]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/microblaze/lib/fastcopy.S
Merge branch 'for-2.6.39' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
[mirror_ubuntu-bionic-kernel.git] / arch / microblaze / lib / fastcopy.S
1 /*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32 #ifdef __MICROBLAZEEL__
33 #error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM.
34 #endif
35
36 #include <linux/linkage.h>
37 .text
38 .globl memcpy
39 .type memcpy, @function
40 .ent memcpy
41
42 memcpy:
43 fast_memcpy_ascending:
44 /* move d to return register as value of function */
45 addi r3, r5, 0
46
47 addi r4, r0, 4 /* n = 4 */
48 cmpu r4, r4, r7 /* n = c - n (unsigned) */
49 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
50
51 /* transfer first 0~3 bytes to get aligned dest address */
52 andi r4, r5, 3 /* n = d & 3 */
53 /* if zero, destination already aligned */
54 beqi r4, a_dalign_done
55 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
56 rsubi r4, r4, 4
57 rsub r7, r4, r7 /* c = c - n adjust c */
58
59 a_xfer_first_loop:
60 /* if no bytes left to transfer, transfer the bulk */
61 beqi r4, a_dalign_done
62 lbui r11, r6, 0 /* h = *s */
63 sbi r11, r5, 0 /* *d = h */
64 addi r6, r6, 1 /* s++ */
65 addi r5, r5, 1 /* d++ */
66 brid a_xfer_first_loop /* loop */
67 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
68
69 a_dalign_done:
70 addi r4, r0, 32 /* n = 32 */
71 cmpu r4, r4, r7 /* n = c - n (unsigned) */
72 /* if n < 0, less than one block to transfer */
73 blti r4, a_block_done
74
75 a_block_xfer:
76 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
77 rsub r7, r4, r7 /* c = c - n */
78
79 andi r9, r6, 3 /* t1 = s & 3 */
80 /* if temp != 0, unaligned transfers needed */
81 bnei r9, a_block_unaligned
82
83 a_block_aligned:
84 lwi r9, r6, 0 /* t1 = *(s + 0) */
85 lwi r10, r6, 4 /* t2 = *(s + 4) */
86 lwi r11, r6, 8 /* t3 = *(s + 8) */
87 lwi r12, r6, 12 /* t4 = *(s + 12) */
88 swi r9, r5, 0 /* *(d + 0) = t1 */
89 swi r10, r5, 4 /* *(d + 4) = t2 */
90 swi r11, r5, 8 /* *(d + 8) = t3 */
91 swi r12, r5, 12 /* *(d + 12) = t4 */
92 lwi r9, r6, 16 /* t1 = *(s + 16) */
93 lwi r10, r6, 20 /* t2 = *(s + 20) */
94 lwi r11, r6, 24 /* t3 = *(s + 24) */
95 lwi r12, r6, 28 /* t4 = *(s + 28) */
96 swi r9, r5, 16 /* *(d + 16) = t1 */
97 swi r10, r5, 20 /* *(d + 20) = t2 */
98 swi r11, r5, 24 /* *(d + 24) = t3 */
99 swi r12, r5, 28 /* *(d + 28) = t4 */
100 addi r6, r6, 32 /* s = s + 32 */
101 addi r4, r4, -32 /* n = n - 32 */
102 bneid r4, a_block_aligned /* while (n) loop */
103 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
104 bri a_block_done
105
106 a_block_unaligned:
107 andi r8, r6, 0xfffffffc /* as = s & ~3 */
108 add r6, r6, r4 /* s = s + n */
109 lwi r11, r8, 0 /* h = *(as + 0) */
110
111 addi r9, r9, -1
112 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
113 addi r9, r9, -1
114 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
115
116 a_block_u3:
117 bslli r11, r11, 24 /* h = h << 24 */
118 a_bu3_loop:
119 lwi r12, r8, 4 /* v = *(as + 4) */
120 bsrli r9, r12, 8 /* t1 = v >> 8 */
121 or r9, r11, r9 /* t1 = h | t1 */
122 swi r9, r5, 0 /* *(d + 0) = t1 */
123 bslli r11, r12, 24 /* h = v << 24 */
124 lwi r12, r8, 8 /* v = *(as + 8) */
125 bsrli r9, r12, 8 /* t1 = v >> 8 */
126 or r9, r11, r9 /* t1 = h | t1 */
127 swi r9, r5, 4 /* *(d + 4) = t1 */
128 bslli r11, r12, 24 /* h = v << 24 */
129 lwi r12, r8, 12 /* v = *(as + 12) */
130 bsrli r9, r12, 8 /* t1 = v >> 8 */
131 or r9, r11, r9 /* t1 = h | t1 */
132 swi r9, r5, 8 /* *(d + 8) = t1 */
133 bslli r11, r12, 24 /* h = v << 24 */
134 lwi r12, r8, 16 /* v = *(as + 16) */
135 bsrli r9, r12, 8 /* t1 = v >> 8 */
136 or r9, r11, r9 /* t1 = h | t1 */
137 swi r9, r5, 12 /* *(d + 12) = t1 */
138 bslli r11, r12, 24 /* h = v << 24 */
139 lwi r12, r8, 20 /* v = *(as + 20) */
140 bsrli r9, r12, 8 /* t1 = v >> 8 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 16 /* *(d + 16) = t1 */
143 bslli r11, r12, 24 /* h = v << 24 */
144 lwi r12, r8, 24 /* v = *(as + 24) */
145 bsrli r9, r12, 8 /* t1 = v >> 8 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 20 /* *(d + 20) = t1 */
148 bslli r11, r12, 24 /* h = v << 24 */
149 lwi r12, r8, 28 /* v = *(as + 28) */
150 bsrli r9, r12, 8 /* t1 = v >> 8 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 24 /* *(d + 24) = t1 */
153 bslli r11, r12, 24 /* h = v << 24 */
154 lwi r12, r8, 32 /* v = *(as + 32) */
155 bsrli r9, r12, 8 /* t1 = v >> 8 */
156 or r9, r11, r9 /* t1 = h | t1 */
157 swi r9, r5, 28 /* *(d + 28) = t1 */
158 bslli r11, r12, 24 /* h = v << 24 */
159 addi r8, r8, 32 /* as = as + 32 */
160 addi r4, r4, -32 /* n = n - 32 */
161 bneid r4, a_bu3_loop /* while (n) loop */
162 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
163 bri a_block_done
164
165 a_block_u1:
166 bslli r11, r11, 8 /* h = h << 8 */
167 a_bu1_loop:
168 lwi r12, r8, 4 /* v = *(as + 4) */
169 bsrli r9, r12, 24 /* t1 = v >> 24 */
170 or r9, r11, r9 /* t1 = h | t1 */
171 swi r9, r5, 0 /* *(d + 0) = t1 */
172 bslli r11, r12, 8 /* h = v << 8 */
173 lwi r12, r8, 8 /* v = *(as + 8) */
174 bsrli r9, r12, 24 /* t1 = v >> 24 */
175 or r9, r11, r9 /* t1 = h | t1 */
176 swi r9, r5, 4 /* *(d + 4) = t1 */
177 bslli r11, r12, 8 /* h = v << 8 */
178 lwi r12, r8, 12 /* v = *(as + 12) */
179 bsrli r9, r12, 24 /* t1 = v >> 24 */
180 or r9, r11, r9 /* t1 = h | t1 */
181 swi r9, r5, 8 /* *(d + 8) = t1 */
182 bslli r11, r12, 8 /* h = v << 8 */
183 lwi r12, r8, 16 /* v = *(as + 16) */
184 bsrli r9, r12, 24 /* t1 = v >> 24 */
185 or r9, r11, r9 /* t1 = h | t1 */
186 swi r9, r5, 12 /* *(d + 12) = t1 */
187 bslli r11, r12, 8 /* h = v << 8 */
188 lwi r12, r8, 20 /* v = *(as + 20) */
189 bsrli r9, r12, 24 /* t1 = v >> 24 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 16 /* *(d + 16) = t1 */
192 bslli r11, r12, 8 /* h = v << 8 */
193 lwi r12, r8, 24 /* v = *(as + 24) */
194 bsrli r9, r12, 24 /* t1 = v >> 24 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 20 /* *(d + 20) = t1 */
197 bslli r11, r12, 8 /* h = v << 8 */
198 lwi r12, r8, 28 /* v = *(as + 28) */
199 bsrli r9, r12, 24 /* t1 = v >> 24 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 24 /* *(d + 24) = t1 */
202 bslli r11, r12, 8 /* h = v << 8 */
203 lwi r12, r8, 32 /* v = *(as + 32) */
204 bsrli r9, r12, 24 /* t1 = v >> 24 */
205 or r9, r11, r9 /* t1 = h | t1 */
206 swi r9, r5, 28 /* *(d + 28) = t1 */
207 bslli r11, r12, 8 /* h = v << 8 */
208 addi r8, r8, 32 /* as = as + 32 */
209 addi r4, r4, -32 /* n = n - 32 */
210 bneid r4, a_bu1_loop /* while (n) loop */
211 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
212 bri a_block_done
213
214 a_block_u2:
215 bslli r11, r11, 16 /* h = h << 16 */
216 a_bu2_loop:
217 lwi r12, r8, 4 /* v = *(as + 4) */
218 bsrli r9, r12, 16 /* t1 = v >> 16 */
219 or r9, r11, r9 /* t1 = h | t1 */
220 swi r9, r5, 0 /* *(d + 0) = t1 */
221 bslli r11, r12, 16 /* h = v << 16 */
222 lwi r12, r8, 8 /* v = *(as + 8) */
223 bsrli r9, r12, 16 /* t1 = v >> 16 */
224 or r9, r11, r9 /* t1 = h | t1 */
225 swi r9, r5, 4 /* *(d + 4) = t1 */
226 bslli r11, r12, 16 /* h = v << 16 */
227 lwi r12, r8, 12 /* v = *(as + 12) */
228 bsrli r9, r12, 16 /* t1 = v >> 16 */
229 or r9, r11, r9 /* t1 = h | t1 */
230 swi r9, r5, 8 /* *(d + 8) = t1 */
231 bslli r11, r12, 16 /* h = v << 16 */
232 lwi r12, r8, 16 /* v = *(as + 16) */
233 bsrli r9, r12, 16 /* t1 = v >> 16 */
234 or r9, r11, r9 /* t1 = h | t1 */
235 swi r9, r5, 12 /* *(d + 12) = t1 */
236 bslli r11, r12, 16 /* h = v << 16 */
237 lwi r12, r8, 20 /* v = *(as + 20) */
238 bsrli r9, r12, 16 /* t1 = v >> 16 */
239 or r9, r11, r9 /* t1 = h | t1 */
240 swi r9, r5, 16 /* *(d + 16) = t1 */
241 bslli r11, r12, 16 /* h = v << 16 */
242 lwi r12, r8, 24 /* v = *(as + 24) */
243 bsrli r9, r12, 16 /* t1 = v >> 16 */
244 or r9, r11, r9 /* t1 = h | t1 */
245 swi r9, r5, 20 /* *(d + 20) = t1 */
246 bslli r11, r12, 16 /* h = v << 16 */
247 lwi r12, r8, 28 /* v = *(as + 28) */
248 bsrli r9, r12, 16 /* t1 = v >> 16 */
249 or r9, r11, r9 /* t1 = h | t1 */
250 swi r9, r5, 24 /* *(d + 24) = t1 */
251 bslli r11, r12, 16 /* h = v << 16 */
252 lwi r12, r8, 32 /* v = *(as + 32) */
253 bsrli r9, r12, 16 /* t1 = v >> 16 */
254 or r9, r11, r9 /* t1 = h | t1 */
255 swi r9, r5, 28 /* *(d + 28) = t1 */
256 bslli r11, r12, 16 /* h = v << 16 */
257 addi r8, r8, 32 /* as = as + 32 */
258 addi r4, r4, -32 /* n = n - 32 */
259 bneid r4, a_bu2_loop /* while (n) loop */
260 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
261
262 a_block_done:
263 addi r4, r0, 4 /* n = 4 */
264 cmpu r4, r4, r7 /* n = c - n (unsigned) */
265 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
266
267 a_word_xfer:
268 andi r4, r7, 0xfffffffc /* n = c & ~3 */
269 addi r10, r0, 0 /* offset = 0 */
270
271 andi r9, r6, 3 /* t1 = s & 3 */
272 /* if temp != 0, unaligned transfers needed */
273 bnei r9, a_word_unaligned
274
275 a_word_aligned:
276 lw r9, r6, r10 /* t1 = *(s+offset) */
277 sw r9, r5, r10 /* *(d+offset) = t1 */
278 addi r4, r4,-4 /* n-- */
279 bneid r4, a_word_aligned /* loop */
280 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
281
282 bri a_word_done
283
284 a_word_unaligned:
285 andi r8, r6, 0xfffffffc /* as = s & ~3 */
286 lwi r11, r8, 0 /* h = *(as + 0) */
287 addi r8, r8, 4 /* as = as + 4 */
288
289 addi r9, r9, -1
290 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
291 addi r9, r9, -1
292 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
293
294 a_word_u3:
295 bslli r11, r11, 24 /* h = h << 24 */
296 a_wu3_loop:
297 lw r12, r8, r10 /* v = *(as + offset) */
298 bsrli r9, r12, 8 /* t1 = v >> 8 */
299 or r9, r11, r9 /* t1 = h | t1 */
300 sw r9, r5, r10 /* *(d + offset) = t1 */
301 bslli r11, r12, 24 /* h = v << 24 */
302 addi r4, r4,-4 /* n = n - 4 */
303 bneid r4, a_wu3_loop /* while (n) loop */
304 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
305
306 bri a_word_done
307
308 a_word_u1:
309 bslli r11, r11, 8 /* h = h << 8 */
310 a_wu1_loop:
311 lw r12, r8, r10 /* v = *(as + offset) */
312 bsrli r9, r12, 24 /* t1 = v >> 24 */
313 or r9, r11, r9 /* t1 = h | t1 */
314 sw r9, r5, r10 /* *(d + offset) = t1 */
315 bslli r11, r12, 8 /* h = v << 8 */
316 addi r4, r4,-4 /* n = n - 4 */
317 bneid r4, a_wu1_loop /* while (n) loop */
318 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
319
320 bri a_word_done
321
322 a_word_u2:
323 bslli r11, r11, 16 /* h = h << 16 */
324 a_wu2_loop:
325 lw r12, r8, r10 /* v = *(as + offset) */
326 bsrli r9, r12, 16 /* t1 = v >> 16 */
327 or r9, r11, r9 /* t1 = h | t1 */
328 sw r9, r5, r10 /* *(d + offset) = t1 */
329 bslli r11, r12, 16 /* h = v << 16 */
330 addi r4, r4,-4 /* n = n - 4 */
331 bneid r4, a_wu2_loop /* while (n) loop */
332 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
333
334 a_word_done:
335 add r5, r5, r10 /* d = d + offset */
336 add r6, r6, r10 /* s = s + offset */
337 rsub r7, r10, r7 /* c = c - offset */
338
339 a_xfer_end:
340 a_xfer_end_loop:
341 beqi r7, a_done /* while (c) */
342 lbui r9, r6, 0 /* t1 = *s */
343 addi r6, r6, 1 /* s++ */
344 sbi r9, r5, 0 /* *d = t1 */
345 addi r7, r7, -1 /* c-- */
346 brid a_xfer_end_loop /* loop */
347 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
348
349 a_done:
350 rtsd r15, 8
351 nop
352
353 .size memcpy, . - memcpy
354 .end memcpy
355 /*----------------------------------------------------------------------------*/
356 .globl memmove
357 .type memmove, @function
358 .ent memmove
359
360 memmove:
361 cmpu r4, r5, r6 /* n = s - d */
362 bgei r4,fast_memcpy_ascending
363
364 fast_memcpy_descending:
365 /* move d to return register as value of function */
366 addi r3, r5, 0
367
368 add r5, r5, r7 /* d = d + c */
369 add r6, r6, r7 /* s = s + c */
370
371 addi r4, r0, 4 /* n = 4 */
372 cmpu r4, r4, r7 /* n = c - n (unsigned) */
373 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
374
375 /* transfer first 0~3 bytes to get aligned dest address */
376 andi r4, r5, 3 /* n = d & 3 */
377 /* if zero, destination already aligned */
378 beqi r4,d_dalign_done
379 rsub r7, r4, r7 /* c = c - n adjust c */
380
381 d_xfer_first_loop:
382 /* if no bytes left to transfer, transfer the bulk */
383 beqi r4,d_dalign_done
384 addi r6, r6, -1 /* s-- */
385 addi r5, r5, -1 /* d-- */
386 lbui r11, r6, 0 /* h = *s */
387 sbi r11, r5, 0 /* *d = h */
388 brid d_xfer_first_loop /* loop */
389 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
390
391 d_dalign_done:
392 addi r4, r0, 32 /* n = 32 */
393 cmpu r4, r4, r7 /* n = c - n (unsigned) */
394 /* if n < 0, less than one block to transfer */
395 blti r4, d_block_done
396
397 d_block_xfer:
398 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
399 rsub r7, r4, r7 /* c = c - n */
400
401 andi r9, r6, 3 /* t1 = s & 3 */
402 /* if temp != 0, unaligned transfers needed */
403 bnei r9, d_block_unaligned
404
405 d_block_aligned:
406 addi r6, r6, -32 /* s = s - 32 */
407 addi r5, r5, -32 /* d = d - 32 */
408 lwi r9, r6, 28 /* t1 = *(s + 28) */
409 lwi r10, r6, 24 /* t2 = *(s + 24) */
410 lwi r11, r6, 20 /* t3 = *(s + 20) */
411 lwi r12, r6, 16 /* t4 = *(s + 16) */
412 swi r9, r5, 28 /* *(d + 28) = t1 */
413 swi r10, r5, 24 /* *(d + 24) = t2 */
414 swi r11, r5, 20 /* *(d + 20) = t3 */
415 swi r12, r5, 16 /* *(d + 16) = t4 */
416 lwi r9, r6, 12 /* t1 = *(s + 12) */
417 lwi r10, r6, 8 /* t2 = *(s + 8) */
418 lwi r11, r6, 4 /* t3 = *(s + 4) */
419 lwi r12, r6, 0 /* t4 = *(s + 0) */
420 swi r9, r5, 12 /* *(d + 12) = t1 */
421 swi r10, r5, 8 /* *(d + 8) = t2 */
422 swi r11, r5, 4 /* *(d + 4) = t3 */
423 addi r4, r4, -32 /* n = n - 32 */
424 bneid r4, d_block_aligned /* while (n) loop */
425 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
426 bri d_block_done
427
428 d_block_unaligned:
429 andi r8, r6, 0xfffffffc /* as = s & ~3 */
430 rsub r6, r4, r6 /* s = s - n */
431 lwi r11, r8, 0 /* h = *(as + 0) */
432
433 addi r9, r9, -1
434 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
435 addi r9, r9, -1
436 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
437
438 d_block_u3:
439 bsrli r11, r11, 8 /* h = h >> 8 */
440 d_bu3_loop:
441 addi r8, r8, -32 /* as = as - 32 */
442 addi r5, r5, -32 /* d = d - 32 */
443 lwi r12, r8, 28 /* v = *(as + 28) */
444 bslli r9, r12, 24 /* t1 = v << 24 */
445 or r9, r11, r9 /* t1 = h | t1 */
446 swi r9, r5, 28 /* *(d + 28) = t1 */
447 bsrli r11, r12, 8 /* h = v >> 8 */
448 lwi r12, r8, 24 /* v = *(as + 24) */
449 bslli r9, r12, 24 /* t1 = v << 24 */
450 or r9, r11, r9 /* t1 = h | t1 */
451 swi r9, r5, 24 /* *(d + 24) = t1 */
452 bsrli r11, r12, 8 /* h = v >> 8 */
453 lwi r12, r8, 20 /* v = *(as + 20) */
454 bslli r9, r12, 24 /* t1 = v << 24 */
455 or r9, r11, r9 /* t1 = h | t1 */
456 swi r9, r5, 20 /* *(d + 20) = t1 */
457 bsrli r11, r12, 8 /* h = v >> 8 */
458 lwi r12, r8, 16 /* v = *(as + 16) */
459 bslli r9, r12, 24 /* t1 = v << 24 */
460 or r9, r11, r9 /* t1 = h | t1 */
461 swi r9, r5, 16 /* *(d + 16) = t1 */
462 bsrli r11, r12, 8 /* h = v >> 8 */
463 lwi r12, r8, 12 /* v = *(as + 12) */
464 bslli r9, r12, 24 /* t1 = v << 24 */
465 or r9, r11, r9 /* t1 = h | t1 */
466 swi r9, r5, 12 /* *(d + 112) = t1 */
467 bsrli r11, r12, 8 /* h = v >> 8 */
468 lwi r12, r8, 8 /* v = *(as + 8) */
469 bslli r9, r12, 24 /* t1 = v << 24 */
470 or r9, r11, r9 /* t1 = h | t1 */
471 swi r9, r5, 8 /* *(d + 8) = t1 */
472 bsrli r11, r12, 8 /* h = v >> 8 */
473 lwi r12, r8, 4 /* v = *(as + 4) */
474 bslli r9, r12, 24 /* t1 = v << 24 */
475 or r9, r11, r9 /* t1 = h | t1 */
476 swi r9, r5, 4 /* *(d + 4) = t1 */
477 bsrli r11, r12, 8 /* h = v >> 8 */
478 lwi r12, r8, 0 /* v = *(as + 0) */
479 bslli r9, r12, 24 /* t1 = v << 24 */
480 or r9, r11, r9 /* t1 = h | t1 */
481 swi r9, r5, 0 /* *(d + 0) = t1 */
482 addi r4, r4, -32 /* n = n - 32 */
483 bneid r4, d_bu3_loop /* while (n) loop */
484 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
485 bri d_block_done
486
487 d_block_u1:
488 bsrli r11, r11, 24 /* h = h >> 24 */
489 d_bu1_loop:
490 addi r8, r8, -32 /* as = as - 32 */
491 addi r5, r5, -32 /* d = d - 32 */
492 lwi r12, r8, 28 /* v = *(as + 28) */
493 bslli r9, r12, 8 /* t1 = v << 8 */
494 or r9, r11, r9 /* t1 = h | t1 */
495 swi r9, r5, 28 /* *(d + 28) = t1 */
496 bsrli r11, r12, 24 /* h = v >> 24 */
497 lwi r12, r8, 24 /* v = *(as + 24) */
498 bslli r9, r12, 8 /* t1 = v << 8 */
499 or r9, r11, r9 /* t1 = h | t1 */
500 swi r9, r5, 24 /* *(d + 24) = t1 */
501 bsrli r11, r12, 24 /* h = v >> 24 */
502 lwi r12, r8, 20 /* v = *(as + 20) */
503 bslli r9, r12, 8 /* t1 = v << 8 */
504 or r9, r11, r9 /* t1 = h | t1 */
505 swi r9, r5, 20 /* *(d + 20) = t1 */
506 bsrli r11, r12, 24 /* h = v >> 24 */
507 lwi r12, r8, 16 /* v = *(as + 16) */
508 bslli r9, r12, 8 /* t1 = v << 8 */
509 or r9, r11, r9 /* t1 = h | t1 */
510 swi r9, r5, 16 /* *(d + 16) = t1 */
511 bsrli r11, r12, 24 /* h = v >> 24 */
512 lwi r12, r8, 12 /* v = *(as + 12) */
513 bslli r9, r12, 8 /* t1 = v << 8 */
514 or r9, r11, r9 /* t1 = h | t1 */
515 swi r9, r5, 12 /* *(d + 112) = t1 */
516 bsrli r11, r12, 24 /* h = v >> 24 */
517 lwi r12, r8, 8 /* v = *(as + 8) */
518 bslli r9, r12, 8 /* t1 = v << 8 */
519 or r9, r11, r9 /* t1 = h | t1 */
520 swi r9, r5, 8 /* *(d + 8) = t1 */
521 bsrli r11, r12, 24 /* h = v >> 24 */
522 lwi r12, r8, 4 /* v = *(as + 4) */
523 bslli r9, r12, 8 /* t1 = v << 8 */
524 or r9, r11, r9 /* t1 = h | t1 */
525 swi r9, r5, 4 /* *(d + 4) = t1 */
526 bsrli r11, r12, 24 /* h = v >> 24 */
527 lwi r12, r8, 0 /* v = *(as + 0) */
528 bslli r9, r12, 8 /* t1 = v << 8 */
529 or r9, r11, r9 /* t1 = h | t1 */
530 swi r9, r5, 0 /* *(d + 0) = t1 */
531 addi r4, r4, -32 /* n = n - 32 */
532 bneid r4, d_bu1_loop /* while (n) loop */
533 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
534 bri d_block_done
535
536 d_block_u2:
537 bsrli r11, r11, 16 /* h = h >> 16 */
538 d_bu2_loop:
539 addi r8, r8, -32 /* as = as - 32 */
540 addi r5, r5, -32 /* d = d - 32 */
541 lwi r12, r8, 28 /* v = *(as + 28) */
542 bslli r9, r12, 16 /* t1 = v << 16 */
543 or r9, r11, r9 /* t1 = h | t1 */
544 swi r9, r5, 28 /* *(d + 28) = t1 */
545 bsrli r11, r12, 16 /* h = v >> 16 */
546 lwi r12, r8, 24 /* v = *(as + 24) */
547 bslli r9, r12, 16 /* t1 = v << 16 */
548 or r9, r11, r9 /* t1 = h | t1 */
549 swi r9, r5, 24 /* *(d + 24) = t1 */
550 bsrli r11, r12, 16 /* h = v >> 16 */
551 lwi r12, r8, 20 /* v = *(as + 20) */
552 bslli r9, r12, 16 /* t1 = v << 16 */
553 or r9, r11, r9 /* t1 = h | t1 */
554 swi r9, r5, 20 /* *(d + 20) = t1 */
555 bsrli r11, r12, 16 /* h = v >> 16 */
556 lwi r12, r8, 16 /* v = *(as + 16) */
557 bslli r9, r12, 16 /* t1 = v << 16 */
558 or r9, r11, r9 /* t1 = h | t1 */
559 swi r9, r5, 16 /* *(d + 16) = t1 */
560 bsrli r11, r12, 16 /* h = v >> 16 */
561 lwi r12, r8, 12 /* v = *(as + 12) */
562 bslli r9, r12, 16 /* t1 = v << 16 */
563 or r9, r11, r9 /* t1 = h | t1 */
564 swi r9, r5, 12 /* *(d + 112) = t1 */
565 bsrli r11, r12, 16 /* h = v >> 16 */
566 lwi r12, r8, 8 /* v = *(as + 8) */
567 bslli r9, r12, 16 /* t1 = v << 16 */
568 or r9, r11, r9 /* t1 = h | t1 */
569 swi r9, r5, 8 /* *(d + 8) = t1 */
570 bsrli r11, r12, 16 /* h = v >> 16 */
571 lwi r12, r8, 4 /* v = *(as + 4) */
572 bslli r9, r12, 16 /* t1 = v << 16 */
573 or r9, r11, r9 /* t1 = h | t1 */
574 swi r9, r5, 4 /* *(d + 4) = t1 */
575 bsrli r11, r12, 16 /* h = v >> 16 */
576 lwi r12, r8, 0 /* v = *(as + 0) */
577 bslli r9, r12, 16 /* t1 = v << 16 */
578 or r9, r11, r9 /* t1 = h | t1 */
579 swi r9, r5, 0 /* *(d + 0) = t1 */
580 addi r4, r4, -32 /* n = n - 32 */
581 bneid r4, d_bu2_loop /* while (n) loop */
582 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
583
584 d_block_done:
585 addi r4, r0, 4 /* n = 4 */
586 cmpu r4, r4, r7 /* n = c - n (unsigned) */
587 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
588
589 d_word_xfer:
590 andi r4, r7, 0xfffffffc /* n = c & ~3 */
591 rsub r5, r4, r5 /* d = d - n */
592 rsub r6, r4, r6 /* s = s - n */
593 rsub r7, r4, r7 /* c = c - n */
594
595 andi r9, r6, 3 /* t1 = s & 3 */
596 /* if temp != 0, unaligned transfers needed */
597 bnei r9, d_word_unaligned
598
599 d_word_aligned:
600 addi r4, r4,-4 /* n-- */
601 lw r9, r6, r4 /* t1 = *(s+n) */
602 bneid r4, d_word_aligned /* loop */
603 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
604
605 bri d_word_done
606
607 d_word_unaligned:
608 andi r8, r6, 0xfffffffc /* as = s & ~3 */
609 lw r11, r8, r4 /* h = *(as + n) */
610
611 addi r9, r9, -1
612 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
613 addi r9, r9, -1
614 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
615
616 d_word_u3:
617 bsrli r11, r11, 8 /* h = h >> 8 */
618 d_wu3_loop:
619 addi r4, r4,-4 /* n = n - 4 */
620 lw r12, r8, r4 /* v = *(as + n) */
621 bslli r9, r12, 24 /* t1 = v << 24 */
622 or r9, r11, r9 /* t1 = h | t1 */
623 sw r9, r5, r4 /* *(d + n) = t1 */
624 bneid r4, d_wu3_loop /* while (n) loop */
625 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
626
627 bri d_word_done
628
629 d_word_u1:
630 bsrli r11, r11, 24 /* h = h >> 24 */
631 d_wu1_loop:
632 addi r4, r4,-4 /* n = n - 4 */
633 lw r12, r8, r4 /* v = *(as + n) */
634 bslli r9, r12, 8 /* t1 = v << 8 */
635 or r9, r11, r9 /* t1 = h | t1 */
636 sw r9, r5, r4 /* *(d + n) = t1 */
637 bneid r4, d_wu1_loop /* while (n) loop */
638 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
639
640 bri d_word_done
641
642 d_word_u2:
643 bsrli r11, r11, 16 /* h = h >> 16 */
644 d_wu2_loop:
645 addi r4, r4,-4 /* n = n - 4 */
646 lw r12, r8, r4 /* v = *(as + n) */
647 bslli r9, r12, 16 /* t1 = v << 16 */
648 or r9, r11, r9 /* t1 = h | t1 */
649 sw r9, r5, r4 /* *(d + n) = t1 */
650 bneid r4, d_wu2_loop /* while (n) loop */
651 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
652
653 d_word_done:
654
655 d_xfer_end:
656 d_xfer_end_loop:
657 beqi r7, a_done /* while (c) */
658 addi r6, r6, -1 /* s-- */
659 lbui r9, r6, 0 /* t1 = *s */
660 addi r5, r5, -1 /* d-- */
661 sbi r9, r5, 0 /* *d = t1 */
662 brid d_xfer_end_loop /* loop */
663 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
664
665 d_done:
666 rtsd r15, 8
667 nop
668
669 .size memmove, . - memmove
670 .end memmove