]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/sparc/lib/NG2memcpy.S
Merge remote-tracking branches 'asoc/topic/sta529', 'asoc/topic/sti', 'asoc/topic...
[mirror_ubuntu-bionic-kernel.git] / arch / sparc / lib / NG2memcpy.S
CommitLineData
cf5adce1
DM
1/* NG2memcpy.S: Niagara-2 optimized memcpy.
2 *
3 * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
e93704e4 7#include <linux/linkage.h>
cf5adce1
DM
8#include <asm/visasm.h>
9#include <asm/asi.h>
10#define GLOBAL_SPARE %g7
11#else
12#define ASI_PNF 0x82
13#define ASI_BLK_P 0xf0
14#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
15#define FPRS_FEF 0x04
16#ifdef MEMCPY_DEBUG
17#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
1b62ca7b 18 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
cf5adce1
DM
19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
20#else
21#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
22#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
23#endif
24#define GLOBAL_SPARE %g5
25#endif
26
27#ifndef STORE_ASI
28#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
29#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
30#else
31#define STORE_ASI 0x80 /* ASI_P */
32#endif
33#endif
34
35#ifndef EX_LD
e93704e4 36#define EX_LD(x,y) x
cf5adce1 37#endif
a7c5724b 38#ifndef EX_LD_FP
e93704e4 39#define EX_LD_FP(x,y) x
a7c5724b 40#endif
cf5adce1
DM
41
42#ifndef EX_ST
e93704e4 43#define EX_ST(x,y) x
cf5adce1 44#endif
a7c5724b 45#ifndef EX_ST_FP
e93704e4 46#define EX_ST_FP(x,y) x
cf5adce1
DM
47#endif
48
49#ifndef LOAD
50#define LOAD(type,addr,dest) type [addr], dest
51#endif
52
53#ifndef LOAD_BLK
54#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
55#endif
56
57#ifndef STORE
58#ifndef MEMCPY_DEBUG
59#define STORE(type,src,addr) type src, [addr]
60#else
61#define STORE(type,src,addr) type##a src, [addr] 0x80
62#endif
63#endif
64
65#ifndef STORE_BLK
66#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
67#endif
68
69#ifndef STORE_INIT
70#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
71#endif
72
73#ifndef FUNC_NAME
74#define FUNC_NAME NG2memcpy
75#endif
76
77#ifndef PREAMBLE
78#define PREAMBLE
79#endif
80
81#ifndef XCC
82#define XCC xcc
83#endif
84
85#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
86 faligndata %x0, %x1, %f0; \
87 faligndata %x1, %x2, %f2; \
88 faligndata %x2, %x3, %f4; \
89 faligndata %x3, %x4, %f6; \
90 faligndata %x4, %x5, %f8; \
91 faligndata %x5, %x6, %f10; \
92 faligndata %x6, %x7, %f12; \
93 faligndata %x7, %x8, %f14;
94
95#define FREG_MOVE_1(x0) \
6f1d827f 96 fsrc2 %x0, %f0;
cf5adce1 97#define FREG_MOVE_2(x0, x1) \
6f1d827f
DM
98 fsrc2 %x0, %f0; \
99 fsrc2 %x1, %f2;
cf5adce1 100#define FREG_MOVE_3(x0, x1, x2) \
6f1d827f
DM
101 fsrc2 %x0, %f0; \
102 fsrc2 %x1, %f2; \
103 fsrc2 %x2, %f4;
cf5adce1 104#define FREG_MOVE_4(x0, x1, x2, x3) \
6f1d827f
DM
105 fsrc2 %x0, %f0; \
106 fsrc2 %x1, %f2; \
107 fsrc2 %x2, %f4; \
108 fsrc2 %x3, %f6;
cf5adce1 109#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
6f1d827f
DM
110 fsrc2 %x0, %f0; \
111 fsrc2 %x1, %f2; \
112 fsrc2 %x2, %f4; \
113 fsrc2 %x3, %f6; \
114 fsrc2 %x4, %f8;
cf5adce1 115#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
6f1d827f
DM
116 fsrc2 %x0, %f0; \
117 fsrc2 %x1, %f2; \
118 fsrc2 %x2, %f4; \
119 fsrc2 %x3, %f6; \
120 fsrc2 %x4, %f8; \
121 fsrc2 %x5, %f10;
cf5adce1 122#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
6f1d827f
DM
123 fsrc2 %x0, %f0; \
124 fsrc2 %x1, %f2; \
125 fsrc2 %x2, %f4; \
126 fsrc2 %x3, %f6; \
127 fsrc2 %x4, %f8; \
128 fsrc2 %x5, %f10; \
129 fsrc2 %x6, %f12;
cf5adce1 130#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
6f1d827f
DM
131 fsrc2 %x0, %f0; \
132 fsrc2 %x1, %f2; \
133 fsrc2 %x2, %f4; \
134 fsrc2 %x3, %f6; \
135 fsrc2 %x4, %f8; \
136 fsrc2 %x5, %f10; \
137 fsrc2 %x6, %f12; \
138 fsrc2 %x7, %f14;
cf5adce1 139#define FREG_LOAD_1(base, x0) \
e93704e4 140 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
cf5adce1 141#define FREG_LOAD_2(base, x0, x1) \
e93704e4
DM
142 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
143 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
cf5adce1 144#define FREG_LOAD_3(base, x0, x1, x2) \
e93704e4
DM
145 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
146 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
147 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
cf5adce1 148#define FREG_LOAD_4(base, x0, x1, x2, x3) \
e93704e4
DM
149 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
150 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
151 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
152 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
cf5adce1 153#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
e93704e4
DM
154 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
155 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
156 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
157 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
158 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
cf5adce1 159#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
e93704e4
DM
160 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
161 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
162 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
163 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
164 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
165 EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
cf5adce1 166#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
e93704e4
DM
167 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
168 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
169 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
170 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
171 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
172 EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
173 EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
cf5adce1
DM
174
175 .register %g2,#scratch
176 .register %g3,#scratch
177
178 .text
e93704e4
DM
179#ifndef EX_RETVAL
180#define EX_RETVAL(x) x
181__restore_fp:
182 VISExitHalf
183__restore_asi:
184 retl
185 wr %g0, ASI_AIUS, %asi
186ENTRY(NG2_retl_o2)
187 ba,pt %xcc, __restore_asi
188 mov %o2, %o0
189ENDPROC(NG2_retl_o2)
190ENTRY(NG2_retl_o2_plus_1)
191 ba,pt %xcc, __restore_asi
192 add %o2, 1, %o0
193ENDPROC(NG2_retl_o2_plus_1)
194ENTRY(NG2_retl_o2_plus_4)
195 ba,pt %xcc, __restore_asi
196 add %o2, 4, %o0
197ENDPROC(NG2_retl_o2_plus_4)
198ENTRY(NG2_retl_o2_plus_8)
199 ba,pt %xcc, __restore_asi
200 add %o2, 8, %o0
201ENDPROC(NG2_retl_o2_plus_8)
202ENTRY(NG2_retl_o2_plus_o4_plus_1)
203 add %o4, 1, %o4
204 ba,pt %xcc, __restore_asi
205 add %o2, %o4, %o0
206ENDPROC(NG2_retl_o2_plus_o4_plus_1)
207ENTRY(NG2_retl_o2_plus_o4_plus_8)
208 add %o4, 8, %o4
209 ba,pt %xcc, __restore_asi
210 add %o2, %o4, %o0
211ENDPROC(NG2_retl_o2_plus_o4_plus_8)
212ENTRY(NG2_retl_o2_plus_o4_plus_16)
213 add %o4, 16, %o4
214 ba,pt %xcc, __restore_asi
215 add %o2, %o4, %o0
216ENDPROC(NG2_retl_o2_plus_o4_plus_16)
217ENTRY(NG2_retl_o2_plus_g1_fp)
218 ba,pt %xcc, __restore_fp
219 add %o2, %g1, %o0
220ENDPROC(NG2_retl_o2_plus_g1_fp)
221ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
222 add %g1, 64, %g1
223 ba,pt %xcc, __restore_fp
224 add %o2, %g1, %o0
225ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
226ENTRY(NG2_retl_o2_plus_g1_plus_1)
227 add %g1, 1, %g1
228 ba,pt %xcc, __restore_asi
229 add %o2, %g1, %o0
230ENDPROC(NG2_retl_o2_plus_g1_plus_1)
231ENTRY(NG2_retl_o2_and_7_plus_o4)
232 and %o2, 7, %o2
233 ba,pt %xcc, __restore_asi
234 add %o2, %o4, %o0
235ENDPROC(NG2_retl_o2_and_7_plus_o4)
236ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
237 and %o2, 7, %o2
238 add %o4, 8, %o4
239 ba,pt %xcc, __restore_asi
240 add %o2, %o4, %o0
241ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
242#endif
243
cf5adce1
DM
244 .align 64
245
246 .globl FUNC_NAME
247 .type FUNC_NAME,#function
248FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
249 srlx %o2, 31, %g2
250 cmp %g2, 0
251 tne %xcc, 5
252 PREAMBLE
1b62ca7b 253 mov %o0, %o3
cf5adce1
DM
254 cmp %o2, 0
255 be,pn %XCC, 85f
1b62ca7b 256 or %o0, %o1, GLOBAL_SPARE
cf5adce1
DM
257 cmp %o2, 16
258 blu,a,pn %XCC, 80f
1b62ca7b 259 or GLOBAL_SPARE, %o2, GLOBAL_SPARE
cf5adce1
DM
260
261 /* 2 blocks (128 bytes) is the minimum we can do the block
262 * copy with. We need to ensure that we'll iterate at least
263 * once in the block copy loop. At worst we'll need to align
264 * the destination to a 64-byte boundary which can chew up
265 * to (64 - 1) bytes from the length before we perform the
266 * block copy loop.
267 *
268 * However, the cut-off point, performance wise, is around
269 * 4 64-byte blocks.
270 */
271 cmp %o2, (4 * 64)
272 blu,pt %XCC, 75f
1b62ca7b 273 andcc GLOBAL_SPARE, 0x7, %g0
cf5adce1
DM
274
275 /* %o0: dst
276 * %o1: src
277 * %o2: len (known to be >= 128)
278 *
279 * The block copy loops can use %o4, %g2, %g3 as
280 * temporaries while copying the data. %o5 must
281 * be preserved between VISEntryHalf and VISExitHalf
282 */
283
284 LOAD(prefetch, %o1 + 0x000, #one_read)
285 LOAD(prefetch, %o1 + 0x040, #one_read)
286 LOAD(prefetch, %o1 + 0x080, #one_read)
287
288 /* Align destination on 64-byte boundary. */
289 andcc %o0, (64 - 1), %o4
290 be,pt %XCC, 2f
291 sub %o4, 64, %o4
292 sub %g0, %o4, %o4 ! bytes to align dst
293 sub %o2, %o4, %o2
2941: subcc %o4, 1, %o4
e93704e4
DM
295 EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
296 EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
cf5adce1
DM
297 add %o1, 1, %o1
298 bne,pt %XCC, 1b
299 add %o0, 1, %o0
300
3012:
302 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
303 * o5 from here until we hit VISExitHalf.
304 */
305 VISEntryHalf
306
5aa4ecfd 307 membar #Sync
cf5adce1
DM
308 alignaddr %o1, %g0, %g0
309
310 add %o1, (64 - 1), %o4
311 andn %o4, (64 - 1), %o4
312 andn %o2, (64 - 1), %g1
313 sub %o2, %g1, %o2
314
315 and %o1, (64 - 1), %g2
316 add %o1, %g1, %o1
317 sub %o0, %o4, %g3
318 brz,pt %g2, 190f
319 cmp %g2, 32
320 blu,a 5f
321 cmp %g2, 16
322 cmp %g2, 48
323 blu,a 4f
324 cmp %g2, 40
325 cmp %g2, 56
326 blu 170f
327 nop
328 ba,a,pt %xcc, 180f
0ae2d26f 329 nop
cf5adce1
DM
330
3314: /* 32 <= low bits < 48 */
332 blu 150f
333 nop
334 ba,a,pt %xcc, 160f
0ae2d26f 335 nop
cf5adce1
DM
3365: /* 0 < low bits < 32 */
337 blu,a 6f
338 cmp %g2, 8
339 cmp %g2, 24
340 blu 130f
341 nop
342 ba,a,pt %xcc, 140f
0ae2d26f 343 nop
cf5adce1
DM
3446: /* 0 < low bits < 16 */
345 bgeu 120f
346 nop
347 /* fall through for 0 < low bits < 8 */
348110: sub %o4, 64, %g2
e93704e4
DM
349 EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
3501: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
351 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 352 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
e93704e4 353 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
354 FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
355 subcc %g1, 64, %g1
356 add %o4, 64, %o4
357 bne,pt %xcc, 1b
358 LOAD(prefetch, %o4 + 64, #one_read)
359 ba,pt %xcc, 195f
360 nop
361
362120: sub %o4, 56, %g2
363 FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
e93704e4
DM
3641: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
365 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 366 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
e93704e4 367 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
368 FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
369 subcc %g1, 64, %g1
370 add %o4, 64, %o4
371 bne,pt %xcc, 1b
372 LOAD(prefetch, %o4 + 64, #one_read)
373 ba,pt %xcc, 195f
374 nop
375
376130: sub %o4, 48, %g2
377 FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
e93704e4
DM
3781: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
379 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 380 FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
e93704e4 381 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
382 FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
383 subcc %g1, 64, %g1
384 add %o4, 64, %o4
385 bne,pt %xcc, 1b
386 LOAD(prefetch, %o4 + 64, #one_read)
387 ba,pt %xcc, 195f
388 nop
389
390140: sub %o4, 40, %g2
391 FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
e93704e4
DM
3921: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
393 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 394 FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
e93704e4 395 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
396 FREG_MOVE_5(f22, f24, f26, f28, f30)
397 subcc %g1, 64, %g1
398 add %o4, 64, %o4
399 bne,pt %xcc, 1b
400 LOAD(prefetch, %o4 + 64, #one_read)
401 ba,pt %xcc, 195f
402 nop
403
404150: sub %o4, 32, %g2
405 FREG_LOAD_4(%g2, f0, f2, f4, f6)
e93704e4
DM
4061: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
407 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 408 FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
e93704e4 409 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
410 FREG_MOVE_4(f24, f26, f28, f30)
411 subcc %g1, 64, %g1
412 add %o4, 64, %o4
413 bne,pt %xcc, 1b
414 LOAD(prefetch, %o4 + 64, #one_read)
415 ba,pt %xcc, 195f
416 nop
417
418160: sub %o4, 24, %g2
419 FREG_LOAD_3(%g2, f0, f2, f4)
e93704e4
DM
4201: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
421 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 422 FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
e93704e4 423 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
424 FREG_MOVE_3(f26, f28, f30)
425 subcc %g1, 64, %g1
426 add %o4, 64, %o4
427 bne,pt %xcc, 1b
428 LOAD(prefetch, %o4 + 64, #one_read)
429 ba,pt %xcc, 195f
430 nop
431
432170: sub %o4, 16, %g2
433 FREG_LOAD_2(%g2, f0, f2)
e93704e4
DM
4341: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
435 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 436 FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
e93704e4 437 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
438 FREG_MOVE_2(f28, f30)
439 subcc %g1, 64, %g1
440 add %o4, 64, %o4
441 bne,pt %xcc, 1b
442 LOAD(prefetch, %o4 + 64, #one_read)
443 ba,pt %xcc, 195f
444 nop
445
446180: sub %o4, 8, %g2
447 FREG_LOAD_1(%g2, f0)
e93704e4
DM
4481: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
449 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
cf5adce1 450 FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
e93704e4 451 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1
DM
452 FREG_MOVE_1(f30)
453 subcc %g1, 64, %g1
454 add %o4, 64, %o4
455 bne,pt %xcc, 1b
456 LOAD(prefetch, %o4 + 64, #one_read)
457 ba,pt %xcc, 195f
458 nop
459
460190:
e93704e4 4611: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
cf5adce1 462 subcc %g1, 64, %g1
e93704e4
DM
463 EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
464 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
cf5adce1
DM
465 add %o4, 64, %o4
466 bne,pt %xcc, 1b
467 LOAD(prefetch, %o4 + 64, #one_read)
468
469195:
470 add %o4, %g3, %o0
471 membar #Sync
472
473 VISExitHalf
474
475 /* %o2 contains any final bytes still needed to be copied
476 * over. If anything is left, we copy it one byte at a time.
477 */
478 brz,pt %o2, 85f
1b62ca7b 479 sub %o0, %o1, GLOBAL_SPARE
cf5adce1 480 ba,a,pt %XCC, 90f
0ae2d26f 481 nop
cf5adce1
DM
482
483 .align 64
48475: /* 16 < len <= 64 */
485 bne,pn %XCC, 75f
1b62ca7b 486 sub %o0, %o1, GLOBAL_SPARE
cf5adce1
DM
487
48872:
489 andn %o2, 0xf, %o4
490 and %o2, 0xf, %o2
4911: subcc %o4, 0x10, %o4
e93704e4 492 EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
cf5adce1 493 add %o1, 0x08, %o1
e93704e4 494 EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
cf5adce1 495 sub %o1, 0x08, %o1
e93704e4 496 EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
cf5adce1 497 add %o1, 0x8, %o1
e93704e4 498 EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
cf5adce1
DM
499 bgu,pt %XCC, 1b
500 add %o1, 0x8, %o1
50173: andcc %o2, 0x8, %g0
502 be,pt %XCC, 1f
503 nop
504 sub %o2, 0x8, %o2
e93704e4
DM
505 EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
506 EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
cf5adce1
DM
507 add %o1, 0x8, %o1
5081: andcc %o2, 0x4, %g0
509 be,pt %XCC, 1f
510 nop
511 sub %o2, 0x4, %o2
e93704e4
DM
512 EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
513 EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
cf5adce1
DM
514 add %o1, 0x4, %o1
5151: cmp %o2, 0
516 be,pt %XCC, 85f
517 nop
518 ba,pt %xcc, 90f
519 nop
520
52175:
522 andcc %o0, 0x7, %g1
523 sub %g1, 0x8, %g1
524 be,pn %icc, 2f
525 sub %g0, %g1, %g1
526 sub %o2, %g1, %o2
527
5281: subcc %g1, 1, %g1
e93704e4
DM
529 EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
530 EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
cf5adce1
DM
531 bgu,pt %icc, 1b
532 add %o1, 1, %o1
533
1b62ca7b 5342: add %o1, GLOBAL_SPARE, %o0
cf5adce1
DM
535 andcc %o1, 0x7, %g1
536 bne,pt %icc, 8f
537 sll %g1, 3, %g1
538
539 cmp %o2, 16
540 bgeu,pt %icc, 72b
541 nop
542 ba,a,pt %xcc, 73b
543
1b62ca7b 5448: mov 64, GLOBAL_SPARE
cf5adce1 545 andn %o1, 0x7, %o1
e93704e4 546 EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
1b62ca7b 547 sub GLOBAL_SPARE, %g1, GLOBAL_SPARE
cf5adce1
DM
548 andn %o2, 0x7, %o4
549 sllx %g2, %g1, %g2
5501: add %o1, 0x8, %o1
e93704e4 551 EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
cf5adce1 552 subcc %o4, 0x8, %o4
1b62ca7b 553 srlx %g3, GLOBAL_SPARE, %o5
cf5adce1 554 or %o5, %g2, %o5
e93704e4 555 EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
cf5adce1
DM
556 add %o0, 0x8, %o0
557 bgu,pt %icc, 1b
558 sllx %g3, %g1, %g2
559
560 srl %g1, 3, %g1
561 andcc %o2, 0x7, %o2
562 be,pn %icc, 85f
563 add %o1, %g1, %o1
564 ba,pt %xcc, 90f
1b62ca7b 565 sub %o0, %o1, GLOBAL_SPARE
cf5adce1
DM
566
567 .align 64
56880: /* 0 < len <= 16 */
1b62ca7b 569 andcc GLOBAL_SPARE, 0x3, %g0
cf5adce1 570 bne,pn %XCC, 90f
1b62ca7b 571 sub %o0, %o1, GLOBAL_SPARE
cf5adce1
DM
572
5731:
574 subcc %o2, 4, %o2
e93704e4
DM
575 EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
576 EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
cf5adce1
DM
577 bgu,pt %XCC, 1b
578 add %o1, 4, %o1
579
58085: retl
1b62ca7b 581 mov EX_RETVAL(%o3), %o0
cf5adce1
DM
582
583 .align 32
58490:
585 subcc %o2, 1, %o2
e93704e4
DM
586 EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
587 EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
cf5adce1
DM
588 bgu,pt %XCC, 90b
589 add %o1, 1, %o1
590 retl
1b62ca7b 591 mov EX_RETVAL(%o3), %o0
cf5adce1
DM
592
593 .size FUNC_NAME, .-FUNC_NAME