]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/sparc/lib/NG2memcpy.S
Merge remote-tracking branches 'asoc/topic/rockchip', 'asoc/topic/rt5514', 'asoc...
[mirror_ubuntu-bionic-kernel.git] / arch / sparc / lib / NG2memcpy.S
1 /* NG2memcpy.S: Niagara-2 optimized memcpy.
2 *
3 * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
4 */
5
6 #ifdef __KERNEL__
7 #include <linux/linkage.h>
8 #include <asm/visasm.h>
9 #include <asm/asi.h>
10 #define GLOBAL_SPARE %g7
11 #else
12 #define ASI_PNF 0x82
13 #define ASI_BLK_P 0xf0
14 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
15 #define FPRS_FEF 0x04
16 #ifdef MEMCPY_DEBUG
17 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
18 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
19 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
20 #else
21 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
22 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
23 #endif
24 #define GLOBAL_SPARE %g5
25 #endif
26
27 #ifndef STORE_ASI
28 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
29 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
30 #else
31 #define STORE_ASI 0x80 /* ASI_P */
32 #endif
33 #endif
34
35 #ifndef EX_LD
36 #define EX_LD(x,y) x
37 #endif
38 #ifndef EX_LD_FP
39 #define EX_LD_FP(x,y) x
40 #endif
41
42 #ifndef EX_ST
43 #define EX_ST(x,y) x
44 #endif
45 #ifndef EX_ST_FP
46 #define EX_ST_FP(x,y) x
47 #endif
48
49 #ifndef LOAD
50 #define LOAD(type,addr,dest) type [addr], dest
51 #endif
52
53 #ifndef LOAD_BLK
54 #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
55 #endif
56
57 #ifndef STORE
58 #ifndef MEMCPY_DEBUG
59 #define STORE(type,src,addr) type src, [addr]
60 #else
61 #define STORE(type,src,addr) type##a src, [addr] 0x80
62 #endif
63 #endif
64
65 #ifndef STORE_BLK
66 #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
67 #endif
68
69 #ifndef STORE_INIT
70 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
71 #endif
72
73 #ifndef FUNC_NAME
74 #define FUNC_NAME NG2memcpy
75 #endif
76
77 #ifndef PREAMBLE
78 #define PREAMBLE
79 #endif
80
81 #ifndef XCC
82 #define XCC xcc
83 #endif
84
85 #define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
86 faligndata %x0, %x1, %f0; \
87 faligndata %x1, %x2, %f2; \
88 faligndata %x2, %x3, %f4; \
89 faligndata %x3, %x4, %f6; \
90 faligndata %x4, %x5, %f8; \
91 faligndata %x5, %x6, %f10; \
92 faligndata %x6, %x7, %f12; \
93 faligndata %x7, %x8, %f14;
94
95 #define FREG_MOVE_1(x0) \
96 fsrc2 %x0, %f0;
97 #define FREG_MOVE_2(x0, x1) \
98 fsrc2 %x0, %f0; \
99 fsrc2 %x1, %f2;
100 #define FREG_MOVE_3(x0, x1, x2) \
101 fsrc2 %x0, %f0; \
102 fsrc2 %x1, %f2; \
103 fsrc2 %x2, %f4;
104 #define FREG_MOVE_4(x0, x1, x2, x3) \
105 fsrc2 %x0, %f0; \
106 fsrc2 %x1, %f2; \
107 fsrc2 %x2, %f4; \
108 fsrc2 %x3, %f6;
109 #define FREG_MOVE_5(x0, x1, x2, x3, x4) \
110 fsrc2 %x0, %f0; \
111 fsrc2 %x1, %f2; \
112 fsrc2 %x2, %f4; \
113 fsrc2 %x3, %f6; \
114 fsrc2 %x4, %f8;
115 #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
116 fsrc2 %x0, %f0; \
117 fsrc2 %x1, %f2; \
118 fsrc2 %x2, %f4; \
119 fsrc2 %x3, %f6; \
120 fsrc2 %x4, %f8; \
121 fsrc2 %x5, %f10;
122 #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
123 fsrc2 %x0, %f0; \
124 fsrc2 %x1, %f2; \
125 fsrc2 %x2, %f4; \
126 fsrc2 %x3, %f6; \
127 fsrc2 %x4, %f8; \
128 fsrc2 %x5, %f10; \
129 fsrc2 %x6, %f12;
130 #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
131 fsrc2 %x0, %f0; \
132 fsrc2 %x1, %f2; \
133 fsrc2 %x2, %f4; \
134 fsrc2 %x3, %f6; \
135 fsrc2 %x4, %f8; \
136 fsrc2 %x5, %f10; \
137 fsrc2 %x6, %f12; \
138 fsrc2 %x7, %f14;
139 #define FREG_LOAD_1(base, x0) \
140 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
141 #define FREG_LOAD_2(base, x0, x1) \
142 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
143 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
144 #define FREG_LOAD_3(base, x0, x1, x2) \
145 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
146 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
147 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
148 #define FREG_LOAD_4(base, x0, x1, x2, x3) \
149 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
150 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
151 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
152 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
153 #define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
154 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
155 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
156 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
157 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
158 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
159 #define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
160 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
161 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
162 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
163 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
164 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
165 EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
166 #define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
167 EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
168 EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
169 EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
170 EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
171 EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
172 EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
173 EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
174
175 .register %g2,#scratch
176 .register %g3,#scratch
177
178 .text
179 #ifndef EX_RETVAL
180 #define EX_RETVAL(x) x
181 __restore_fp:
182 VISExitHalf
183 __restore_asi:
184 retl
185 wr %g0, ASI_AIUS, %asi
186 ENTRY(NG2_retl_o2)
187 ba,pt %xcc, __restore_asi
188 mov %o2, %o0
189 ENDPROC(NG2_retl_o2)
190 ENTRY(NG2_retl_o2_plus_1)
191 ba,pt %xcc, __restore_asi
192 add %o2, 1, %o0
193 ENDPROC(NG2_retl_o2_plus_1)
194 ENTRY(NG2_retl_o2_plus_4)
195 ba,pt %xcc, __restore_asi
196 add %o2, 4, %o0
197 ENDPROC(NG2_retl_o2_plus_4)
198 ENTRY(NG2_retl_o2_plus_8)
199 ba,pt %xcc, __restore_asi
200 add %o2, 8, %o0
201 ENDPROC(NG2_retl_o2_plus_8)
202 ENTRY(NG2_retl_o2_plus_o4_plus_1)
203 add %o4, 1, %o4
204 ba,pt %xcc, __restore_asi
205 add %o2, %o4, %o0
206 ENDPROC(NG2_retl_o2_plus_o4_plus_1)
207 ENTRY(NG2_retl_o2_plus_o4_plus_8)
208 add %o4, 8, %o4
209 ba,pt %xcc, __restore_asi
210 add %o2, %o4, %o0
211 ENDPROC(NG2_retl_o2_plus_o4_plus_8)
212 ENTRY(NG2_retl_o2_plus_o4_plus_16)
213 add %o4, 16, %o4
214 ba,pt %xcc, __restore_asi
215 add %o2, %o4, %o0
216 ENDPROC(NG2_retl_o2_plus_o4_plus_16)
217 ENTRY(NG2_retl_o2_plus_g1_fp)
218 ba,pt %xcc, __restore_fp
219 add %o2, %g1, %o0
220 ENDPROC(NG2_retl_o2_plus_g1_fp)
221 ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
222 add %g1, 64, %g1
223 ba,pt %xcc, __restore_fp
224 add %o2, %g1, %o0
225 ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
226 ENTRY(NG2_retl_o2_plus_g1_plus_1)
227 add %g1, 1, %g1
228 ba,pt %xcc, __restore_asi
229 add %o2, %g1, %o0
230 ENDPROC(NG2_retl_o2_plus_g1_plus_1)
231 ENTRY(NG2_retl_o2_and_7_plus_o4)
232 and %o2, 7, %o2
233 ba,pt %xcc, __restore_asi
234 add %o2, %o4, %o0
235 ENDPROC(NG2_retl_o2_and_7_plus_o4)
236 ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
237 and %o2, 7, %o2
238 add %o4, 8, %o4
239 ba,pt %xcc, __restore_asi
240 add %o2, %o4, %o0
241 ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
242 #endif
243
244 .align 64
245
246 .globl FUNC_NAME
247 .type FUNC_NAME,#function
248 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
249 srlx %o2, 31, %g2
250 cmp %g2, 0
251 tne %xcc, 5
252 PREAMBLE
253 mov %o0, %o3
254 cmp %o2, 0
255 be,pn %XCC, 85f
256 or %o0, %o1, GLOBAL_SPARE
257 cmp %o2, 16
258 blu,a,pn %XCC, 80f
259 or GLOBAL_SPARE, %o2, GLOBAL_SPARE
260
261 /* 2 blocks (128 bytes) is the minimum we can do the block
262 * copy with. We need to ensure that we'll iterate at least
263 * once in the block copy loop. At worst we'll need to align
264 * the destination to a 64-byte boundary which can chew up
265 * to (64 - 1) bytes from the length before we perform the
266 * block copy loop.
267 *
268 * However, the cut-off point, performance wise, is around
269 * 4 64-byte blocks.
270 */
271 cmp %o2, (4 * 64)
272 blu,pt %XCC, 75f
273 andcc GLOBAL_SPARE, 0x7, %g0
274
275 /* %o0: dst
276 * %o1: src
277 * %o2: len (known to be >= 128)
278 *
279 * The block copy loops can use %o4, %g2, %g3 as
280 * temporaries while copying the data. %o5 must
281 * be preserved between VISEntryHalf and VISExitHalf
282 */
283
284 LOAD(prefetch, %o1 + 0x000, #one_read)
285 LOAD(prefetch, %o1 + 0x040, #one_read)
286 LOAD(prefetch, %o1 + 0x080, #one_read)
287
288 /* Align destination on 64-byte boundary. */
289 andcc %o0, (64 - 1), %o4
290 be,pt %XCC, 2f
291 sub %o4, 64, %o4
292 sub %g0, %o4, %o4 ! bytes to align dst
293 sub %o2, %o4, %o2
294 1: subcc %o4, 1, %o4
295 EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
296 EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
297 add %o1, 1, %o1
298 bne,pt %XCC, 1b
299 add %o0, 1, %o0
300
301 2:
302 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
303 * o5 from here until we hit VISExitHalf.
304 */
305 VISEntryHalf
306
307 membar #Sync
308 alignaddr %o1, %g0, %g0
309
310 add %o1, (64 - 1), %o4
311 andn %o4, (64 - 1), %o4
312 andn %o2, (64 - 1), %g1
313 sub %o2, %g1, %o2
314
315 and %o1, (64 - 1), %g2
316 add %o1, %g1, %o1
317 sub %o0, %o4, %g3
318 brz,pt %g2, 190f
319 cmp %g2, 32
320 blu,a 5f
321 cmp %g2, 16
322 cmp %g2, 48
323 blu,a 4f
324 cmp %g2, 40
325 cmp %g2, 56
326 blu 170f
327 nop
328 ba,a,pt %xcc, 180f
329 nop
330
331 4: /* 32 <= low bits < 48 */
332 blu 150f
333 nop
334 ba,a,pt %xcc, 160f
335 nop
336 5: /* 0 < low bits < 32 */
337 blu,a 6f
338 cmp %g2, 8
339 cmp %g2, 24
340 blu 130f
341 nop
342 ba,a,pt %xcc, 140f
343 nop
344 6: /* 0 < low bits < 16 */
345 bgeu 120f
346 nop
347 /* fall through for 0 < low bits < 8 */
348 110: sub %o4, 64, %g2
349 EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
350 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
351 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
352 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
353 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
354 FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
355 subcc %g1, 64, %g1
356 add %o4, 64, %o4
357 bne,pt %xcc, 1b
358 LOAD(prefetch, %o4 + 64, #one_read)
359 ba,pt %xcc, 195f
360 nop
361
362 120: sub %o4, 56, %g2
363 FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
364 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
365 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
366 FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
367 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
368 FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
369 subcc %g1, 64, %g1
370 add %o4, 64, %o4
371 bne,pt %xcc, 1b
372 LOAD(prefetch, %o4 + 64, #one_read)
373 ba,pt %xcc, 195f
374 nop
375
376 130: sub %o4, 48, %g2
377 FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
378 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
379 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
380 FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
381 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
382 FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
383 subcc %g1, 64, %g1
384 add %o4, 64, %o4
385 bne,pt %xcc, 1b
386 LOAD(prefetch, %o4 + 64, #one_read)
387 ba,pt %xcc, 195f
388 nop
389
390 140: sub %o4, 40, %g2
391 FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
392 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
393 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
394 FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
395 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
396 FREG_MOVE_5(f22, f24, f26, f28, f30)
397 subcc %g1, 64, %g1
398 add %o4, 64, %o4
399 bne,pt %xcc, 1b
400 LOAD(prefetch, %o4 + 64, #one_read)
401 ba,pt %xcc, 195f
402 nop
403
404 150: sub %o4, 32, %g2
405 FREG_LOAD_4(%g2, f0, f2, f4, f6)
406 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
407 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
408 FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
409 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
410 FREG_MOVE_4(f24, f26, f28, f30)
411 subcc %g1, 64, %g1
412 add %o4, 64, %o4
413 bne,pt %xcc, 1b
414 LOAD(prefetch, %o4 + 64, #one_read)
415 ba,pt %xcc, 195f
416 nop
417
418 160: sub %o4, 24, %g2
419 FREG_LOAD_3(%g2, f0, f2, f4)
420 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
421 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
422 FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
423 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
424 FREG_MOVE_3(f26, f28, f30)
425 subcc %g1, 64, %g1
426 add %o4, 64, %o4
427 bne,pt %xcc, 1b
428 LOAD(prefetch, %o4 + 64, #one_read)
429 ba,pt %xcc, 195f
430 nop
431
432 170: sub %o4, 16, %g2
433 FREG_LOAD_2(%g2, f0, f2)
434 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
435 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
436 FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
437 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
438 FREG_MOVE_2(f28, f30)
439 subcc %g1, 64, %g1
440 add %o4, 64, %o4
441 bne,pt %xcc, 1b
442 LOAD(prefetch, %o4 + 64, #one_read)
443 ba,pt %xcc, 195f
444 nop
445
446 180: sub %o4, 8, %g2
447 FREG_LOAD_1(%g2, f0)
448 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
449 EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
450 FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
451 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
452 FREG_MOVE_1(f30)
453 subcc %g1, 64, %g1
454 add %o4, 64, %o4
455 bne,pt %xcc, 1b
456 LOAD(prefetch, %o4 + 64, #one_read)
457 ba,pt %xcc, 195f
458 nop
459
460 190:
461 1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
462 subcc %g1, 64, %g1
463 EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
464 EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
465 add %o4, 64, %o4
466 bne,pt %xcc, 1b
467 LOAD(prefetch, %o4 + 64, #one_read)
468
469 195:
470 add %o4, %g3, %o0
471 membar #Sync
472
473 VISExitHalf
474
475 /* %o2 contains any final bytes still needed to be copied
476 * over. If anything is left, we copy it one byte at a time.
477 */
478 brz,pt %o2, 85f
479 sub %o0, %o1, GLOBAL_SPARE
480 ba,a,pt %XCC, 90f
481 nop
482
483 .align 64
484 75: /* 16 < len <= 64 */
485 bne,pn %XCC, 75f
486 sub %o0, %o1, GLOBAL_SPARE
487
488 72:
489 andn %o2, 0xf, %o4
490 and %o2, 0xf, %o2
491 1: subcc %o4, 0x10, %o4
492 EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
493 add %o1, 0x08, %o1
494 EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
495 sub %o1, 0x08, %o1
496 EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
497 add %o1, 0x8, %o1
498 EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
499 bgu,pt %XCC, 1b
500 add %o1, 0x8, %o1
501 73: andcc %o2, 0x8, %g0
502 be,pt %XCC, 1f
503 nop
504 sub %o2, 0x8, %o2
505 EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
506 EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
507 add %o1, 0x8, %o1
508 1: andcc %o2, 0x4, %g0
509 be,pt %XCC, 1f
510 nop
511 sub %o2, 0x4, %o2
512 EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
513 EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
514 add %o1, 0x4, %o1
515 1: cmp %o2, 0
516 be,pt %XCC, 85f
517 nop
518 ba,pt %xcc, 90f
519 nop
520
521 75:
522 andcc %o0, 0x7, %g1
523 sub %g1, 0x8, %g1
524 be,pn %icc, 2f
525 sub %g0, %g1, %g1
526 sub %o2, %g1, %o2
527
528 1: subcc %g1, 1, %g1
529 EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
530 EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
531 bgu,pt %icc, 1b
532 add %o1, 1, %o1
533
534 2: add %o1, GLOBAL_SPARE, %o0
535 andcc %o1, 0x7, %g1
536 bne,pt %icc, 8f
537 sll %g1, 3, %g1
538
539 cmp %o2, 16
540 bgeu,pt %icc, 72b
541 nop
542 ba,a,pt %xcc, 73b
543
544 8: mov 64, GLOBAL_SPARE
545 andn %o1, 0x7, %o1
546 EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
547 sub GLOBAL_SPARE, %g1, GLOBAL_SPARE
548 andn %o2, 0x7, %o4
549 sllx %g2, %g1, %g2
550 1: add %o1, 0x8, %o1
551 EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
552 subcc %o4, 0x8, %o4
553 srlx %g3, GLOBAL_SPARE, %o5
554 or %o5, %g2, %o5
555 EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
556 add %o0, 0x8, %o0
557 bgu,pt %icc, 1b
558 sllx %g3, %g1, %g2
559
560 srl %g1, 3, %g1
561 andcc %o2, 0x7, %o2
562 be,pn %icc, 85f
563 add %o1, %g1, %o1
564 ba,pt %xcc, 90f
565 sub %o0, %o1, GLOBAL_SPARE
566
567 .align 64
568 80: /* 0 < len <= 16 */
569 andcc GLOBAL_SPARE, 0x3, %g0
570 bne,pn %XCC, 90f
571 sub %o0, %o1, GLOBAL_SPARE
572
573 1:
574 subcc %o2, 4, %o2
575 EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
576 EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
577 bgu,pt %XCC, 1b
578 add %o1, 4, %o1
579
580 85: retl
581 mov EX_RETVAL(%o3), %o0
582
583 .align 32
584 90:
585 subcc %o2, 1, %o2
586 EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
587 EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
588 bgu,pt %XCC, 90b
589 add %o1, 1, %o1
590 retl
591 mov EX_RETVAL(%o3), %o0
592
593 .size FUNC_NAME, .-FUNC_NAME