]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - arch/sparc/lib/NG4memcpy.S
Merge remote-tracking branches 'spi/topic/devprop', 'spi/topic/fsl', 'spi/topic/fsl...
[mirror_ubuntu-bionic-kernel.git] / arch / sparc / lib / NG4memcpy.S
CommitLineData
ae2c6ca6
DM
1/* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
95707704 7#include <linux/linkage.h>
ae2c6ca6
DM
8#include <asm/visasm.h>
9#include <asm/asi.h>
10#define GLOBAL_SPARE %g7
11#else
12#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
13#define FPRS_FEF 0x04
14
15/* On T4 it is very expensive to access ASRs like %fprs and
16 * %asi, avoiding a read or a write can save ~50 cycles.
17 */
18#define FPU_ENTER \
19 rd %fprs, %o5; \
20 andcc %o5, FPRS_FEF, %g0; \
21 be,a,pn %icc, 999f; \
22 wr %g0, FPRS_FEF, %fprs; \
23 999:
24
25#ifdef MEMCPY_DEBUG
26#define VISEntryHalf FPU_ENTER; \
27 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
28#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
29#else
30#define VISEntryHalf FPU_ENTER
31#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
32#endif
33
34#define GLOBAL_SPARE %g5
35#endif
36
37#ifndef STORE_ASI
38#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
39#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
40#else
41#define STORE_ASI 0x80 /* ASI_P */
42#endif
43#endif
44
f4da3628
DM
45#if !defined(EX_LD) && !defined(EX_ST)
46#define NON_USER_COPY
47#endif
48
ae2c6ca6 49#ifndef EX_LD
95707704 50#define EX_LD(x,y) x
ae2c6ca6 51#endif
a7c5724b 52#ifndef EX_LD_FP
95707704 53#define EX_LD_FP(x,y) x
a7c5724b 54#endif
ae2c6ca6
DM
55
56#ifndef EX_ST
95707704 57#define EX_ST(x,y) x
ae2c6ca6 58#endif
a7c5724b 59#ifndef EX_ST_FP
95707704 60#define EX_ST_FP(x,y) x
a7c5724b 61#endif
ae2c6ca6 62
ae2c6ca6
DM
63
64#ifndef LOAD
65#define LOAD(type,addr,dest) type [addr], dest
66#endif
67
68#ifndef STORE
69#ifndef MEMCPY_DEBUG
70#define STORE(type,src,addr) type src, [addr]
71#else
72#define STORE(type,src,addr) type##a src, [addr] %asi
73#endif
74#endif
75
76#ifndef STORE_INIT
77#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
78#endif
79
80#ifndef FUNC_NAME
81#define FUNC_NAME NG4memcpy
82#endif
83#ifndef PREAMBLE
84#define PREAMBLE
85#endif
86
87#ifndef XCC
88#define XCC xcc
89#endif
90
91 .register %g2,#scratch
92 .register %g3,#scratch
93
94 .text
95707704
DM
95#ifndef EX_RETVAL
96#define EX_RETVAL(x) x
97__restore_asi_fp:
98 VISExitHalf
99__restore_asi:
100 retl
101 wr %g0, ASI_AIUS, %asi
102
103ENTRY(NG4_retl_o2)
104 ba,pt %xcc, __restore_asi
105 mov %o2, %o0
106ENDPROC(NG4_retl_o2)
107ENTRY(NG4_retl_o2_plus_1)
108 ba,pt %xcc, __restore_asi
109 add %o2, 1, %o0
110ENDPROC(NG4_retl_o2_plus_1)
111ENTRY(NG4_retl_o2_plus_4)
112 ba,pt %xcc, __restore_asi
113 add %o2, 4, %o0
114ENDPROC(NG4_retl_o2_plus_4)
115ENTRY(NG4_retl_o2_plus_o5)
116 ba,pt %xcc, __restore_asi
117 add %o2, %o5, %o0
118ENDPROC(NG4_retl_o2_plus_o5)
119ENTRY(NG4_retl_o2_plus_o5_plus_4)
120 add %o5, 4, %o5
121 ba,pt %xcc, __restore_asi
122 add %o2, %o5, %o0
123ENDPROC(NG4_retl_o2_plus_o5_plus_4)
124ENTRY(NG4_retl_o2_plus_o5_plus_8)
125 add %o5, 8, %o5
126 ba,pt %xcc, __restore_asi
127 add %o2, %o5, %o0
128ENDPROC(NG4_retl_o2_plus_o5_plus_8)
129ENTRY(NG4_retl_o2_plus_o5_plus_16)
130 add %o5, 16, %o5
131 ba,pt %xcc, __restore_asi
132 add %o2, %o5, %o0
133ENDPROC(NG4_retl_o2_plus_o5_plus_16)
134ENTRY(NG4_retl_o2_plus_o5_plus_24)
135 add %o5, 24, %o5
136 ba,pt %xcc, __restore_asi
137 add %o2, %o5, %o0
138ENDPROC(NG4_retl_o2_plus_o5_plus_24)
139ENTRY(NG4_retl_o2_plus_o5_plus_32)
140 add %o5, 32, %o5
141 ba,pt %xcc, __restore_asi
142 add %o2, %o5, %o0
143ENDPROC(NG4_retl_o2_plus_o5_plus_32)
144ENTRY(NG4_retl_o2_plus_g1)
145 ba,pt %xcc, __restore_asi
146 add %o2, %g1, %o0
147ENDPROC(NG4_retl_o2_plus_g1)
148ENTRY(NG4_retl_o2_plus_g1_plus_1)
149 add %g1, 1, %g1
150 ba,pt %xcc, __restore_asi
151 add %o2, %g1, %o0
152ENDPROC(NG4_retl_o2_plus_g1_plus_1)
153ENTRY(NG4_retl_o2_plus_g1_plus_8)
154 add %g1, 8, %g1
155 ba,pt %xcc, __restore_asi
156 add %o2, %g1, %o0
157ENDPROC(NG4_retl_o2_plus_g1_plus_8)
158ENTRY(NG4_retl_o2_plus_o4)
159 ba,pt %xcc, __restore_asi
160 add %o2, %o4, %o0
161ENDPROC(NG4_retl_o2_plus_o4)
162ENTRY(NG4_retl_o2_plus_o4_plus_8)
163 add %o4, 8, %o4
164 ba,pt %xcc, __restore_asi
165 add %o2, %o4, %o0
166ENDPROC(NG4_retl_o2_plus_o4_plus_8)
167ENTRY(NG4_retl_o2_plus_o4_plus_16)
168 add %o4, 16, %o4
169 ba,pt %xcc, __restore_asi
170 add %o2, %o4, %o0
171ENDPROC(NG4_retl_o2_plus_o4_plus_16)
172ENTRY(NG4_retl_o2_plus_o4_plus_24)
173 add %o4, 24, %o4
174 ba,pt %xcc, __restore_asi
175 add %o2, %o4, %o0
176ENDPROC(NG4_retl_o2_plus_o4_plus_24)
177ENTRY(NG4_retl_o2_plus_o4_plus_32)
178 add %o4, 32, %o4
179 ba,pt %xcc, __restore_asi
180 add %o2, %o4, %o0
181ENDPROC(NG4_retl_o2_plus_o4_plus_32)
182ENTRY(NG4_retl_o2_plus_o4_plus_40)
183 add %o4, 40, %o4
184 ba,pt %xcc, __restore_asi
185 add %o2, %o4, %o0
186ENDPROC(NG4_retl_o2_plus_o4_plus_40)
187ENTRY(NG4_retl_o2_plus_o4_plus_48)
188 add %o4, 48, %o4
189 ba,pt %xcc, __restore_asi
190 add %o2, %o4, %o0
191ENDPROC(NG4_retl_o2_plus_o4_plus_48)
192ENTRY(NG4_retl_o2_plus_o4_plus_56)
193 add %o4, 56, %o4
194 ba,pt %xcc, __restore_asi
195 add %o2, %o4, %o0
196ENDPROC(NG4_retl_o2_plus_o4_plus_56)
197ENTRY(NG4_retl_o2_plus_o4_plus_64)
198 add %o4, 64, %o4
199 ba,pt %xcc, __restore_asi
200 add %o2, %o4, %o0
201ENDPROC(NG4_retl_o2_plus_o4_plus_64)
202ENTRY(NG4_retl_o2_plus_o4_fp)
203 ba,pt %xcc, __restore_asi_fp
204 add %o2, %o4, %o0
205ENDPROC(NG4_retl_o2_plus_o4_fp)
206ENTRY(NG4_retl_o2_plus_o4_plus_8_fp)
207 add %o4, 8, %o4
208 ba,pt %xcc, __restore_asi_fp
209 add %o2, %o4, %o0
210ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp)
211ENTRY(NG4_retl_o2_plus_o4_plus_16_fp)
212 add %o4, 16, %o4
213 ba,pt %xcc, __restore_asi_fp
214 add %o2, %o4, %o0
215ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp)
216ENTRY(NG4_retl_o2_plus_o4_plus_24_fp)
217 add %o4, 24, %o4
218 ba,pt %xcc, __restore_asi_fp
219 add %o2, %o4, %o0
220ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp)
221ENTRY(NG4_retl_o2_plus_o4_plus_32_fp)
222 add %o4, 32, %o4
223 ba,pt %xcc, __restore_asi_fp
224 add %o2, %o4, %o0
225ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp)
226ENTRY(NG4_retl_o2_plus_o4_plus_40_fp)
227 add %o4, 40, %o4
228 ba,pt %xcc, __restore_asi_fp
229 add %o2, %o4, %o0
230ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp)
231ENTRY(NG4_retl_o2_plus_o4_plus_48_fp)
232 add %o4, 48, %o4
233 ba,pt %xcc, __restore_asi_fp
234 add %o2, %o4, %o0
235ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp)
236ENTRY(NG4_retl_o2_plus_o4_plus_56_fp)
237 add %o4, 56, %o4
238 ba,pt %xcc, __restore_asi_fp
239 add %o2, %o4, %o0
240ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp)
241ENTRY(NG4_retl_o2_plus_o4_plus_64_fp)
242 add %o4, 64, %o4
243 ba,pt %xcc, __restore_asi_fp
244 add %o2, %o4, %o0
245ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp)
246#endif
ae2c6ca6
DM
247 .align 64
248
249 .globl FUNC_NAME
250 .type FUNC_NAME,#function
251FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
252#ifdef MEMCPY_DEBUG
253 wr %g0, 0x80, %asi
254#endif
255 srlx %o2, 31, %g2
256 cmp %g2, 0
257 tne %XCC, 5
258 PREAMBLE
259 mov %o0, %o3
260 brz,pn %o2, .Lexit
261 cmp %o2, 3
262 ble,pn %icc, .Ltiny
263 cmp %o2, 19
264 ble,pn %icc, .Lsmall
265 or %o0, %o1, %g2
266 cmp %o2, 128
267 bl,pn %icc, .Lmedium
268 nop
269
270.Llarge:/* len >= 0x80 */
271 /* First get dest 8 byte aligned. */
272 sub %g0, %o0, %g1
273 and %g1, 0x7, %g1
274 brz,pt %g1, 51f
275 sub %o2, %g1, %o2
42a4172b 276
95707704
DM
277
2781: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
ae2c6ca6
DM
279 add %o1, 1, %o1
280 subcc %g1, 1, %g1
281 add %o0, 1, %o0
282 bne,pt %icc, 1b
95707704 283 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
ae2c6ca6
DM
284
28551: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
286 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
287 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
288 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
289 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
290 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
291 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
292 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
293
294 /* Check if we can use the straight fully aligned
295 * loop, or we require the alignaddr/faligndata variant.
296 */
297 andcc %o1, 0x7, %o5
298 bne,pn %icc, .Llarge_src_unaligned
299 sub %g0, %o0, %g1
300
301 /* Legitimize the use of initializing stores by getting dest
302 * to be 64-byte aligned.
303 */
304 and %g1, 0x3f, %g1
305 brz,pt %g1, .Llarge_aligned
306 sub %o2, %g1, %o2
42a4172b 307
95707704 3081: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
ae2c6ca6
DM
309 add %o1, 8, %o1
310 subcc %g1, 8, %g1
311 add %o0, 8, %o0
312 bne,pt %icc, 1b
95707704 313 EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8)
ae2c6ca6
DM
314
315.Llarge_aligned:
316 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
317 andn %o2, 0x3f, %o4
318 sub %o2, %o4, %o2
319
95707704 3201: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4)
ae2c6ca6 321 add %o1, 0x40, %o1
95707704 322 EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4)
ae2c6ca6 323 subcc %o4, 0x40, %o4
95707704
DM
324 EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64)
325 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64)
326 EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64)
327 EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64)
ae2c6ca6 328 add %o0, 0x08, %o0
95707704 329 EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56)
ae2c6ca6 330 add %o0, 0x08, %o0
95707704
DM
331 EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48)
332 EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48)
ae2c6ca6 333 add %o0, 0x08, %o0
95707704
DM
334 EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40)
335 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40)
ae2c6ca6 336 add %o0, 0x08, %o0
95707704
DM
337 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32)
338 EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32)
ae2c6ca6 339 add %o0, 0x08, %o0
95707704 340 EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24)
ae2c6ca6 341 add %o0, 0x08, %o0
95707704 342 EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16)
ae2c6ca6 343 add %o0, 0x08, %o0
95707704 344 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8)
ae2c6ca6
DM
345 add %o0, 0x08, %o0
346 bne,pt %icc, 1b
347 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
348
349 membar #StoreLoad | #StoreStore
350
351 brz,pn %o2, .Lexit
352 cmp %o2, 19
353 ble,pn %icc, .Lsmall_unaligned
354 nop
355 ba,a,pt %icc, .Lmedium_noprefetch
356
357.Lexit: retl
358 mov EX_RETVAL(%o3), %o0
359
360.Llarge_src_unaligned:
f4da3628
DM
361#ifdef NON_USER_COPY
362 VISEntryHalfFast(.Lmedium_vis_entry_fail)
363#else
364 VISEntryHalf
365#endif
ae2c6ca6
DM
366 andn %o2, 0x3f, %o4
367 sub %o2, %o4, %o2
ae2c6ca6
DM
368 alignaddr %o1, %g0, %g1
369 add %o1, %o4, %o1
95707704
DM
370 EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4)
3711: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4)
ae2c6ca6 372 subcc %o4, 0x40, %o4
95707704
DM
373 EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64)
374 EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64)
375 EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64)
376 EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64)
377 EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64)
378 EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64)
ae2c6ca6 379 faligndata %f0, %f2, %f16
95707704 380 EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64)
ae2c6ca6
DM
381 faligndata %f2, %f4, %f18
382 add %g1, 0x40, %g1
383 faligndata %f4, %f6, %f20
384 faligndata %f6, %f8, %f22
385 faligndata %f8, %f10, %f24
386 faligndata %f10, %f12, %f26
387 faligndata %f12, %f14, %f28
388 faligndata %f14, %f0, %f30
95707704
DM
389 EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64)
390 EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56)
391 EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48)
392 EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40)
393 EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32)
394 EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24)
395 EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16)
396 EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8)
ae2c6ca6
DM
397 add %o0, 0x40, %o0
398 bne,pt %icc, 1b
399 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
44922150
DM
400#ifdef NON_USER_COPY
401 VISExitHalfFast
402#else
ae2c6ca6 403 VISExitHalf
44922150 404#endif
ae2c6ca6
DM
405 brz,pn %o2, .Lexit
406 cmp %o2, 19
407 ble,pn %icc, .Lsmall_unaligned
408 nop
409 ba,a,pt %icc, .Lmedium_unaligned
410
f4da3628
DM
411#ifdef NON_USER_COPY
412.Lmedium_vis_entry_fail:
413 or %o0, %o1, %g2
414#endif
ae2c6ca6
DM
415.Lmedium:
416 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
417 andcc %g2, 0x7, %g0
418 bne,pn %icc, .Lmedium_unaligned
419 nop
420.Lmedium_noprefetch:
421 andncc %o2, 0x20 - 1, %o5
422 be,pn %icc, 2f
423 sub %o2, %o5, %o2
95707704
DM
4241: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
425 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5)
426 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5)
427 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5)
ae2c6ca6
DM
428 add %o1, 0x20, %o1
429 subcc %o5, 0x20, %o5
95707704
DM
430 EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32)
431 EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24)
432 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24)
433 EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8)
ae2c6ca6
DM
434 bne,pt %icc, 1b
435 add %o0, 0x20, %o0
4362: andcc %o2, 0x18, %o5
437 be,pt %icc, 3f
438 sub %o2, %o5, %o2
95707704
DM
439
4401: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
ae2c6ca6
DM
441 add %o1, 0x08, %o1
442 add %o0, 0x08, %o0
443 subcc %o5, 0x08, %o5
444 bne,pt %icc, 1b
95707704 445 EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8)
ae2c6ca6
DM
4463: brz,pt %o2, .Lexit
447 cmp %o2, 0x04
448 bl,pn %icc, .Ltiny
449 nop
95707704 450 EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2)
ae2c6ca6
DM
451 add %o1, 0x04, %o1
452 add %o0, 0x04, %o0
453 subcc %o2, 0x04, %o2
454 bne,pn %icc, .Ltiny
95707704 455 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4)
ae2c6ca6
DM
456 ba,a,pt %icc, .Lexit
457.Lmedium_unaligned:
458 /* First get dest 8 byte aligned. */
459 sub %g0, %o0, %g1
460 and %g1, 0x7, %g1
461 brz,pt %g1, 2f
462 sub %o2, %g1, %o2
42a4172b 463
95707704 4641: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
ae2c6ca6
DM
465 add %o1, 1, %o1
466 subcc %g1, 1, %g1
467 add %o0, 1, %o0
468 bne,pt %icc, 1b
95707704 469 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
ae2c6ca6
DM
4702:
471 and %o1, 0x7, %g1
472 brz,pn %g1, .Lmedium_noprefetch
473 sll %g1, 3, %g1
474 mov 64, %g2
475 sub %g2, %g1, %g2
476 andn %o1, 0x7, %o1
95707704 477 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2)
ae2c6ca6
DM
478 sllx %o4, %g1, %o4
479 andn %o2, 0x08 - 1, %o5
480 sub %o2, %o5, %o2
95707704 4811: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5)
ae2c6ca6
DM
482 add %o1, 0x08, %o1
483 subcc %o5, 0x08, %o5
484 srlx %g3, %g2, GLOBAL_SPARE
485 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
95707704 486 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8)
ae2c6ca6
DM
487 add %o0, 0x08, %o0
488 bne,pt %icc, 1b
489 sllx %g3, %g1, %o4
490 srl %g1, 3, %g1
491 add %o1, %g1, %o1
492 brz,pn %o2, .Lexit
493 nop
494 ba,pt %icc, .Lsmall_unaligned
495
496.Ltiny:
95707704 497 EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
ae2c6ca6
DM
498 subcc %o2, 1, %o2
499 be,pn %icc, .Lexit
95707704
DM
500 EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1)
501 EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2)
ae2c6ca6
DM
502 subcc %o2, 1, %o2
503 be,pn %icc, .Lexit
95707704
DM
504 EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1)
505 EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2)
ae2c6ca6 506 ba,pt %icc, .Lexit
95707704 507 EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2)
ae2c6ca6
DM
508
509.Lsmall:
510 andcc %g2, 0x3, %g0
511 bne,pn %icc, .Lsmall_unaligned
512 andn %o2, 0x4 - 1, %o5
513 sub %o2, %o5, %o2
5141:
95707704 515 EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
ae2c6ca6
DM
516 add %o1, 0x04, %o1
517 subcc %o5, 0x04, %o5
518 add %o0, 0x04, %o0
519 bne,pt %icc, 1b
95707704 520 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4)
ae2c6ca6
DM
521 brz,pt %o2, .Lexit
522 nop
523 ba,a,pt %icc, .Ltiny
524
525.Lsmall_unaligned:
95707704 5261: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
ae2c6ca6
DM
527 add %o1, 1, %o1
528 add %o0, 1, %o0
529 subcc %o2, 1, %o2
530 bne,pt %icc, 1b
95707704 531 EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1)
ae2c6ca6 532 ba,a,pt %icc, .Lexit
0ae2d26f 533 nop
ae2c6ca6 534 .size FUNC_NAME, .-FUNC_NAME