]>
Commit | Line | Data |
---|---|---|
ae2c6ca6 DM |
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. |
2 | * | |
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | */ | |
5 | ||
6 | #ifdef __KERNEL__ | |
95707704 | 7 | #include <linux/linkage.h> |
ae2c6ca6 DM |
8 | #include <asm/visasm.h> |
9 | #include <asm/asi.h> | |
10 | #define GLOBAL_SPARE %g7 | |
11 | #else | |
12 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
13 | #define FPRS_FEF 0x04 | |
14 | ||
15 | /* On T4 it is very expensive to access ASRs like %fprs and | |
16 | * %asi, avoiding a read or a write can save ~50 cycles. | |
17 | */ | |
18 | #define FPU_ENTER \ | |
19 | rd %fprs, %o5; \ | |
20 | andcc %o5, FPRS_FEF, %g0; \ | |
21 | be,a,pn %icc, 999f; \ | |
22 | wr %g0, FPRS_FEF, %fprs; \ | |
23 | 999: | |
24 | ||
25 | #ifdef MEMCPY_DEBUG | |
26 | #define VISEntryHalf FPU_ENTER; \ | |
27 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | |
28 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
29 | #else | |
30 | #define VISEntryHalf FPU_ENTER | |
31 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
32 | #endif | |
33 | ||
34 | #define GLOBAL_SPARE %g5 | |
35 | #endif | |
36 | ||
37 | #ifndef STORE_ASI | |
38 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | |
39 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
40 | #else | |
41 | #define STORE_ASI 0x80 /* ASI_P */ | |
42 | #endif | |
43 | #endif | |
44 | ||
f4da3628 DM |
45 | #if !defined(EX_LD) && !defined(EX_ST) |
46 | #define NON_USER_COPY | |
47 | #endif | |
48 | ||
ae2c6ca6 | 49 | #ifndef EX_LD |
95707704 | 50 | #define EX_LD(x,y) x |
ae2c6ca6 | 51 | #endif |
a7c5724b | 52 | #ifndef EX_LD_FP |
95707704 | 53 | #define EX_LD_FP(x,y) x |
a7c5724b | 54 | #endif |
ae2c6ca6 DM |
55 | |
56 | #ifndef EX_ST | |
95707704 | 57 | #define EX_ST(x,y) x |
ae2c6ca6 | 58 | #endif |
a7c5724b | 59 | #ifndef EX_ST_FP |
95707704 | 60 | #define EX_ST_FP(x,y) x |
a7c5724b | 61 | #endif |
ae2c6ca6 | 62 | |
ae2c6ca6 DM |
63 | |
64 | #ifndef LOAD | |
65 | #define LOAD(type,addr,dest) type [addr], dest | |
66 | #endif | |
67 | ||
68 | #ifndef STORE | |
69 | #ifndef MEMCPY_DEBUG | |
70 | #define STORE(type,src,addr) type src, [addr] | |
71 | #else | |
72 | #define STORE(type,src,addr) type##a src, [addr] %asi | |
73 | #endif | |
74 | #endif | |
75 | ||
76 | #ifndef STORE_INIT | |
77 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | |
78 | #endif | |
79 | ||
80 | #ifndef FUNC_NAME | |
81 | #define FUNC_NAME NG4memcpy | |
82 | #endif | |
83 | #ifndef PREAMBLE | |
84 | #define PREAMBLE | |
85 | #endif | |
86 | ||
87 | #ifndef XCC | |
88 | #define XCC xcc | |
89 | #endif | |
90 | ||
91 | .register %g2,#scratch | |
92 | .register %g3,#scratch | |
93 | ||
94 | .text | |
95707704 DM |
95 | #ifndef EX_RETVAL |
96 | #define EX_RETVAL(x) x | |
97 | __restore_asi_fp: | |
98 | VISExitHalf | |
99 | __restore_asi: | |
100 | retl | |
101 | wr %g0, ASI_AIUS, %asi | |
102 | ||
103 | ENTRY(NG4_retl_o2) | |
104 | ba,pt %xcc, __restore_asi | |
105 | mov %o2, %o0 | |
106 | ENDPROC(NG4_retl_o2) | |
107 | ENTRY(NG4_retl_o2_plus_1) | |
108 | ba,pt %xcc, __restore_asi | |
109 | add %o2, 1, %o0 | |
110 | ENDPROC(NG4_retl_o2_plus_1) | |
111 | ENTRY(NG4_retl_o2_plus_4) | |
112 | ba,pt %xcc, __restore_asi | |
113 | add %o2, 4, %o0 | |
114 | ENDPROC(NG4_retl_o2_plus_4) | |
115 | ENTRY(NG4_retl_o2_plus_o5) | |
116 | ba,pt %xcc, __restore_asi | |
117 | add %o2, %o5, %o0 | |
118 | ENDPROC(NG4_retl_o2_plus_o5) | |
119 | ENTRY(NG4_retl_o2_plus_o5_plus_4) | |
120 | add %o5, 4, %o5 | |
121 | ba,pt %xcc, __restore_asi | |
122 | add %o2, %o5, %o0 | |
123 | ENDPROC(NG4_retl_o2_plus_o5_plus_4) | |
124 | ENTRY(NG4_retl_o2_plus_o5_plus_8) | |
125 | add %o5, 8, %o5 | |
126 | ba,pt %xcc, __restore_asi | |
127 | add %o2, %o5, %o0 | |
128 | ENDPROC(NG4_retl_o2_plus_o5_plus_8) | |
129 | ENTRY(NG4_retl_o2_plus_o5_plus_16) | |
130 | add %o5, 16, %o5 | |
131 | ba,pt %xcc, __restore_asi | |
132 | add %o2, %o5, %o0 | |
133 | ENDPROC(NG4_retl_o2_plus_o5_plus_16) | |
134 | ENTRY(NG4_retl_o2_plus_o5_plus_24) | |
135 | add %o5, 24, %o5 | |
136 | ba,pt %xcc, __restore_asi | |
137 | add %o2, %o5, %o0 | |
138 | ENDPROC(NG4_retl_o2_plus_o5_plus_24) | |
139 | ENTRY(NG4_retl_o2_plus_o5_plus_32) | |
140 | add %o5, 32, %o5 | |
141 | ba,pt %xcc, __restore_asi | |
142 | add %o2, %o5, %o0 | |
143 | ENDPROC(NG4_retl_o2_plus_o5_plus_32) | |
144 | ENTRY(NG4_retl_o2_plus_g1) | |
145 | ba,pt %xcc, __restore_asi | |
146 | add %o2, %g1, %o0 | |
147 | ENDPROC(NG4_retl_o2_plus_g1) | |
148 | ENTRY(NG4_retl_o2_plus_g1_plus_1) | |
149 | add %g1, 1, %g1 | |
150 | ba,pt %xcc, __restore_asi | |
151 | add %o2, %g1, %o0 | |
152 | ENDPROC(NG4_retl_o2_plus_g1_plus_1) | |
153 | ENTRY(NG4_retl_o2_plus_g1_plus_8) | |
154 | add %g1, 8, %g1 | |
155 | ba,pt %xcc, __restore_asi | |
156 | add %o2, %g1, %o0 | |
157 | ENDPROC(NG4_retl_o2_plus_g1_plus_8) | |
158 | ENTRY(NG4_retl_o2_plus_o4) | |
159 | ba,pt %xcc, __restore_asi | |
160 | add %o2, %o4, %o0 | |
161 | ENDPROC(NG4_retl_o2_plus_o4) | |
162 | ENTRY(NG4_retl_o2_plus_o4_plus_8) | |
163 | add %o4, 8, %o4 | |
164 | ba,pt %xcc, __restore_asi | |
165 | add %o2, %o4, %o0 | |
166 | ENDPROC(NG4_retl_o2_plus_o4_plus_8) | |
167 | ENTRY(NG4_retl_o2_plus_o4_plus_16) | |
168 | add %o4, 16, %o4 | |
169 | ba,pt %xcc, __restore_asi | |
170 | add %o2, %o4, %o0 | |
171 | ENDPROC(NG4_retl_o2_plus_o4_plus_16) | |
172 | ENTRY(NG4_retl_o2_plus_o4_plus_24) | |
173 | add %o4, 24, %o4 | |
174 | ba,pt %xcc, __restore_asi | |
175 | add %o2, %o4, %o0 | |
176 | ENDPROC(NG4_retl_o2_plus_o4_plus_24) | |
177 | ENTRY(NG4_retl_o2_plus_o4_plus_32) | |
178 | add %o4, 32, %o4 | |
179 | ba,pt %xcc, __restore_asi | |
180 | add %o2, %o4, %o0 | |
181 | ENDPROC(NG4_retl_o2_plus_o4_plus_32) | |
182 | ENTRY(NG4_retl_o2_plus_o4_plus_40) | |
183 | add %o4, 40, %o4 | |
184 | ba,pt %xcc, __restore_asi | |
185 | add %o2, %o4, %o0 | |
186 | ENDPROC(NG4_retl_o2_plus_o4_plus_40) | |
187 | ENTRY(NG4_retl_o2_plus_o4_plus_48) | |
188 | add %o4, 48, %o4 | |
189 | ba,pt %xcc, __restore_asi | |
190 | add %o2, %o4, %o0 | |
191 | ENDPROC(NG4_retl_o2_plus_o4_plus_48) | |
192 | ENTRY(NG4_retl_o2_plus_o4_plus_56) | |
193 | add %o4, 56, %o4 | |
194 | ba,pt %xcc, __restore_asi | |
195 | add %o2, %o4, %o0 | |
196 | ENDPROC(NG4_retl_o2_plus_o4_plus_56) | |
197 | ENTRY(NG4_retl_o2_plus_o4_plus_64) | |
198 | add %o4, 64, %o4 | |
199 | ba,pt %xcc, __restore_asi | |
200 | add %o2, %o4, %o0 | |
201 | ENDPROC(NG4_retl_o2_plus_o4_plus_64) | |
202 | ENTRY(NG4_retl_o2_plus_o4_fp) | |
203 | ba,pt %xcc, __restore_asi_fp | |
204 | add %o2, %o4, %o0 | |
205 | ENDPROC(NG4_retl_o2_plus_o4_fp) | |
206 | ENTRY(NG4_retl_o2_plus_o4_plus_8_fp) | |
207 | add %o4, 8, %o4 | |
208 | ba,pt %xcc, __restore_asi_fp | |
209 | add %o2, %o4, %o0 | |
210 | ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp) | |
211 | ENTRY(NG4_retl_o2_plus_o4_plus_16_fp) | |
212 | add %o4, 16, %o4 | |
213 | ba,pt %xcc, __restore_asi_fp | |
214 | add %o2, %o4, %o0 | |
215 | ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp) | |
216 | ENTRY(NG4_retl_o2_plus_o4_plus_24_fp) | |
217 | add %o4, 24, %o4 | |
218 | ba,pt %xcc, __restore_asi_fp | |
219 | add %o2, %o4, %o0 | |
220 | ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp) | |
221 | ENTRY(NG4_retl_o2_plus_o4_plus_32_fp) | |
222 | add %o4, 32, %o4 | |
223 | ba,pt %xcc, __restore_asi_fp | |
224 | add %o2, %o4, %o0 | |
225 | ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp) | |
226 | ENTRY(NG4_retl_o2_plus_o4_plus_40_fp) | |
227 | add %o4, 40, %o4 | |
228 | ba,pt %xcc, __restore_asi_fp | |
229 | add %o2, %o4, %o0 | |
230 | ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp) | |
231 | ENTRY(NG4_retl_o2_plus_o4_plus_48_fp) | |
232 | add %o4, 48, %o4 | |
233 | ba,pt %xcc, __restore_asi_fp | |
234 | add %o2, %o4, %o0 | |
235 | ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp) | |
236 | ENTRY(NG4_retl_o2_plus_o4_plus_56_fp) | |
237 | add %o4, 56, %o4 | |
238 | ba,pt %xcc, __restore_asi_fp | |
239 | add %o2, %o4, %o0 | |
240 | ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp) | |
241 | ENTRY(NG4_retl_o2_plus_o4_plus_64_fp) | |
242 | add %o4, 64, %o4 | |
243 | ba,pt %xcc, __restore_asi_fp | |
244 | add %o2, %o4, %o0 | |
245 | ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp) | |
246 | #endif | |
ae2c6ca6 DM |
247 | .align 64 |
248 | ||
249 | .globl FUNC_NAME | |
250 | .type FUNC_NAME,#function | |
251 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
252 | #ifdef MEMCPY_DEBUG | |
253 | wr %g0, 0x80, %asi | |
254 | #endif | |
255 | srlx %o2, 31, %g2 | |
256 | cmp %g2, 0 | |
257 | tne %XCC, 5 | |
258 | PREAMBLE | |
259 | mov %o0, %o3 | |
260 | brz,pn %o2, .Lexit | |
261 | cmp %o2, 3 | |
262 | ble,pn %icc, .Ltiny | |
263 | cmp %o2, 19 | |
264 | ble,pn %icc, .Lsmall | |
265 | or %o0, %o1, %g2 | |
266 | cmp %o2, 128 | |
267 | bl,pn %icc, .Lmedium | |
268 | nop | |
269 | ||
270 | .Llarge:/* len >= 0x80 */ | |
271 | /* First get dest 8 byte aligned. */ | |
272 | sub %g0, %o0, %g1 | |
273 | and %g1, 0x7, %g1 | |
274 | brz,pt %g1, 51f | |
275 | sub %o2, %g1, %o2 | |
42a4172b | 276 | |
95707704 DM |
277 | |
278 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) | |
ae2c6ca6 DM |
279 | add %o1, 1, %o1 |
280 | subcc %g1, 1, %g1 | |
281 | add %o0, 1, %o0 | |
282 | bne,pt %icc, 1b | |
95707704 | 283 | EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1) |
ae2c6ca6 DM |
284 | |
285 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | |
286 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | |
287 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | |
288 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | |
289 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | |
290 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | |
291 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | |
292 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
293 | ||
294 | /* Check if we can use the straight fully aligned | |
295 | * loop, or we require the alignaddr/faligndata variant. | |
296 | */ | |
297 | andcc %o1, 0x7, %o5 | |
298 | bne,pn %icc, .Llarge_src_unaligned | |
299 | sub %g0, %o0, %g1 | |
300 | ||
301 | /* Legitimize the use of initializing stores by getting dest | |
302 | * to be 64-byte aligned. | |
303 | */ | |
304 | and %g1, 0x3f, %g1 | |
305 | brz,pt %g1, .Llarge_aligned | |
306 | sub %o2, %g1, %o2 | |
42a4172b | 307 | |
95707704 | 308 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) |
ae2c6ca6 DM |
309 | add %o1, 8, %o1 |
310 | subcc %g1, 8, %g1 | |
311 | add %o0, 8, %o0 | |
312 | bne,pt %icc, 1b | |
95707704 | 313 | EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8) |
ae2c6ca6 DM |
314 | |
315 | .Llarge_aligned: | |
316 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | |
317 | andn %o2, 0x3f, %o4 | |
318 | sub %o2, %o4, %o2 | |
319 | ||
95707704 | 320 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4) |
ae2c6ca6 | 321 | add %o1, 0x40, %o1 |
95707704 | 322 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4) |
ae2c6ca6 | 323 | subcc %o4, 0x40, %o4 |
95707704 DM |
324 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64) |
325 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64) | |
326 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64) | |
327 | EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64) | |
ae2c6ca6 | 328 | add %o0, 0x08, %o0 |
95707704 | 329 | EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56) |
ae2c6ca6 | 330 | add %o0, 0x08, %o0 |
95707704 DM |
331 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48) |
332 | EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48) | |
ae2c6ca6 | 333 | add %o0, 0x08, %o0 |
95707704 DM |
334 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40) |
335 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40) | |
ae2c6ca6 | 336 | add %o0, 0x08, %o0 |
95707704 DM |
337 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32) |
338 | EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32) | |
ae2c6ca6 | 339 | add %o0, 0x08, %o0 |
95707704 | 340 | EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24) |
ae2c6ca6 | 341 | add %o0, 0x08, %o0 |
95707704 | 342 | EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16) |
ae2c6ca6 | 343 | add %o0, 0x08, %o0 |
95707704 | 344 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8) |
ae2c6ca6 DM |
345 | add %o0, 0x08, %o0 |
346 | bne,pt %icc, 1b | |
347 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
348 | ||
349 | membar #StoreLoad | #StoreStore | |
350 | ||
351 | brz,pn %o2, .Lexit | |
352 | cmp %o2, 19 | |
353 | ble,pn %icc, .Lsmall_unaligned | |
354 | nop | |
355 | ba,a,pt %icc, .Lmedium_noprefetch | |
356 | ||
357 | .Lexit: retl | |
358 | mov EX_RETVAL(%o3), %o0 | |
359 | ||
360 | .Llarge_src_unaligned: | |
f4da3628 DM |
361 | #ifdef NON_USER_COPY |
362 | VISEntryHalfFast(.Lmedium_vis_entry_fail) | |
363 | #else | |
364 | VISEntryHalf | |
365 | #endif | |
ae2c6ca6 DM |
366 | andn %o2, 0x3f, %o4 |
367 | sub %o2, %o4, %o2 | |
ae2c6ca6 DM |
368 | alignaddr %o1, %g0, %g1 |
369 | add %o1, %o4, %o1 | |
95707704 DM |
370 | EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4) |
371 | 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4) | |
ae2c6ca6 | 372 | subcc %o4, 0x40, %o4 |
95707704 DM |
373 | EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64) |
374 | EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64) | |
375 | EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64) | |
376 | EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64) | |
377 | EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64) | |
378 | EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64) | |
ae2c6ca6 | 379 | faligndata %f0, %f2, %f16 |
95707704 | 380 | EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64) |
ae2c6ca6 DM |
381 | faligndata %f2, %f4, %f18 |
382 | add %g1, 0x40, %g1 | |
383 | faligndata %f4, %f6, %f20 | |
384 | faligndata %f6, %f8, %f22 | |
385 | faligndata %f8, %f10, %f24 | |
386 | faligndata %f10, %f12, %f26 | |
387 | faligndata %f12, %f14, %f28 | |
388 | faligndata %f14, %f0, %f30 | |
95707704 DM |
389 | EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64) |
390 | EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56) | |
391 | EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48) | |
392 | EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40) | |
393 | EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32) | |
394 | EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24) | |
395 | EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16) | |
396 | EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8) | |
ae2c6ca6 DM |
397 | add %o0, 0x40, %o0 |
398 | bne,pt %icc, 1b | |
399 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | |
44922150 DM |
400 | #ifdef NON_USER_COPY |
401 | VISExitHalfFast | |
402 | #else | |
ae2c6ca6 | 403 | VISExitHalf |
44922150 | 404 | #endif |
ae2c6ca6 DM |
405 | brz,pn %o2, .Lexit |
406 | cmp %o2, 19 | |
407 | ble,pn %icc, .Lsmall_unaligned | |
408 | nop | |
409 | ba,a,pt %icc, .Lmedium_unaligned | |
410 | ||
f4da3628 DM |
411 | #ifdef NON_USER_COPY |
412 | .Lmedium_vis_entry_fail: | |
413 | or %o0, %o1, %g2 | |
414 | #endif | |
ae2c6ca6 DM |
415 | .Lmedium: |
416 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | |
417 | andcc %g2, 0x7, %g0 | |
418 | bne,pn %icc, .Lmedium_unaligned | |
419 | nop | |
420 | .Lmedium_noprefetch: | |
421 | andncc %o2, 0x20 - 1, %o5 | |
422 | be,pn %icc, 2f | |
423 | sub %o2, %o5, %o2 | |
95707704 DM |
424 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) |
425 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5) | |
426 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5) | |
427 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5) | |
ae2c6ca6 DM |
428 | add %o1, 0x20, %o1 |
429 | subcc %o5, 0x20, %o5 | |
95707704 DM |
430 | EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32) |
431 | EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24) | |
432 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24) | |
433 | EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8) | |
ae2c6ca6 DM |
434 | bne,pt %icc, 1b |
435 | add %o0, 0x20, %o0 | |
436 | 2: andcc %o2, 0x18, %o5 | |
437 | be,pt %icc, 3f | |
438 | sub %o2, %o5, %o2 | |
95707704 DM |
439 | |
440 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) | |
ae2c6ca6 DM |
441 | add %o1, 0x08, %o1 |
442 | add %o0, 0x08, %o0 | |
443 | subcc %o5, 0x08, %o5 | |
444 | bne,pt %icc, 1b | |
95707704 | 445 | EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8) |
ae2c6ca6 DM |
446 | 3: brz,pt %o2, .Lexit |
447 | cmp %o2, 0x04 | |
448 | bl,pn %icc, .Ltiny | |
449 | nop | |
95707704 | 450 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2) |
ae2c6ca6 DM |
451 | add %o1, 0x04, %o1 |
452 | add %o0, 0x04, %o0 | |
453 | subcc %o2, 0x04, %o2 | |
454 | bne,pn %icc, .Ltiny | |
95707704 | 455 | EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4) |
ae2c6ca6 DM |
456 | ba,a,pt %icc, .Lexit |
457 | .Lmedium_unaligned: | |
458 | /* First get dest 8 byte aligned. */ | |
459 | sub %g0, %o0, %g1 | |
460 | and %g1, 0x7, %g1 | |
461 | brz,pt %g1, 2f | |
462 | sub %o2, %g1, %o2 | |
42a4172b | 463 | |
95707704 | 464 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1) |
ae2c6ca6 DM |
465 | add %o1, 1, %o1 |
466 | subcc %g1, 1, %g1 | |
467 | add %o0, 1, %o0 | |
468 | bne,pt %icc, 1b | |
95707704 | 469 | EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1) |
ae2c6ca6 DM |
470 | 2: |
471 | and %o1, 0x7, %g1 | |
472 | brz,pn %g1, .Lmedium_noprefetch | |
473 | sll %g1, 3, %g1 | |
474 | mov 64, %g2 | |
475 | sub %g2, %g1, %g2 | |
476 | andn %o1, 0x7, %o1 | |
95707704 | 477 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2) |
ae2c6ca6 DM |
478 | sllx %o4, %g1, %o4 |
479 | andn %o2, 0x08 - 1, %o5 | |
480 | sub %o2, %o5, %o2 | |
95707704 | 481 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5) |
ae2c6ca6 DM |
482 | add %o1, 0x08, %o1 |
483 | subcc %o5, 0x08, %o5 | |
484 | srlx %g3, %g2, GLOBAL_SPARE | |
485 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | |
95707704 | 486 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8) |
ae2c6ca6 DM |
487 | add %o0, 0x08, %o0 |
488 | bne,pt %icc, 1b | |
489 | sllx %g3, %g1, %o4 | |
490 | srl %g1, 3, %g1 | |
491 | add %o1, %g1, %o1 | |
492 | brz,pn %o2, .Lexit | |
493 | nop | |
494 | ba,pt %icc, .Lsmall_unaligned | |
495 | ||
496 | .Ltiny: | |
95707704 | 497 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2) |
ae2c6ca6 DM |
498 | subcc %o2, 1, %o2 |
499 | be,pn %icc, .Lexit | |
95707704 DM |
500 | EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1) |
501 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2) | |
ae2c6ca6 DM |
502 | subcc %o2, 1, %o2 |
503 | be,pn %icc, .Lexit | |
95707704 DM |
504 | EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1) |
505 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2) | |
ae2c6ca6 | 506 | ba,pt %icc, .Lexit |
95707704 | 507 | EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2) |
ae2c6ca6 DM |
508 | |
509 | .Lsmall: | |
510 | andcc %g2, 0x3, %g0 | |
511 | bne,pn %icc, .Lsmall_unaligned | |
512 | andn %o2, 0x4 - 1, %o5 | |
513 | sub %o2, %o5, %o2 | |
514 | 1: | |
95707704 | 515 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5) |
ae2c6ca6 DM |
516 | add %o1, 0x04, %o1 |
517 | subcc %o5, 0x04, %o5 | |
518 | add %o0, 0x04, %o0 | |
519 | bne,pt %icc, 1b | |
95707704 | 520 | EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4) |
ae2c6ca6 DM |
521 | brz,pt %o2, .Lexit |
522 | nop | |
523 | ba,a,pt %icc, .Ltiny | |
524 | ||
525 | .Lsmall_unaligned: | |
95707704 | 526 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2) |
ae2c6ca6 DM |
527 | add %o1, 1, %o1 |
528 | add %o0, 1, %o0 | |
529 | subcc %o2, 1, %o2 | |
530 | bne,pt %icc, 1b | |
95707704 | 531 | EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1) |
ae2c6ca6 | 532 | ba,a,pt %icc, .Lexit |
0ae2d26f | 533 | nop |
ae2c6ca6 | 534 | .size FUNC_NAME, .-FUNC_NAME |