]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - arch/sparc/lib/NG4memcpy.S
ARM: tango: use const and __initconst for smp_operations
[mirror_ubuntu-zesty-kernel.git] / arch / sparc / lib / NG4memcpy.S
1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6 #ifdef __KERNEL__
7 #include <asm/visasm.h>
8 #include <asm/asi.h>
9 #define GLOBAL_SPARE %g7
10 #else
11 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12 #define FPRS_FEF 0x04
13
14 /* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17 #define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24 #ifdef MEMCPY_DEBUG
25 #define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28 #else
29 #define VISEntryHalf FPU_ENTER
30 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31 #endif
32
33 #define GLOBAL_SPARE %g5
34 #endif
35
36 #ifndef STORE_ASI
37 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39 #else
40 #define STORE_ASI 0x80 /* ASI_P */
41 #endif
42 #endif
43
44 #if !defined(EX_LD) && !defined(EX_ST)
45 #define NON_USER_COPY
46 #endif
47
48 #ifndef EX_LD
49 #define EX_LD(x) x
50 #endif
51 #ifndef EX_LD_FP
52 #define EX_LD_FP(x) x
53 #endif
54
55 #ifndef EX_ST
56 #define EX_ST(x) x
57 #endif
58 #ifndef EX_ST_FP
59 #define EX_ST_FP(x) x
60 #endif
61
62 #ifndef EX_RETVAL
63 #define EX_RETVAL(x) x
64 #endif
65
66 #ifndef LOAD
67 #define LOAD(type,addr,dest) type [addr], dest
68 #endif
69
70 #ifndef STORE
71 #ifndef MEMCPY_DEBUG
72 #define STORE(type,src,addr) type src, [addr]
73 #else
74 #define STORE(type,src,addr) type##a src, [addr] %asi
75 #endif
76 #endif
77
78 #ifndef STORE_INIT
79 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
80 #endif
81
82 #ifndef FUNC_NAME
83 #define FUNC_NAME NG4memcpy
84 #endif
85 #ifndef PREAMBLE
86 #define PREAMBLE
87 #endif
88
89 #ifndef XCC
90 #define XCC xcc
91 #endif
92
93 .register %g2,#scratch
94 .register %g3,#scratch
95
96 .text
97 .align 64
98
99 .globl FUNC_NAME
100 .type FUNC_NAME,#function
101 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
102 #ifdef MEMCPY_DEBUG
103 wr %g0, 0x80, %asi
104 #endif
105 srlx %o2, 31, %g2
106 cmp %g2, 0
107 tne %XCC, 5
108 PREAMBLE
109 mov %o0, %o3
110 brz,pn %o2, .Lexit
111 cmp %o2, 3
112 ble,pn %icc, .Ltiny
113 cmp %o2, 19
114 ble,pn %icc, .Lsmall
115 or %o0, %o1, %g2
116 cmp %o2, 128
117 bl,pn %icc, .Lmedium
118 nop
119
120 .Llarge:/* len >= 0x80 */
121 /* First get dest 8 byte aligned. */
122 sub %g0, %o0, %g1
123 and %g1, 0x7, %g1
124 brz,pt %g1, 51f
125 sub %o2, %g1, %o2
126
127 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
128 add %o1, 1, %o1
129 subcc %g1, 1, %g1
130 add %o0, 1, %o0
131 bne,pt %icc, 1b
132 EX_ST(STORE(stb, %g2, %o0 - 0x01))
133
134 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
135 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
136 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
137 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
138 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
139 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
140 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
141 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
142
143 /* Check if we can use the straight fully aligned
144 * loop, or we require the alignaddr/faligndata variant.
145 */
146 andcc %o1, 0x7, %o5
147 bne,pn %icc, .Llarge_src_unaligned
148 sub %g0, %o0, %g1
149
150 /* Legitimize the use of initializing stores by getting dest
151 * to be 64-byte aligned.
152 */
153 and %g1, 0x3f, %g1
154 brz,pt %g1, .Llarge_aligned
155 sub %o2, %g1, %o2
156
157 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
158 add %o1, 8, %o1
159 subcc %g1, 8, %g1
160 add %o0, 8, %o0
161 bne,pt %icc, 1b
162 EX_ST(STORE(stx, %g2, %o0 - 0x08))
163
164 .Llarge_aligned:
165 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
166 andn %o2, 0x3f, %o4
167 sub %o2, %o4, %o2
168
169 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
170 add %o1, 0x40, %o1
171 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
172 subcc %o4, 0x40, %o4
173 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
174 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
175 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
176 EX_ST(STORE_INIT(%g1, %o0))
177 add %o0, 0x08, %o0
178 EX_ST(STORE_INIT(%g2, %o0))
179 add %o0, 0x08, %o0
180 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
181 EX_ST(STORE_INIT(%g3, %o0))
182 add %o0, 0x08, %o0
183 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
184 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
185 add %o0, 0x08, %o0
186 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
187 EX_ST(STORE_INIT(%o5, %o0))
188 add %o0, 0x08, %o0
189 EX_ST(STORE_INIT(%g2, %o0))
190 add %o0, 0x08, %o0
191 EX_ST(STORE_INIT(%g3, %o0))
192 add %o0, 0x08, %o0
193 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
194 add %o0, 0x08, %o0
195 bne,pt %icc, 1b
196 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
197
198 membar #StoreLoad | #StoreStore
199
200 brz,pn %o2, .Lexit
201 cmp %o2, 19
202 ble,pn %icc, .Lsmall_unaligned
203 nop
204 ba,a,pt %icc, .Lmedium_noprefetch
205
206 .Lexit: retl
207 mov EX_RETVAL(%o3), %o0
208
209 .Llarge_src_unaligned:
210 #ifdef NON_USER_COPY
211 VISEntryHalfFast(.Lmedium_vis_entry_fail)
212 #else
213 VISEntryHalf
214 #endif
215 andn %o2, 0x3f, %o4
216 sub %o2, %o4, %o2
217 alignaddr %o1, %g0, %g1
218 add %o1, %o4, %o1
219 EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0))
220 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2))
221 subcc %o4, 0x40, %o4
222 EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4))
223 EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6))
224 EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8))
225 EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10))
226 EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12))
227 EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14))
228 faligndata %f0, %f2, %f16
229 EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0))
230 faligndata %f2, %f4, %f18
231 add %g1, 0x40, %g1
232 faligndata %f4, %f6, %f20
233 faligndata %f6, %f8, %f22
234 faligndata %f8, %f10, %f24
235 faligndata %f10, %f12, %f26
236 faligndata %f12, %f14, %f28
237 faligndata %f14, %f0, %f30
238 EX_ST_FP(STORE(std, %f16, %o0 + 0x00))
239 EX_ST_FP(STORE(std, %f18, %o0 + 0x08))
240 EX_ST_FP(STORE(std, %f20, %o0 + 0x10))
241 EX_ST_FP(STORE(std, %f22, %o0 + 0x18))
242 EX_ST_FP(STORE(std, %f24, %o0 + 0x20))
243 EX_ST_FP(STORE(std, %f26, %o0 + 0x28))
244 EX_ST_FP(STORE(std, %f28, %o0 + 0x30))
245 EX_ST_FP(STORE(std, %f30, %o0 + 0x38))
246 add %o0, 0x40, %o0
247 bne,pt %icc, 1b
248 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
249 #ifdef NON_USER_COPY
250 VISExitHalfFast
251 #else
252 VISExitHalf
253 #endif
254 brz,pn %o2, .Lexit
255 cmp %o2, 19
256 ble,pn %icc, .Lsmall_unaligned
257 nop
258 ba,a,pt %icc, .Lmedium_unaligned
259
260 #ifdef NON_USER_COPY
261 .Lmedium_vis_entry_fail:
262 or %o0, %o1, %g2
263 #endif
264 .Lmedium:
265 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
266 andcc %g2, 0x7, %g0
267 bne,pn %icc, .Lmedium_unaligned
268 nop
269 .Lmedium_noprefetch:
270 andncc %o2, 0x20 - 1, %o5
271 be,pn %icc, 2f
272 sub %o2, %o5, %o2
273 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
274 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
275 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
276 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
277 add %o1, 0x20, %o1
278 subcc %o5, 0x20, %o5
279 EX_ST(STORE(stx, %g1, %o0 + 0x00))
280 EX_ST(STORE(stx, %g2, %o0 + 0x08))
281 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
282 EX_ST(STORE(stx, %o4, %o0 + 0x18))
283 bne,pt %icc, 1b
284 add %o0, 0x20, %o0
285 2: andcc %o2, 0x18, %o5
286 be,pt %icc, 3f
287 sub %o2, %o5, %o2
288 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
289 add %o1, 0x08, %o1
290 add %o0, 0x08, %o0
291 subcc %o5, 0x08, %o5
292 bne,pt %icc, 1b
293 EX_ST(STORE(stx, %g1, %o0 - 0x08))
294 3: brz,pt %o2, .Lexit
295 cmp %o2, 0x04
296 bl,pn %icc, .Ltiny
297 nop
298 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
299 add %o1, 0x04, %o1
300 add %o0, 0x04, %o0
301 subcc %o2, 0x04, %o2
302 bne,pn %icc, .Ltiny
303 EX_ST(STORE(stw, %g1, %o0 - 0x04))
304 ba,a,pt %icc, .Lexit
305 .Lmedium_unaligned:
306 /* First get dest 8 byte aligned. */
307 sub %g0, %o0, %g1
308 and %g1, 0x7, %g1
309 brz,pt %g1, 2f
310 sub %o2, %g1, %o2
311
312 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
313 add %o1, 1, %o1
314 subcc %g1, 1, %g1
315 add %o0, 1, %o0
316 bne,pt %icc, 1b
317 EX_ST(STORE(stb, %g2, %o0 - 0x01))
318 2:
319 and %o1, 0x7, %g1
320 brz,pn %g1, .Lmedium_noprefetch
321 sll %g1, 3, %g1
322 mov 64, %g2
323 sub %g2, %g1, %g2
324 andn %o1, 0x7, %o1
325 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
326 sllx %o4, %g1, %o4
327 andn %o2, 0x08 - 1, %o5
328 sub %o2, %o5, %o2
329 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
330 add %o1, 0x08, %o1
331 subcc %o5, 0x08, %o5
332 srlx %g3, %g2, GLOBAL_SPARE
333 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
334 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
335 add %o0, 0x08, %o0
336 bne,pt %icc, 1b
337 sllx %g3, %g1, %o4
338 srl %g1, 3, %g1
339 add %o1, %g1, %o1
340 brz,pn %o2, .Lexit
341 nop
342 ba,pt %icc, .Lsmall_unaligned
343
344 .Ltiny:
345 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
346 subcc %o2, 1, %o2
347 be,pn %icc, .Lexit
348 EX_ST(STORE(stb, %g1, %o0 + 0x00))
349 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
350 subcc %o2, 1, %o2
351 be,pn %icc, .Lexit
352 EX_ST(STORE(stb, %g1, %o0 + 0x01))
353 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
354 ba,pt %icc, .Lexit
355 EX_ST(STORE(stb, %g1, %o0 + 0x02))
356
357 .Lsmall:
358 andcc %g2, 0x3, %g0
359 bne,pn %icc, .Lsmall_unaligned
360 andn %o2, 0x4 - 1, %o5
361 sub %o2, %o5, %o2
362 1:
363 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
364 add %o1, 0x04, %o1
365 subcc %o5, 0x04, %o5
366 add %o0, 0x04, %o0
367 bne,pt %icc, 1b
368 EX_ST(STORE(stw, %g1, %o0 - 0x04))
369 brz,pt %o2, .Lexit
370 nop
371 ba,a,pt %icc, .Ltiny
372
373 .Lsmall_unaligned:
374 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
375 add %o1, 1, %o1
376 add %o0, 1, %o0
377 subcc %o2, 1, %o2
378 bne,pt %icc, 1b
379 EX_ST(STORE(stb, %g1, %o0 - 0x01))
380 ba,a,pt %icc, .Lexit
381 .size FUNC_NAME, .-FUNC_NAME