]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blob - arch/sparc/lib/NG4memcpy.S
Merge tag 'armsoc-cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[mirror_ubuntu-eoan-kernel.git] / arch / sparc / lib / NG4memcpy.S
1 /* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6 #ifdef __KERNEL__
7 #include <asm/visasm.h>
8 #include <asm/asi.h>
9 #define GLOBAL_SPARE %g7
10 #else
11 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12 #define FPRS_FEF 0x04
13
14 /* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17 #define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24 #ifdef MEMCPY_DEBUG
25 #define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28 #else
29 #define VISEntryHalf FPU_ENTER
30 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31 #endif
32
33 #define GLOBAL_SPARE %g5
34 #endif
35
36 #ifndef STORE_ASI
37 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39 #else
40 #define STORE_ASI 0x80 /* ASI_P */
41 #endif
42 #endif
43
44 #if !defined(EX_LD) && !defined(EX_ST)
45 #define NON_USER_COPY
46 #endif
47
48 #ifndef EX_LD
49 #define EX_LD(x) x
50 #endif
51
52 #ifndef EX_ST
53 #define EX_ST(x) x
54 #endif
55
56 #ifndef EX_RETVAL
57 #define EX_RETVAL(x) x
58 #endif
59
60 #ifndef LOAD
61 #define LOAD(type,addr,dest) type [addr], dest
62 #endif
63
64 #ifndef STORE
65 #ifndef MEMCPY_DEBUG
66 #define STORE(type,src,addr) type src, [addr]
67 #else
68 #define STORE(type,src,addr) type##a src, [addr] %asi
69 #endif
70 #endif
71
72 #ifndef STORE_INIT
73 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
74 #endif
75
76 #ifndef FUNC_NAME
77 #define FUNC_NAME NG4memcpy
78 #endif
79 #ifndef PREAMBLE
80 #define PREAMBLE
81 #endif
82
83 #ifndef XCC
84 #define XCC xcc
85 #endif
86
87 .register %g2,#scratch
88 .register %g3,#scratch
89
90 .text
91 .align 64
92
93 .globl FUNC_NAME
94 .type FUNC_NAME,#function
95 FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
96 #ifdef MEMCPY_DEBUG
97 wr %g0, 0x80, %asi
98 #endif
99 srlx %o2, 31, %g2
100 cmp %g2, 0
101 tne %XCC, 5
102 PREAMBLE
103 mov %o0, %o3
104 brz,pn %o2, .Lexit
105 cmp %o2, 3
106 ble,pn %icc, .Ltiny
107 cmp %o2, 19
108 ble,pn %icc, .Lsmall
109 or %o0, %o1, %g2
110 cmp %o2, 128
111 bl,pn %icc, .Lmedium
112 nop
113
114 .Llarge:/* len >= 0x80 */
115 /* First get dest 8 byte aligned. */
116 sub %g0, %o0, %g1
117 and %g1, 0x7, %g1
118 brz,pt %g1, 51f
119 sub %o2, %g1, %o2
120
121 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
122 add %o1, 1, %o1
123 subcc %g1, 1, %g1
124 add %o0, 1, %o0
125 bne,pt %icc, 1b
126 EX_ST(STORE(stb, %g2, %o0 - 0x01))
127
128 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
132 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
133 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
134 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
135 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
136
137 /* Check if we can use the straight fully aligned
138 * loop, or we require the alignaddr/faligndata variant.
139 */
140 andcc %o1, 0x7, %o5
141 bne,pn %icc, .Llarge_src_unaligned
142 sub %g0, %o0, %g1
143
144 /* Legitimize the use of initializing stores by getting dest
145 * to be 64-byte aligned.
146 */
147 and %g1, 0x3f, %g1
148 brz,pt %g1, .Llarge_aligned
149 sub %o2, %g1, %o2
150
151 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
152 add %o1, 8, %o1
153 subcc %g1, 8, %g1
154 add %o0, 8, %o0
155 bne,pt %icc, 1b
156 EX_ST(STORE(stx, %g2, %o0 - 0x08))
157
158 .Llarge_aligned:
159 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
160 andn %o2, 0x3f, %o4
161 sub %o2, %o4, %o2
162
163 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
164 add %o1, 0x40, %o1
165 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
166 subcc %o4, 0x40, %o4
167 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
168 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
169 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
170 EX_ST(STORE_INIT(%g1, %o0))
171 add %o0, 0x08, %o0
172 EX_ST(STORE_INIT(%g2, %o0))
173 add %o0, 0x08, %o0
174 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
175 EX_ST(STORE_INIT(%g3, %o0))
176 add %o0, 0x08, %o0
177 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
178 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
179 add %o0, 0x08, %o0
180 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
181 EX_ST(STORE_INIT(%o5, %o0))
182 add %o0, 0x08, %o0
183 EX_ST(STORE_INIT(%g2, %o0))
184 add %o0, 0x08, %o0
185 EX_ST(STORE_INIT(%g3, %o0))
186 add %o0, 0x08, %o0
187 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
188 add %o0, 0x08, %o0
189 bne,pt %icc, 1b
190 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
191
192 membar #StoreLoad | #StoreStore
193
194 brz,pn %o2, .Lexit
195 cmp %o2, 19
196 ble,pn %icc, .Lsmall_unaligned
197 nop
198 ba,a,pt %icc, .Lmedium_noprefetch
199
200 .Lexit: retl
201 mov EX_RETVAL(%o3), %o0
202
203 .Llarge_src_unaligned:
204 #ifdef NON_USER_COPY
205 VISEntryHalfFast(.Lmedium_vis_entry_fail)
206 #else
207 VISEntryHalf
208 #endif
209 andn %o2, 0x3f, %o4
210 sub %o2, %o4, %o2
211 alignaddr %o1, %g0, %g1
212 add %o1, %o4, %o1
213 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
214 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
215 subcc %o4, 0x40, %o4
216 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
217 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
218 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
219 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
220 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
221 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
222 faligndata %f0, %f2, %f16
223 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
224 faligndata %f2, %f4, %f18
225 add %g1, 0x40, %g1
226 faligndata %f4, %f6, %f20
227 faligndata %f6, %f8, %f22
228 faligndata %f8, %f10, %f24
229 faligndata %f10, %f12, %f26
230 faligndata %f12, %f14, %f28
231 faligndata %f14, %f0, %f30
232 EX_ST(STORE(std, %f16, %o0 + 0x00))
233 EX_ST(STORE(std, %f18, %o0 + 0x08))
234 EX_ST(STORE(std, %f20, %o0 + 0x10))
235 EX_ST(STORE(std, %f22, %o0 + 0x18))
236 EX_ST(STORE(std, %f24, %o0 + 0x20))
237 EX_ST(STORE(std, %f26, %o0 + 0x28))
238 EX_ST(STORE(std, %f28, %o0 + 0x30))
239 EX_ST(STORE(std, %f30, %o0 + 0x38))
240 add %o0, 0x40, %o0
241 bne,pt %icc, 1b
242 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
243 #ifdef NON_USER_COPY
244 VISExitHalfFast
245 #else
246 VISExitHalf
247 #endif
248 brz,pn %o2, .Lexit
249 cmp %o2, 19
250 ble,pn %icc, .Lsmall_unaligned
251 nop
252 ba,a,pt %icc, .Lmedium_unaligned
253
254 #ifdef NON_USER_COPY
255 .Lmedium_vis_entry_fail:
256 or %o0, %o1, %g2
257 #endif
258 .Lmedium:
259 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
260 andcc %g2, 0x7, %g0
261 bne,pn %icc, .Lmedium_unaligned
262 nop
263 .Lmedium_noprefetch:
264 andncc %o2, 0x20 - 1, %o5
265 be,pn %icc, 2f
266 sub %o2, %o5, %o2
267 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
268 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
269 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
270 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
271 add %o1, 0x20, %o1
272 subcc %o5, 0x20, %o5
273 EX_ST(STORE(stx, %g1, %o0 + 0x00))
274 EX_ST(STORE(stx, %g2, %o0 + 0x08))
275 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
276 EX_ST(STORE(stx, %o4, %o0 + 0x18))
277 bne,pt %icc, 1b
278 add %o0, 0x20, %o0
279 2: andcc %o2, 0x18, %o5
280 be,pt %icc, 3f
281 sub %o2, %o5, %o2
282 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
283 add %o1, 0x08, %o1
284 add %o0, 0x08, %o0
285 subcc %o5, 0x08, %o5
286 bne,pt %icc, 1b
287 EX_ST(STORE(stx, %g1, %o0 - 0x08))
288 3: brz,pt %o2, .Lexit
289 cmp %o2, 0x04
290 bl,pn %icc, .Ltiny
291 nop
292 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
293 add %o1, 0x04, %o1
294 add %o0, 0x04, %o0
295 subcc %o2, 0x04, %o2
296 bne,pn %icc, .Ltiny
297 EX_ST(STORE(stw, %g1, %o0 - 0x04))
298 ba,a,pt %icc, .Lexit
299 .Lmedium_unaligned:
300 /* First get dest 8 byte aligned. */
301 sub %g0, %o0, %g1
302 and %g1, 0x7, %g1
303 brz,pt %g1, 2f
304 sub %o2, %g1, %o2
305
306 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
307 add %o1, 1, %o1
308 subcc %g1, 1, %g1
309 add %o0, 1, %o0
310 bne,pt %icc, 1b
311 EX_ST(STORE(stb, %g2, %o0 - 0x01))
312 2:
313 and %o1, 0x7, %g1
314 brz,pn %g1, .Lmedium_noprefetch
315 sll %g1, 3, %g1
316 mov 64, %g2
317 sub %g2, %g1, %g2
318 andn %o1, 0x7, %o1
319 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
320 sllx %o4, %g1, %o4
321 andn %o2, 0x08 - 1, %o5
322 sub %o2, %o5, %o2
323 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
324 add %o1, 0x08, %o1
325 subcc %o5, 0x08, %o5
326 srlx %g3, %g2, GLOBAL_SPARE
327 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
328 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
329 add %o0, 0x08, %o0
330 bne,pt %icc, 1b
331 sllx %g3, %g1, %o4
332 srl %g1, 3, %g1
333 add %o1, %g1, %o1
334 brz,pn %o2, .Lexit
335 nop
336 ba,pt %icc, .Lsmall_unaligned
337
338 .Ltiny:
339 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
340 subcc %o2, 1, %o2
341 be,pn %icc, .Lexit
342 EX_ST(STORE(stb, %g1, %o0 + 0x00))
343 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
344 subcc %o2, 1, %o2
345 be,pn %icc, .Lexit
346 EX_ST(STORE(stb, %g1, %o0 + 0x01))
347 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
348 ba,pt %icc, .Lexit
349 EX_ST(STORE(stb, %g1, %o0 + 0x02))
350
351 .Lsmall:
352 andcc %g2, 0x3, %g0
353 bne,pn %icc, .Lsmall_unaligned
354 andn %o2, 0x4 - 1, %o5
355 sub %o2, %o5, %o2
356 1:
357 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
358 add %o1, 0x04, %o1
359 subcc %o5, 0x04, %o5
360 add %o0, 0x04, %o0
361 bne,pt %icc, 1b
362 EX_ST(STORE(stw, %g1, %o0 - 0x04))
363 brz,pt %o2, .Lexit
364 nop
365 ba,a,pt %icc, .Ltiny
366
367 .Lsmall_unaligned:
368 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
369 add %o1, 1, %o1
370 add %o0, 1, %o0
371 subcc %o2, 1, %o2
372 bne,pt %icc, 1b
373 EX_ST(STORE(stb, %g1, %o0 - 0x01))
374 ba,a,pt %icc, .Lexit
375 .size FUNC_NAME, .-FUNC_NAME