]>
git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/parisc/lib/memcpy.c
2 * Optimized memory copy routines.
4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 * Portions derived from the GNU C Library
21 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
23 * Several strategies are tried to try to get the best performance for various
24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26 * general registers. Unaligned copies are handled either by aligning the
27 * destination and then using shift-and-write method, or in a few cases by
28 * falling back to a byte-at-a-time copy.
30 * I chose to implement this in C because it is easier to maintain and debug,
31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32 * at the time of writing) is fairly optimal. Unfortunately some of the
33 * semantics of the copy routine (exception handling) is difficult to express
34 * in C, so we have to play some tricks to get it to work.
36 * All the loads and stores are done via explicit asm() code in order to use
37 * the right space registers.
39 * Testing with various alignments and buffer sizes shows that this code is
40 * often >10x faster than a simple byte-at-a-time copy, even for strangely
41 * aligned operands. It is interesting to note that the glibc version
42 * of memcpy (written in C) is actually quite fast already. This routine is
43 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44 * but in some cases the glibc version is still slightly faster. This lends
45 * more credibility that gcc can generate very good code as long as we are
49 * - cache prefetching needs more experimentation to get optimal settings
50 * - try not to use the post-increment address modifiers; they create additional
52 * - replace byte-copy loops with stybs sequences
56 #include <linux/config.h>
57 #include <linux/module.h>
58 #include <linux/compiler.h>
59 #include <asm/uaccess.h>
60 #define s_space "%%sr1"
61 #define d_space "%%sr2"
64 #define s_space "%%sr0"
65 #define d_space "%%sr0"
66 #define pa_memcpy new2_copy
69 DECLARE_PER_CPU ( struct exception_data
, exception_data
);
71 #define preserve_branch(label) do { \
73 /* The following branch is never taken, it's just here to */ \
74 /* prevent gcc from optimizing away our exception code. */ \
75 if (unlikely(dummy != dummy)) \
79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80 #define get_kernel_space() (0)
82 #define MERGE(w0, sh_1, w1, sh_2) ({ \
86 "shrpw %1, %2, %%sar, %0 \n " \
88 : "r" (w0), "r" (w1), "r" (sh_2) \
95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s " , __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
97 #define DPRINTF(fmt, args...)
101 #define EXC_WORD ".word"
103 #define EXC_WORD ".dword"
106 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
107 __asm__ __volatile__ ( \
108 "1: \t " #_insn ",ma " #_sz "(" _s ",%1), %0 \n " \
109 " \t .section __ex_table, \" aw \"\n " \
110 " \t " EXC_WORD " \t 1b \n " \
111 " \t " EXC_WORD " \t " #_e " \n " \
113 : _tt(_t), "+r" (_a) \
117 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
118 __asm__ __volatile__ ( \
119 "1: \t " #_insn ",ma %1, " #_sz "(" _s ",%0) \n " \
120 " \t .section __ex_table, \" aw \"\n " \
121 " \t " EXC_WORD " \t 1b \n " \
122 " \t " EXC_WORD " \t " #_e " \n " \
128 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1, "=r" ,_s,_a,_t,_e)
129 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1, "r" ,_s,_a,_t,_e)
130 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4, "=r" ,_s,_a,_t,_e)
131 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4, "r" ,_s,_a,_t,_e)
132 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8, "=f" ,_s,_a,_t,_e)
133 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8, "f" ,_s,_a,_t,_e)
135 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
136 __asm__ __volatile__ ( \
137 "1: \t " #_insn " " #_o "(" _s ",%1), %0 \n " \
138 " \t .section __ex_table, \" aw \"\n " \
139 " \t " EXC_WORD " \t 1b \n " \
140 " \t " EXC_WORD " \t " #_e " \n " \
146 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
147 __asm__ __volatile__ ( \
148 "1: \t " #_insn " %0, " #_o "(" _s ",%1) \n " \
149 " \t .section __ex_table, \" aw \"\n " \
150 " \t " EXC_WORD " \t 1b \n " \
151 " \t " EXC_WORD " \t " #_e " \n " \
157 #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw, "=r" ,_s,_o,_a,_t,_e)
158 #define stw(_s,_t,_o,_a,_e) def_store_insn(stw, "r" ,_s,_t,_o,_a,_e)
160 #ifdef CONFIG_PREFETCH
161 extern inline void prefetch_src ( const void * addr
)
163 __asm__ ( "ldw 0(" s_space
",%0), %%r0" : : "r" ( addr
));
166 extern inline void prefetch_dst ( const void * addr
)
168 __asm__ ( "ldd 0(" d_space
",%0), %%r0" : : "r" ( addr
));
171 #define prefetch_src(addr)
172 #define prefetch_dst(addr)
175 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
176 * per loop. This code is derived from glibc.
178 static inline unsigned long copy_dstaligned ( unsigned long dst
, unsigned long src
, unsigned long len
, unsigned long o_dst
, unsigned long o_src
, unsigned long o_len
)
180 /* gcc complains that a2 and a3 may be uninitialized, but actually
181 * they cannot be. Initialize a2/a3 to shut gcc up.
183 register unsigned int a0
, a1
, a2
= 0 , a3
= 0 ;
185 struct exception_data
* d
;
187 /* prefetch_src((const void *)src); */
189 /* Calculate how to shift a word read at the memory operation
190 aligned srcp to make it aligned for copy. */
191 sh_1
= 8 * ( src
% sizeof ( unsigned int ));
192 sh_2
= 8 * sizeof ( unsigned int ) - sh_1
;
194 /* Make src aligned by rounding it down. */
195 src
&= - sizeof ( unsigned int );
200 /* a1 = ((unsigned int *) src)[0];
201 a2 = ((unsigned int *) src)[1]; */
202 ldw ( s_space
, 0 , src
, a1
, cda_ldw_exc
);
203 ldw ( s_space
, 4 , src
, a2
, cda_ldw_exc
);
204 src
-= 1 * sizeof ( unsigned int );
205 dst
-= 3 * sizeof ( unsigned int );
209 /* a0 = ((unsigned int *) src)[0];
210 a1 = ((unsigned int *) src)[1]; */
211 ldw ( s_space
, 0 , src
, a0
, cda_ldw_exc
);
212 ldw ( s_space
, 4 , src
, a1
, cda_ldw_exc
);
213 src
-= 0 * sizeof ( unsigned int );
214 dst
-= 2 * sizeof ( unsigned int );
220 /* a3 = ((unsigned int *) src)[0];
221 a0 = ((unsigned int *) src)[1]; */
222 ldw ( s_space
, 0 , src
, a3
, cda_ldw_exc
);
223 ldw ( s_space
, 4 , src
, a0
, cda_ldw_exc
);
224 src
-=- 1 * sizeof ( unsigned int );
225 dst
-= 1 * sizeof ( unsigned int );
229 /* a2 = ((unsigned int *) src)[0];
230 a3 = ((unsigned int *) src)[1]; */
231 ldw ( s_space
, 0 , src
, a2
, cda_ldw_exc
);
232 ldw ( s_space
, 4 , src
, a3
, cda_ldw_exc
);
233 src
-=- 2 * sizeof ( unsigned int );
234 dst
-= 0 * sizeof ( unsigned int );
238 goto do4
; /* No-op. */
243 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
245 /* a0 = ((unsigned int *) src)[0]; */
246 ldw ( s_space
, 0 , src
, a0
, cda_ldw_exc
);
247 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
248 stw ( d_space
, MERGE ( a2
, sh_1
, a3
, sh_2
), 0 , dst
, cda_stw_exc
);
250 /* a1 = ((unsigned int *) src)[1]; */
251 ldw ( s_space
, 4 , src
, a1
, cda_ldw_exc
);
252 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
253 stw ( d_space
, MERGE ( a3
, sh_1
, a0
, sh_2
), 4 , dst
, cda_stw_exc
);
255 /* a2 = ((unsigned int *) src)[2]; */
256 ldw ( s_space
, 8 , src
, a2
, cda_ldw_exc
);
257 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
258 stw ( d_space
, MERGE ( a0
, sh_1
, a1
, sh_2
), 8 , dst
, cda_stw_exc
);
260 /* a3 = ((unsigned int *) src)[3]; */
261 ldw ( s_space
, 12 , src
, a3
, cda_ldw_exc
);
262 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
263 stw ( d_space
, MERGE ( a1
, sh_1
, a2
, sh_2
), 12 , dst
, cda_stw_exc
);
265 src
+= 4 * sizeof ( unsigned int );
266 dst
+= 4 * sizeof ( unsigned int );
272 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
273 stw ( d_space
, MERGE ( a2
, sh_1
, a3
, sh_2
), 0 , dst
, cda_stw_exc
);
275 preserve_branch ( handle_load_error
);
276 preserve_branch ( handle_store_error
);
281 __asm__
__volatile__ ( "cda_ldw_exc: \n " );
282 d
= & __get_cpu_var ( exception_data
);
283 DPRINTF ( "cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu \n " ,
284 o_len
, d
-> fault_addr
, o_src
, o_len
- d
-> fault_addr
+ o_src
);
285 return o_len
* 4 - d
-> fault_addr
+ o_src
;
288 __asm__
__volatile__ ( "cda_stw_exc: \n " );
289 d
= & __get_cpu_var ( exception_data
);
290 DPRINTF ( "cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu \n " ,
291 o_len
, d
-> fault_addr
, o_dst
, o_len
- d
-> fault_addr
+ o_dst
);
292 return o_len
* 4 - d
-> fault_addr
+ o_dst
;
296 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
297 unsigned long pa_memcpy ( void * dstp
, const void * srcp
, unsigned long len
)
299 register unsigned long src
, dst
, t1
, t2
, t3
;
300 register unsigned char * pcs
, * pcd
;
301 register unsigned int * pws
, * pwd
;
302 register double * pds
, * pdd
;
303 unsigned long ret
= 0 ;
304 unsigned long o_dst
, o_src
, o_len
;
305 struct exception_data
* d
;
307 src
= ( unsigned long ) srcp
;
308 dst
= ( unsigned long ) dstp
;
309 pcs
= ( unsigned char *) srcp
;
310 pcd
= ( unsigned char *) dstp
;
312 o_dst
= dst
; o_src
= src
; o_len
= len
;
314 /* prefetch_src((const void *)srcp); */
319 /* Check alignment */
321 if ( unlikely ( t1
& ( sizeof ( double )- 1 )))
324 /* src and dst have same alignment. */
326 /* Copy bytes till we are double-aligned. */
327 t2
= src
& ( sizeof ( double ) - 1 );
328 if ( unlikely ( t2
!= 0 )) {
329 t2
= sizeof ( double ) - t2
;
331 /* *pcd++ = *pcs++; */
332 ldbma ( s_space
, pcs
, t3
, pmc_load_exc
);
334 stbma ( d_space
, t3
, pcd
, pmc_store_exc
);
342 /* Copy 8 doubles at a time */
343 while ( len
>= 8 * sizeof ( double )) {
344 register double r1
, r2
, r3
, r4
, r5
, r6
, r7
, r8
;
345 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
346 flddma ( s_space
, pds
, r1
, pmc_load_exc
);
347 flddma ( s_space
, pds
, r2
, pmc_load_exc
);
348 flddma ( s_space
, pds
, r3
, pmc_load_exc
);
349 flddma ( s_space
, pds
, r4
, pmc_load_exc
);
350 fstdma ( d_space
, r1
, pdd
, pmc_store_exc
);
351 fstdma ( d_space
, r2
, pdd
, pmc_store_exc
);
352 fstdma ( d_space
, r3
, pdd
, pmc_store_exc
);
353 fstdma ( d_space
, r4
, pdd
, pmc_store_exc
);
356 if ( L1_CACHE_BYTES
<= 32 )
357 prefetch_src (( char *) pds
+ L1_CACHE_BYTES
);
359 flddma ( s_space
, pds
, r5
, pmc_load_exc
);
360 flddma ( s_space
, pds
, r6
, pmc_load_exc
);
361 flddma ( s_space
, pds
, r7
, pmc_load_exc
);
362 flddma ( s_space
, pds
, r8
, pmc_load_exc
);
363 fstdma ( d_space
, r5
, pdd
, pmc_store_exc
);
364 fstdma ( d_space
, r6
, pdd
, pmc_store_exc
);
365 fstdma ( d_space
, r7
, pdd
, pmc_store_exc
);
366 fstdma ( d_space
, r8
, pdd
, pmc_store_exc
);
367 len
-= 8 * sizeof ( double );
370 pws
= ( unsigned int *) pds
;
371 pwd
= ( unsigned int *) pdd
;
374 while ( len
>= 8 * sizeof ( unsigned int )) {
375 register unsigned int r1
, r2
, r3
, r4
, r5
, r6
, r7
, r8
;
376 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
377 ldwma ( s_space
, pws
, r1
, pmc_load_exc
);
378 ldwma ( s_space
, pws
, r2
, pmc_load_exc
);
379 ldwma ( s_space
, pws
, r3
, pmc_load_exc
);
380 ldwma ( s_space
, pws
, r4
, pmc_load_exc
);
381 stwma ( d_space
, r1
, pwd
, pmc_store_exc
);
382 stwma ( d_space
, r2
, pwd
, pmc_store_exc
);
383 stwma ( d_space
, r3
, pwd
, pmc_store_exc
);
384 stwma ( d_space
, r4
, pwd
, pmc_store_exc
);
386 ldwma ( s_space
, pws
, r5
, pmc_load_exc
);
387 ldwma ( s_space
, pws
, r6
, pmc_load_exc
);
388 ldwma ( s_space
, pws
, r7
, pmc_load_exc
);
389 ldwma ( s_space
, pws
, r8
, pmc_load_exc
);
390 stwma ( d_space
, r5
, pwd
, pmc_store_exc
);
391 stwma ( d_space
, r6
, pwd
, pmc_store_exc
);
392 stwma ( d_space
, r7
, pwd
, pmc_store_exc
);
393 stwma ( d_space
, r8
, pwd
, pmc_store_exc
);
394 len
-= 8 * sizeof ( unsigned int );
397 while ( len
>= 4 * sizeof ( unsigned int )) {
398 register unsigned int r1
, r2
, r3
, r4
;
399 ldwma ( s_space
, pws
, r1
, pmc_load_exc
);
400 ldwma ( s_space
, pws
, r2
, pmc_load_exc
);
401 ldwma ( s_space
, pws
, r3
, pmc_load_exc
);
402 ldwma ( s_space
, pws
, r4
, pmc_load_exc
);
403 stwma ( d_space
, r1
, pwd
, pmc_store_exc
);
404 stwma ( d_space
, r2
, pwd
, pmc_store_exc
);
405 stwma ( d_space
, r3
, pwd
, pmc_store_exc
);
406 stwma ( d_space
, r4
, pwd
, pmc_store_exc
);
407 len
-= 4 * sizeof ( unsigned int );
410 pcs
= ( unsigned char *) pws
;
411 pcd
= ( unsigned char *) pwd
;
415 /* *pcd++ = *pcs++; */
416 ldbma ( s_space
, pcs
, t3
, pmc_load_exc
);
417 stbma ( d_space
, t3
, pcd
, pmc_store_exc
);
424 /* possibly we are aligned on a word, but not on a double... */
425 if ( likely ( t1
& ( sizeof ( unsigned int )- 1 )) == 0 ) {
426 t2
= src
& ( sizeof ( unsigned int ) - 1 );
428 if ( unlikely ( t2
!= 0 )) {
429 t2
= sizeof ( unsigned int ) - t2
;
431 /* *pcd++ = *pcs++; */
432 ldbma ( s_space
, pcs
, t3
, pmc_load_exc
);
433 stbma ( d_space
, t3
, pcd
, pmc_store_exc
);
439 pws
= ( unsigned int *) pcs
;
440 pwd
= ( unsigned int *) pcd
;
444 /* Align the destination. */
445 if ( unlikely (( dst
& ( sizeof ( unsigned int ) - 1 )) != 0 )) {
446 t2
= sizeof ( unsigned int ) - ( dst
& ( sizeof ( unsigned int ) - 1 ));
448 /* *pcd++ = *pcs++; */
449 ldbma ( s_space
, pcs
, t3
, pmc_load_exc
);
450 stbma ( d_space
, t3
, pcd
, pmc_store_exc
);
454 dst
= ( unsigned long ) pcd
;
455 src
= ( unsigned long ) pcs
;
458 ret
= copy_dstaligned ( dst
, src
, len
/ sizeof ( unsigned int ),
459 o_dst
, o_src
, o_len
);
463 pcs
+= ( len
& - sizeof ( unsigned int ));
464 pcd
+= ( len
& - sizeof ( unsigned int ));
465 len
%= sizeof ( unsigned int );
467 preserve_branch ( handle_load_error
);
468 preserve_branch ( handle_store_error
);
473 __asm__
__volatile__ ( "pmc_load_exc: \n " );
474 d
= & __get_cpu_var ( exception_data
);
475 DPRINTF ( "pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu \n " ,
476 o_len
, d
-> fault_addr
, o_src
, o_len
- d
-> fault_addr
+ o_src
);
477 return o_len
- d
-> fault_addr
+ o_src
;
480 __asm__
__volatile__ ( "pmc_store_exc: \n " );
481 d
= & __get_cpu_var ( exception_data
);
482 DPRINTF ( "pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu \n " ,
483 o_len
, d
-> fault_addr
, o_dst
, o_len
- d
-> fault_addr
+ o_dst
);
484 return o_len
- d
-> fault_addr
+ o_dst
;
488 unsigned long copy_to_user ( void __user
* dst
, const void * src
, unsigned long len
)
490 mtsp ( get_kernel_space (), 1 );
491 mtsp ( get_user_space (), 2 );
492 return pa_memcpy (( void __force
*) dst
, src
, len
);
495 unsigned long copy_from_user ( void * dst
, const void __user
* src
, unsigned long len
)
497 mtsp ( get_user_space (), 1 );
498 mtsp ( get_kernel_space (), 2 );
499 return pa_memcpy ( dst
, ( void __force
*) src
, len
);
502 unsigned long copy_in_user ( void __user
* dst
, const void __user
* src
, unsigned long len
)
504 mtsp ( get_user_space (), 1 );
505 mtsp ( get_user_space (), 2 );
506 return pa_memcpy (( void __force
*) dst
, ( void __force
*) src
, len
);
510 void * memcpy ( void * dst
, const void * src
, size_t count
)
512 mtsp ( get_kernel_space (), 1 );
513 mtsp ( get_kernel_space (), 2 );
514 pa_memcpy ( dst
, src
, count
);
518 EXPORT_SYMBOL ( copy_to_user
);
519 EXPORT_SYMBOL ( copy_from_user
);
520 EXPORT_SYMBOL ( copy_in_user
);
521 EXPORT_SYMBOL ( memcpy
);