]>
git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2015 Cavium, Inc
5 #ifndef _RTE_MEMCPY_ARM64_H_
6 #define _RTE_MEMCPY_ARM64_H_
15 #include "generic/rte_memcpy.h"
17 #ifdef RTE_ARCH_ARM64_MEMCPY
18 #include <rte_common.h>
19 #include <rte_branch_prediction.h>
22 * The memory copy performance differs on different AArch64 micro-architectures.
23 * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
24 * performance compared to old glibc versions. It's always suggested to use a
25 * more recent glibc if possible, from which the entire system can get benefit.
27 * This implementation improves memory copy on some aarch64 micro-architectures,
28 * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
29 * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
30 * always providing better performance than memcpy() so users need to run unit
31 * test "memcpy_perf_autotest" and customize parameters in customization section
32 * below for best performance.
34 * Compiler version will also impact the rte_memcpy() performance. It's observed
35 * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
36 * provide better performance than GCC 4.8.5 compiled binaries.
39 /**************************************
40 * Beginning of customization section
41 **************************************/
42 #ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
43 #define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
46 #ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
47 /* Only src unalignment will be treated as unaligned copy */
48 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
49 ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
51 /* Both dst and src unalignment will be treated as unaligned copy */
52 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
53 (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
58 * If copy size is larger than threshold, memcpy() will be used.
59 * Run "memcpy_perf_autotest" to determine the proper threshold.
61 #ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
62 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
63 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
64 n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
66 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
67 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
69 #ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
70 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
71 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
72 n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
74 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
75 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
78 * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
80 #if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
81 || defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
82 #define USE_RTE_MEMCPY(dst, src, n) \
83 (USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
85 #define USE_RTE_MEMCPY(dst, src, n) (1)
87 /**************************************
88 * End of customization section
89 **************************************/
92 #if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
93 #if (GCC_VERSION < 50400)
94 #warning "The GCC version is quite old, which may result in sub-optimal \
95 performance of the compiled code. It is suggested that at least GCC 5.4.0 \
100 static __rte_always_inline
101 void rte_mov16(uint8_t *dst
, const uint8_t *src
)
103 __uint128_t
*dst128
= (__uint128_t
*)dst
;
104 const __uint128_t
*src128
= (const __uint128_t
*)src
;
108 static __rte_always_inline
109 void rte_mov32(uint8_t *dst
, const uint8_t *src
)
111 __uint128_t
*dst128
= (__uint128_t
*)dst
;
112 const __uint128_t
*src128
= (const __uint128_t
*)src
;
113 const __uint128_t x0
= src128
[0], x1
= src128
[1];
118 static __rte_always_inline
119 void rte_mov48(uint8_t *dst
, const uint8_t *src
)
121 __uint128_t
*dst128
= (__uint128_t
*)dst
;
122 const __uint128_t
*src128
= (const __uint128_t
*)src
;
123 const __uint128_t x0
= src128
[0], x1
= src128
[1], x2
= src128
[2];
129 static __rte_always_inline
130 void rte_mov64(uint8_t *dst
, const uint8_t *src
)
132 __uint128_t
*dst128
= (__uint128_t
*)dst
;
133 const __uint128_t
*src128
= (const __uint128_t
*)src
;
135 x0
= src128
[0], x1
= src128
[1], x2
= src128
[2], x3
= src128
[3];
142 static __rte_always_inline
143 void rte_mov128(uint8_t *dst
, const uint8_t *src
)
145 __uint128_t
*dst128
= (__uint128_t
*)dst
;
146 const __uint128_t
*src128
= (const __uint128_t
*)src
;
147 /* Keep below declaration & copy sequence for optimized instructions */
149 x0
= src128
[0], x1
= src128
[1], x2
= src128
[2], x3
= src128
[3];
151 __uint128_t x4
= src128
[4];
153 __uint128_t x5
= src128
[5];
155 __uint128_t x6
= src128
[6];
157 __uint128_t x7
= src128
[7];
164 static __rte_always_inline
165 void rte_mov256(uint8_t *dst
, const uint8_t *src
)
167 rte_mov128(dst
, src
);
168 rte_mov128(dst
+ 128, src
+ 128);
171 static __rte_always_inline
void
172 rte_memcpy_lt16(uint8_t *dst
, const uint8_t *src
, size_t n
)
175 /* copy 8 ~ 15 bytes */
176 *(uint64_t *)dst
= *(const uint64_t *)src
;
177 *(uint64_t *)(dst
- 8 + n
) = *(const uint64_t *)(src
- 8 + n
);
178 } else if (n
& 0x04) {
179 /* copy 4 ~ 7 bytes */
180 *(uint32_t *)dst
= *(const uint32_t *)src
;
181 *(uint32_t *)(dst
- 4 + n
) = *(const uint32_t *)(src
- 4 + n
);
182 } else if (n
& 0x02) {
183 /* copy 2 ~ 3 bytes */
184 *(uint16_t *)dst
= *(const uint16_t *)src
;
185 *(uint16_t *)(dst
- 2 + n
) = *(const uint16_t *)(src
- 2 + n
);
186 } else if (n
& 0x01) {
192 static __rte_always_inline
193 void rte_memcpy_ge16_lt128(uint8_t *dst
, const uint8_t *src
, size_t n
)
198 } else if (n
<= 32) {
200 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
201 } else if (n
<= 48) {
203 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
206 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
209 rte_mov64((uint8_t *)dst
, (const uint8_t *)src
);
211 rte_mov64(dst
- 64 + n
, src
- 64 + n
);
212 else if (n
> 32 + 64)
213 rte_mov48(dst
- 48 + n
, src
- 48 + n
);
214 else if (n
> 16 + 64)
215 rte_mov32(dst
- 32 + n
, src
- 32 + n
);
217 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
221 static __rte_always_inline
222 void rte_memcpy_ge128(uint8_t *dst
, const uint8_t *src
, size_t n
)
225 rte_mov128(dst
, src
);
229 } while (likely(n
>= 128));
233 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
235 rte_mov32(dst
- 32 + n
, src
- 32 + n
);
237 rte_mov48(dst
- 48 + n
, src
- 48 + n
);
239 rte_mov64(dst
- 64 + n
, src
- 64 + n
);
241 rte_memcpy_ge16_lt128(dst
, src
, n
);
245 static __rte_always_inline
246 void rte_memcpy_ge16_lt64(uint8_t *dst
, const uint8_t *src
, size_t n
)
250 } else if (n
<= 32) {
252 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
253 } else if (n
<= 48) {
255 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
258 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
262 static __rte_always_inline
263 void rte_memcpy_ge64(uint8_t *dst
, const uint8_t *src
, size_t n
)
270 } while (likely(n
>= 64));
274 rte_mov16(dst
- 16 + n
, src
- 16 + n
);
276 rte_mov32(dst
- 32 + n
, src
- 32 + n
);
278 rte_mov48(dst
- 48 + n
, src
- 48 + n
);
280 rte_mov64(dst
- 64 + n
, src
- 64 + n
);
284 #if RTE_CACHE_LINE_SIZE >= 128
285 static __rte_always_inline
286 void *rte_memcpy(void *dst
, const void *src
, size_t n
)
289 rte_memcpy_lt16((uint8_t *)dst
, (const uint8_t *)src
, n
);
293 rte_memcpy_ge16_lt128((uint8_t *)dst
, (const uint8_t *)src
, n
);
296 __builtin_prefetch(src
, 0, 0);
297 __builtin_prefetch(dst
, 1, 0);
298 if (likely(USE_RTE_MEMCPY(dst
, src
, n
))) {
299 rte_memcpy_ge128((uint8_t *)dst
, (const uint8_t *)src
, n
);
302 return memcpy(dst
, src
, n
);
306 static __rte_always_inline
307 void *rte_memcpy(void *dst
, const void *src
, size_t n
)
310 rte_memcpy_lt16((uint8_t *)dst
, (const uint8_t *)src
, n
);
314 rte_memcpy_ge16_lt64((uint8_t *)dst
, (const uint8_t *)src
, n
);
317 __builtin_prefetch(src
, 0, 0);
318 __builtin_prefetch(dst
, 1, 0);
319 if (likely(USE_RTE_MEMCPY(dst
, src
, n
))) {
320 rte_memcpy_ge64((uint8_t *)dst
, (const uint8_t *)src
, n
);
323 return memcpy(dst
, src
, n
);
325 #endif /* RTE_CACHE_LINE_SIZE >= 128 */
329 rte_mov16(uint8_t *dst
, const uint8_t *src
)
331 memcpy(dst
, src
, 16);
335 rte_mov32(uint8_t *dst
, const uint8_t *src
)
337 memcpy(dst
, src
, 32);
341 rte_mov48(uint8_t *dst
, const uint8_t *src
)
343 memcpy(dst
, src
, 48);
347 rte_mov64(uint8_t *dst
, const uint8_t *src
)
349 memcpy(dst
, src
, 64);
353 rte_mov128(uint8_t *dst
, const uint8_t *src
)
355 memcpy(dst
, src
, 128);
359 rte_mov256(uint8_t *dst
, const uint8_t *src
)
361 memcpy(dst
, src
, 256);
364 #define rte_memcpy(d, s, n) memcpy((d), (s), (n))
366 #endif /* RTE_ARCH_ARM64_MEMCPY */
372 #endif /* _RTE_MEMCPY_ARM_64_H_ */