]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / common / include / arch / arm / rte_memcpy_64.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2015 Cavium, Inc
3 */
4
5 #ifndef _RTE_MEMCPY_ARM64_H_
6 #define _RTE_MEMCPY_ARM64_H_
7
8 #ifdef __cplusplus
9 extern "C" {
10 #endif
11
12 #include <stdint.h>
13 #include <string.h>
14
15 #include "generic/rte_memcpy.h"
16
17 #ifdef RTE_ARCH_ARM64_MEMCPY
18 #include <rte_common.h>
19 #include <rte_branch_prediction.h>
20
21 /*
22 * The memory copy performance differs on different AArch64 micro-architectures.
23 * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
24 * performance compared to old glibc versions. It's always suggested to use a
25 * more recent glibc if possible, from which the entire system can get benefit.
26 *
27 * This implementation improves memory copy on some aarch64 micro-architectures,
28 * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
29 * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
30 * always providing better performance than memcpy() so users need to run unit
31 * test "memcpy_perf_autotest" and customize parameters in customization section
32 * below for best performance.
33 *
34 * Compiler version will also impact the rte_memcpy() performance. It's observed
35 * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
36 * provide better performance than GCC 4.8.5 compiled binaries.
37 */
38
39 /**************************************
40 * Beginning of customization section
41 **************************************/
42 #ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
43 #define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
44 #endif
45
46 #ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
47 /* Only src unalignment will be treated as unaligned copy */
48 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
49 ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
50 #else
51 /* Both dst and src unalignment will be treated as unaligned copy */
52 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
53 (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
54 #endif
55
56
57 /*
58 * If copy size is larger than threshold, memcpy() will be used.
59 * Run "memcpy_perf_autotest" to determine the proper threshold.
60 */
61 #ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
62 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
63 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
64 n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
65 #else
66 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
67 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
68 #endif
69 #ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
70 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
71 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
72 n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
73 #else
74 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
75 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
76 #endif
77 /*
78 * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
79 */
80 #if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
81 || defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
82 #define USE_RTE_MEMCPY(dst, src, n) \
83 (USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
84 #else
85 #define USE_RTE_MEMCPY(dst, src, n) (1)
86 #endif
87 /**************************************
88 * End of customization section
89 **************************************/
90
91
92 #if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
93 #if (GCC_VERSION < 50400)
94 #warning "The GCC version is quite old, which may result in sub-optimal \
95 performance of the compiled code. It is suggested that at least GCC 5.4.0 \
96 be used."
97 #endif
98 #endif
99
100 static __rte_always_inline
101 void rte_mov16(uint8_t *dst, const uint8_t *src)
102 {
103 __uint128_t *dst128 = (__uint128_t *)dst;
104 const __uint128_t *src128 = (const __uint128_t *)src;
105 *dst128 = *src128;
106 }
107
108 static __rte_always_inline
109 void rte_mov32(uint8_t *dst, const uint8_t *src)
110 {
111 __uint128_t *dst128 = (__uint128_t *)dst;
112 const __uint128_t *src128 = (const __uint128_t *)src;
113 const __uint128_t x0 = src128[0], x1 = src128[1];
114 dst128[0] = x0;
115 dst128[1] = x1;
116 }
117
118 static __rte_always_inline
119 void rte_mov48(uint8_t *dst, const uint8_t *src)
120 {
121 __uint128_t *dst128 = (__uint128_t *)dst;
122 const __uint128_t *src128 = (const __uint128_t *)src;
123 const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
124 dst128[0] = x0;
125 dst128[1] = x1;
126 dst128[2] = x2;
127 }
128
129 static __rte_always_inline
130 void rte_mov64(uint8_t *dst, const uint8_t *src)
131 {
132 __uint128_t *dst128 = (__uint128_t *)dst;
133 const __uint128_t *src128 = (const __uint128_t *)src;
134 const __uint128_t
135 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
136 dst128[0] = x0;
137 dst128[1] = x1;
138 dst128[2] = x2;
139 dst128[3] = x3;
140 }
141
142 static __rte_always_inline
143 void rte_mov128(uint8_t *dst, const uint8_t *src)
144 {
145 __uint128_t *dst128 = (__uint128_t *)dst;
146 const __uint128_t *src128 = (const __uint128_t *)src;
147 /* Keep below declaration & copy sequence for optimized instructions */
148 const __uint128_t
149 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
150 dst128[0] = x0;
151 __uint128_t x4 = src128[4];
152 dst128[1] = x1;
153 __uint128_t x5 = src128[5];
154 dst128[2] = x2;
155 __uint128_t x6 = src128[6];
156 dst128[3] = x3;
157 __uint128_t x7 = src128[7];
158 dst128[4] = x4;
159 dst128[5] = x5;
160 dst128[6] = x6;
161 dst128[7] = x7;
162 }
163
164 static __rte_always_inline
165 void rte_mov256(uint8_t *dst, const uint8_t *src)
166 {
167 rte_mov128(dst, src);
168 rte_mov128(dst + 128, src + 128);
169 }
170
171 static __rte_always_inline void
172 rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
173 {
174 if (n & 0x08) {
175 /* copy 8 ~ 15 bytes */
176 *(uint64_t *)dst = *(const uint64_t *)src;
177 *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
178 } else if (n & 0x04) {
179 /* copy 4 ~ 7 bytes */
180 *(uint32_t *)dst = *(const uint32_t *)src;
181 *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
182 } else if (n & 0x02) {
183 /* copy 2 ~ 3 bytes */
184 *(uint16_t *)dst = *(const uint16_t *)src;
185 *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
186 } else if (n & 0x01) {
187 /* copy 1 byte */
188 *dst = *src;
189 }
190 }
191
192 static __rte_always_inline
193 void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
194 {
195 if (n < 64) {
196 if (n == 16) {
197 rte_mov16(dst, src);
198 } else if (n <= 32) {
199 rte_mov16(dst, src);
200 rte_mov16(dst - 16 + n, src - 16 + n);
201 } else if (n <= 48) {
202 rte_mov32(dst, src);
203 rte_mov16(dst - 16 + n, src - 16 + n);
204 } else {
205 rte_mov48(dst, src);
206 rte_mov16(dst - 16 + n, src - 16 + n);
207 }
208 } else {
209 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
210 if (n > 48 + 64)
211 rte_mov64(dst - 64 + n, src - 64 + n);
212 else if (n > 32 + 64)
213 rte_mov48(dst - 48 + n, src - 48 + n);
214 else if (n > 16 + 64)
215 rte_mov32(dst - 32 + n, src - 32 + n);
216 else if (n > 64)
217 rte_mov16(dst - 16 + n, src - 16 + n);
218 }
219 }
220
221 static __rte_always_inline
222 void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
223 {
224 do {
225 rte_mov128(dst, src);
226 src += 128;
227 dst += 128;
228 n -= 128;
229 } while (likely(n >= 128));
230
231 if (likely(n)) {
232 if (n <= 16)
233 rte_mov16(dst - 16 + n, src - 16 + n);
234 else if (n <= 32)
235 rte_mov32(dst - 32 + n, src - 32 + n);
236 else if (n <= 48)
237 rte_mov48(dst - 48 + n, src - 48 + n);
238 else if (n <= 64)
239 rte_mov64(dst - 64 + n, src - 64 + n);
240 else
241 rte_memcpy_ge16_lt128(dst, src, n);
242 }
243 }
244
245 static __rte_always_inline
246 void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
247 {
248 if (n == 16) {
249 rte_mov16(dst, src);
250 } else if (n <= 32) {
251 rte_mov16(dst, src);
252 rte_mov16(dst - 16 + n, src - 16 + n);
253 } else if (n <= 48) {
254 rte_mov32(dst, src);
255 rte_mov16(dst - 16 + n, src - 16 + n);
256 } else {
257 rte_mov48(dst, src);
258 rte_mov16(dst - 16 + n, src - 16 + n);
259 }
260 }
261
262 static __rte_always_inline
263 void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
264 {
265 do {
266 rte_mov64(dst, src);
267 src += 64;
268 dst += 64;
269 n -= 64;
270 } while (likely(n >= 64));
271
272 if (likely(n)) {
273 if (n <= 16)
274 rte_mov16(dst - 16 + n, src - 16 + n);
275 else if (n <= 32)
276 rte_mov32(dst - 32 + n, src - 32 + n);
277 else if (n <= 48)
278 rte_mov48(dst - 48 + n, src - 48 + n);
279 else
280 rte_mov64(dst - 64 + n, src - 64 + n);
281 }
282 }
283
284 #if RTE_CACHE_LINE_SIZE >= 128
285 static __rte_always_inline
286 void *rte_memcpy(void *dst, const void *src, size_t n)
287 {
288 if (n < 16) {
289 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
290 return dst;
291 }
292 if (n < 128) {
293 rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
294 return dst;
295 }
296 __builtin_prefetch(src, 0, 0);
297 __builtin_prefetch(dst, 1, 0);
298 if (likely(USE_RTE_MEMCPY(dst, src, n))) {
299 rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
300 return dst;
301 } else
302 return memcpy(dst, src, n);
303 }
304
305 #else
306 static __rte_always_inline
307 void *rte_memcpy(void *dst, const void *src, size_t n)
308 {
309 if (n < 16) {
310 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
311 return dst;
312 }
313 if (n < 64) {
314 rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
315 return dst;
316 }
317 __builtin_prefetch(src, 0, 0);
318 __builtin_prefetch(dst, 1, 0);
319 if (likely(USE_RTE_MEMCPY(dst, src, n))) {
320 rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
321 return dst;
322 } else
323 return memcpy(dst, src, n);
324 }
325 #endif /* RTE_CACHE_LINE_SIZE >= 128 */
326
327 #else
328 static inline void
329 rte_mov16(uint8_t *dst, const uint8_t *src)
330 {
331 memcpy(dst, src, 16);
332 }
333
334 static inline void
335 rte_mov32(uint8_t *dst, const uint8_t *src)
336 {
337 memcpy(dst, src, 32);
338 }
339
340 static inline void
341 rte_mov48(uint8_t *dst, const uint8_t *src)
342 {
343 memcpy(dst, src, 48);
344 }
345
346 static inline void
347 rte_mov64(uint8_t *dst, const uint8_t *src)
348 {
349 memcpy(dst, src, 64);
350 }
351
352 static inline void
353 rte_mov128(uint8_t *dst, const uint8_t *src)
354 {
355 memcpy(dst, src, 128);
356 }
357
358 static inline void
359 rte_mov256(uint8_t *dst, const uint8_t *src)
360 {
361 memcpy(dst, src, 256);
362 }
363
364 #define rte_memcpy(d, s, n) memcpy((d), (s), (n))
365
366 #endif /* RTE_ARCH_ARM64_MEMCPY */
367
368 #ifdef __cplusplus
369 }
370 #endif
371
372 #endif /* _RTE_MEMCPY_ARM_64_H_ */