ceph/src/spdk/dpdk/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2015 Cavium, Inc
   3  */
   4
   5 #ifndef _RTE_MEMCPY_ARM64_H_
   6 #define _RTE_MEMCPY_ARM64_H_
   7
   8 #ifdef __cplusplus
   9 extern "C" {
  10 #endif
  11
  12 #include <stdint.h>
  13 #include <string.h>
  14
  15 #include "generic/rte_memcpy.h"
  16
  17 #ifdef RTE_ARCH_ARM64_MEMCPY
  18 #include <rte_common.h>
  19 #include <rte_branch_prediction.h>
  20
  21 /*
  22  * The memory copy performance differs on different AArch64 micro-architectures.
  23  * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
  24  * performance compared to old glibc versions. It's always suggested to use a
  25  * more recent glibc if possible, from which the entire system can get benefit.
  26  *
  27  * This implementation improves memory copy on some aarch64 micro-architectures,
  28  * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
  29  * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
  30  * always providing better performance than memcpy() so users need to run unit
  31  * test "memcpy_perf_autotest" and customize parameters in customization section
  32  * below for best performance.
  33  *
  34  * Compiler version will also impact the rte_memcpy() performance. It's observed
  35  * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
  36  * provide better performance than GCC 4.8.5 compiled binaries.
  37  */
  38
  39 /**************************************
  40  * Beginning of customization section
  41  **************************************/
  42 #ifndef RTE_ARM64_MEMCPY_ALIGN_MASK
  43 #define RTE_ARM64_MEMCPY_ALIGN_MASK ((RTE_CACHE_LINE_SIZE >> 3) - 1)
  44 #endif
  45
  46 #ifndef RTE_ARM64_MEMCPY_STRICT_ALIGN
  47 /* Only src unalignment will be treated as unaligned copy */
  48 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
  49         ((uintptr_t)(src) & RTE_ARM64_MEMCPY_ALIGN_MASK)
  50 #else
  51 /* Both dst and src unalignment will be treated as unaligned copy */
  52 #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
  53         (((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
  54 #endif
  55
  56
  57 /*
  58  * If copy size is larger than threshold, memcpy() will be used.
  59  * Run "memcpy_perf_autotest" to determine the proper threshold.
  60  */
  61 #ifdef RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
  62 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
  63 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
  64 n <= (size_t)RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD)
  65 #else
  66 #define USE_ALIGNED_RTE_MEMCPY(dst, src, n) \
  67 (!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
  68 #endif
  69 #ifdef RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
  70 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
  71 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
  72 n <= (size_t)RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
  73 #else
  74 #define USE_UNALIGNED_RTE_MEMCPY(dst, src, n) \
  75 (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src))
  76 #endif
  77 /*
  78  * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
  79  */
  80 #if defined(RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
  81 || defined(RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD)
  82 #define USE_RTE_MEMCPY(dst, src, n) \
  83 (USE_ALIGNED_RTE_MEMCPY(dst, src, n) || USE_UNALIGNED_RTE_MEMCPY(dst, src, n))
  84 #else
  85 #define USE_RTE_MEMCPY(dst, src, n) (1)
  86 #endif
  87 /**************************************
  88  * End of customization section
  89  **************************************/
  90
  91
  92 #if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
  93 #if (GCC_VERSION < 50400)
  94 #warning "The GCC version is quite old, which may result in sub-optimal \
  95 performance of the compiled code. It is suggested that at least GCC 5.4.0 \
  96 be used."
  97 #endif
  98 #endif
  99
 100 static __rte_always_inline
 101 void rte_mov16(uint8_t *dst, const uint8_t *src)
 102 {
 103         __uint128_t *dst128 = (__uint128_t *)dst;
 104         const __uint128_t *src128 = (const __uint128_t *)src;
 105         *dst128 = *src128;
 106 }
 107
 108 static __rte_always_inline
 109 void rte_mov32(uint8_t *dst, const uint8_t *src)
 110 {
 111         __uint128_t *dst128 = (__uint128_t *)dst;
 112         const __uint128_t *src128 = (const __uint128_t *)src;
 113         const __uint128_t x0 = src128[0], x1 = src128[1];
 114         dst128[0] = x0;
 115         dst128[1] = x1;
 116 }
 117
 118 static __rte_always_inline
 119 void rte_mov48(uint8_t *dst, const uint8_t *src)
 120 {
 121         __uint128_t *dst128 = (__uint128_t *)dst;
 122         const __uint128_t *src128 = (const __uint128_t *)src;
 123         const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
 124         dst128[0] = x0;
 125         dst128[1] = x1;
 126         dst128[2] = x2;
 127 }
 128
 129 static __rte_always_inline
 130 void rte_mov64(uint8_t *dst, const uint8_t *src)
 131 {
 132         __uint128_t *dst128 = (__uint128_t *)dst;
 133         const __uint128_t *src128 = (const __uint128_t *)src;
 134         const __uint128_t
 135                 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
 136         dst128[0] = x0;
 137         dst128[1] = x1;
 138         dst128[2] = x2;
 139         dst128[3] = x3;
 140 }
 141
 142 static __rte_always_inline
 143 void rte_mov128(uint8_t *dst, const uint8_t *src)
 144 {
 145         __uint128_t *dst128 = (__uint128_t *)dst;
 146         const __uint128_t *src128 = (const __uint128_t *)src;
 147         /* Keep below declaration & copy sequence for optimized instructions */
 148         const __uint128_t
 149                 x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
 150         dst128[0] = x0;
 151         __uint128_t x4 = src128[4];
 152         dst128[1] = x1;
 153         __uint128_t x5 = src128[5];
 154         dst128[2] = x2;
 155         __uint128_t x6 = src128[6];
 156         dst128[3] = x3;
 157         __uint128_t x7 = src128[7];
 158         dst128[4] = x4;
 159         dst128[5] = x5;
 160         dst128[6] = x6;
 161         dst128[7] = x7;
 162 }
 163
 164 static __rte_always_inline
 165 void rte_mov256(uint8_t *dst, const uint8_t *src)
 166 {
 167         rte_mov128(dst, src);
 168         rte_mov128(dst + 128, src + 128);
 169 }
 170
 171 static __rte_always_inline void
 172 rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
 173 {
 174         if (n & 0x08) {
 175                 /* copy 8 ~ 15 bytes */
 176                 *(uint64_t *)dst = *(const uint64_t *)src;
 177                 *(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
 178         } else if (n & 0x04) {
 179                 /* copy 4 ~ 7 bytes */
 180                 *(uint32_t *)dst = *(const uint32_t *)src;
 181                 *(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
 182         } else if (n & 0x02) {
 183                 /* copy 2 ~ 3 bytes */
 184                 *(uint16_t *)dst = *(const uint16_t *)src;
 185                 *(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
 186         } else if (n & 0x01) {
 187                 /* copy 1 byte */
 188                 *dst = *src;
 189         }
 190 }
 191
 192 static __rte_always_inline
 193 void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
 194 {
 195         if (n < 64) {
 196                 if (n == 16) {
 197                         rte_mov16(dst, src);
 198                 } else if (n <= 32) {
 199                         rte_mov16(dst, src);
 200                         rte_mov16(dst - 16 + n, src - 16 + n);
 201                 } else if (n <= 48) {
 202                         rte_mov32(dst, src);
 203                         rte_mov16(dst - 16 + n, src - 16 + n);
 204                 } else {
 205                         rte_mov48(dst, src);
 206                         rte_mov16(dst - 16 + n, src - 16 + n);
 207                 }
 208         } else {
 209                 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
 210                 if (n > 48 + 64)
 211                         rte_mov64(dst - 64 + n, src - 64 + n);
 212                 else if (n > 32 + 64)
 213                         rte_mov48(dst - 48 + n, src - 48 + n);
 214                 else if (n > 16 + 64)
 215                         rte_mov32(dst - 32 + n, src - 32 + n);
 216                 else if (n > 64)
 217                         rte_mov16(dst - 16 + n, src - 16 + n);
 218         }
 219 }
 220
 221 static __rte_always_inline
 222 void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
 223 {
 224         do {
 225                 rte_mov128(dst, src);
 226                 src += 128;
 227                 dst += 128;
 228                 n -= 128;
 229         } while (likely(n >= 128));
 230
 231         if (likely(n)) {
 232                 if (n <= 16)
 233                         rte_mov16(dst - 16 + n, src - 16 + n);
 234                 else if (n <= 32)
 235                         rte_mov32(dst - 32 + n, src - 32 + n);
 236                 else if (n <= 48)
 237                         rte_mov48(dst - 48 + n, src - 48 + n);
 238                 else if (n <= 64)
 239                         rte_mov64(dst - 64 + n, src - 64 + n);
 240                 else
 241                         rte_memcpy_ge16_lt128(dst, src, n);
 242         }
 243 }
 244
 245 static __rte_always_inline
 246 void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
 247 {
 248         if (n == 16) {
 249                 rte_mov16(dst, src);
 250         } else if (n <= 32) {
 251                 rte_mov16(dst, src);
 252                 rte_mov16(dst - 16 + n, src - 16 + n);
 253         } else if (n <= 48) {
 254                 rte_mov32(dst, src);
 255                 rte_mov16(dst - 16 + n, src - 16 + n);
 256         } else {
 257                 rte_mov48(dst, src);
 258                 rte_mov16(dst - 16 + n, src - 16 + n);
 259         }
 260 }
 261
 262 static __rte_always_inline
 263 void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
 264 {
 265         do {
 266                 rte_mov64(dst, src);
 267                 src += 64;
 268                 dst += 64;
 269                 n -= 64;
 270         } while (likely(n >= 64));
 271
 272         if (likely(n)) {
 273                 if (n <= 16)
 274                         rte_mov16(dst - 16 + n, src - 16 + n);
 275                 else if (n <= 32)
 276                         rte_mov32(dst - 32 + n, src - 32 + n);
 277                 else if (n <= 48)
 278                         rte_mov48(dst - 48 + n, src - 48 + n);
 279                 else
 280                         rte_mov64(dst - 64 + n, src - 64 + n);
 281         }
 282 }
 283
 284 #if RTE_CACHE_LINE_SIZE >= 128
 285 static __rte_always_inline
 286 void *rte_memcpy(void *dst, const void *src, size_t n)
 287 {
 288         if (n < 16) {
 289                 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
 290                 return dst;
 291         }
 292         if (n < 128) {
 293                 rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
 294                 return dst;
 295         }
 296         __builtin_prefetch(src, 0, 0);
 297         __builtin_prefetch(dst, 1, 0);
 298         if (likely(USE_RTE_MEMCPY(dst, src, n))) {
 299                 rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
 300                 return dst;
 301         } else
 302                 return memcpy(dst, src, n);
 303 }
 304
 305 #else
 306 static __rte_always_inline
 307 void *rte_memcpy(void *dst, const void *src, size_t n)
 308 {
 309         if (n < 16) {
 310                 rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
 311                 return dst;
 312         }
 313         if (n < 64) {
 314                 rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
 315                 return dst;
 316         }
 317         __builtin_prefetch(src, 0, 0);
 318         __builtin_prefetch(dst, 1, 0);
 319         if (likely(USE_RTE_MEMCPY(dst, src, n))) {
 320                 rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
 321                 return dst;
 322         } else
 323                 return memcpy(dst, src, n);
 324 }
 325 #endif /* RTE_CACHE_LINE_SIZE >= 128 */
 326
 327 #else
 328 static inline void
 329 rte_mov16(uint8_t *dst, const uint8_t *src)
 330 {
 331         memcpy(dst, src, 16);
 332 }
 333
 334 static inline void
 335 rte_mov32(uint8_t *dst, const uint8_t *src)
 336 {
 337         memcpy(dst, src, 32);
 338 }
 339
 340 static inline void
 341 rte_mov48(uint8_t *dst, const uint8_t *src)
 342 {
 343         memcpy(dst, src, 48);
 344 }
 345
 346 static inline void
 347 rte_mov64(uint8_t *dst, const uint8_t *src)
 348 {
 349         memcpy(dst, src, 64);
 350 }
 351
 352 static inline void
 353 rte_mov128(uint8_t *dst, const uint8_t *src)
 354 {
 355         memcpy(dst, src, 128);
 356 }
 357
 358 static inline void
 359 rte_mov256(uint8_t *dst, const uint8_t *src)
 360 {
 361         memcpy(dst, src, 256);
 362 }
 363
 364 #define rte_memcpy(d, s, n)     memcpy((d), (s), (n))
 365
 366 #endif /* RTE_ARCH_ARM64_MEMCPY */
 367
 368 #ifdef __cplusplus
 369 }
 370 #endif
 371
 372 #endif /* _RTE_MEMCPY_ARM_64_H_ */