]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/dpdk/lib/librte_eal/x86/include/rte_memcpy.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / x86 / include / rte_memcpy.h
CommitLineData
11fdf7f2
TL
1/* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
7c673cae
FG
3 */
4
5#ifndef _RTE_MEMCPY_X86_64_H_
6#define _RTE_MEMCPY_X86_64_H_
7
8/**
9 * @file
10 *
11 * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
12 */
13
14#include <stdio.h>
15#include <stdint.h>
16#include <string.h>
17#include <rte_vect.h>
11fdf7f2
TL
18#include <rte_common.h>
19#include <rte_config.h>
7c673cae
FG
20
21#ifdef __cplusplus
22extern "C" {
23#endif
24
f67539c2
TL
25#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
26#pragma GCC diagnostic push
27#pragma GCC diagnostic ignored "-Wstringop-overflow"
28#endif
29
7c673cae
FG
30/**
31 * Copy bytes from one location to another. The locations must not overlap.
32 *
33 * @note This is implemented as a macro, so it's address should not be taken
34 * and care is needed as parameter expressions may be evaluated multiple times.
35 *
36 * @param dst
37 * Pointer to the destination of the data.
38 * @param src
39 * Pointer to the source data.
40 * @param n
41 * Number of bytes to copy.
42 * @return
43 * Pointer to the destination data.
44 */
11fdf7f2
TL
45static __rte_always_inline void *
46rte_memcpy(void *dst, const void *src, size_t n);
7c673cae
FG
47
48#ifdef RTE_MACHINE_CPUFLAG_AVX512F
49
11fdf7f2
TL
50#define ALIGNMENT_MASK 0x3F
51
7c673cae
FG
52/**
53 * AVX512 implementation below
54 */
55
56/**
57 * Copy 16 bytes from one location to another,
58 * locations should not overlap.
59 */
11fdf7f2 60static __rte_always_inline void
7c673cae
FG
61rte_mov16(uint8_t *dst, const uint8_t *src)
62{
63 __m128i xmm0;
64
65 xmm0 = _mm_loadu_si128((const __m128i *)src);
66 _mm_storeu_si128((__m128i *)dst, xmm0);
67}
68
69/**
70 * Copy 32 bytes from one location to another,
71 * locations should not overlap.
72 */
11fdf7f2 73static __rte_always_inline void
7c673cae
FG
74rte_mov32(uint8_t *dst, const uint8_t *src)
75{
76 __m256i ymm0;
77
78 ymm0 = _mm256_loadu_si256((const __m256i *)src);
79 _mm256_storeu_si256((__m256i *)dst, ymm0);
80}
81
82/**
83 * Copy 64 bytes from one location to another,
84 * locations should not overlap.
85 */
11fdf7f2 86static __rte_always_inline void
7c673cae
FG
87rte_mov64(uint8_t *dst, const uint8_t *src)
88{
89 __m512i zmm0;
90
91 zmm0 = _mm512_loadu_si512((const void *)src);
92 _mm512_storeu_si512((void *)dst, zmm0);
93}
94
95/**
96 * Copy 128 bytes from one location to another,
97 * locations should not overlap.
98 */
11fdf7f2 99static __rte_always_inline void
7c673cae
FG
100rte_mov128(uint8_t *dst, const uint8_t *src)
101{
102 rte_mov64(dst + 0 * 64, src + 0 * 64);
103 rte_mov64(dst + 1 * 64, src + 1 * 64);
104}
105
106/**
107 * Copy 256 bytes from one location to another,
108 * locations should not overlap.
109 */
11fdf7f2 110static __rte_always_inline void
7c673cae
FG
111rte_mov256(uint8_t *dst, const uint8_t *src)
112{
113 rte_mov64(dst + 0 * 64, src + 0 * 64);
114 rte_mov64(dst + 1 * 64, src + 1 * 64);
115 rte_mov64(dst + 2 * 64, src + 2 * 64);
116 rte_mov64(dst + 3 * 64, src + 3 * 64);
117}
118
119/**
120 * Copy 128-byte blocks from one location to another,
121 * locations should not overlap.
122 */
f67539c2 123static __rte_always_inline void
7c673cae
FG
124rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
125{
126 __m512i zmm0, zmm1;
127
128 while (n >= 128) {
129 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
130 n -= 128;
131 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
132 src = src + 128;
133 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
134 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
135 dst = dst + 128;
136 }
137}
138
139/**
140 * Copy 512-byte blocks from one location to another,
141 * locations should not overlap.
142 */
143static inline void
144rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
145{
146 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
147
148 while (n >= 512) {
149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
150 n -= 512;
151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
152 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
153 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
154 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
155 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
156 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
157 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
158 src = src + 512;
159 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
160 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
161 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
162 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
163 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
164 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
165 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
166 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
167 dst = dst + 512;
168 }
169}
170
f67539c2 171static __rte_always_inline void *
11fdf7f2 172rte_memcpy_generic(void *dst, const void *src, size_t n)
7c673cae
FG
173{
174 uintptr_t dstu = (uintptr_t)dst;
175 uintptr_t srcu = (uintptr_t)src;
176 void *ret = dst;
177 size_t dstofss;
178 size_t bits;
179
180 /**
181 * Copy less than 16 bytes
182 */
183 if (n < 16) {
184 if (n & 0x01) {
185 *(uint8_t *)dstu = *(const uint8_t *)srcu;
186 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
187 dstu = (uintptr_t)((uint8_t *)dstu + 1);
188 }
189 if (n & 0x02) {
190 *(uint16_t *)dstu = *(const uint16_t *)srcu;
191 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
192 dstu = (uintptr_t)((uint16_t *)dstu + 1);
193 }
194 if (n & 0x04) {
195 *(uint32_t *)dstu = *(const uint32_t *)srcu;
196 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
197 dstu = (uintptr_t)((uint32_t *)dstu + 1);
198 }
199 if (n & 0x08)
200 *(uint64_t *)dstu = *(const uint64_t *)srcu;
201 return ret;
202 }
203
204 /**
205 * Fast way when copy size doesn't exceed 512 bytes
206 */
207 if (n <= 32) {
208 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
209 rte_mov16((uint8_t *)dst - 16 + n,
210 (const uint8_t *)src - 16 + n);
211 return ret;
212 }
213 if (n <= 64) {
214 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
215 rte_mov32((uint8_t *)dst - 32 + n,
216 (const uint8_t *)src - 32 + n);
217 return ret;
218 }
219 if (n <= 512) {
220 if (n >= 256) {
221 n -= 256;
222 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
223 src = (const uint8_t *)src + 256;
224 dst = (uint8_t *)dst + 256;
225 }
226 if (n >= 128) {
227 n -= 128;
228 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
229 src = (const uint8_t *)src + 128;
230 dst = (uint8_t *)dst + 128;
231 }
232COPY_BLOCK_128_BACK63:
233 if (n > 64) {
234 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
235 rte_mov64((uint8_t *)dst - 64 + n,
236 (const uint8_t *)src - 64 + n);
237 return ret;
238 }
239 if (n > 0)
240 rte_mov64((uint8_t *)dst - 64 + n,
241 (const uint8_t *)src - 64 + n);
242 return ret;
243 }
244
245 /**
246 * Make store aligned when copy size exceeds 512 bytes
247 */
248 dstofss = ((uintptr_t)dst & 0x3F);
249 if (dstofss > 0) {
250 dstofss = 64 - dstofss;
251 n -= dstofss;
252 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
253 src = (const uint8_t *)src + dstofss;
254 dst = (uint8_t *)dst + dstofss;
255 }
256
257 /**
258 * Copy 512-byte blocks.
259 * Use copy block function for better instruction order control,
260 * which is important when load is unaligned.
261 */
262 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
263 bits = n;
264 n = n & 511;
265 bits -= n;
266 src = (const uint8_t *)src + bits;
267 dst = (uint8_t *)dst + bits;
268
269 /**
270 * Copy 128-byte blocks.
271 * Use copy block function for better instruction order control,
272 * which is important when load is unaligned.
273 */
274 if (n >= 128) {
275 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
276 bits = n;
277 n = n & 127;
278 bits -= n;
279 src = (const uint8_t *)src + bits;
280 dst = (uint8_t *)dst + bits;
281 }
282
283 /**
284 * Copy whatever left
285 */
286 goto COPY_BLOCK_128_BACK63;
287}
288
289#elif defined RTE_MACHINE_CPUFLAG_AVX2
290
11fdf7f2
TL
291#define ALIGNMENT_MASK 0x1F
292
7c673cae
FG
293/**
294 * AVX2 implementation below
295 */
296
297/**
298 * Copy 16 bytes from one location to another,
299 * locations should not overlap.
300 */
11fdf7f2 301static __rte_always_inline void
7c673cae
FG
302rte_mov16(uint8_t *dst, const uint8_t *src)
303{
304 __m128i xmm0;
305
306 xmm0 = _mm_loadu_si128((const __m128i *)src);
307 _mm_storeu_si128((__m128i *)dst, xmm0);
308}
309
310/**
311 * Copy 32 bytes from one location to another,
312 * locations should not overlap.
313 */
11fdf7f2 314static __rte_always_inline void
7c673cae
FG
315rte_mov32(uint8_t *dst, const uint8_t *src)
316{
317 __m256i ymm0;
318
319 ymm0 = _mm256_loadu_si256((const __m256i *)src);
320 _mm256_storeu_si256((__m256i *)dst, ymm0);
321}
322
323/**
324 * Copy 64 bytes from one location to another,
325 * locations should not overlap.
326 */
11fdf7f2 327static __rte_always_inline void
7c673cae
FG
328rte_mov64(uint8_t *dst, const uint8_t *src)
329{
330 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
331 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
332}
333
334/**
335 * Copy 128 bytes from one location to another,
336 * locations should not overlap.
337 */
f67539c2 338static __rte_always_inline void
7c673cae
FG
339rte_mov128(uint8_t *dst, const uint8_t *src)
340{
341 rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
342 rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
343 rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
344 rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
345}
346
347/**
348 * Copy 128-byte blocks from one location to another,
349 * locations should not overlap.
350 */
f67539c2 351static __rte_always_inline void
7c673cae
FG
352rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
353{
354 __m256i ymm0, ymm1, ymm2, ymm3;
355
356 while (n >= 128) {
357 ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32));
358 n -= 128;
359 ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32));
360 ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32));
361 ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32));
362 src = (const uint8_t *)src + 128;
363 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0);
364 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1);
365 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2);
366 _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3);
367 dst = (uint8_t *)dst + 128;
368 }
369}
370
f67539c2 371static __rte_always_inline void *
11fdf7f2 372rte_memcpy_generic(void *dst, const void *src, size_t n)
7c673cae
FG
373{
374 uintptr_t dstu = (uintptr_t)dst;
375 uintptr_t srcu = (uintptr_t)src;
376 void *ret = dst;
377 size_t dstofss;
378 size_t bits;
379
380 /**
381 * Copy less than 16 bytes
382 */
383 if (n < 16) {
384 if (n & 0x01) {
385 *(uint8_t *)dstu = *(const uint8_t *)srcu;
386 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
387 dstu = (uintptr_t)((uint8_t *)dstu + 1);
388 }
389 if (n & 0x02) {
390 *(uint16_t *)dstu = *(const uint16_t *)srcu;
391 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
392 dstu = (uintptr_t)((uint16_t *)dstu + 1);
393 }
394 if (n & 0x04) {
395 *(uint32_t *)dstu = *(const uint32_t *)srcu;
396 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
397 dstu = (uintptr_t)((uint32_t *)dstu + 1);
398 }
399 if (n & 0x08) {
400 *(uint64_t *)dstu = *(const uint64_t *)srcu;
401 }
402 return ret;
403 }
404
405 /**
406 * Fast way when copy size doesn't exceed 256 bytes
407 */
408 if (n <= 32) {
409 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
410 rte_mov16((uint8_t *)dst - 16 + n,
411 (const uint8_t *)src - 16 + n);
412 return ret;
413 }
414 if (n <= 48) {
415 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
416 rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
417 rte_mov16((uint8_t *)dst - 16 + n,
418 (const uint8_t *)src - 16 + n);
419 return ret;
420 }
421 if (n <= 64) {
422 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
423 rte_mov32((uint8_t *)dst - 32 + n,
424 (const uint8_t *)src - 32 + n);
425 return ret;
426 }
427 if (n <= 256) {
428 if (n >= 128) {
429 n -= 128;
430 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
431 src = (const uint8_t *)src + 128;
432 dst = (uint8_t *)dst + 128;
433 }
434COPY_BLOCK_128_BACK31:
435 if (n >= 64) {
436 n -= 64;
437 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
438 src = (const uint8_t *)src + 64;
439 dst = (uint8_t *)dst + 64;
440 }
441 if (n > 32) {
442 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
443 rte_mov32((uint8_t *)dst - 32 + n,
444 (const uint8_t *)src - 32 + n);
445 return ret;
446 }
447 if (n > 0) {
448 rte_mov32((uint8_t *)dst - 32 + n,
449 (const uint8_t *)src - 32 + n);
450 }
451 return ret;
452 }
453
454 /**
455 * Make store aligned when copy size exceeds 256 bytes
456 */
457 dstofss = (uintptr_t)dst & 0x1F;
458 if (dstofss > 0) {
459 dstofss = 32 - dstofss;
460 n -= dstofss;
461 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
462 src = (const uint8_t *)src + dstofss;
463 dst = (uint8_t *)dst + dstofss;
464 }
465
466 /**
467 * Copy 128-byte blocks
468 */
469 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
470 bits = n;
471 n = n & 127;
472 bits -= n;
473 src = (const uint8_t *)src + bits;
474 dst = (uint8_t *)dst + bits;
475
476 /**
477 * Copy whatever left
478 */
479 goto COPY_BLOCK_128_BACK31;
480}
481
482#else /* RTE_MACHINE_CPUFLAG */
483
11fdf7f2
TL
484#define ALIGNMENT_MASK 0x0F
485
7c673cae
FG
486/**
487 * SSE & AVX implementation below
488 */
489
490/**
491 * Copy 16 bytes from one location to another,
492 * locations should not overlap.
493 */
11fdf7f2 494static __rte_always_inline void
7c673cae
FG
495rte_mov16(uint8_t *dst, const uint8_t *src)
496{
497 __m128i xmm0;
498
499 xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src);
500 _mm_storeu_si128((__m128i *)dst, xmm0);
501}
502
503/**
504 * Copy 32 bytes from one location to another,
505 * locations should not overlap.
506 */
11fdf7f2 507static __rte_always_inline void
7c673cae
FG
508rte_mov32(uint8_t *dst, const uint8_t *src)
509{
510 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
511 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
512}
513
514/**
515 * Copy 64 bytes from one location to another,
516 * locations should not overlap.
517 */
11fdf7f2 518static __rte_always_inline void
7c673cae
FG
519rte_mov64(uint8_t *dst, const uint8_t *src)
520{
521 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
522 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
523 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
524 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
525}
526
527/**
528 * Copy 128 bytes from one location to another,
529 * locations should not overlap.
530 */
f67539c2 531static __rte_always_inline void
7c673cae
FG
532rte_mov128(uint8_t *dst, const uint8_t *src)
533{
534 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
535 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
536 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
537 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
538 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
539 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
540 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
541 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
542}
543
544/**
545 * Copy 256 bytes from one location to another,
546 * locations should not overlap.
547 */
548static inline void
549rte_mov256(uint8_t *dst, const uint8_t *src)
550{
551 rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
552 rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
553 rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
554 rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
555 rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
556 rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
557 rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
558 rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
559 rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
560 rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
561 rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
562 rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
563 rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
564 rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
565 rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
566 rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
567}
568
569/**
570 * Macro for copying unaligned block from one location to another with constant load offset,
571 * 47 bytes leftover maximum,
572 * locations should not overlap.
573 * Requirements:
574 * - Store is aligned
575 * - Load offset is <offset>, which must be immediate value within [1, 15]
576 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
577 * - <dst>, <src>, <len> must be variables
578 * - __m128i <xmm0> ~ <xmm8> must be pre-defined
579 */
580#define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \
581__extension__ ({ \
11fdf7f2 582 size_t tmp; \
7c673cae
FG
583 while (len >= 128 + 16 - offset) { \
584 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
585 len -= 128; \
586 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
587 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
588 xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \
589 xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \
590 xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \
591 xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \
592 xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \
593 xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \
594 src = (const uint8_t *)src + 128; \
595 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
596 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
597 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
598 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
599 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
600 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
601 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
602 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
603 dst = (uint8_t *)dst + 128; \
604 } \
605 tmp = len; \
606 len = ((len - 16 + offset) & 127) + 16 - offset; \
607 tmp -= len; \
608 src = (const uint8_t *)src + tmp; \
609 dst = (uint8_t *)dst + tmp; \
610 if (len >= 32 + 16 - offset) { \
611 while (len >= 32 + 16 - offset) { \
612 xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \
613 len -= 32; \
614 xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \
615 xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \
616 src = (const uint8_t *)src + 32; \
617 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
618 _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
619 dst = (uint8_t *)dst + 32; \
620 } \
621 tmp = len; \
622 len = ((len - 16 + offset) & 31) + 16 - offset; \
623 tmp -= len; \
624 src = (const uint8_t *)src + tmp; \
625 dst = (uint8_t *)dst + tmp; \
626 } \
627})
628
629/**
630 * Macro for copying unaligned block from one location to another,
631 * 47 bytes leftover maximum,
632 * locations should not overlap.
633 * Use switch here because the aligning instruction requires immediate value for shift count.
634 * Requirements:
635 * - Store is aligned
636 * - Load offset is <offset>, which must be within [1, 15]
637 * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
638 * - <dst>, <src>, <len> must be variables
639 * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined
640 */
641#define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \
642__extension__ ({ \
643 switch (offset) { \
644 case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \
645 case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \
646 case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \
647 case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \
648 case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \
649 case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \
650 case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \
651 case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \
652 case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \
653 case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \
654 case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \
655 case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \
656 case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \
657 case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \
658 case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \
659 default:; \
660 } \
661})
662
f67539c2 663static __rte_always_inline void *
11fdf7f2 664rte_memcpy_generic(void *dst, const void *src, size_t n)
7c673cae
FG
665{
666 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
667 uintptr_t dstu = (uintptr_t)dst;
668 uintptr_t srcu = (uintptr_t)src;
669 void *ret = dst;
670 size_t dstofss;
671 size_t srcofs;
672
673 /**
674 * Copy less than 16 bytes
675 */
676 if (n < 16) {
677 if (n & 0x01) {
678 *(uint8_t *)dstu = *(const uint8_t *)srcu;
679 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
680 dstu = (uintptr_t)((uint8_t *)dstu + 1);
681 }
682 if (n & 0x02) {
683 *(uint16_t *)dstu = *(const uint16_t *)srcu;
684 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
685 dstu = (uintptr_t)((uint16_t *)dstu + 1);
686 }
687 if (n & 0x04) {
688 *(uint32_t *)dstu = *(const uint32_t *)srcu;
689 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
690 dstu = (uintptr_t)((uint32_t *)dstu + 1);
691 }
692 if (n & 0x08) {
693 *(uint64_t *)dstu = *(const uint64_t *)srcu;
694 }
695 return ret;
696 }
697
698 /**
699 * Fast way when copy size doesn't exceed 512 bytes
700 */
701 if (n <= 32) {
702 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
703 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
704 return ret;
705 }
706 if (n <= 48) {
707 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
708 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
709 return ret;
710 }
711 if (n <= 64) {
712 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
713 rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
714 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
715 return ret;
716 }
717 if (n <= 128) {
718 goto COPY_BLOCK_128_BACK15;
719 }
720 if (n <= 512) {
721 if (n >= 256) {
722 n -= 256;
723 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
724 rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128);
725 src = (const uint8_t *)src + 256;
726 dst = (uint8_t *)dst + 256;
727 }
728COPY_BLOCK_255_BACK15:
729 if (n >= 128) {
730 n -= 128;
731 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
732 src = (const uint8_t *)src + 128;
733 dst = (uint8_t *)dst + 128;
734 }
735COPY_BLOCK_128_BACK15:
736 if (n >= 64) {
737 n -= 64;
738 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
739 src = (const uint8_t *)src + 64;
740 dst = (uint8_t *)dst + 64;
741 }
742COPY_BLOCK_64_BACK15:
743 if (n >= 32) {
744 n -= 32;
745 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
746 src = (const uint8_t *)src + 32;
747 dst = (uint8_t *)dst + 32;
748 }
749 if (n > 16) {
750 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
751 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
752 return ret;
753 }
754 if (n > 0) {
755 rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
756 }
757 return ret;
758 }
759
760 /**
761 * Make store aligned when copy size exceeds 512 bytes,
762 * and make sure the first 15 bytes are copied, because
763 * unaligned copy functions require up to 15 bytes
764 * backwards access.
765 */
766 dstofss = (uintptr_t)dst & 0x0F;
767 if (dstofss > 0) {
768 dstofss = 16 - dstofss + 16;
769 n -= dstofss;
770 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
771 src = (const uint8_t *)src + dstofss;
772 dst = (uint8_t *)dst + dstofss;
773 }
774 srcofs = ((uintptr_t)src & 0x0F);
775
776 /**
777 * For aligned copy
778 */
779 if (srcofs == 0) {
780 /**
781 * Copy 256-byte blocks
782 */
783 for (; n >= 256; n -= 256) {
784 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
785 dst = (uint8_t *)dst + 256;
786 src = (const uint8_t *)src + 256;
787 }
788
789 /**
790 * Copy whatever left
791 */
792 goto COPY_BLOCK_255_BACK15;
793 }
794
795 /**
796 * For copy with unaligned load
797 */
798 MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
799
800 /**
801 * Copy whatever left
802 */
803 goto COPY_BLOCK_64_BACK15;
804}
805
806#endif /* RTE_MACHINE_CPUFLAG */
807
f67539c2 808static __rte_always_inline void *
11fdf7f2
TL
809rte_memcpy_aligned(void *dst, const void *src, size_t n)
810{
811 void *ret = dst;
812
813 /* Copy size <= 16 bytes */
814 if (n < 16) {
815 if (n & 0x01) {
816 *(uint8_t *)dst = *(const uint8_t *)src;
817 src = (const uint8_t *)src + 1;
818 dst = (uint8_t *)dst + 1;
819 }
820 if (n & 0x02) {
821 *(uint16_t *)dst = *(const uint16_t *)src;
822 src = (const uint16_t *)src + 1;
823 dst = (uint16_t *)dst + 1;
824 }
825 if (n & 0x04) {
826 *(uint32_t *)dst = *(const uint32_t *)src;
827 src = (const uint32_t *)src + 1;
828 dst = (uint32_t *)dst + 1;
829 }
830 if (n & 0x08)
831 *(uint64_t *)dst = *(const uint64_t *)src;
832
833 return ret;
834 }
835
836 /* Copy 16 <= size <= 32 bytes */
837 if (n <= 32) {
838 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
839 rte_mov16((uint8_t *)dst - 16 + n,
840 (const uint8_t *)src - 16 + n);
841
842 return ret;
843 }
844
845 /* Copy 32 < size <= 64 bytes */
846 if (n <= 64) {
847 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
848 rte_mov32((uint8_t *)dst - 32 + n,
849 (const uint8_t *)src - 32 + n);
850
851 return ret;
852 }
853
854 /* Copy 64 bytes blocks */
855 for (; n >= 64; n -= 64) {
856 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
857 dst = (uint8_t *)dst + 64;
858 src = (const uint8_t *)src + 64;
859 }
860
861 /* Copy whatever left */
862 rte_mov64((uint8_t *)dst - 64 + n,
863 (const uint8_t *)src - 64 + n);
864
865 return ret;
866}
867
f67539c2 868static __rte_always_inline void *
11fdf7f2
TL
869rte_memcpy(void *dst, const void *src, size_t n)
870{
871 if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
872 return rte_memcpy_aligned(dst, src, n);
873 else
874 return rte_memcpy_generic(dst, src, n);
875}
876
f67539c2
TL
877#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
878#pragma GCC diagnostic pop
879#endif
880
7c673cae
FG
881#ifdef __cplusplus
882}
883#endif
884
885#endif /* _RTE_MEMCPY_X86_64_H_ */