]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2014 Intel Corporation | |
7c673cae FG |
3 | */ |
4 | ||
5 | #ifndef _RTE_MEMCPY_X86_64_H_ | |
6 | #define _RTE_MEMCPY_X86_64_H_ | |
7 | ||
8 | /** | |
9 | * @file | |
10 | * | |
11 | * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy(). | |
12 | */ | |
13 | ||
14 | #include <stdio.h> | |
15 | #include <stdint.h> | |
16 | #include <string.h> | |
17 | #include <rte_vect.h> | |
11fdf7f2 TL |
18 | #include <rte_common.h> |
19 | #include <rte_config.h> | |
7c673cae FG |
20 | |
21 | #ifdef __cplusplus | |
22 | extern "C" { | |
23 | #endif | |
24 | ||
f67539c2 TL |
25 | #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) |
26 | #pragma GCC diagnostic push | |
27 | #pragma GCC diagnostic ignored "-Wstringop-overflow" | |
28 | #endif | |
29 | ||
7c673cae FG |
30 | /** |
31 | * Copy bytes from one location to another. The locations must not overlap. | |
32 | * | |
33 | * @note This is implemented as a macro, so it's address should not be taken | |
34 | * and care is needed as parameter expressions may be evaluated multiple times. | |
35 | * | |
36 | * @param dst | |
37 | * Pointer to the destination of the data. | |
38 | * @param src | |
39 | * Pointer to the source data. | |
40 | * @param n | |
41 | * Number of bytes to copy. | |
42 | * @return | |
43 | * Pointer to the destination data. | |
44 | */ | |
11fdf7f2 TL |
45 | static __rte_always_inline void * |
46 | rte_memcpy(void *dst, const void *src, size_t n); | |
7c673cae FG |
47 | |
48 | #ifdef RTE_MACHINE_CPUFLAG_AVX512F | |
49 | ||
11fdf7f2 TL |
50 | #define ALIGNMENT_MASK 0x3F |
51 | ||
7c673cae FG |
52 | /** |
53 | * AVX512 implementation below | |
54 | */ | |
55 | ||
56 | /** | |
57 | * Copy 16 bytes from one location to another, | |
58 | * locations should not overlap. | |
59 | */ | |
11fdf7f2 | 60 | static __rte_always_inline void |
7c673cae FG |
61 | rte_mov16(uint8_t *dst, const uint8_t *src) |
62 | { | |
63 | __m128i xmm0; | |
64 | ||
65 | xmm0 = _mm_loadu_si128((const __m128i *)src); | |
66 | _mm_storeu_si128((__m128i *)dst, xmm0); | |
67 | } | |
68 | ||
69 | /** | |
70 | * Copy 32 bytes from one location to another, | |
71 | * locations should not overlap. | |
72 | */ | |
11fdf7f2 | 73 | static __rte_always_inline void |
7c673cae FG |
74 | rte_mov32(uint8_t *dst, const uint8_t *src) |
75 | { | |
76 | __m256i ymm0; | |
77 | ||
78 | ymm0 = _mm256_loadu_si256((const __m256i *)src); | |
79 | _mm256_storeu_si256((__m256i *)dst, ymm0); | |
80 | } | |
81 | ||
82 | /** | |
83 | * Copy 64 bytes from one location to another, | |
84 | * locations should not overlap. | |
85 | */ | |
11fdf7f2 | 86 | static __rte_always_inline void |
7c673cae FG |
87 | rte_mov64(uint8_t *dst, const uint8_t *src) |
88 | { | |
89 | __m512i zmm0; | |
90 | ||
91 | zmm0 = _mm512_loadu_si512((const void *)src); | |
92 | _mm512_storeu_si512((void *)dst, zmm0); | |
93 | } | |
94 | ||
95 | /** | |
96 | * Copy 128 bytes from one location to another, | |
97 | * locations should not overlap. | |
98 | */ | |
11fdf7f2 | 99 | static __rte_always_inline void |
7c673cae FG |
100 | rte_mov128(uint8_t *dst, const uint8_t *src) |
101 | { | |
102 | rte_mov64(dst + 0 * 64, src + 0 * 64); | |
103 | rte_mov64(dst + 1 * 64, src + 1 * 64); | |
104 | } | |
105 | ||
106 | /** | |
107 | * Copy 256 bytes from one location to another, | |
108 | * locations should not overlap. | |
109 | */ | |
11fdf7f2 | 110 | static __rte_always_inline void |
7c673cae FG |
111 | rte_mov256(uint8_t *dst, const uint8_t *src) |
112 | { | |
113 | rte_mov64(dst + 0 * 64, src + 0 * 64); | |
114 | rte_mov64(dst + 1 * 64, src + 1 * 64); | |
115 | rte_mov64(dst + 2 * 64, src + 2 * 64); | |
116 | rte_mov64(dst + 3 * 64, src + 3 * 64); | |
117 | } | |
118 | ||
119 | /** | |
120 | * Copy 128-byte blocks from one location to another, | |
121 | * locations should not overlap. | |
122 | */ | |
f67539c2 | 123 | static __rte_always_inline void |
7c673cae FG |
124 | rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) |
125 | { | |
126 | __m512i zmm0, zmm1; | |
127 | ||
128 | while (n >= 128) { | |
129 | zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); | |
130 | n -= 128; | |
131 | zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); | |
132 | src = src + 128; | |
133 | _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); | |
134 | _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); | |
135 | dst = dst + 128; | |
136 | } | |
137 | } | |
138 | ||
139 | /** | |
140 | * Copy 512-byte blocks from one location to another, | |
141 | * locations should not overlap. | |
142 | */ | |
143 | static inline void | |
144 | rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) | |
145 | { | |
146 | __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; | |
147 | ||
148 | while (n >= 512) { | |
149 | zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64)); | |
150 | n -= 512; | |
151 | zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64)); | |
152 | zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64)); | |
153 | zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64)); | |
154 | zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64)); | |
155 | zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64)); | |
156 | zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64)); | |
157 | zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64)); | |
158 | src = src + 512; | |
159 | _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0); | |
160 | _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1); | |
161 | _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2); | |
162 | _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3); | |
163 | _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4); | |
164 | _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5); | |
165 | _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6); | |
166 | _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7); | |
167 | dst = dst + 512; | |
168 | } | |
169 | } | |
170 | ||
f67539c2 | 171 | static __rte_always_inline void * |
11fdf7f2 | 172 | rte_memcpy_generic(void *dst, const void *src, size_t n) |
7c673cae FG |
173 | { |
174 | uintptr_t dstu = (uintptr_t)dst; | |
175 | uintptr_t srcu = (uintptr_t)src; | |
176 | void *ret = dst; | |
177 | size_t dstofss; | |
178 | size_t bits; | |
179 | ||
180 | /** | |
181 | * Copy less than 16 bytes | |
182 | */ | |
183 | if (n < 16) { | |
184 | if (n & 0x01) { | |
185 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |
186 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |
187 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |
188 | } | |
189 | if (n & 0x02) { | |
190 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |
191 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |
192 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |
193 | } | |
194 | if (n & 0x04) { | |
195 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |
196 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |
197 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |
198 | } | |
199 | if (n & 0x08) | |
200 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |
201 | return ret; | |
202 | } | |
203 | ||
204 | /** | |
205 | * Fast way when copy size doesn't exceed 512 bytes | |
206 | */ | |
207 | if (n <= 32) { | |
208 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
209 | rte_mov16((uint8_t *)dst - 16 + n, | |
210 | (const uint8_t *)src - 16 + n); | |
211 | return ret; | |
212 | } | |
213 | if (n <= 64) { | |
214 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
215 | rte_mov32((uint8_t *)dst - 32 + n, | |
216 | (const uint8_t *)src - 32 + n); | |
217 | return ret; | |
218 | } | |
219 | if (n <= 512) { | |
220 | if (n >= 256) { | |
221 | n -= 256; | |
222 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |
223 | src = (const uint8_t *)src + 256; | |
224 | dst = (uint8_t *)dst + 256; | |
225 | } | |
226 | if (n >= 128) { | |
227 | n -= 128; | |
228 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |
229 | src = (const uint8_t *)src + 128; | |
230 | dst = (uint8_t *)dst + 128; | |
231 | } | |
232 | COPY_BLOCK_128_BACK63: | |
233 | if (n > 64) { | |
234 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
235 | rte_mov64((uint8_t *)dst - 64 + n, | |
236 | (const uint8_t *)src - 64 + n); | |
237 | return ret; | |
238 | } | |
239 | if (n > 0) | |
240 | rte_mov64((uint8_t *)dst - 64 + n, | |
241 | (const uint8_t *)src - 64 + n); | |
242 | return ret; | |
243 | } | |
244 | ||
245 | /** | |
246 | * Make store aligned when copy size exceeds 512 bytes | |
247 | */ | |
248 | dstofss = ((uintptr_t)dst & 0x3F); | |
249 | if (dstofss > 0) { | |
250 | dstofss = 64 - dstofss; | |
251 | n -= dstofss; | |
252 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
253 | src = (const uint8_t *)src + dstofss; | |
254 | dst = (uint8_t *)dst + dstofss; | |
255 | } | |
256 | ||
257 | /** | |
258 | * Copy 512-byte blocks. | |
259 | * Use copy block function for better instruction order control, | |
260 | * which is important when load is unaligned. | |
261 | */ | |
262 | rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n); | |
263 | bits = n; | |
264 | n = n & 511; | |
265 | bits -= n; | |
266 | src = (const uint8_t *)src + bits; | |
267 | dst = (uint8_t *)dst + bits; | |
268 | ||
269 | /** | |
270 | * Copy 128-byte blocks. | |
271 | * Use copy block function for better instruction order control, | |
272 | * which is important when load is unaligned. | |
273 | */ | |
274 | if (n >= 128) { | |
275 | rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); | |
276 | bits = n; | |
277 | n = n & 127; | |
278 | bits -= n; | |
279 | src = (const uint8_t *)src + bits; | |
280 | dst = (uint8_t *)dst + bits; | |
281 | } | |
282 | ||
283 | /** | |
284 | * Copy whatever left | |
285 | */ | |
286 | goto COPY_BLOCK_128_BACK63; | |
287 | } | |
288 | ||
289 | #elif defined RTE_MACHINE_CPUFLAG_AVX2 | |
290 | ||
11fdf7f2 TL |
291 | #define ALIGNMENT_MASK 0x1F |
292 | ||
7c673cae FG |
293 | /** |
294 | * AVX2 implementation below | |
295 | */ | |
296 | ||
297 | /** | |
298 | * Copy 16 bytes from one location to another, | |
299 | * locations should not overlap. | |
300 | */ | |
11fdf7f2 | 301 | static __rte_always_inline void |
7c673cae FG |
302 | rte_mov16(uint8_t *dst, const uint8_t *src) |
303 | { | |
304 | __m128i xmm0; | |
305 | ||
306 | xmm0 = _mm_loadu_si128((const __m128i *)src); | |
307 | _mm_storeu_si128((__m128i *)dst, xmm0); | |
308 | } | |
309 | ||
310 | /** | |
311 | * Copy 32 bytes from one location to another, | |
312 | * locations should not overlap. | |
313 | */ | |
11fdf7f2 | 314 | static __rte_always_inline void |
7c673cae FG |
315 | rte_mov32(uint8_t *dst, const uint8_t *src) |
316 | { | |
317 | __m256i ymm0; | |
318 | ||
319 | ymm0 = _mm256_loadu_si256((const __m256i *)src); | |
320 | _mm256_storeu_si256((__m256i *)dst, ymm0); | |
321 | } | |
322 | ||
323 | /** | |
324 | * Copy 64 bytes from one location to another, | |
325 | * locations should not overlap. | |
326 | */ | |
11fdf7f2 | 327 | static __rte_always_inline void |
7c673cae FG |
328 | rte_mov64(uint8_t *dst, const uint8_t *src) |
329 | { | |
330 | rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); | |
331 | rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); | |
332 | } | |
333 | ||
334 | /** | |
335 | * Copy 128 bytes from one location to another, | |
336 | * locations should not overlap. | |
337 | */ | |
f67539c2 | 338 | static __rte_always_inline void |
7c673cae FG |
339 | rte_mov128(uint8_t *dst, const uint8_t *src) |
340 | { | |
341 | rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); | |
342 | rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); | |
343 | rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); | |
344 | rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); | |
345 | } | |
346 | ||
347 | /** | |
348 | * Copy 128-byte blocks from one location to another, | |
349 | * locations should not overlap. | |
350 | */ | |
f67539c2 | 351 | static __rte_always_inline void |
7c673cae FG |
352 | rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) |
353 | { | |
354 | __m256i ymm0, ymm1, ymm2, ymm3; | |
355 | ||
356 | while (n >= 128) { | |
357 | ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 0 * 32)); | |
358 | n -= 128; | |
359 | ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 1 * 32)); | |
360 | ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 2 * 32)); | |
361 | ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t *)src + 3 * 32)); | |
362 | src = (const uint8_t *)src + 128; | |
363 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); | |
364 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); | |
365 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), ymm2); | |
366 | _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), ymm3); | |
367 | dst = (uint8_t *)dst + 128; | |
368 | } | |
369 | } | |
370 | ||
f67539c2 | 371 | static __rte_always_inline void * |
11fdf7f2 | 372 | rte_memcpy_generic(void *dst, const void *src, size_t n) |
7c673cae FG |
373 | { |
374 | uintptr_t dstu = (uintptr_t)dst; | |
375 | uintptr_t srcu = (uintptr_t)src; | |
376 | void *ret = dst; | |
377 | size_t dstofss; | |
378 | size_t bits; | |
379 | ||
380 | /** | |
381 | * Copy less than 16 bytes | |
382 | */ | |
383 | if (n < 16) { | |
384 | if (n & 0x01) { | |
385 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |
386 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |
387 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |
388 | } | |
389 | if (n & 0x02) { | |
390 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |
391 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |
392 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |
393 | } | |
394 | if (n & 0x04) { | |
395 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |
396 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |
397 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |
398 | } | |
399 | if (n & 0x08) { | |
400 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |
401 | } | |
402 | return ret; | |
403 | } | |
404 | ||
405 | /** | |
406 | * Fast way when copy size doesn't exceed 256 bytes | |
407 | */ | |
408 | if (n <= 32) { | |
409 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
410 | rte_mov16((uint8_t *)dst - 16 + n, | |
411 | (const uint8_t *)src - 16 + n); | |
412 | return ret; | |
413 | } | |
414 | if (n <= 48) { | |
415 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
416 | rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); | |
417 | rte_mov16((uint8_t *)dst - 16 + n, | |
418 | (const uint8_t *)src - 16 + n); | |
419 | return ret; | |
420 | } | |
421 | if (n <= 64) { | |
422 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
423 | rte_mov32((uint8_t *)dst - 32 + n, | |
424 | (const uint8_t *)src - 32 + n); | |
425 | return ret; | |
426 | } | |
427 | if (n <= 256) { | |
428 | if (n >= 128) { | |
429 | n -= 128; | |
430 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |
431 | src = (const uint8_t *)src + 128; | |
432 | dst = (uint8_t *)dst + 128; | |
433 | } | |
434 | COPY_BLOCK_128_BACK31: | |
435 | if (n >= 64) { | |
436 | n -= 64; | |
437 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
438 | src = (const uint8_t *)src + 64; | |
439 | dst = (uint8_t *)dst + 64; | |
440 | } | |
441 | if (n > 32) { | |
442 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
443 | rte_mov32((uint8_t *)dst - 32 + n, | |
444 | (const uint8_t *)src - 32 + n); | |
445 | return ret; | |
446 | } | |
447 | if (n > 0) { | |
448 | rte_mov32((uint8_t *)dst - 32 + n, | |
449 | (const uint8_t *)src - 32 + n); | |
450 | } | |
451 | return ret; | |
452 | } | |
453 | ||
454 | /** | |
455 | * Make store aligned when copy size exceeds 256 bytes | |
456 | */ | |
457 | dstofss = (uintptr_t)dst & 0x1F; | |
458 | if (dstofss > 0) { | |
459 | dstofss = 32 - dstofss; | |
460 | n -= dstofss; | |
461 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
462 | src = (const uint8_t *)src + dstofss; | |
463 | dst = (uint8_t *)dst + dstofss; | |
464 | } | |
465 | ||
466 | /** | |
467 | * Copy 128-byte blocks | |
468 | */ | |
469 | rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n); | |
470 | bits = n; | |
471 | n = n & 127; | |
472 | bits -= n; | |
473 | src = (const uint8_t *)src + bits; | |
474 | dst = (uint8_t *)dst + bits; | |
475 | ||
476 | /** | |
477 | * Copy whatever left | |
478 | */ | |
479 | goto COPY_BLOCK_128_BACK31; | |
480 | } | |
481 | ||
482 | #else /* RTE_MACHINE_CPUFLAG */ | |
483 | ||
11fdf7f2 TL |
484 | #define ALIGNMENT_MASK 0x0F |
485 | ||
7c673cae FG |
486 | /** |
487 | * SSE & AVX implementation below | |
488 | */ | |
489 | ||
490 | /** | |
491 | * Copy 16 bytes from one location to another, | |
492 | * locations should not overlap. | |
493 | */ | |
11fdf7f2 | 494 | static __rte_always_inline void |
7c673cae FG |
495 | rte_mov16(uint8_t *dst, const uint8_t *src) |
496 | { | |
497 | __m128i xmm0; | |
498 | ||
499 | xmm0 = _mm_loadu_si128((const __m128i *)(const __m128i *)src); | |
500 | _mm_storeu_si128((__m128i *)dst, xmm0); | |
501 | } | |
502 | ||
503 | /** | |
504 | * Copy 32 bytes from one location to another, | |
505 | * locations should not overlap. | |
506 | */ | |
11fdf7f2 | 507 | static __rte_always_inline void |
7c673cae FG |
508 | rte_mov32(uint8_t *dst, const uint8_t *src) |
509 | { | |
510 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |
511 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |
512 | } | |
513 | ||
514 | /** | |
515 | * Copy 64 bytes from one location to another, | |
516 | * locations should not overlap. | |
517 | */ | |
11fdf7f2 | 518 | static __rte_always_inline void |
7c673cae FG |
519 | rte_mov64(uint8_t *dst, const uint8_t *src) |
520 | { | |
521 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |
522 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |
523 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |
524 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |
525 | } | |
526 | ||
527 | /** | |
528 | * Copy 128 bytes from one location to another, | |
529 | * locations should not overlap. | |
530 | */ | |
f67539c2 | 531 | static __rte_always_inline void |
7c673cae FG |
532 | rte_mov128(uint8_t *dst, const uint8_t *src) |
533 | { | |
534 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |
535 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |
536 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |
537 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |
538 | rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); | |
539 | rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); | |
540 | rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); | |
541 | rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); | |
542 | } | |
543 | ||
544 | /** | |
545 | * Copy 256 bytes from one location to another, | |
546 | * locations should not overlap. | |
547 | */ | |
548 | static inline void | |
549 | rte_mov256(uint8_t *dst, const uint8_t *src) | |
550 | { | |
551 | rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); | |
552 | rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); | |
553 | rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); | |
554 | rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); | |
555 | rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); | |
556 | rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); | |
557 | rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); | |
558 | rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); | |
559 | rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); | |
560 | rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); | |
561 | rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); | |
562 | rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); | |
563 | rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); | |
564 | rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); | |
565 | rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); | |
566 | rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); | |
567 | } | |
568 | ||
569 | /** | |
570 | * Macro for copying unaligned block from one location to another with constant load offset, | |
571 | * 47 bytes leftover maximum, | |
572 | * locations should not overlap. | |
573 | * Requirements: | |
574 | * - Store is aligned | |
575 | * - Load offset is <offset>, which must be immediate value within [1, 15] | |
576 | * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading | |
577 | * - <dst>, <src>, <len> must be variables | |
578 | * - __m128i <xmm0> ~ <xmm8> must be pre-defined | |
579 | */ | |
580 | #define MOVEUNALIGNED_LEFT47_IMM(dst, src, len, offset) \ | |
581 | __extension__ ({ \ | |
11fdf7f2 | 582 | size_t tmp; \ |
7c673cae FG |
583 | while (len >= 128 + 16 - offset) { \ |
584 | xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ | |
585 | len -= 128; \ | |
586 | xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ | |
587 | xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ | |
588 | xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 3 * 16)); \ | |
589 | xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 4 * 16)); \ | |
590 | xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 5 * 16)); \ | |
591 | xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 6 * 16)); \ | |
592 | xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 7 * 16)); \ | |
593 | xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 8 * 16)); \ | |
594 | src = (const uint8_t *)src + 128; \ | |
595 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | |
596 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | |
597 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | |
598 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | |
599 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | |
600 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | |
601 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | |
602 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | |
603 | dst = (uint8_t *)dst + 128; \ | |
604 | } \ | |
605 | tmp = len; \ | |
606 | len = ((len - 16 + offset) & 127) + 16 - offset; \ | |
607 | tmp -= len; \ | |
608 | src = (const uint8_t *)src + tmp; \ | |
609 | dst = (uint8_t *)dst + tmp; \ | |
610 | if (len >= 32 + 16 - offset) { \ | |
611 | while (len >= 32 + 16 - offset) { \ | |
612 | xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 0 * 16)); \ | |
613 | len -= 32; \ | |
614 | xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 1 * 16)); \ | |
615 | xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - offset + 2 * 16)); \ | |
616 | src = (const uint8_t *)src + 32; \ | |
617 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | |
618 | _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | |
619 | dst = (uint8_t *)dst + 32; \ | |
620 | } \ | |
621 | tmp = len; \ | |
622 | len = ((len - 16 + offset) & 31) + 16 - offset; \ | |
623 | tmp -= len; \ | |
624 | src = (const uint8_t *)src + tmp; \ | |
625 | dst = (uint8_t *)dst + tmp; \ | |
626 | } \ | |
627 | }) | |
628 | ||
629 | /** | |
630 | * Macro for copying unaligned block from one location to another, | |
631 | * 47 bytes leftover maximum, | |
632 | * locations should not overlap. | |
633 | * Use switch here because the aligning instruction requires immediate value for shift count. | |
634 | * Requirements: | |
635 | * - Store is aligned | |
636 | * - Load offset is <offset>, which must be within [1, 15] | |
637 | * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading | |
638 | * - <dst>, <src>, <len> must be variables | |
639 | * - __m128i <xmm0> ~ <xmm8> used in MOVEUNALIGNED_LEFT47_IMM must be pre-defined | |
640 | */ | |
641 | #define MOVEUNALIGNED_LEFT47(dst, src, len, offset) \ | |
642 | __extension__ ({ \ | |
643 | switch (offset) { \ | |
644 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | |
645 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | |
646 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | |
647 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | |
648 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | |
649 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | |
650 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | |
651 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | |
652 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | |
653 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | |
654 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | |
655 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | |
656 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | |
657 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | |
658 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | |
659 | default:; \ | |
660 | } \ | |
661 | }) | |
662 | ||
f67539c2 | 663 | static __rte_always_inline void * |
11fdf7f2 | 664 | rte_memcpy_generic(void *dst, const void *src, size_t n) |
7c673cae FG |
665 | { |
666 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; | |
667 | uintptr_t dstu = (uintptr_t)dst; | |
668 | uintptr_t srcu = (uintptr_t)src; | |
669 | void *ret = dst; | |
670 | size_t dstofss; | |
671 | size_t srcofs; | |
672 | ||
673 | /** | |
674 | * Copy less than 16 bytes | |
675 | */ | |
676 | if (n < 16) { | |
677 | if (n & 0x01) { | |
678 | *(uint8_t *)dstu = *(const uint8_t *)srcu; | |
679 | srcu = (uintptr_t)((const uint8_t *)srcu + 1); | |
680 | dstu = (uintptr_t)((uint8_t *)dstu + 1); | |
681 | } | |
682 | if (n & 0x02) { | |
683 | *(uint16_t *)dstu = *(const uint16_t *)srcu; | |
684 | srcu = (uintptr_t)((const uint16_t *)srcu + 1); | |
685 | dstu = (uintptr_t)((uint16_t *)dstu + 1); | |
686 | } | |
687 | if (n & 0x04) { | |
688 | *(uint32_t *)dstu = *(const uint32_t *)srcu; | |
689 | srcu = (uintptr_t)((const uint32_t *)srcu + 1); | |
690 | dstu = (uintptr_t)((uint32_t *)dstu + 1); | |
691 | } | |
692 | if (n & 0x08) { | |
693 | *(uint64_t *)dstu = *(const uint64_t *)srcu; | |
694 | } | |
695 | return ret; | |
696 | } | |
697 | ||
698 | /** | |
699 | * Fast way when copy size doesn't exceed 512 bytes | |
700 | */ | |
701 | if (n <= 32) { | |
702 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
703 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |
704 | return ret; | |
705 | } | |
706 | if (n <= 48) { | |
707 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
708 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |
709 | return ret; | |
710 | } | |
711 | if (n <= 64) { | |
712 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
713 | rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); | |
714 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |
715 | return ret; | |
716 | } | |
717 | if (n <= 128) { | |
718 | goto COPY_BLOCK_128_BACK15; | |
719 | } | |
720 | if (n <= 512) { | |
721 | if (n >= 256) { | |
722 | n -= 256; | |
723 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |
724 | rte_mov128((uint8_t *)dst + 128, (const uint8_t *)src + 128); | |
725 | src = (const uint8_t *)src + 256; | |
726 | dst = (uint8_t *)dst + 256; | |
727 | } | |
728 | COPY_BLOCK_255_BACK15: | |
729 | if (n >= 128) { | |
730 | n -= 128; | |
731 | rte_mov128((uint8_t *)dst, (const uint8_t *)src); | |
732 | src = (const uint8_t *)src + 128; | |
733 | dst = (uint8_t *)dst + 128; | |
734 | } | |
735 | COPY_BLOCK_128_BACK15: | |
736 | if (n >= 64) { | |
737 | n -= 64; | |
738 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
739 | src = (const uint8_t *)src + 64; | |
740 | dst = (uint8_t *)dst + 64; | |
741 | } | |
742 | COPY_BLOCK_64_BACK15: | |
743 | if (n >= 32) { | |
744 | n -= 32; | |
745 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
746 | src = (const uint8_t *)src + 32; | |
747 | dst = (uint8_t *)dst + 32; | |
748 | } | |
749 | if (n > 16) { | |
750 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
751 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |
752 | return ret; | |
753 | } | |
754 | if (n > 0) { | |
755 | rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); | |
756 | } | |
757 | return ret; | |
758 | } | |
759 | ||
760 | /** | |
761 | * Make store aligned when copy size exceeds 512 bytes, | |
762 | * and make sure the first 15 bytes are copied, because | |
763 | * unaligned copy functions require up to 15 bytes | |
764 | * backwards access. | |
765 | */ | |
766 | dstofss = (uintptr_t)dst & 0x0F; | |
767 | if (dstofss > 0) { | |
768 | dstofss = 16 - dstofss + 16; | |
769 | n -= dstofss; | |
770 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
771 | src = (const uint8_t *)src + dstofss; | |
772 | dst = (uint8_t *)dst + dstofss; | |
773 | } | |
774 | srcofs = ((uintptr_t)src & 0x0F); | |
775 | ||
776 | /** | |
777 | * For aligned copy | |
778 | */ | |
779 | if (srcofs == 0) { | |
780 | /** | |
781 | * Copy 256-byte blocks | |
782 | */ | |
783 | for (; n >= 256; n -= 256) { | |
784 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |
785 | dst = (uint8_t *)dst + 256; | |
786 | src = (const uint8_t *)src + 256; | |
787 | } | |
788 | ||
789 | /** | |
790 | * Copy whatever left | |
791 | */ | |
792 | goto COPY_BLOCK_255_BACK15; | |
793 | } | |
794 | ||
795 | /** | |
796 | * For copy with unaligned load | |
797 | */ | |
798 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | |
799 | ||
800 | /** | |
801 | * Copy whatever left | |
802 | */ | |
803 | goto COPY_BLOCK_64_BACK15; | |
804 | } | |
805 | ||
806 | #endif /* RTE_MACHINE_CPUFLAG */ | |
807 | ||
f67539c2 | 808 | static __rte_always_inline void * |
11fdf7f2 TL |
809 | rte_memcpy_aligned(void *dst, const void *src, size_t n) |
810 | { | |
811 | void *ret = dst; | |
812 | ||
813 | /* Copy size <= 16 bytes */ | |
814 | if (n < 16) { | |
815 | if (n & 0x01) { | |
816 | *(uint8_t *)dst = *(const uint8_t *)src; | |
817 | src = (const uint8_t *)src + 1; | |
818 | dst = (uint8_t *)dst + 1; | |
819 | } | |
820 | if (n & 0x02) { | |
821 | *(uint16_t *)dst = *(const uint16_t *)src; | |
822 | src = (const uint16_t *)src + 1; | |
823 | dst = (uint16_t *)dst + 1; | |
824 | } | |
825 | if (n & 0x04) { | |
826 | *(uint32_t *)dst = *(const uint32_t *)src; | |
827 | src = (const uint32_t *)src + 1; | |
828 | dst = (uint32_t *)dst + 1; | |
829 | } | |
830 | if (n & 0x08) | |
831 | *(uint64_t *)dst = *(const uint64_t *)src; | |
832 | ||
833 | return ret; | |
834 | } | |
835 | ||
836 | /* Copy 16 <= size <= 32 bytes */ | |
837 | if (n <= 32) { | |
838 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
839 | rte_mov16((uint8_t *)dst - 16 + n, | |
840 | (const uint8_t *)src - 16 + n); | |
841 | ||
842 | return ret; | |
843 | } | |
844 | ||
845 | /* Copy 32 < size <= 64 bytes */ | |
846 | if (n <= 64) { | |
847 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
848 | rte_mov32((uint8_t *)dst - 32 + n, | |
849 | (const uint8_t *)src - 32 + n); | |
850 | ||
851 | return ret; | |
852 | } | |
853 | ||
854 | /* Copy 64 bytes blocks */ | |
855 | for (; n >= 64; n -= 64) { | |
856 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
857 | dst = (uint8_t *)dst + 64; | |
858 | src = (const uint8_t *)src + 64; | |
859 | } | |
860 | ||
861 | /* Copy whatever left */ | |
862 | rte_mov64((uint8_t *)dst - 64 + n, | |
863 | (const uint8_t *)src - 64 + n); | |
864 | ||
865 | return ret; | |
866 | } | |
867 | ||
f67539c2 | 868 | static __rte_always_inline void * |
11fdf7f2 TL |
869 | rte_memcpy(void *dst, const void *src, size_t n) |
870 | { | |
871 | if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) | |
872 | return rte_memcpy_aligned(dst, src, n); | |
873 | else | |
874 | return rte_memcpy_generic(dst, src, n); | |
875 | } | |
876 | ||
f67539c2 TL |
877 | #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) |
878 | #pragma GCC diagnostic pop | |
879 | #endif | |
880 | ||
7c673cae FG |
881 | #ifdef __cplusplus |
882 | } | |
883 | #endif | |
884 | ||
885 | #endif /* _RTE_MEMCPY_X86_64_H_ */ |