]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | /* |
f67539c2 TL |
2 | * SPDX-License-Identifier: BSD-3-Clause |
3 | * Copyright (C) IBM Corporation 2014. | |
4 | */ | |
7c673cae FG |
5 | |
6 | #ifndef _RTE_MEMCPY_PPC_64_H_ | |
7 | #define _RTE_MEMCPY_PPC_64_H_ | |
8 | ||
9 | #include <stdint.h> | |
10 | #include <string.h> | |
f67539c2 TL |
11 | |
12 | #include "rte_altivec.h" | |
13 | #include "rte_common.h" | |
7c673cae FG |
14 | |
15 | #ifdef __cplusplus | |
16 | extern "C" { | |
17 | #endif | |
18 | ||
19 | #include "generic/rte_memcpy.h" | |
20 | ||
f67539c2 TL |
21 | #if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) |
22 | #pragma GCC diagnostic push | |
23 | #pragma GCC diagnostic ignored "-Warray-bounds" | |
24 | #endif | |
25 | ||
7c673cae FG |
26 | static inline void |
27 | rte_mov16(uint8_t *dst, const uint8_t *src) | |
28 | { | |
29 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
30 | } | |
31 | ||
32 | static inline void | |
33 | rte_mov32(uint8_t *dst, const uint8_t *src) | |
34 | { | |
35 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
36 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
37 | } | |
38 | ||
39 | static inline void | |
40 | rte_mov48(uint8_t *dst, const uint8_t *src) | |
41 | { | |
42 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
43 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
44 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
45 | } | |
46 | ||
47 | static inline void | |
48 | rte_mov64(uint8_t *dst, const uint8_t *src) | |
49 | { | |
50 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
51 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
52 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
53 | vec_vsx_st(vec_vsx_ld(48, src), 48, dst); | |
54 | } | |
55 | ||
56 | static inline void | |
57 | rte_mov128(uint8_t *dst, const uint8_t *src) | |
58 | { | |
59 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
60 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
61 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
62 | vec_vsx_st(vec_vsx_ld(48, src), 48, dst); | |
63 | vec_vsx_st(vec_vsx_ld(64, src), 64, dst); | |
64 | vec_vsx_st(vec_vsx_ld(80, src), 80, dst); | |
65 | vec_vsx_st(vec_vsx_ld(96, src), 96, dst); | |
66 | vec_vsx_st(vec_vsx_ld(112, src), 112, dst); | |
67 | } | |
68 | ||
69 | static inline void | |
70 | rte_mov256(uint8_t *dst, const uint8_t *src) | |
71 | { | |
72 | rte_mov128(dst, src); | |
73 | rte_mov128(dst + 128, src + 128); | |
74 | } | |
75 | ||
76 | #define rte_memcpy(dst, src, n) \ | |
77 | __extension__ ({ \ | |
78 | (__builtin_constant_p(n)) ? \ | |
79 | memcpy((dst), (src), (n)) : \ | |
80 | rte_memcpy_func((dst), (src), (n)); }) | |
81 | ||
82 | static inline void * | |
83 | rte_memcpy_func(void *dst, const void *src, size_t n) | |
84 | { | |
85 | void *ret = dst; | |
86 | ||
87 | /* We can't copy < 16 bytes using XMM registers so do it manually. */ | |
88 | if (n < 16) { | |
89 | if (n & 0x01) { | |
90 | *(uint8_t *)dst = *(const uint8_t *)src; | |
91 | dst = (uint8_t *)dst + 1; | |
92 | src = (const uint8_t *)src + 1; | |
93 | } | |
94 | if (n & 0x02) { | |
95 | *(uint16_t *)dst = *(const uint16_t *)src; | |
96 | dst = (uint16_t *)dst + 1; | |
97 | src = (const uint16_t *)src + 1; | |
98 | } | |
99 | if (n & 0x04) { | |
100 | *(uint32_t *)dst = *(const uint32_t *)src; | |
101 | dst = (uint32_t *)dst + 1; | |
102 | src = (const uint32_t *)src + 1; | |
103 | } | |
104 | if (n & 0x08) | |
105 | *(uint64_t *)dst = *(const uint64_t *)src; | |
106 | return ret; | |
107 | } | |
108 | ||
109 | /* Special fast cases for <= 128 bytes */ | |
110 | if (n <= 32) { | |
111 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
112 | rte_mov16((uint8_t *)dst - 16 + n, | |
113 | (const uint8_t *)src - 16 + n); | |
114 | return ret; | |
115 | } | |
116 | ||
117 | if (n <= 64) { | |
118 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
119 | rte_mov32((uint8_t *)dst - 32 + n, | |
120 | (const uint8_t *)src - 32 + n); | |
121 | return ret; | |
122 | } | |
123 | ||
124 | if (n <= 128) { | |
125 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
126 | rte_mov64((uint8_t *)dst - 64 + n, | |
127 | (const uint8_t *)src - 64 + n); | |
128 | return ret; | |
129 | } | |
130 | ||
131 | /* | |
132 | * For large copies > 128 bytes. This combination of 256, 64 and 16 byte | |
133 | * copies was found to be faster than doing 128 and 32 byte copies as | |
134 | * well. | |
135 | */ | |
136 | for ( ; n >= 256; n -= 256) { | |
137 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |
138 | dst = (uint8_t *)dst + 256; | |
139 | src = (const uint8_t *)src + 256; | |
140 | } | |
141 | ||
142 | /* | |
143 | * We split the remaining bytes (which will be less than 256) into | |
144 | * 64byte (2^6) chunks. | |
145 | * Using incrementing integers in the case labels of a switch statement | |
11fdf7f2 | 146 | * encourages the compiler to use a jump table. To get incrementing |
7c673cae FG |
147 | * integers, we shift the 2 relevant bits to the LSB position to first |
148 | * get decrementing integers, and then subtract. | |
149 | */ | |
150 | switch (3 - (n >> 6)) { | |
151 | case 0x00: | |
152 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
153 | n -= 64; | |
154 | dst = (uint8_t *)dst + 64; | |
155 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
156 | case 0x01: | |
157 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
158 | n -= 64; | |
159 | dst = (uint8_t *)dst + 64; | |
160 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
161 | case 0x02: | |
162 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
163 | n -= 64; | |
164 | dst = (uint8_t *)dst + 64; | |
165 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
166 | default: | |
167 | ; | |
168 | } | |
169 | ||
170 | /* | |
171 | * We split the remaining bytes (which will be less than 64) into | |
172 | * 16byte (2^4) chunks, using the same switch structure as above. | |
173 | */ | |
174 | switch (3 - (n >> 4)) { | |
175 | case 0x00: | |
176 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
177 | n -= 16; | |
178 | dst = (uint8_t *)dst + 16; | |
179 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
180 | case 0x01: | |
181 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
182 | n -= 16; | |
183 | dst = (uint8_t *)dst + 16; | |
184 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
185 | case 0x02: | |
186 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
187 | n -= 16; | |
188 | dst = (uint8_t *)dst + 16; | |
189 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
190 | default: | |
191 | ; | |
192 | } | |
193 | ||
194 | /* Copy any remaining bytes, without going beyond end of buffers */ | |
195 | if (n != 0) | |
196 | rte_mov16((uint8_t *)dst - 16 + n, | |
197 | (const uint8_t *)src - 16 + n); | |
198 | return ret; | |
199 | } | |
200 | ||
f67539c2 TL |
201 | #if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) |
202 | #pragma GCC diagnostic pop | |
203 | #endif | |
204 | ||
7c673cae FG |
205 | #ifdef __cplusplus |
206 | } | |
207 | #endif | |
208 | ||
209 | #endif /* _RTE_MEMCPY_PPC_64_H_ */ |