]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright (C) IBM Corporation 2014. | |
5 | * | |
6 | * Redistribution and use in source and binary forms, with or without | |
7 | * modification, are permitted provided that the following conditions | |
8 | * are met: | |
9 | * | |
10 | * * Redistributions of source code must retain the above copyright | |
11 | * notice, this list of conditions and the following disclaimer. | |
12 | * * Redistributions in binary form must reproduce the above copyright | |
13 | * notice, this list of conditions and the following disclaimer in | |
14 | * the documentation and/or other materials provided with the | |
15 | * distribution. | |
16 | * * Neither the name of IBM Corporation nor the names of its | |
17 | * contributors may be used to endorse or promote products derived | |
18 | * from this software without specific prior written permission. | |
19 | * | |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
21 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
24 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
25 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
26 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
27 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
28 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
29 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
30 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
31 | */ | |
32 | ||
33 | #ifndef _RTE_MEMCPY_PPC_64_H_ | |
34 | #define _RTE_MEMCPY_PPC_64_H_ | |
35 | ||
36 | #include <stdint.h> | |
37 | #include <string.h> | |
38 | /*To include altivec.h, GCC version must >= 4.8 */ | |
39 | #include <altivec.h> | |
40 | ||
41 | #ifdef __cplusplus | |
42 | extern "C" { | |
43 | #endif | |
44 | ||
45 | #include "generic/rte_memcpy.h" | |
46 | ||
47 | static inline void | |
48 | rte_mov16(uint8_t *dst, const uint8_t *src) | |
49 | { | |
50 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
51 | } | |
52 | ||
53 | static inline void | |
54 | rte_mov32(uint8_t *dst, const uint8_t *src) | |
55 | { | |
56 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
57 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
58 | } | |
59 | ||
60 | static inline void | |
61 | rte_mov48(uint8_t *dst, const uint8_t *src) | |
62 | { | |
63 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
64 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
65 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
66 | } | |
67 | ||
68 | static inline void | |
69 | rte_mov64(uint8_t *dst, const uint8_t *src) | |
70 | { | |
71 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
72 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
73 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
74 | vec_vsx_st(vec_vsx_ld(48, src), 48, dst); | |
75 | } | |
76 | ||
77 | static inline void | |
78 | rte_mov128(uint8_t *dst, const uint8_t *src) | |
79 | { | |
80 | vec_vsx_st(vec_vsx_ld(0, src), 0, dst); | |
81 | vec_vsx_st(vec_vsx_ld(16, src), 16, dst); | |
82 | vec_vsx_st(vec_vsx_ld(32, src), 32, dst); | |
83 | vec_vsx_st(vec_vsx_ld(48, src), 48, dst); | |
84 | vec_vsx_st(vec_vsx_ld(64, src), 64, dst); | |
85 | vec_vsx_st(vec_vsx_ld(80, src), 80, dst); | |
86 | vec_vsx_st(vec_vsx_ld(96, src), 96, dst); | |
87 | vec_vsx_st(vec_vsx_ld(112, src), 112, dst); | |
88 | } | |
89 | ||
90 | static inline void | |
91 | rte_mov256(uint8_t *dst, const uint8_t *src) | |
92 | { | |
93 | rte_mov128(dst, src); | |
94 | rte_mov128(dst + 128, src + 128); | |
95 | } | |
96 | ||
97 | #define rte_memcpy(dst, src, n) \ | |
98 | __extension__ ({ \ | |
99 | (__builtin_constant_p(n)) ? \ | |
100 | memcpy((dst), (src), (n)) : \ | |
101 | rte_memcpy_func((dst), (src), (n)); }) | |
102 | ||
103 | static inline void * | |
104 | rte_memcpy_func(void *dst, const void *src, size_t n) | |
105 | { | |
106 | void *ret = dst; | |
107 | ||
108 | /* We can't copy < 16 bytes using XMM registers so do it manually. */ | |
109 | if (n < 16) { | |
110 | if (n & 0x01) { | |
111 | *(uint8_t *)dst = *(const uint8_t *)src; | |
112 | dst = (uint8_t *)dst + 1; | |
113 | src = (const uint8_t *)src + 1; | |
114 | } | |
115 | if (n & 0x02) { | |
116 | *(uint16_t *)dst = *(const uint16_t *)src; | |
117 | dst = (uint16_t *)dst + 1; | |
118 | src = (const uint16_t *)src + 1; | |
119 | } | |
120 | if (n & 0x04) { | |
121 | *(uint32_t *)dst = *(const uint32_t *)src; | |
122 | dst = (uint32_t *)dst + 1; | |
123 | src = (const uint32_t *)src + 1; | |
124 | } | |
125 | if (n & 0x08) | |
126 | *(uint64_t *)dst = *(const uint64_t *)src; | |
127 | return ret; | |
128 | } | |
129 | ||
130 | /* Special fast cases for <= 128 bytes */ | |
131 | if (n <= 32) { | |
132 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
133 | rte_mov16((uint8_t *)dst - 16 + n, | |
134 | (const uint8_t *)src - 16 + n); | |
135 | return ret; | |
136 | } | |
137 | ||
138 | if (n <= 64) { | |
139 | rte_mov32((uint8_t *)dst, (const uint8_t *)src); | |
140 | rte_mov32((uint8_t *)dst - 32 + n, | |
141 | (const uint8_t *)src - 32 + n); | |
142 | return ret; | |
143 | } | |
144 | ||
145 | if (n <= 128) { | |
146 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
147 | rte_mov64((uint8_t *)dst - 64 + n, | |
148 | (const uint8_t *)src - 64 + n); | |
149 | return ret; | |
150 | } | |
151 | ||
152 | /* | |
153 | * For large copies > 128 bytes. This combination of 256, 64 and 16 byte | |
154 | * copies was found to be faster than doing 128 and 32 byte copies as | |
155 | * well. | |
156 | */ | |
157 | for ( ; n >= 256; n -= 256) { | |
158 | rte_mov256((uint8_t *)dst, (const uint8_t *)src); | |
159 | dst = (uint8_t *)dst + 256; | |
160 | src = (const uint8_t *)src + 256; | |
161 | } | |
162 | ||
163 | /* | |
164 | * We split the remaining bytes (which will be less than 256) into | |
165 | * 64byte (2^6) chunks. | |
166 | * Using incrementing integers in the case labels of a switch statement | |
11fdf7f2 | 167 | * encourages the compiler to use a jump table. To get incrementing |
7c673cae FG |
168 | * integers, we shift the 2 relevant bits to the LSB position to first |
169 | * get decrementing integers, and then subtract. | |
170 | */ | |
171 | switch (3 - (n >> 6)) { | |
172 | case 0x00: | |
173 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
174 | n -= 64; | |
175 | dst = (uint8_t *)dst + 64; | |
176 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
177 | case 0x01: | |
178 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
179 | n -= 64; | |
180 | dst = (uint8_t *)dst + 64; | |
181 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
182 | case 0x02: | |
183 | rte_mov64((uint8_t *)dst, (const uint8_t *)src); | |
184 | n -= 64; | |
185 | dst = (uint8_t *)dst + 64; | |
186 | src = (const uint8_t *)src + 64; /* fallthrough */ | |
187 | default: | |
188 | ; | |
189 | } | |
190 | ||
191 | /* | |
192 | * We split the remaining bytes (which will be less than 64) into | |
193 | * 16byte (2^4) chunks, using the same switch structure as above. | |
194 | */ | |
195 | switch (3 - (n >> 4)) { | |
196 | case 0x00: | |
197 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
198 | n -= 16; | |
199 | dst = (uint8_t *)dst + 16; | |
200 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
201 | case 0x01: | |
202 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
203 | n -= 16; | |
204 | dst = (uint8_t *)dst + 16; | |
205 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
206 | case 0x02: | |
207 | rte_mov16((uint8_t *)dst, (const uint8_t *)src); | |
208 | n -= 16; | |
209 | dst = (uint8_t *)dst + 16; | |
210 | src = (const uint8_t *)src + 16; /* fallthrough */ | |
211 | default: | |
212 | ; | |
213 | } | |
214 | ||
215 | /* Copy any remaining bytes, without going beyond end of buffers */ | |
216 | if (n != 0) | |
217 | rte_mov16((uint8_t *)dst - 16 + n, | |
218 | (const uint8_t *)src - 16 + n); | |
219 | return ret; | |
220 | } | |
221 | ||
222 | #ifdef __cplusplus | |
223 | } | |
224 | #endif | |
225 | ||
226 | #endif /* _RTE_MEMCPY_PPC_64_H_ */ |