]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / common / include / arch / ppc_64 / rte_memcpy.h
CommitLineData
7c673cae
FG
1/*
2 * BSD LICENSE
3 *
4 * Copyright (C) IBM Corporation 2014.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * * Neither the name of IBM Corporation nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifndef _RTE_MEMCPY_PPC_64_H_
34#define _RTE_MEMCPY_PPC_64_H_
35
36#include <stdint.h>
37#include <string.h>
38/*To include altivec.h, GCC version must >= 4.8 */
39#include <altivec.h>
40
41#ifdef __cplusplus
42extern "C" {
43#endif
44
45#include "generic/rte_memcpy.h"
46
47static inline void
48rte_mov16(uint8_t *dst, const uint8_t *src)
49{
50 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
51}
52
53static inline void
54rte_mov32(uint8_t *dst, const uint8_t *src)
55{
56 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
57 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
58}
59
60static inline void
61rte_mov48(uint8_t *dst, const uint8_t *src)
62{
63 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
64 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
65 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
66}
67
68static inline void
69rte_mov64(uint8_t *dst, const uint8_t *src)
70{
71 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
72 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
73 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
74 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
75}
76
77static inline void
78rte_mov128(uint8_t *dst, const uint8_t *src)
79{
80 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
81 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
82 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
83 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
84 vec_vsx_st(vec_vsx_ld(64, src), 64, dst);
85 vec_vsx_st(vec_vsx_ld(80, src), 80, dst);
86 vec_vsx_st(vec_vsx_ld(96, src), 96, dst);
87 vec_vsx_st(vec_vsx_ld(112, src), 112, dst);
88}
89
90static inline void
91rte_mov256(uint8_t *dst, const uint8_t *src)
92{
93 rte_mov128(dst, src);
94 rte_mov128(dst + 128, src + 128);
95}
96
97#define rte_memcpy(dst, src, n) \
98 __extension__ ({ \
99 (__builtin_constant_p(n)) ? \
100 memcpy((dst), (src), (n)) : \
101 rte_memcpy_func((dst), (src), (n)); })
102
103static inline void *
104rte_memcpy_func(void *dst, const void *src, size_t n)
105{
106 void *ret = dst;
107
108 /* We can't copy < 16 bytes using XMM registers so do it manually. */
109 if (n < 16) {
110 if (n & 0x01) {
111 *(uint8_t *)dst = *(const uint8_t *)src;
112 dst = (uint8_t *)dst + 1;
113 src = (const uint8_t *)src + 1;
114 }
115 if (n & 0x02) {
116 *(uint16_t *)dst = *(const uint16_t *)src;
117 dst = (uint16_t *)dst + 1;
118 src = (const uint16_t *)src + 1;
119 }
120 if (n & 0x04) {
121 *(uint32_t *)dst = *(const uint32_t *)src;
122 dst = (uint32_t *)dst + 1;
123 src = (const uint32_t *)src + 1;
124 }
125 if (n & 0x08)
126 *(uint64_t *)dst = *(const uint64_t *)src;
127 return ret;
128 }
129
130 /* Special fast cases for <= 128 bytes */
131 if (n <= 32) {
132 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
133 rte_mov16((uint8_t *)dst - 16 + n,
134 (const uint8_t *)src - 16 + n);
135 return ret;
136 }
137
138 if (n <= 64) {
139 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
140 rte_mov32((uint8_t *)dst - 32 + n,
141 (const uint8_t *)src - 32 + n);
142 return ret;
143 }
144
145 if (n <= 128) {
146 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
147 rte_mov64((uint8_t *)dst - 64 + n,
148 (const uint8_t *)src - 64 + n);
149 return ret;
150 }
151
152 /*
153 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
154 * copies was found to be faster than doing 128 and 32 byte copies as
155 * well.
156 */
157 for ( ; n >= 256; n -= 256) {
158 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
159 dst = (uint8_t *)dst + 256;
160 src = (const uint8_t *)src + 256;
161 }
162
163 /*
164 * We split the remaining bytes (which will be less than 256) into
165 * 64byte (2^6) chunks.
166 * Using incrementing integers in the case labels of a switch statement
11fdf7f2 167 * encourages the compiler to use a jump table. To get incrementing
7c673cae
FG
168 * integers, we shift the 2 relevant bits to the LSB position to first
169 * get decrementing integers, and then subtract.
170 */
171 switch (3 - (n >> 6)) {
172 case 0x00:
173 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
174 n -= 64;
175 dst = (uint8_t *)dst + 64;
176 src = (const uint8_t *)src + 64; /* fallthrough */
177 case 0x01:
178 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
179 n -= 64;
180 dst = (uint8_t *)dst + 64;
181 src = (const uint8_t *)src + 64; /* fallthrough */
182 case 0x02:
183 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
184 n -= 64;
185 dst = (uint8_t *)dst + 64;
186 src = (const uint8_t *)src + 64; /* fallthrough */
187 default:
188 ;
189 }
190
191 /*
192 * We split the remaining bytes (which will be less than 64) into
193 * 16byte (2^4) chunks, using the same switch structure as above.
194 */
195 switch (3 - (n >> 4)) {
196 case 0x00:
197 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
198 n -= 16;
199 dst = (uint8_t *)dst + 16;
200 src = (const uint8_t *)src + 16; /* fallthrough */
201 case 0x01:
202 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
203 n -= 16;
204 dst = (uint8_t *)dst + 16;
205 src = (const uint8_t *)src + 16; /* fallthrough */
206 case 0x02:
207 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
208 n -= 16;
209 dst = (uint8_t *)dst + 16;
210 src = (const uint8_t *)src + 16; /* fallthrough */
211 default:
212 ;
213 }
214
215 /* Copy any remaining bytes, without going beyond end of buffers */
216 if (n != 0)
217 rte_mov16((uint8_t *)dst - 16 + n,
218 (const uint8_t *)src - 16 + n);
219 return ret;
220}
221
222#ifdef __cplusplus
223}
224#endif
225
226#endif /* _RTE_MEMCPY_PPC_64_H_ */