]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/dpdk/lib/librte_net/net_crc_sse.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / dpdk / lib / librte_net / net_crc_sse.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017 Intel Corporation
3 */
4
5 #ifndef _RTE_NET_CRC_SSE_H_
6 #define _RTE_NET_CRC_SSE_H_
7
8 #include <rte_branch_prediction.h>
9
10 #include <x86intrin.h>
11 #include <cpuid.h>
12
13 #ifdef __cplusplus
14 extern "C" {
15 #endif
16
17 /** PCLMULQDQ CRC computation context structure */
18 struct crc_pclmulqdq_ctx {
19 __m128i rk1_rk2;
20 __m128i rk5_rk6;
21 __m128i rk7_rk8;
22 };
23
24 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
25 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
26 /**
27 * @brief Performs one folding round
28 *
29 * Logically function operates as follows:
30 * DATA = READ_NEXT_16BYTES();
31 * F1 = LSB8(FOLD)
32 * F2 = MSB8(FOLD)
33 * T1 = CLMUL(F1, RK1)
34 * T2 = CLMUL(F2, RK2)
35 * FOLD = XOR(T1, T2, DATA)
36 *
37 * @param data_block
38 * 16 byte data block
39 * @param precomp
40 * Precomputed rk1 constant
41 * @param fold
42 * Current16 byte folded data
43 *
44 * @return
45 * New 16 byte folded data
46 */
47 static __rte_always_inline __m128i
48 crcr32_folding_round(__m128i data_block,
49 __m128i precomp,
50 __m128i fold)
51 {
52 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
53 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
54
55 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
56 }
57
58 /**
59 * Performs reduction from 128 bits to 64 bits
60 *
61 * @param data128
62 * 128 bits data to be reduced
63 * @param precomp
64 * precomputed constants rk5, rk6
65 *
66 * @return
67 * 64 bits reduced data
68 */
69
70 static __rte_always_inline __m128i
71 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
72 {
73 __m128i tmp0, tmp1, tmp2;
74
75 /* 64b fold */
76 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
77 tmp1 = _mm_srli_si128(data128, 8);
78 tmp0 = _mm_xor_si128(tmp0, tmp1);
79
80 /* 32b fold */
81 tmp2 = _mm_slli_si128(tmp0, 4);
82 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
83
84 return _mm_xor_si128(tmp1, tmp0);
85 }
86
87 /**
88 * Performs Barret's reduction from 64 bits to 32 bits
89 *
90 * @param data64
91 * 64 bits data to be reduced
92 * @param precomp
93 * rk7 precomputed constant
94 *
95 * @return
96 * reduced 32 bits data
97 */
98
99 static __rte_always_inline uint32_t
100 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
101 {
102 static const uint32_t mask1[4] __rte_aligned(16) = {
103 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
104 };
105
106 static const uint32_t mask2[4] __rte_aligned(16) = {
107 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
108 };
109 __m128i tmp0, tmp1, tmp2;
110
111 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
112
113 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
114 tmp1 = _mm_xor_si128(tmp1, tmp0);
115 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
116
117 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
118 tmp2 = _mm_xor_si128(tmp2, tmp1);
119 tmp2 = _mm_xor_si128(tmp2, tmp0);
120
121 return _mm_extract_epi32(tmp2, 2);
122 }
123
124 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
125 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
126 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
127 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
128 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
129 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
130 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
131 };
132
133 /**
134 * Shifts left 128 bit register by specified number of bytes
135 *
136 * @param reg
137 * 128 bit value
138 * @param num
139 * number of bytes to shift left reg by (0-16)
140 *
141 * @return
142 * reg << (num * 8)
143 */
144
145 static __rte_always_inline __m128i
146 xmm_shift_left(__m128i reg, const unsigned int num)
147 {
148 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
149
150 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
151 }
152
153 static __rte_always_inline uint32_t
154 crc32_eth_calc_pclmulqdq(
155 const uint8_t *data,
156 uint32_t data_len,
157 uint32_t crc,
158 const struct crc_pclmulqdq_ctx *params)
159 {
160 __m128i temp, fold, k;
161 uint32_t n;
162
163 /* Get CRC init value */
164 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
165
166 /**
167 * Folding all data into single 16 byte data block
168 * Assumes: fold holds first 16 bytes of data
169 */
170
171 if (unlikely(data_len < 32)) {
172 if (unlikely(data_len == 16)) {
173 /* 16 bytes */
174 fold = _mm_loadu_si128((const __m128i *)data);
175 fold = _mm_xor_si128(fold, temp);
176 goto reduction_128_64;
177 }
178
179 if (unlikely(data_len < 16)) {
180 /* 0 to 15 bytes */
181 uint8_t buffer[16] __rte_aligned(16);
182
183 memset(buffer, 0, sizeof(buffer));
184 memcpy(buffer, data, data_len);
185
186 fold = _mm_load_si128((const __m128i *)buffer);
187 fold = _mm_xor_si128(fold, temp);
188 if (unlikely(data_len < 4)) {
189 fold = xmm_shift_left(fold, 8 - data_len);
190 goto barret_reduction;
191 }
192 fold = xmm_shift_left(fold, 16 - data_len);
193 goto reduction_128_64;
194 }
195 /* 17 to 31 bytes */
196 fold = _mm_loadu_si128((const __m128i *)data);
197 fold = _mm_xor_si128(fold, temp);
198 n = 16;
199 k = params->rk1_rk2;
200 goto partial_bytes;
201 }
202
203 /** At least 32 bytes in the buffer */
204 /** Apply CRC initial value */
205 fold = _mm_loadu_si128((const __m128i *)data);
206 fold = _mm_xor_si128(fold, temp);
207
208 /** Main folding loop - the last 16 bytes is processed separately */
209 k = params->rk1_rk2;
210 for (n = 16; (n + 16) <= data_len; n += 16) {
211 temp = _mm_loadu_si128((const __m128i *)&data[n]);
212 fold = crcr32_folding_round(temp, k, fold);
213 }
214
215 partial_bytes:
216 if (likely(n < data_len)) {
217
218 const uint32_t mask3[4] __rte_aligned(16) = {
219 0x80808080, 0x80808080, 0x80808080, 0x80808080
220 };
221
222 const uint8_t shf_table[32] __rte_aligned(16) = {
223 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
224 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
225 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
226 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
227 };
228
229 __m128i last16, a, b;
230
231 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
232
233 temp = _mm_loadu_si128((const __m128i *)
234 &shf_table[data_len & 15]);
235 a = _mm_shuffle_epi8(fold, temp);
236
237 temp = _mm_xor_si128(temp,
238 _mm_load_si128((const __m128i *)mask3));
239 b = _mm_shuffle_epi8(fold, temp);
240 b = _mm_blendv_epi8(b, last16, temp);
241
242 /* k = rk1 & rk2 */
243 temp = _mm_clmulepi64_si128(a, k, 0x01);
244 fold = _mm_clmulepi64_si128(a, k, 0x10);
245
246 fold = _mm_xor_si128(fold, temp);
247 fold = _mm_xor_si128(fold, b);
248 }
249
250 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
251 reduction_128_64:
252 k = params->rk5_rk6;
253 fold = crcr32_reduce_128_to_64(fold, k);
254
255 barret_reduction:
256 k = params->rk7_rk8;
257 n = crcr32_reduce_64_to_32(fold, k);
258
259 return n;
260 }
261
262
263 static inline void
264 rte_net_crc_sse42_init(void)
265 {
266 uint64_t k1, k2, k5, k6;
267 uint64_t p = 0, q = 0;
268
269 /** Initialize CRC16 data */
270 k1 = 0x189aeLLU;
271 k2 = 0x8e10LLU;
272 k5 = 0x189aeLLU;
273 k6 = 0x114aaLLU;
274 q = 0x11c581910LLU;
275 p = 0x10811LLU;
276
277 /** Save the params in context structure */
278 crc16_ccitt_pclmulqdq.rk1_rk2 =
279 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
280 crc16_ccitt_pclmulqdq.rk5_rk6 =
281 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
282 crc16_ccitt_pclmulqdq.rk7_rk8 =
283 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
284
285 /** Initialize CRC32 data */
286 k1 = 0xccaa009eLLU;
287 k2 = 0x1751997d0LLU;
288 k5 = 0xccaa009eLLU;
289 k6 = 0x163cd6124LLU;
290 q = 0x1f7011640LLU;
291 p = 0x1db710641LLU;
292
293 /** Save the params in context structure */
294 crc32_eth_pclmulqdq.rk1_rk2 =
295 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
296 crc32_eth_pclmulqdq.rk5_rk6 =
297 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
298 crc32_eth_pclmulqdq.rk7_rk8 =
299 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
300
301 /**
302 * Reset the register as following calculation may
303 * use other data types such as float, double, etc.
304 */
305 _mm_empty();
306
307 }
308
309 static inline uint32_t
310 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
311 uint32_t data_len)
312 {
313 /** return 16-bit CRC value */
314 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
315 data_len,
316 0xffff,
317 &crc16_ccitt_pclmulqdq);
318 }
319
320 static inline uint32_t
321 rte_crc32_eth_sse42_handler(const uint8_t *data,
322 uint32_t data_len)
323 {
324 return ~crc32_eth_calc_pclmulqdq(data,
325 data_len,
326 0xffffffffUL,
327 &crc32_eth_pclmulqdq);
328 }
329
330 #ifdef __cplusplus
331 }
332 #endif
333
334 #endif /* _RTE_NET_CRC_SSE_H_ */