]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright 2017 6WIND S.A. | |
3 | * Copyright 2017 Mellanox Technologies, Ltd | |
4 | */ | |
5 | ||
6 | #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_ | |
7 | #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_ | |
8 | ||
9 | #include <assert.h> | |
10 | #include <stdint.h> | |
11 | #include <string.h> | |
12 | #include <stdlib.h> | |
13 | #include <smmintrin.h> | |
14 | ||
15 | #include <rte_mbuf.h> | |
16 | #include <rte_mempool.h> | |
17 | #include <rte_prefetch.h> | |
18 | ||
19 | #include "mlx5.h" | |
20 | #include "mlx5_utils.h" | |
21 | #include "mlx5_rxtx.h" | |
22 | #include "mlx5_rxtx_vec.h" | |
23 | #include "mlx5_autoconf.h" | |
24 | #include "mlx5_defs.h" | |
25 | #include "mlx5_prm.h" | |
26 | ||
27 | #ifndef __INTEL_COMPILER | |
28 | #pragma GCC diagnostic ignored "-Wcast-qual" | |
29 | #endif | |
30 | ||
31 | /** | |
32 | * Fill in buffer descriptors in a multi-packet send descriptor. | |
33 | * | |
34 | * @param txq | |
35 | * Pointer to TX queue structure. | |
36 | * @param dseg | |
37 | * Pointer to buffer descriptor to be written. | |
38 | * @param pkts | |
39 | * Pointer to array of packets to be sent. | |
40 | * @param n | |
41 | * Number of packets to be filled. | |
42 | */ | |
43 | static inline void | |
44 | txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg, | |
45 | struct rte_mbuf **pkts, unsigned int n) | |
46 | { | |
47 | unsigned int pos; | |
48 | uintptr_t addr; | |
49 | const __m128i shuf_mask_dseg = | |
50 | _mm_set_epi8(8, 9, 10, 11, /* addr, bswap64 */ | |
51 | 12, 13, 14, 15, | |
52 | 7, 6, 5, 4, /* lkey */ | |
53 | 0, 1, 2, 3 /* length, bswap32 */); | |
54 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
55 | uint32_t tx_byte = 0; | |
56 | #endif | |
57 | ||
58 | for (pos = 0; pos < n; ++pos, ++dseg) { | |
59 | __m128i desc; | |
60 | struct rte_mbuf *pkt = pkts[pos]; | |
61 | ||
62 | addr = rte_pktmbuf_mtod(pkt, uintptr_t); | |
63 | desc = _mm_set_epi32(addr >> 32, | |
64 | addr, | |
65 | mlx5_tx_mb2mr(txq, pkt), | |
66 | DATA_LEN(pkt)); | |
67 | desc = _mm_shuffle_epi8(desc, shuf_mask_dseg); | |
68 | _mm_store_si128(dseg, desc); | |
69 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
70 | tx_byte += DATA_LEN(pkt); | |
71 | #endif | |
72 | } | |
73 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
74 | txq->stats.obytes += tx_byte; | |
75 | #endif | |
76 | } | |
77 | ||
78 | /** | |
79 | * Send multi-segmented packets until it encounters a single segment packet in | |
80 | * the pkts list. | |
81 | * | |
82 | * @param txq | |
83 | * Pointer to TX queue structure. | |
84 | * @param pkts | |
85 | * Pointer to array of packets to be sent. | |
86 | * @param pkts_n | |
87 | * Number of packets to be sent. | |
88 | * | |
89 | * @return | |
90 | * Number of packets successfully transmitted (<= pkts_n). | |
91 | */ | |
92 | static uint16_t | |
93 | txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, | |
94 | uint16_t pkts_n) | |
95 | { | |
96 | uint16_t elts_head = txq->elts_head; | |
97 | const uint16_t elts_n = 1 << txq->elts_n; | |
98 | const uint16_t elts_m = elts_n - 1; | |
99 | const uint16_t wq_n = 1 << txq->wqe_n; | |
100 | const uint16_t wq_mask = wq_n - 1; | |
101 | const unsigned int nb_dword_per_wqebb = | |
102 | MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; | |
103 | const unsigned int nb_dword_in_hdr = | |
104 | sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; | |
105 | unsigned int n; | |
106 | volatile struct mlx5_wqe *wqe = NULL; | |
9f95a23c TL |
107 | bool metadata_ol = |
108 | txq->offloads & DEV_TX_OFFLOAD_MATCH_METADATA ? true : false; | |
11fdf7f2 TL |
109 | |
110 | assert(elts_n > pkts_n); | |
111 | mlx5_tx_complete(txq); | |
112 | if (unlikely(!pkts_n)) | |
113 | return 0; | |
114 | for (n = 0; n < pkts_n; ++n) { | |
115 | struct rte_mbuf *buf = pkts[n]; | |
116 | unsigned int segs_n = buf->nb_segs; | |
117 | unsigned int ds = nb_dword_in_hdr; | |
118 | unsigned int len = PKT_LEN(buf); | |
119 | uint16_t wqe_ci = txq->wqe_ci; | |
120 | const __m128i shuf_mask_ctrl = | |
121 | _mm_set_epi8(15, 14, 13, 12, | |
122 | 8, 9, 10, 11, /* bswap32 */ | |
123 | 4, 5, 6, 7, /* bswap32 */ | |
124 | 0, 1, 2, 3 /* bswap32 */); | |
125 | uint8_t cs_flags; | |
126 | uint16_t max_elts; | |
127 | uint16_t max_wqe; | |
128 | __m128i *t_wqe, *dseg; | |
129 | __m128i ctrl; | |
9f95a23c TL |
130 | rte_be32_t metadata = |
131 | metadata_ol && (buf->ol_flags & PKT_TX_METADATA) ? | |
132 | buf->tx_metadata : 0; | |
11fdf7f2 TL |
133 | |
134 | assert(segs_n); | |
135 | max_elts = elts_n - (elts_head - txq->elts_tail); | |
136 | max_wqe = wq_n - (txq->wqe_ci - txq->wqe_pi); | |
137 | /* | |
138 | * A MPW session consumes 2 WQEs at most to | |
139 | * include MLX5_MPW_DSEG_MAX pointers. | |
140 | */ | |
141 | if (segs_n == 1 || | |
142 | max_elts < segs_n || max_wqe < 2) | |
143 | break; | |
144 | if (segs_n > MLX5_MPW_DSEG_MAX) { | |
145 | txq->stats.oerrors++; | |
146 | break; | |
147 | } | |
148 | wqe = &((volatile struct mlx5_wqe64 *) | |
149 | txq->wqes)[wqe_ci & wq_mask].hdr; | |
150 | cs_flags = txq_ol_cksum_to_cs(buf); | |
151 | /* Title WQEBB pointer. */ | |
152 | t_wqe = (__m128i *)wqe; | |
153 | dseg = (__m128i *)(wqe + 1); | |
154 | do { | |
155 | if (!(ds++ % nb_dword_per_wqebb)) { | |
156 | dseg = (__m128i *) | |
157 | &((volatile struct mlx5_wqe64 *) | |
158 | txq->wqes)[++wqe_ci & wq_mask]; | |
159 | } | |
160 | txq_wr_dseg_v(txq, dseg++, &buf, 1); | |
161 | (*txq->elts)[elts_head++ & elts_m] = buf; | |
162 | buf = buf->next; | |
163 | } while (--segs_n); | |
164 | ++wqe_ci; | |
165 | /* Fill CTRL in the header. */ | |
166 | ctrl = _mm_set_epi32(0, 0, txq->qp_num_8s | ds, | |
167 | MLX5_OPC_MOD_MPW << 24 | | |
168 | txq->wqe_ci << 8 | MLX5_OPCODE_TSO); | |
169 | ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); | |
170 | _mm_store_si128(t_wqe, ctrl); | |
171 | /* Fill ESEG in the header. */ | |
172 | _mm_store_si128(t_wqe + 1, | |
9f95a23c TL |
173 | _mm_set_epi32(0, metadata, |
174 | (rte_cpu_to_be_16(len) << 16) | | |
175 | cs_flags, 0)); | |
11fdf7f2 TL |
176 | txq->wqe_ci = wqe_ci; |
177 | } | |
178 | if (!n) | |
179 | return 0; | |
180 | txq->elts_comp += (uint16_t)(elts_head - txq->elts_head); | |
181 | txq->elts_head = elts_head; | |
182 | if (txq->elts_comp >= MLX5_TX_COMP_THRESH) { | |
183 | /* A CQE slot must always be available. */ | |
184 | assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); | |
185 | wqe->ctrl[2] = rte_cpu_to_be_32(8); | |
186 | wqe->ctrl[3] = txq->elts_head; | |
187 | txq->elts_comp = 0; | |
188 | } | |
189 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
190 | txq->stats.opackets += n; | |
191 | #endif | |
192 | mlx5_tx_dbrec(txq, wqe); | |
193 | return n; | |
194 | } | |
195 | ||
196 | /** | |
197 | * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet, | |
198 | * it returns to make it processed by txq_scatter_v(). All the packets in | |
199 | * the pkts list should be single segment packets having same offload flags. | |
200 | * This must be checked by txq_count_contig_single_seg() and txq_calc_offload(). | |
201 | * | |
202 | * @param txq | |
203 | * Pointer to TX queue structure. | |
204 | * @param pkts | |
205 | * Pointer to array of packets to be sent. | |
206 | * @param pkts_n | |
207 | * Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST). | |
208 | * @param cs_flags | |
209 | * Checksum offload flags to be written in the descriptor. | |
9f95a23c TL |
210 | * @param metadata |
211 | * Metadata value to be written in the descriptor. | |
11fdf7f2 TL |
212 | * |
213 | * @return | |
214 | * Number of packets successfully transmitted (<= pkts_n). | |
215 | */ | |
216 | static inline uint16_t | |
217 | txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n, | |
9f95a23c | 218 | uint8_t cs_flags, rte_be32_t metadata) |
11fdf7f2 TL |
219 | { |
220 | struct rte_mbuf **elts; | |
221 | uint16_t elts_head = txq->elts_head; | |
222 | const uint16_t elts_n = 1 << txq->elts_n; | |
223 | const uint16_t elts_m = elts_n - 1; | |
224 | const unsigned int nb_dword_per_wqebb = | |
225 | MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE; | |
226 | const unsigned int nb_dword_in_hdr = | |
227 | sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE; | |
228 | unsigned int n = 0; | |
229 | unsigned int pos; | |
230 | uint16_t max_elts; | |
231 | uint16_t max_wqe; | |
232 | uint32_t comp_req = 0; | |
233 | const uint16_t wq_n = 1 << txq->wqe_n; | |
234 | const uint16_t wq_mask = wq_n - 1; | |
235 | uint16_t wq_idx = txq->wqe_ci & wq_mask; | |
236 | volatile struct mlx5_wqe64 *wq = | |
237 | &((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx]; | |
238 | volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq; | |
239 | const __m128i shuf_mask_ctrl = | |
240 | _mm_set_epi8(15, 14, 13, 12, | |
241 | 8, 9, 10, 11, /* bswap32 */ | |
242 | 4, 5, 6, 7, /* bswap32 */ | |
243 | 0, 1, 2, 3 /* bswap32 */); | |
244 | __m128i *t_wqe, *dseg; | |
245 | __m128i ctrl; | |
246 | ||
247 | /* Make sure all packets can fit into a single WQE. */ | |
248 | assert(elts_n > pkts_n); | |
249 | mlx5_tx_complete(txq); | |
250 | max_elts = (elts_n - (elts_head - txq->elts_tail)); | |
251 | max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi); | |
252 | pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts); | |
253 | assert(pkts_n <= MLX5_DSEG_MAX - nb_dword_in_hdr); | |
254 | if (unlikely(!pkts_n)) | |
255 | return 0; | |
256 | elts = &(*txq->elts)[elts_head & elts_m]; | |
257 | /* Loop for available tailroom first. */ | |
258 | n = RTE_MIN(elts_n - (elts_head & elts_m), pkts_n); | |
259 | for (pos = 0; pos < (n & -2); pos += 2) | |
260 | _mm_storeu_si128((__m128i *)&elts[pos], | |
261 | _mm_loadu_si128((__m128i *)&pkts[pos])); | |
262 | if (n & 1) | |
263 | elts[pos] = pkts[pos]; | |
264 | /* Check if it crosses the end of the queue. */ | |
265 | if (unlikely(n < pkts_n)) { | |
266 | elts = &(*txq->elts)[0]; | |
267 | for (pos = 0; pos < pkts_n - n; ++pos) | |
268 | elts[pos] = pkts[n + pos]; | |
269 | } | |
270 | txq->elts_head += pkts_n; | |
271 | /* Save title WQEBB pointer. */ | |
272 | t_wqe = (__m128i *)wqe; | |
273 | dseg = (__m128i *)(wqe + 1); | |
274 | /* Calculate the number of entries to the end. */ | |
275 | n = RTE_MIN( | |
276 | (wq_n - wq_idx) * nb_dword_per_wqebb - nb_dword_in_hdr, | |
277 | pkts_n); | |
278 | /* Fill DSEGs. */ | |
279 | txq_wr_dseg_v(txq, dseg, pkts, n); | |
280 | /* Check if it crosses the end of the queue. */ | |
281 | if (n < pkts_n) { | |
282 | dseg = (__m128i *)txq->wqes; | |
283 | txq_wr_dseg_v(txq, dseg, &pkts[n], pkts_n - n); | |
284 | } | |
285 | if (txq->elts_comp + pkts_n < MLX5_TX_COMP_THRESH) { | |
286 | txq->elts_comp += pkts_n; | |
287 | } else { | |
288 | /* A CQE slot must always be available. */ | |
289 | assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci)); | |
290 | /* Request a completion. */ | |
291 | txq->elts_comp = 0; | |
292 | comp_req = 8; | |
293 | } | |
294 | /* Fill CTRL in the header. */ | |
295 | ctrl = _mm_set_epi32(txq->elts_head, comp_req, | |
296 | txq->qp_num_8s | (pkts_n + 2), | |
297 | MLX5_OPC_MOD_ENHANCED_MPSW << 24 | | |
298 | txq->wqe_ci << 8 | MLX5_OPCODE_ENHANCED_MPSW); | |
299 | ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl); | |
300 | _mm_store_si128(t_wqe, ctrl); | |
301 | /* Fill ESEG in the header. */ | |
9f95a23c | 302 | _mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0)); |
11fdf7f2 TL |
303 | #ifdef MLX5_PMD_SOFT_COUNTERS |
304 | txq->stats.opackets += pkts_n; | |
305 | #endif | |
306 | txq->wqe_ci += (nb_dword_in_hdr + pkts_n + (nb_dword_per_wqebb - 1)) / | |
307 | nb_dword_per_wqebb; | |
308 | /* Ring QP doorbell. */ | |
309 | mlx5_tx_dbrec_cond_wmb(txq, wqe, pkts_n < MLX5_VPMD_TX_MAX_BURST); | |
310 | return pkts_n; | |
311 | } | |
312 | ||
313 | /** | |
314 | * Store free buffers to RX SW ring. | |
315 | * | |
316 | * @param rxq | |
317 | * Pointer to RX queue structure. | |
318 | * @param pkts | |
319 | * Pointer to array of packets to be stored. | |
320 | * @param pkts_n | |
321 | * Number of packets to be stored. | |
322 | */ | |
323 | static inline void | |
324 | rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n) | |
325 | { | |
326 | const uint16_t q_mask = (1 << rxq->elts_n) - 1; | |
327 | struct rte_mbuf **elts = &(*rxq->elts)[rxq->rq_pi & q_mask]; | |
328 | unsigned int pos; | |
329 | uint16_t p = n & -2; | |
330 | ||
331 | for (pos = 0; pos < p; pos += 2) { | |
332 | __m128i mbp; | |
333 | ||
334 | mbp = _mm_loadu_si128((__m128i *)&elts[pos]); | |
335 | _mm_storeu_si128((__m128i *)&pkts[pos], mbp); | |
336 | } | |
337 | if (n & 1) | |
338 | pkts[pos] = elts[pos]; | |
339 | } | |
340 | ||
341 | /** | |
342 | * Decompress a compressed completion and fill in mbufs in RX SW ring with data | |
343 | * extracted from the title completion descriptor. | |
344 | * | |
345 | * @param rxq | |
346 | * Pointer to RX queue structure. | |
347 | * @param cq | |
348 | * Pointer to completion array having a compressed completion at first. | |
349 | * @param elts | |
350 | * Pointer to SW ring to be filled. The first mbuf has to be pre-built from | |
351 | * the title completion descriptor to be copied to the rest of mbufs. | |
352 | */ | |
353 | static inline void | |
354 | rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, | |
355 | struct rte_mbuf **elts) | |
356 | { | |
357 | volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1); | |
358 | struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */ | |
359 | unsigned int pos; | |
360 | unsigned int i; | |
361 | unsigned int inv = 0; | |
362 | /* Mask to shuffle from extracted mini CQE to mbuf. */ | |
363 | const __m128i shuf_mask1 = | |
364 | _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */ | |
365 | -1, -1, /* skip vlan_tci */ | |
366 | 6, 7, /* data_len, bswap16 */ | |
367 | -1, -1, 6, 7, /* pkt_len, bswap16 */ | |
368 | -1, -1, -1, -1 /* skip packet_type */); | |
369 | const __m128i shuf_mask2 = | |
370 | _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */ | |
371 | -1, -1, /* skip vlan_tci */ | |
372 | 14, 15, /* data_len, bswap16 */ | |
373 | -1, -1, 14, 15, /* pkt_len, bswap16 */ | |
374 | -1, -1, -1, -1 /* skip packet_type */); | |
375 | /* Restore the compressed count. Must be 16 bits. */ | |
376 | const uint16_t mcqe_n = t_pkt->data_len + | |
377 | (rxq->crc_present * ETHER_CRC_LEN); | |
378 | const __m128i rearm = | |
379 | _mm_loadu_si128((__m128i *)&t_pkt->rearm_data); | |
380 | const __m128i rxdf = | |
381 | _mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1); | |
382 | const __m128i crc_adj = | |
383 | _mm_set_epi16(0, 0, 0, | |
384 | rxq->crc_present * ETHER_CRC_LEN, | |
385 | 0, | |
386 | rxq->crc_present * ETHER_CRC_LEN, | |
387 | 0, 0); | |
388 | const uint32_t flow_tag = t_pkt->hash.fdir.hi; | |
389 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
390 | const __m128i zero = _mm_setzero_si128(); | |
391 | const __m128i ones = _mm_cmpeq_epi32(zero, zero); | |
392 | uint32_t rcvd_byte = 0; | |
393 | /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ | |
394 | const __m128i len_shuf_mask = | |
395 | _mm_set_epi8(-1, -1, -1, -1, | |
396 | -1, -1, -1, -1, | |
397 | 14, 15, 6, 7, | |
398 | 10, 11, 2, 3); | |
399 | #endif | |
400 | ||
401 | /* | |
402 | * A. load mCQEs into a 128bit register. | |
403 | * B. store rearm data to mbuf. | |
404 | * C. combine data from mCQEs with rx_descriptor_fields1. | |
405 | * D. store rx_descriptor_fields1. | |
406 | * E. store flow tag (rte_flow mark). | |
407 | */ | |
408 | for (pos = 0; pos < mcqe_n; ) { | |
409 | __m128i mcqe1, mcqe2; | |
410 | __m128i rxdf1, rxdf2; | |
411 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
412 | __m128i byte_cnt, invalid_mask; | |
413 | #endif | |
414 | ||
415 | if (!(pos & 0x7) && pos + 8 < mcqe_n) | |
416 | rte_prefetch0((void *)(cq + pos + 8)); | |
417 | /* A.1 load mCQEs into a 128bit register. */ | |
418 | mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]); | |
419 | mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]); | |
420 | /* B.1 store rearm data to mbuf. */ | |
421 | _mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm); | |
422 | _mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm); | |
423 | /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ | |
424 | rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1); | |
425 | rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2); | |
426 | rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); | |
427 | rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); | |
428 | rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); | |
429 | rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); | |
430 | /* D.1 store rx_descriptor_fields1. */ | |
431 | _mm_storeu_si128((__m128i *) | |
432 | &elts[pos]->rx_descriptor_fields1, | |
433 | rxdf1); | |
434 | _mm_storeu_si128((__m128i *) | |
435 | &elts[pos + 1]->rx_descriptor_fields1, | |
436 | rxdf2); | |
437 | /* B.1 store rearm data to mbuf. */ | |
438 | _mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm); | |
439 | _mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm); | |
440 | /* C.1 combine data from mCQEs with rx_descriptor_fields1. */ | |
441 | rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1); | |
442 | rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2); | |
443 | rxdf1 = _mm_sub_epi16(rxdf1, crc_adj); | |
444 | rxdf2 = _mm_sub_epi16(rxdf2, crc_adj); | |
445 | rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23); | |
446 | rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23); | |
447 | /* D.1 store rx_descriptor_fields1. */ | |
448 | _mm_storeu_si128((__m128i *) | |
449 | &elts[pos + 2]->rx_descriptor_fields1, | |
450 | rxdf1); | |
451 | _mm_storeu_si128((__m128i *) | |
452 | &elts[pos + 3]->rx_descriptor_fields1, | |
453 | rxdf2); | |
454 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
455 | invalid_mask = _mm_set_epi64x(0, | |
456 | (mcqe_n - pos) * | |
457 | sizeof(uint16_t) * 8); | |
458 | invalid_mask = _mm_sll_epi64(ones, invalid_mask); | |
459 | mcqe1 = _mm_srli_si128(mcqe1, 4); | |
460 | byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc); | |
461 | byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask); | |
462 | byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); | |
463 | byte_cnt = _mm_hadd_epi16(byte_cnt, zero); | |
464 | rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); | |
465 | #endif | |
466 | if (rxq->mark) { | |
467 | /* E.1 store flow tag (rte_flow mark). */ | |
468 | elts[pos]->hash.fdir.hi = flow_tag; | |
469 | elts[pos + 1]->hash.fdir.hi = flow_tag; | |
470 | elts[pos + 2]->hash.fdir.hi = flow_tag; | |
471 | elts[pos + 3]->hash.fdir.hi = flow_tag; | |
472 | } | |
473 | pos += MLX5_VPMD_DESCS_PER_LOOP; | |
474 | /* Move to next CQE and invalidate consumed CQEs. */ | |
475 | if (!(pos & 0x7) && pos < mcqe_n) { | |
476 | mcq = (void *)(cq + pos); | |
477 | for (i = 0; i < 8; ++i) | |
478 | cq[inv++].op_own = MLX5_CQE_INVALIDATE; | |
479 | } | |
480 | } | |
481 | /* Invalidate the rest of CQEs. */ | |
482 | for (; inv < mcqe_n; ++inv) | |
483 | cq[inv].op_own = MLX5_CQE_INVALIDATE; | |
484 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
485 | rxq->stats.ipackets += mcqe_n; | |
486 | rxq->stats.ibytes += rcvd_byte; | |
487 | #endif | |
488 | rxq->cq_ci += mcqe_n; | |
489 | } | |
490 | ||
491 | /** | |
492 | * Calculate packet type and offload flag for mbuf and store it. | |
493 | * | |
494 | * @param rxq | |
495 | * Pointer to RX queue structure. | |
496 | * @param cqes[4] | |
497 | * Array of four 16bytes completions extracted from the original completion | |
498 | * descriptor. | |
499 | * @param op_err | |
500 | * Opcode vector having responder error status. Each field is 4B. | |
501 | * @param pkts | |
502 | * Pointer to array of packets to be filled. | |
503 | */ | |
504 | static inline void | |
505 | rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4], | |
506 | __m128i op_err, struct rte_mbuf **pkts) | |
507 | { | |
508 | __m128i pinfo0, pinfo1; | |
509 | __m128i pinfo, ptype; | |
510 | __m128i ol_flags = _mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH | | |
511 | rxq->hw_timestamp * PKT_RX_TIMESTAMP); | |
512 | __m128i cv_flags; | |
513 | const __m128i zero = _mm_setzero_si128(); | |
514 | const __m128i ptype_mask = | |
515 | _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06); | |
516 | const __m128i ptype_ol_mask = | |
517 | _mm_set_epi32(0x106, 0x106, 0x106, 0x106); | |
518 | const __m128i pinfo_mask = | |
519 | _mm_set_epi32(0x3, 0x3, 0x3, 0x3); | |
520 | const __m128i cv_flag_sel = | |
521 | _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, | |
522 | (uint8_t)((PKT_RX_IP_CKSUM_GOOD | | |
523 | PKT_RX_L4_CKSUM_GOOD) >> 1), | |
524 | 0, | |
525 | (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1), | |
526 | 0, | |
527 | (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1), | |
528 | (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED), | |
529 | 0); | |
530 | const __m128i cv_mask = | |
531 | _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | | |
532 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, | |
533 | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | | |
534 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, | |
535 | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | | |
536 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, | |
537 | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD | | |
538 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); | |
539 | const __m128i mbuf_init = | |
540 | _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer); | |
541 | __m128i rearm0, rearm1, rearm2, rearm3; | |
542 | uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3; | |
543 | ||
544 | /* Extract pkt_info field. */ | |
545 | pinfo0 = _mm_unpacklo_epi32(cqes[0], cqes[1]); | |
546 | pinfo1 = _mm_unpacklo_epi32(cqes[2], cqes[3]); | |
547 | pinfo = _mm_unpacklo_epi64(pinfo0, pinfo1); | |
548 | /* Extract hdr_type_etc field. */ | |
549 | pinfo0 = _mm_unpackhi_epi32(cqes[0], cqes[1]); | |
550 | pinfo1 = _mm_unpackhi_epi32(cqes[2], cqes[3]); | |
551 | ptype = _mm_unpacklo_epi64(pinfo0, pinfo1); | |
552 | if (rxq->mark) { | |
553 | const __m128i pinfo_ft_mask = | |
554 | _mm_set_epi32(0xffffff00, 0xffffff00, | |
555 | 0xffffff00, 0xffffff00); | |
556 | const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR); | |
557 | __m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID); | |
558 | __m128i flow_tag, invalid_mask; | |
559 | ||
560 | flow_tag = _mm_and_si128(pinfo, pinfo_ft_mask); | |
561 | /* Check if flow tag is non-zero then set PKT_RX_FDIR. */ | |
562 | invalid_mask = _mm_cmpeq_epi32(flow_tag, zero); | |
563 | ol_flags = _mm_or_si128(ol_flags, | |
564 | _mm_andnot_si128(invalid_mask, | |
565 | fdir_flags)); | |
566 | /* Mask out invalid entries. */ | |
567 | fdir_id_flags = _mm_andnot_si128(invalid_mask, fdir_id_flags); | |
568 | /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */ | |
569 | ol_flags = _mm_or_si128(ol_flags, | |
570 | _mm_andnot_si128( | |
571 | _mm_cmpeq_epi32(flow_tag, | |
572 | pinfo_ft_mask), | |
573 | fdir_id_flags)); | |
574 | } | |
575 | /* | |
576 | * Merge the two fields to generate the following: | |
577 | * bit[1] = l3_ok | |
578 | * bit[2] = l4_ok | |
579 | * bit[8] = cv | |
580 | * bit[11:10] = l3_hdr_type | |
581 | * bit[14:12] = l4_hdr_type | |
582 | * bit[15] = ip_frag | |
583 | * bit[16] = tunneled | |
584 | * bit[17] = outer_l3_type | |
585 | */ | |
586 | ptype = _mm_and_si128(ptype, ptype_mask); | |
587 | pinfo = _mm_and_si128(pinfo, pinfo_mask); | |
588 | pinfo = _mm_slli_epi32(pinfo, 16); | |
589 | /* Make pinfo has merged fields for ol_flags calculation. */ | |
590 | pinfo = _mm_or_si128(ptype, pinfo); | |
591 | ptype = _mm_srli_epi32(pinfo, 10); | |
592 | ptype = _mm_packs_epi32(ptype, zero); | |
593 | /* Errored packets will have RTE_PTYPE_ALL_MASK. */ | |
594 | op_err = _mm_srli_epi16(op_err, 8); | |
595 | ptype = _mm_or_si128(ptype, op_err); | |
596 | pt_idx0 = _mm_extract_epi8(ptype, 0); | |
597 | pt_idx1 = _mm_extract_epi8(ptype, 2); | |
598 | pt_idx2 = _mm_extract_epi8(ptype, 4); | |
599 | pt_idx3 = _mm_extract_epi8(ptype, 6); | |
600 | pkts[0]->packet_type = mlx5_ptype_table[pt_idx0] | | |
601 | !!(pt_idx0 & (1 << 6)) * rxq->tunnel; | |
602 | pkts[1]->packet_type = mlx5_ptype_table[pt_idx1] | | |
603 | !!(pt_idx1 & (1 << 6)) * rxq->tunnel; | |
604 | pkts[2]->packet_type = mlx5_ptype_table[pt_idx2] | | |
605 | !!(pt_idx2 & (1 << 6)) * rxq->tunnel; | |
606 | pkts[3]->packet_type = mlx5_ptype_table[pt_idx3] | | |
607 | !!(pt_idx3 & (1 << 6)) * rxq->tunnel; | |
608 | /* Fill flags for checksum and VLAN. */ | |
609 | pinfo = _mm_and_si128(pinfo, ptype_ol_mask); | |
610 | pinfo = _mm_shuffle_epi8(cv_flag_sel, pinfo); | |
611 | /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */ | |
612 | cv_flags = _mm_slli_epi32(pinfo, 9); | |
613 | cv_flags = _mm_or_si128(pinfo, cv_flags); | |
614 | /* Move back flags to start from byte[0]. */ | |
615 | cv_flags = _mm_srli_epi32(cv_flags, 8); | |
616 | /* Mask out garbage bits. */ | |
617 | cv_flags = _mm_and_si128(cv_flags, cv_mask); | |
618 | /* Merge to ol_flags. */ | |
619 | ol_flags = _mm_or_si128(ol_flags, cv_flags); | |
620 | /* Merge mbuf_init and ol_flags. */ | |
621 | rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 8), 0x30); | |
622 | rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(ol_flags, 4), 0x30); | |
623 | rearm2 = _mm_blend_epi16(mbuf_init, ol_flags, 0x30); | |
624 | rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(ol_flags, 4), 0x30); | |
625 | /* Write 8B rearm_data and 8B ol_flags. */ | |
626 | _mm_store_si128((__m128i *)&pkts[0]->rearm_data, rearm0); | |
627 | _mm_store_si128((__m128i *)&pkts[1]->rearm_data, rearm1); | |
628 | _mm_store_si128((__m128i *)&pkts[2]->rearm_data, rearm2); | |
629 | _mm_store_si128((__m128i *)&pkts[3]->rearm_data, rearm3); | |
630 | } | |
631 | ||
632 | /** | |
633 | * Receive burst of packets. An errored completion also consumes a mbuf, but the | |
634 | * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed | |
635 | * before returning to application. | |
636 | * | |
637 | * @param rxq | |
638 | * Pointer to RX queue structure. | |
639 | * @param[out] pkts | |
640 | * Array to store received packets. | |
641 | * @param pkts_n | |
642 | * Maximum number of packets in array. | |
643 | * @param[out] err | |
644 | * Pointer to a flag. Set non-zero value if pkts array has at least one error | |
645 | * packet to handle. | |
646 | * | |
647 | * @return | |
648 | * Number of packets received including errors (<= pkts_n). | |
649 | */ | |
650 | static inline uint16_t | |
651 | rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, | |
652 | uint64_t *err) | |
653 | { | |
654 | const uint16_t q_n = 1 << rxq->cqe_n; | |
655 | const uint16_t q_mask = q_n - 1; | |
656 | volatile struct mlx5_cqe *cq; | |
657 | struct rte_mbuf **elts; | |
658 | unsigned int pos; | |
659 | uint64_t n; | |
660 | uint16_t repl_n; | |
661 | uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP; | |
662 | uint16_t nocmp_n = 0; | |
663 | uint16_t rcvd_pkt = 0; | |
664 | unsigned int cq_idx = rxq->cq_ci & q_mask; | |
665 | unsigned int elts_idx; | |
666 | unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1)); | |
667 | const __m128i owner_check = | |
668 | _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL); | |
669 | const __m128i opcode_check = | |
670 | _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL); | |
671 | const __m128i format_check = | |
672 | _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL); | |
673 | const __m128i resp_err_check = | |
674 | _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL); | |
675 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
676 | uint32_t rcvd_byte = 0; | |
677 | /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */ | |
678 | const __m128i len_shuf_mask = | |
679 | _mm_set_epi8(-1, -1, -1, -1, | |
680 | -1, -1, -1, -1, | |
681 | 12, 13, 8, 9, | |
682 | 4, 5, 0, 1); | |
683 | #endif | |
684 | /* Mask to shuffle from extracted CQE to mbuf. */ | |
685 | const __m128i shuf_mask = | |
686 | _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */ | |
687 | 12, 13, 14, 15, /* rss, bswap32 */ | |
688 | 10, 11, /* vlan_tci, bswap16 */ | |
689 | 4, 5, /* data_len, bswap16 */ | |
690 | -1, -1, /* zero out 2nd half of pkt_len */ | |
691 | 4, 5 /* pkt_len, bswap16 */); | |
692 | /* Mask to blend from the last Qword to the first DQword. */ | |
693 | const __m128i blend_mask = | |
694 | _mm_set_epi8(-1, -1, -1, -1, | |
695 | -1, -1, -1, -1, | |
696 | 0, 0, 0, 0, | |
697 | 0, 0, 0, -1); | |
698 | const __m128i zero = _mm_setzero_si128(); | |
699 | const __m128i ones = _mm_cmpeq_epi32(zero, zero); | |
700 | const __m128i crc_adj = | |
701 | _mm_set_epi16(0, 0, 0, 0, 0, | |
702 | rxq->crc_present * ETHER_CRC_LEN, | |
703 | 0, | |
704 | rxq->crc_present * ETHER_CRC_LEN); | |
705 | const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0); | |
706 | ||
707 | assert(rxq->sges_n == 0); | |
708 | assert(rxq->cqe_n == rxq->elts_n); | |
709 | cq = &(*rxq->cqes)[cq_idx]; | |
710 | rte_prefetch0(cq); | |
711 | rte_prefetch0(cq + 1); | |
712 | rte_prefetch0(cq + 2); | |
713 | rte_prefetch0(cq + 3); | |
714 | pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST); | |
715 | /* | |
716 | * Order of indexes: | |
717 | * rq_ci >= cq_ci >= rq_pi | |
718 | * Definition of indexes: | |
719 | * rq_ci - cq_ci := # of buffers owned by HW (posted). | |
720 | * cq_ci - rq_pi := # of buffers not returned to app (decompressed). | |
721 | * N - (rq_ci - rq_pi) := # of buffers consumed (to be replenished). | |
722 | */ | |
723 | repl_n = q_n - (rxq->rq_ci - rxq->rq_pi); | |
9f95a23c | 724 | if (repl_n >= rxq->rq_repl_thresh) |
11fdf7f2 TL |
725 | mlx5_rx_replenish_bulk_mbuf(rxq, repl_n); |
726 | /* See if there're unreturned mbufs from compressed CQE. */ | |
727 | rcvd_pkt = rxq->cq_ci - rxq->rq_pi; | |
728 | if (rcvd_pkt > 0) { | |
729 | rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n); | |
730 | rxq_copy_mbuf_v(rxq, pkts, rcvd_pkt); | |
731 | rxq->rq_pi += rcvd_pkt; | |
732 | pkts += rcvd_pkt; | |
733 | } | |
734 | elts_idx = rxq->rq_pi & q_mask; | |
735 | elts = &(*rxq->elts)[elts_idx]; | |
736 | /* Not to overflow pkts array. */ | |
737 | pkts_n = RTE_ALIGN_FLOOR(pkts_n - rcvd_pkt, MLX5_VPMD_DESCS_PER_LOOP); | |
738 | /* Not to cross queue end. */ | |
739 | pkts_n = RTE_MIN(pkts_n, q_n - elts_idx); | |
740 | if (!pkts_n) | |
741 | return rcvd_pkt; | |
742 | /* At this point, there shouldn't be any remained packets. */ | |
743 | assert(rxq->rq_pi == rxq->cq_ci); | |
744 | /* | |
745 | * A. load first Qword (8bytes) in one loop. | |
746 | * B. copy 4 mbuf pointers from elts ring to returing pkts. | |
747 | * C. load remained CQE data and extract necessary fields. | |
748 | * Final 16bytes cqes[] extracted from original 64bytes CQE has the | |
749 | * following structure: | |
750 | * struct { | |
751 | * uint8_t pkt_info; | |
752 | * uint8_t flow_tag[3]; | |
753 | * uint16_t byte_cnt; | |
754 | * uint8_t rsvd4; | |
755 | * uint8_t op_own; | |
756 | * uint16_t hdr_type_etc; | |
757 | * uint16_t vlan_info; | |
758 | * uint32_t rx_has_res; | |
759 | * } c; | |
760 | * D. fill in mbuf. | |
761 | * E. get valid CQEs. | |
762 | * F. find compressed CQE. | |
763 | */ | |
764 | for (pos = 0; | |
765 | pos < pkts_n; | |
766 | pos += MLX5_VPMD_DESCS_PER_LOOP) { | |
767 | __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP]; | |
768 | __m128i cqe_tmp1, cqe_tmp2; | |
769 | __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3; | |
770 | __m128i op_own, op_own_tmp1, op_own_tmp2; | |
771 | __m128i opcode, owner_mask, invalid_mask; | |
772 | __m128i comp_mask; | |
773 | __m128i mask; | |
774 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
775 | __m128i byte_cnt; | |
776 | #endif | |
777 | __m128i mbp1, mbp2; | |
778 | __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); | |
779 | unsigned int p1, p2, p3; | |
780 | ||
781 | /* Prefetch next 4 CQEs. */ | |
782 | if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) { | |
783 | rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]); | |
784 | rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]); | |
785 | rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]); | |
786 | rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]); | |
787 | } | |
788 | /* A.0 do not cross the end of CQ. */ | |
789 | mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8); | |
790 | mask = _mm_sll_epi64(ones, mask); | |
791 | p = _mm_andnot_si128(mask, p); | |
792 | /* A.1 load cqes. */ | |
793 | p3 = _mm_extract_epi16(p, 3); | |
794 | cqes[3] = _mm_loadl_epi64((__m128i *) | |
795 | &cq[pos + p3].sop_drop_qpn); | |
796 | rte_compiler_barrier(); | |
797 | p2 = _mm_extract_epi16(p, 2); | |
798 | cqes[2] = _mm_loadl_epi64((__m128i *) | |
799 | &cq[pos + p2].sop_drop_qpn); | |
800 | rte_compiler_barrier(); | |
801 | /* B.1 load mbuf pointers. */ | |
802 | mbp1 = _mm_loadu_si128((__m128i *)&elts[pos]); | |
803 | mbp2 = _mm_loadu_si128((__m128i *)&elts[pos + 2]); | |
804 | /* A.1 load a block having op_own. */ | |
805 | p1 = _mm_extract_epi16(p, 1); | |
806 | cqes[1] = _mm_loadl_epi64((__m128i *) | |
807 | &cq[pos + p1].sop_drop_qpn); | |
808 | rte_compiler_barrier(); | |
809 | cqes[0] = _mm_loadl_epi64((__m128i *) | |
810 | &cq[pos].sop_drop_qpn); | |
811 | /* B.2 copy mbuf pointers. */ | |
812 | _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); | |
813 | _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); | |
814 | rte_cio_rmb(); | |
815 | /* C.1 load remained CQE data and extract necessary fields. */ | |
816 | cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]); | |
817 | cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]); | |
818 | cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask); | |
819 | cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask); | |
820 | cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].rsvd1[3]); | |
821 | cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].rsvd1[3]); | |
822 | cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30); | |
823 | cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30); | |
824 | cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd2[10]); | |
825 | cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd2[10]); | |
826 | cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04); | |
827 | cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04); | |
828 | /* C.2 generate final structure for mbuf with swapping bytes. */ | |
829 | pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask); | |
830 | pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask); | |
831 | /* C.3 adjust CRC length. */ | |
832 | pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj); | |
833 | pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj); | |
834 | /* C.4 adjust flow mark. */ | |
835 | pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj); | |
836 | pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj); | |
837 | /* D.1 fill in mbuf - rx_descriptor_fields1. */ | |
838 | _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3); | |
839 | _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2); | |
840 | /* E.1 extract op_own field. */ | |
841 | op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]); | |
842 | /* C.1 load remained CQE data and extract necessary fields. */ | |
843 | cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]); | |
844 | cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]); | |
845 | cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask); | |
846 | cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask); | |
847 | cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].rsvd1[3]); | |
848 | cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].rsvd1[3]); | |
849 | cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30); | |
850 | cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30); | |
851 | cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd2[10]); | |
852 | cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd2[10]); | |
853 | cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04); | |
854 | cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04); | |
855 | /* C.2 generate final structure for mbuf with swapping bytes. */ | |
856 | pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask); | |
857 | pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask); | |
858 | /* C.3 adjust CRC length. */ | |
859 | pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj); | |
860 | pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj); | |
861 | /* C.4 adjust flow mark. */ | |
862 | pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj); | |
863 | pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj); | |
864 | /* E.1 extract op_own byte. */ | |
865 | op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]); | |
866 | op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2); | |
867 | /* D.1 fill in mbuf - rx_descriptor_fields1. */ | |
868 | _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1); | |
869 | _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0); | |
870 | /* E.2 flip owner bit to mark CQEs from last round. */ | |
871 | owner_mask = _mm_and_si128(op_own, owner_check); | |
872 | if (ownership) | |
873 | owner_mask = _mm_xor_si128(owner_mask, owner_check); | |
874 | owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check); | |
875 | owner_mask = _mm_packs_epi32(owner_mask, zero); | |
876 | /* E.3 get mask for invalidated CQEs. */ | |
877 | opcode = _mm_and_si128(op_own, opcode_check); | |
878 | invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode); | |
879 | invalid_mask = _mm_packs_epi32(invalid_mask, zero); | |
880 | /* E.4 mask out beyond boundary. */ | |
881 | invalid_mask = _mm_or_si128(invalid_mask, mask); | |
882 | /* E.5 merge invalid_mask with invalid owner. */ | |
883 | invalid_mask = _mm_or_si128(invalid_mask, owner_mask); | |
884 | /* F.1 find compressed CQE format. */ | |
885 | comp_mask = _mm_and_si128(op_own, format_check); | |
886 | comp_mask = _mm_cmpeq_epi32(comp_mask, format_check); | |
887 | comp_mask = _mm_packs_epi32(comp_mask, zero); | |
888 | /* F.2 mask out invalid entries. */ | |
889 | comp_mask = _mm_andnot_si128(invalid_mask, comp_mask); | |
890 | comp_idx = _mm_cvtsi128_si64(comp_mask); | |
891 | /* F.3 get the first compressed CQE. */ | |
892 | comp_idx = comp_idx ? | |
893 | __builtin_ctzll(comp_idx) / | |
894 | (sizeof(uint16_t) * 8) : | |
895 | MLX5_VPMD_DESCS_PER_LOOP; | |
896 | /* E.6 mask out entries after the compressed CQE. */ | |
897 | mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8); | |
898 | mask = _mm_sll_epi64(ones, mask); | |
899 | invalid_mask = _mm_or_si128(invalid_mask, mask); | |
900 | /* E.7 count non-compressed valid CQEs. */ | |
901 | n = _mm_cvtsi128_si64(invalid_mask); | |
902 | n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) : | |
903 | MLX5_VPMD_DESCS_PER_LOOP; | |
904 | nocmp_n += n; | |
905 | /* D.2 get the final invalid mask. */ | |
906 | mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8); | |
907 | mask = _mm_sll_epi64(ones, mask); | |
908 | invalid_mask = _mm_or_si128(invalid_mask, mask); | |
909 | /* D.3 check error in opcode. */ | |
910 | opcode = _mm_cmpeq_epi32(resp_err_check, opcode); | |
911 | opcode = _mm_packs_epi32(opcode, zero); | |
912 | opcode = _mm_andnot_si128(invalid_mask, opcode); | |
913 | /* D.4 mark if any error is set */ | |
914 | *err |= _mm_cvtsi128_si64(opcode); | |
915 | /* D.5 fill in mbuf - rearm_data and packet_type. */ | |
916 | rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]); | |
917 | if (rxq->hw_timestamp) { | |
918 | pkts[pos]->timestamp = | |
919 | rte_be_to_cpu_64(cq[pos].timestamp); | |
920 | pkts[pos + 1]->timestamp = | |
921 | rte_be_to_cpu_64(cq[pos + p1].timestamp); | |
922 | pkts[pos + 2]->timestamp = | |
923 | rte_be_to_cpu_64(cq[pos + p2].timestamp); | |
924 | pkts[pos + 3]->timestamp = | |
925 | rte_be_to_cpu_64(cq[pos + p3].timestamp); | |
926 | } | |
927 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
928 | /* Add up received bytes count. */ | |
929 | byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask); | |
930 | byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt); | |
931 | byte_cnt = _mm_hadd_epi16(byte_cnt, zero); | |
932 | rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero)); | |
933 | #endif | |
934 | /* | |
935 | * Break the loop unless more valid CQE is expected, or if | |
936 | * there's a compressed CQE. | |
937 | */ | |
938 | if (n != MLX5_VPMD_DESCS_PER_LOOP) | |
939 | break; | |
940 | } | |
941 | /* If no new CQE seen, return without updating cq_db. */ | |
942 | if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) | |
943 | return rcvd_pkt; | |
944 | /* Update the consumer indexes for non-compressed CQEs. */ | |
945 | assert(nocmp_n <= pkts_n); | |
946 | rxq->cq_ci += nocmp_n; | |
947 | rxq->rq_pi += nocmp_n; | |
948 | rcvd_pkt += nocmp_n; | |
949 | #ifdef MLX5_PMD_SOFT_COUNTERS | |
950 | rxq->stats.ipackets += nocmp_n; | |
951 | rxq->stats.ibytes += rcvd_byte; | |
952 | #endif | |
953 | /* Decompress the last CQE if compressed. */ | |
954 | if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) { | |
955 | assert(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP)); | |
956 | rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]); | |
957 | /* Return more packets if needed. */ | |
958 | if (nocmp_n < pkts_n) { | |
959 | uint16_t n = rxq->cq_ci - rxq->rq_pi; | |
960 | ||
961 | n = RTE_MIN(n, pkts_n - nocmp_n); | |
962 | rxq_copy_mbuf_v(rxq, &pkts[nocmp_n], n); | |
963 | rxq->rq_pi += n; | |
964 | rcvd_pkt += n; | |
965 | } | |
966 | } | |
967 | rte_compiler_barrier(); | |
968 | *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci); | |
969 | return rcvd_pkt; | |
970 | } | |
971 | ||
972 | #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */ |