]>
git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2017 6WIND S.A.
3 * Copyright 2017 Mellanox Technologies, Ltd
6 #ifndef RTE_PMD_MLX5_RXTX_VEC_SSE_H_
7 #define RTE_PMD_MLX5_RXTX_VEC_SSE_H_
12 #include <smmintrin.h>
15 #include <rte_mempool.h>
16 #include <rte_prefetch.h>
20 #include "mlx5_defs.h"
22 #include "mlx5_utils.h"
23 #include "mlx5_rxtx.h"
24 #include "mlx5_rxtx_vec.h"
25 #include "mlx5_autoconf.h"
27 #ifndef __INTEL_COMPILER
28 #pragma GCC diagnostic ignored "-Wcast-qual"
32 * Store free buffers to RX SW ring.
35 * Pointer to RX queue structure.
37 * Pointer to array of packets to be stored.
39 * Number of packets to be stored.
42 rxq_copy_mbuf_v(struct mlx5_rxq_data
*rxq
, struct rte_mbuf
**pkts
, uint16_t n
)
44 const uint16_t q_mask
= (1 << rxq
->elts_n
) - 1;
45 struct rte_mbuf
**elts
= &(*rxq
->elts
)[rxq
->rq_pi
& q_mask
];
49 for (pos
= 0; pos
< p
; pos
+= 2) {
52 mbp
= _mm_loadu_si128((__m128i
*)&elts
[pos
]);
53 _mm_storeu_si128((__m128i
*)&pkts
[pos
], mbp
);
56 pkts
[pos
] = elts
[pos
];
60 * Decompress a compressed completion and fill in mbufs in RX SW ring with data
61 * extracted from the title completion descriptor.
64 * Pointer to RX queue structure.
66 * Pointer to completion array having a compressed completion at first.
68 * Pointer to SW ring to be filled. The first mbuf has to be pre-built from
69 * the title completion descriptor to be copied to the rest of mbufs.
72 * Number of mini-CQEs successfully decompressed.
74 static inline uint16_t
75 rxq_cq_decompress_v(struct mlx5_rxq_data
*rxq
, volatile struct mlx5_cqe
*cq
,
76 struct rte_mbuf
**elts
)
78 volatile struct mlx5_mini_cqe8
*mcq
= (void *)(cq
+ 1);
79 struct rte_mbuf
*t_pkt
= elts
[0]; /* Title packet is pre-built. */
83 /* Mask to shuffle from extracted mini CQE to mbuf. */
84 const __m128i shuf_mask1
=
85 _mm_set_epi8(0, 1, 2, 3, /* rss, bswap32 */
86 -1, -1, /* skip vlan_tci */
87 6, 7, /* data_len, bswap16 */
88 -1, -1, 6, 7, /* pkt_len, bswap16 */
89 -1, -1, -1, -1 /* skip packet_type */);
90 const __m128i shuf_mask2
=
91 _mm_set_epi8(8, 9, 10, 11, /* rss, bswap32 */
92 -1, -1, /* skip vlan_tci */
93 14, 15, /* data_len, bswap16 */
94 -1, -1, 14, 15, /* pkt_len, bswap16 */
95 -1, -1, -1, -1 /* skip packet_type */);
96 /* Restore the compressed count. Must be 16 bits. */
97 const uint16_t mcqe_n
= t_pkt
->data_len
+
98 (rxq
->crc_present
* RTE_ETHER_CRC_LEN
);
100 _mm_loadu_si128((__m128i
*)&t_pkt
->rearm_data
);
102 _mm_loadu_si128((__m128i
*)&t_pkt
->rx_descriptor_fields1
);
103 const __m128i crc_adj
=
104 _mm_set_epi16(0, 0, 0,
105 rxq
->crc_present
* RTE_ETHER_CRC_LEN
,
107 rxq
->crc_present
* RTE_ETHER_CRC_LEN
,
109 const uint32_t flow_tag
= t_pkt
->hash
.fdir
.hi
;
110 #ifdef MLX5_PMD_SOFT_COUNTERS
111 const __m128i zero
= _mm_setzero_si128();
112 const __m128i ones
= _mm_cmpeq_epi32(zero
, zero
);
113 uint32_t rcvd_byte
= 0;
114 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
115 const __m128i len_shuf_mask
=
116 _mm_set_epi8(-1, -1, -1, -1,
122 * A. load mCQEs into a 128bit register.
123 * B. store rearm data to mbuf.
124 * C. combine data from mCQEs with rx_descriptor_fields1.
125 * D. store rx_descriptor_fields1.
126 * E. store flow tag (rte_flow mark).
128 for (pos
= 0; pos
< mcqe_n
; ) {
129 __m128i mcqe1
, mcqe2
;
130 __m128i rxdf1
, rxdf2
;
131 #ifdef MLX5_PMD_SOFT_COUNTERS
132 __m128i byte_cnt
, invalid_mask
;
135 for (i
= 0; i
< MLX5_VPMD_DESCS_PER_LOOP
; ++i
)
136 if (likely(pos
+ i
< mcqe_n
))
137 rte_prefetch0((void *)(cq
+ pos
+ i
));
139 /* A.1 load mCQEs into a 128bit register. */
140 mcqe1
= _mm_loadu_si128((__m128i
*)&mcq
[pos
% 8]);
141 mcqe2
= _mm_loadu_si128((__m128i
*)&mcq
[pos
% 8 + 2]);
142 /* B.1 store rearm data to mbuf. */
143 _mm_storeu_si128((__m128i
*)&elts
[pos
]->rearm_data
, rearm
);
144 _mm_storeu_si128((__m128i
*)&elts
[pos
+ 1]->rearm_data
, rearm
);
145 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
146 rxdf1
= _mm_shuffle_epi8(mcqe1
, shuf_mask1
);
147 rxdf2
= _mm_shuffle_epi8(mcqe1
, shuf_mask2
);
148 rxdf1
= _mm_sub_epi16(rxdf1
, crc_adj
);
149 rxdf2
= _mm_sub_epi16(rxdf2
, crc_adj
);
150 rxdf1
= _mm_blend_epi16(rxdf1
, rxdf
, 0x23);
151 rxdf2
= _mm_blend_epi16(rxdf2
, rxdf
, 0x23);
152 /* D.1 store rx_descriptor_fields1. */
153 _mm_storeu_si128((__m128i
*)
154 &elts
[pos
]->rx_descriptor_fields1
,
156 _mm_storeu_si128((__m128i
*)
157 &elts
[pos
+ 1]->rx_descriptor_fields1
,
159 /* B.1 store rearm data to mbuf. */
160 _mm_storeu_si128((__m128i
*)&elts
[pos
+ 2]->rearm_data
, rearm
);
161 _mm_storeu_si128((__m128i
*)&elts
[pos
+ 3]->rearm_data
, rearm
);
162 /* C.1 combine data from mCQEs with rx_descriptor_fields1. */
163 rxdf1
= _mm_shuffle_epi8(mcqe2
, shuf_mask1
);
164 rxdf2
= _mm_shuffle_epi8(mcqe2
, shuf_mask2
);
165 rxdf1
= _mm_sub_epi16(rxdf1
, crc_adj
);
166 rxdf2
= _mm_sub_epi16(rxdf2
, crc_adj
);
167 rxdf1
= _mm_blend_epi16(rxdf1
, rxdf
, 0x23);
168 rxdf2
= _mm_blend_epi16(rxdf2
, rxdf
, 0x23);
169 /* D.1 store rx_descriptor_fields1. */
170 _mm_storeu_si128((__m128i
*)
171 &elts
[pos
+ 2]->rx_descriptor_fields1
,
173 _mm_storeu_si128((__m128i
*)
174 &elts
[pos
+ 3]->rx_descriptor_fields1
,
176 #ifdef MLX5_PMD_SOFT_COUNTERS
177 invalid_mask
= _mm_set_epi64x(0,
179 sizeof(uint16_t) * 8);
180 invalid_mask
= _mm_sll_epi64(ones
, invalid_mask
);
181 mcqe1
= _mm_srli_si128(mcqe1
, 4);
182 byte_cnt
= _mm_blend_epi16(mcqe1
, mcqe2
, 0xcc);
183 byte_cnt
= _mm_shuffle_epi8(byte_cnt
, len_shuf_mask
);
184 byte_cnt
= _mm_andnot_si128(invalid_mask
, byte_cnt
);
185 byte_cnt
= _mm_hadd_epi16(byte_cnt
, zero
);
186 rcvd_byte
+= _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt
, zero
));
189 /* E.1 store flow tag (rte_flow mark). */
190 elts
[pos
]->hash
.fdir
.hi
= flow_tag
;
191 elts
[pos
+ 1]->hash
.fdir
.hi
= flow_tag
;
192 elts
[pos
+ 2]->hash
.fdir
.hi
= flow_tag
;
193 elts
[pos
+ 3]->hash
.fdir
.hi
= flow_tag
;
195 if (rxq
->dynf_meta
) {
196 int32_t offs
= rxq
->flow_meta_offset
;
197 const uint32_t meta
=
198 *RTE_MBUF_DYNFIELD(t_pkt
, offs
, uint32_t *);
200 /* Check if title packet has valid metadata. */
202 MLX5_ASSERT(t_pkt
->ol_flags
&
203 rxq
->flow_meta_mask
);
204 *RTE_MBUF_DYNFIELD(elts
[pos
], offs
,
206 *RTE_MBUF_DYNFIELD(elts
[pos
+ 1], offs
,
208 *RTE_MBUF_DYNFIELD(elts
[pos
+ 2], offs
,
210 *RTE_MBUF_DYNFIELD(elts
[pos
+ 3], offs
,
214 pos
+= MLX5_VPMD_DESCS_PER_LOOP
;
215 /* Move to next CQE and invalidate consumed CQEs. */
216 if (!(pos
& 0x7) && pos
< mcqe_n
) {
217 mcq
= (void *)(cq
+ pos
);
218 for (i
= 0; i
< 8; ++i
)
219 cq
[inv
++].op_own
= MLX5_CQE_INVALIDATE
;
222 /* Invalidate the rest of CQEs. */
223 for (; inv
< mcqe_n
; ++inv
)
224 cq
[inv
].op_own
= MLX5_CQE_INVALIDATE
;
225 #ifdef MLX5_PMD_SOFT_COUNTERS
226 rxq
->stats
.ipackets
+= mcqe_n
;
227 rxq
->stats
.ibytes
+= rcvd_byte
;
229 rxq
->cq_ci
+= mcqe_n
;
234 * Calculate packet type and offload flag for mbuf and store it.
237 * Pointer to RX queue structure.
239 * Array of four 16bytes completions extracted from the original completion
242 * Opcode vector having responder error status. Each field is 4B.
244 * Pointer to array of packets to be filled.
247 rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data
*rxq
, __m128i cqes
[4],
248 __m128i op_err
, struct rte_mbuf
**pkts
)
250 __m128i pinfo0
, pinfo1
;
251 __m128i pinfo
, ptype
;
252 __m128i ol_flags
= _mm_set1_epi32(rxq
->rss_hash
* PKT_RX_RSS_HASH
|
253 rxq
->hw_timestamp
* PKT_RX_TIMESTAMP
);
255 const __m128i zero
= _mm_setzero_si128();
256 const __m128i ptype_mask
=
257 _mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
258 const __m128i ptype_ol_mask
=
259 _mm_set_epi32(0x106, 0x106, 0x106, 0x106);
260 const __m128i pinfo_mask
=
261 _mm_set_epi32(0x3, 0x3, 0x3, 0x3);
262 const __m128i cv_flag_sel
=
263 _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
264 (uint8_t)((PKT_RX_IP_CKSUM_GOOD
|
265 PKT_RX_L4_CKSUM_GOOD
) >> 1),
267 (uint8_t)(PKT_RX_L4_CKSUM_GOOD
>> 1),
269 (uint8_t)(PKT_RX_IP_CKSUM_GOOD
>> 1),
270 (uint8_t)(PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
),
272 const __m128i cv_mask
=
273 _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
|
274 PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
,
275 PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
|
276 PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
,
277 PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
|
278 PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
,
279 PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
|
280 PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
);
281 const __m128i mbuf_init
=
282 _mm_load_si128((__m128i
*)&rxq
->mbuf_initializer
);
283 __m128i rearm0
, rearm1
, rearm2
, rearm3
;
284 uint8_t pt_idx0
, pt_idx1
, pt_idx2
, pt_idx3
;
286 /* Extract pkt_info field. */
287 pinfo0
= _mm_unpacklo_epi32(cqes
[0], cqes
[1]);
288 pinfo1
= _mm_unpacklo_epi32(cqes
[2], cqes
[3]);
289 pinfo
= _mm_unpacklo_epi64(pinfo0
, pinfo1
);
290 /* Extract hdr_type_etc field. */
291 pinfo0
= _mm_unpackhi_epi32(cqes
[0], cqes
[1]);
292 pinfo1
= _mm_unpackhi_epi32(cqes
[2], cqes
[3]);
293 ptype
= _mm_unpacklo_epi64(pinfo0
, pinfo1
);
295 const __m128i pinfo_ft_mask
=
296 _mm_set_epi32(0xffffff00, 0xffffff00,
297 0xffffff00, 0xffffff00);
298 const __m128i fdir_flags
= _mm_set1_epi32(PKT_RX_FDIR
);
299 __m128i fdir_id_flags
= _mm_set1_epi32(PKT_RX_FDIR_ID
);
300 __m128i flow_tag
, invalid_mask
;
302 flow_tag
= _mm_and_si128(pinfo
, pinfo_ft_mask
);
303 /* Check if flow tag is non-zero then set PKT_RX_FDIR. */
304 invalid_mask
= _mm_cmpeq_epi32(flow_tag
, zero
);
305 ol_flags
= _mm_or_si128(ol_flags
,
306 _mm_andnot_si128(invalid_mask
,
308 /* Mask out invalid entries. */
309 fdir_id_flags
= _mm_andnot_si128(invalid_mask
, fdir_id_flags
);
310 /* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
311 ol_flags
= _mm_or_si128(ol_flags
,
313 _mm_cmpeq_epi32(flow_tag
,
318 * Merge the two fields to generate the following:
322 * bit[11:10] = l3_hdr_type
323 * bit[14:12] = l4_hdr_type
326 * bit[17] = outer_l3_type
328 ptype
= _mm_and_si128(ptype
, ptype_mask
);
329 pinfo
= _mm_and_si128(pinfo
, pinfo_mask
);
330 pinfo
= _mm_slli_epi32(pinfo
, 16);
331 /* Make pinfo has merged fields for ol_flags calculation. */
332 pinfo
= _mm_or_si128(ptype
, pinfo
);
333 ptype
= _mm_srli_epi32(pinfo
, 10);
334 ptype
= _mm_packs_epi32(ptype
, zero
);
335 /* Errored packets will have RTE_PTYPE_ALL_MASK. */
336 op_err
= _mm_srli_epi16(op_err
, 8);
337 ptype
= _mm_or_si128(ptype
, op_err
);
338 pt_idx0
= _mm_extract_epi8(ptype
, 0);
339 pt_idx1
= _mm_extract_epi8(ptype
, 2);
340 pt_idx2
= _mm_extract_epi8(ptype
, 4);
341 pt_idx3
= _mm_extract_epi8(ptype
, 6);
342 pkts
[0]->packet_type
= mlx5_ptype_table
[pt_idx0
] |
343 !!(pt_idx0
& (1 << 6)) * rxq
->tunnel
;
344 pkts
[1]->packet_type
= mlx5_ptype_table
[pt_idx1
] |
345 !!(pt_idx1
& (1 << 6)) * rxq
->tunnel
;
346 pkts
[2]->packet_type
= mlx5_ptype_table
[pt_idx2
] |
347 !!(pt_idx2
& (1 << 6)) * rxq
->tunnel
;
348 pkts
[3]->packet_type
= mlx5_ptype_table
[pt_idx3
] |
349 !!(pt_idx3
& (1 << 6)) * rxq
->tunnel
;
350 /* Fill flags for checksum and VLAN. */
351 pinfo
= _mm_and_si128(pinfo
, ptype_ol_mask
);
352 pinfo
= _mm_shuffle_epi8(cv_flag_sel
, pinfo
);
353 /* Locate checksum flags at byte[2:1] and merge with VLAN flags. */
354 cv_flags
= _mm_slli_epi32(pinfo
, 9);
355 cv_flags
= _mm_or_si128(pinfo
, cv_flags
);
356 /* Move back flags to start from byte[0]. */
357 cv_flags
= _mm_srli_epi32(cv_flags
, 8);
358 /* Mask out garbage bits. */
359 cv_flags
= _mm_and_si128(cv_flags
, cv_mask
);
360 /* Merge to ol_flags. */
361 ol_flags
= _mm_or_si128(ol_flags
, cv_flags
);
362 /* Merge mbuf_init and ol_flags. */
363 rearm0
= _mm_blend_epi16(mbuf_init
, _mm_slli_si128(ol_flags
, 8), 0x30);
364 rearm1
= _mm_blend_epi16(mbuf_init
, _mm_slli_si128(ol_flags
, 4), 0x30);
365 rearm2
= _mm_blend_epi16(mbuf_init
, ol_flags
, 0x30);
366 rearm3
= _mm_blend_epi16(mbuf_init
, _mm_srli_si128(ol_flags
, 4), 0x30);
367 /* Write 8B rearm_data and 8B ol_flags. */
368 _mm_store_si128((__m128i
*)&pkts
[0]->rearm_data
, rearm0
);
369 _mm_store_si128((__m128i
*)&pkts
[1]->rearm_data
, rearm1
);
370 _mm_store_si128((__m128i
*)&pkts
[2]->rearm_data
, rearm2
);
371 _mm_store_si128((__m128i
*)&pkts
[3]->rearm_data
, rearm3
);
375 * Receive burst of packets. An errored completion also consumes a mbuf, but the
376 * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
377 * before returning to application.
380 * Pointer to RX queue structure.
382 * Array to store received packets.
384 * Maximum number of packets in array.
386 * Pointer to a flag. Set non-zero value if pkts array has at least one error
390 * Number of packets received including errors (<= pkts_n).
392 static inline uint16_t
393 rxq_burst_v(struct mlx5_rxq_data
*rxq
, struct rte_mbuf
**pkts
, uint16_t pkts_n
,
396 const uint16_t q_n
= 1 << rxq
->cqe_n
;
397 const uint16_t q_mask
= q_n
- 1;
398 volatile struct mlx5_cqe
*cq
;
399 struct rte_mbuf
**elts
;
403 uint64_t comp_idx
= MLX5_VPMD_DESCS_PER_LOOP
;
404 uint16_t nocmp_n
= 0;
405 uint16_t rcvd_pkt
= 0;
406 unsigned int cq_idx
= rxq
->cq_ci
& q_mask
;
407 unsigned int elts_idx
;
408 unsigned int ownership
= !!(rxq
->cq_ci
& (q_mask
+ 1));
409 const __m128i owner_check
=
410 _mm_set_epi64x(0x0100000001000000LL
, 0x0100000001000000LL
);
411 const __m128i opcode_check
=
412 _mm_set_epi64x(0xf0000000f0000000LL
, 0xf0000000f0000000LL
);
413 const __m128i format_check
=
414 _mm_set_epi64x(0x0c0000000c000000LL
, 0x0c0000000c000000LL
);
415 const __m128i resp_err_check
=
416 _mm_set_epi64x(0xe0000000e0000000LL
, 0xe0000000e0000000LL
);
417 #ifdef MLX5_PMD_SOFT_COUNTERS
418 uint32_t rcvd_byte
= 0;
419 /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
420 const __m128i len_shuf_mask
=
421 _mm_set_epi8(-1, -1, -1, -1,
426 /* Mask to shuffle from extracted CQE to mbuf. */
427 const __m128i shuf_mask
=
428 _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */
429 12, 13, 14, 15, /* rss, bswap32 */
430 10, 11, /* vlan_tci, bswap16 */
431 4, 5, /* data_len, bswap16 */
432 -1, -1, /* zero out 2nd half of pkt_len */
433 4, 5 /* pkt_len, bswap16 */);
434 /* Mask to blend from the last Qword to the first DQword. */
435 const __m128i blend_mask
=
436 _mm_set_epi8(-1, -1, -1, -1,
440 const __m128i zero
= _mm_setzero_si128();
441 const __m128i ones
= _mm_cmpeq_epi32(zero
, zero
);
442 const __m128i crc_adj
=
443 _mm_set_epi16(0, 0, 0, 0, 0,
444 rxq
->crc_present
* RTE_ETHER_CRC_LEN
,
446 rxq
->crc_present
* RTE_ETHER_CRC_LEN
);
447 const __m128i flow_mark_adj
= _mm_set_epi32(rxq
->mark
* (-1), 0, 0, 0);
449 MLX5_ASSERT(rxq
->sges_n
== 0);
450 MLX5_ASSERT(rxq
->cqe_n
== rxq
->elts_n
);
451 cq
= &(*rxq
->cqes
)[cq_idx
];
453 rte_prefetch0(cq
+ 1);
454 rte_prefetch0(cq
+ 2);
455 rte_prefetch0(cq
+ 3);
456 pkts_n
= RTE_MIN(pkts_n
, MLX5_VPMD_RX_MAX_BURST
);
457 repl_n
= q_n
- (rxq
->rq_ci
- rxq
->rq_pi
);
458 if (repl_n
>= rxq
->rq_repl_thresh
)
459 mlx5_rx_replenish_bulk_mbuf(rxq
, repl_n
);
460 /* See if there're unreturned mbufs from compressed CQE. */
461 rcvd_pkt
= rxq
->decompressed
;
463 rcvd_pkt
= RTE_MIN(rcvd_pkt
, pkts_n
);
464 rxq_copy_mbuf_v(rxq
, pkts
, rcvd_pkt
);
465 rxq
->rq_pi
+= rcvd_pkt
;
466 rxq
->decompressed
-= rcvd_pkt
;
469 elts_idx
= rxq
->rq_pi
& q_mask
;
470 elts
= &(*rxq
->elts
)[elts_idx
];
471 /* Not to overflow pkts array. */
472 pkts_n
= RTE_ALIGN_FLOOR(pkts_n
- rcvd_pkt
, MLX5_VPMD_DESCS_PER_LOOP
);
473 /* Not to cross queue end. */
474 pkts_n
= RTE_MIN(pkts_n
, q_n
- elts_idx
);
475 pkts_n
= RTE_MIN(pkts_n
, q_n
- cq_idx
);
478 /* At this point, there shouldn't be any remained packets. */
479 MLX5_ASSERT(rxq
->decompressed
== 0);
481 * A. load first Qword (8bytes) in one loop.
482 * B. copy 4 mbuf pointers from elts ring to returing pkts.
483 * C. load remained CQE data and extract necessary fields.
484 * Final 16bytes cqes[] extracted from original 64bytes CQE has the
485 * following structure:
488 * uint8_t flow_tag[3];
492 * uint16_t hdr_type_etc;
493 * uint16_t vlan_info;
494 * uint32_t rx_has_res;
498 * F. find compressed CQE.
502 pos
+= MLX5_VPMD_DESCS_PER_LOOP
) {
503 __m128i cqes
[MLX5_VPMD_DESCS_PER_LOOP
];
504 __m128i cqe_tmp1
, cqe_tmp2
;
505 __m128i pkt_mb0
, pkt_mb1
, pkt_mb2
, pkt_mb3
;
506 __m128i op_own
, op_own_tmp1
, op_own_tmp2
;
507 __m128i opcode
, owner_mask
, invalid_mask
;
510 #ifdef MLX5_PMD_SOFT_COUNTERS
514 __m128i p
= _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
515 unsigned int p1
, p2
, p3
;
517 /* Prefetch next 4 CQEs. */
518 if (pkts_n
- pos
>= 2 * MLX5_VPMD_DESCS_PER_LOOP
) {
519 rte_prefetch0(&cq
[pos
+ MLX5_VPMD_DESCS_PER_LOOP
]);
520 rte_prefetch0(&cq
[pos
+ MLX5_VPMD_DESCS_PER_LOOP
+ 1]);
521 rte_prefetch0(&cq
[pos
+ MLX5_VPMD_DESCS_PER_LOOP
+ 2]);
522 rte_prefetch0(&cq
[pos
+ MLX5_VPMD_DESCS_PER_LOOP
+ 3]);
524 /* A.0 do not cross the end of CQ. */
525 mask
= _mm_set_epi64x(0, (pkts_n
- pos
) * sizeof(uint16_t) * 8);
526 mask
= _mm_sll_epi64(ones
, mask
);
527 p
= _mm_andnot_si128(mask
, p
);
529 p3
= _mm_extract_epi16(p
, 3);
530 cqes
[3] = _mm_loadl_epi64((__m128i
*)
531 &cq
[pos
+ p3
].sop_drop_qpn
);
532 rte_compiler_barrier();
533 p2
= _mm_extract_epi16(p
, 2);
534 cqes
[2] = _mm_loadl_epi64((__m128i
*)
535 &cq
[pos
+ p2
].sop_drop_qpn
);
536 rte_compiler_barrier();
537 /* B.1 load mbuf pointers. */
538 mbp1
= _mm_loadu_si128((__m128i
*)&elts
[pos
]);
539 mbp2
= _mm_loadu_si128((__m128i
*)&elts
[pos
+ 2]);
540 /* A.1 load a block having op_own. */
541 p1
= _mm_extract_epi16(p
, 1);
542 cqes
[1] = _mm_loadl_epi64((__m128i
*)
543 &cq
[pos
+ p1
].sop_drop_qpn
);
544 rte_compiler_barrier();
545 cqes
[0] = _mm_loadl_epi64((__m128i
*)
546 &cq
[pos
].sop_drop_qpn
);
547 /* B.2 copy mbuf pointers. */
548 _mm_storeu_si128((__m128i
*)&pkts
[pos
], mbp1
);
549 _mm_storeu_si128((__m128i
*)&pkts
[pos
+ 2], mbp2
);
551 /* C.1 load remained CQE data and extract necessary fields. */
552 cqe_tmp2
= _mm_load_si128((__m128i
*)&cq
[pos
+ p3
]);
553 cqe_tmp1
= _mm_load_si128((__m128i
*)&cq
[pos
+ p2
]);
554 cqes
[3] = _mm_blendv_epi8(cqes
[3], cqe_tmp2
, blend_mask
);
555 cqes
[2] = _mm_blendv_epi8(cqes
[2], cqe_tmp1
, blend_mask
);
556 cqe_tmp2
= _mm_loadu_si128((__m128i
*)&cq
[pos
+ p3
].csum
);
557 cqe_tmp1
= _mm_loadu_si128((__m128i
*)&cq
[pos
+ p2
].csum
);
558 cqes
[3] = _mm_blend_epi16(cqes
[3], cqe_tmp2
, 0x30);
559 cqes
[2] = _mm_blend_epi16(cqes
[2], cqe_tmp1
, 0x30);
560 cqe_tmp2
= _mm_loadl_epi64((__m128i
*)&cq
[pos
+ p3
].rsvd4
[2]);
561 cqe_tmp1
= _mm_loadl_epi64((__m128i
*)&cq
[pos
+ p2
].rsvd4
[2]);
562 cqes
[3] = _mm_blend_epi16(cqes
[3], cqe_tmp2
, 0x04);
563 cqes
[2] = _mm_blend_epi16(cqes
[2], cqe_tmp1
, 0x04);
564 /* C.2 generate final structure for mbuf with swapping bytes. */
565 pkt_mb3
= _mm_shuffle_epi8(cqes
[3], shuf_mask
);
566 pkt_mb2
= _mm_shuffle_epi8(cqes
[2], shuf_mask
);
567 /* C.3 adjust CRC length. */
568 pkt_mb3
= _mm_sub_epi16(pkt_mb3
, crc_adj
);
569 pkt_mb2
= _mm_sub_epi16(pkt_mb2
, crc_adj
);
570 /* C.4 adjust flow mark. */
571 pkt_mb3
= _mm_add_epi32(pkt_mb3
, flow_mark_adj
);
572 pkt_mb2
= _mm_add_epi32(pkt_mb2
, flow_mark_adj
);
573 /* D.1 fill in mbuf - rx_descriptor_fields1. */
574 _mm_storeu_si128((void *)&pkts
[pos
+ 3]->pkt_len
, pkt_mb3
);
575 _mm_storeu_si128((void *)&pkts
[pos
+ 2]->pkt_len
, pkt_mb2
);
576 /* E.1 extract op_own field. */
577 op_own_tmp2
= _mm_unpacklo_epi32(cqes
[2], cqes
[3]);
578 /* C.1 load remained CQE data and extract necessary fields. */
579 cqe_tmp2
= _mm_load_si128((__m128i
*)&cq
[pos
+ p1
]);
580 cqe_tmp1
= _mm_load_si128((__m128i
*)&cq
[pos
]);
581 cqes
[1] = _mm_blendv_epi8(cqes
[1], cqe_tmp2
, blend_mask
);
582 cqes
[0] = _mm_blendv_epi8(cqes
[0], cqe_tmp1
, blend_mask
);
583 cqe_tmp2
= _mm_loadu_si128((__m128i
*)&cq
[pos
+ p1
].csum
);
584 cqe_tmp1
= _mm_loadu_si128((__m128i
*)&cq
[pos
].csum
);
585 cqes
[1] = _mm_blend_epi16(cqes
[1], cqe_tmp2
, 0x30);
586 cqes
[0] = _mm_blend_epi16(cqes
[0], cqe_tmp1
, 0x30);
587 cqe_tmp2
= _mm_loadl_epi64((__m128i
*)&cq
[pos
+ p1
].rsvd4
[2]);
588 cqe_tmp1
= _mm_loadl_epi64((__m128i
*)&cq
[pos
].rsvd4
[2]);
589 cqes
[1] = _mm_blend_epi16(cqes
[1], cqe_tmp2
, 0x04);
590 cqes
[0] = _mm_blend_epi16(cqes
[0], cqe_tmp1
, 0x04);
591 /* C.2 generate final structure for mbuf with swapping bytes. */
592 pkt_mb1
= _mm_shuffle_epi8(cqes
[1], shuf_mask
);
593 pkt_mb0
= _mm_shuffle_epi8(cqes
[0], shuf_mask
);
594 /* C.3 adjust CRC length. */
595 pkt_mb1
= _mm_sub_epi16(pkt_mb1
, crc_adj
);
596 pkt_mb0
= _mm_sub_epi16(pkt_mb0
, crc_adj
);
597 /* C.4 adjust flow mark. */
598 pkt_mb1
= _mm_add_epi32(pkt_mb1
, flow_mark_adj
);
599 pkt_mb0
= _mm_add_epi32(pkt_mb0
, flow_mark_adj
);
600 /* E.1 extract op_own byte. */
601 op_own_tmp1
= _mm_unpacklo_epi32(cqes
[0], cqes
[1]);
602 op_own
= _mm_unpackhi_epi64(op_own_tmp1
, op_own_tmp2
);
603 /* D.1 fill in mbuf - rx_descriptor_fields1. */
604 _mm_storeu_si128((void *)&pkts
[pos
+ 1]->pkt_len
, pkt_mb1
);
605 _mm_storeu_si128((void *)&pkts
[pos
]->pkt_len
, pkt_mb0
);
606 /* E.2 flip owner bit to mark CQEs from last round. */
607 owner_mask
= _mm_and_si128(op_own
, owner_check
);
609 owner_mask
= _mm_xor_si128(owner_mask
, owner_check
);
610 owner_mask
= _mm_cmpeq_epi32(owner_mask
, owner_check
);
611 owner_mask
= _mm_packs_epi32(owner_mask
, zero
);
612 /* E.3 get mask for invalidated CQEs. */
613 opcode
= _mm_and_si128(op_own
, opcode_check
);
614 invalid_mask
= _mm_cmpeq_epi32(opcode_check
, opcode
);
615 invalid_mask
= _mm_packs_epi32(invalid_mask
, zero
);
616 /* E.4 mask out beyond boundary. */
617 invalid_mask
= _mm_or_si128(invalid_mask
, mask
);
618 /* E.5 merge invalid_mask with invalid owner. */
619 invalid_mask
= _mm_or_si128(invalid_mask
, owner_mask
);
620 /* F.1 find compressed CQE format. */
621 comp_mask
= _mm_and_si128(op_own
, format_check
);
622 comp_mask
= _mm_cmpeq_epi32(comp_mask
, format_check
);
623 comp_mask
= _mm_packs_epi32(comp_mask
, zero
);
624 /* F.2 mask out invalid entries. */
625 comp_mask
= _mm_andnot_si128(invalid_mask
, comp_mask
);
626 comp_idx
= _mm_cvtsi128_si64(comp_mask
);
627 /* F.3 get the first compressed CQE. */
628 comp_idx
= comp_idx
?
629 __builtin_ctzll(comp_idx
) /
630 (sizeof(uint16_t) * 8) :
631 MLX5_VPMD_DESCS_PER_LOOP
;
632 /* E.6 mask out entries after the compressed CQE. */
633 mask
= _mm_set_epi64x(0, comp_idx
* sizeof(uint16_t) * 8);
634 mask
= _mm_sll_epi64(ones
, mask
);
635 invalid_mask
= _mm_or_si128(invalid_mask
, mask
);
636 /* E.7 count non-compressed valid CQEs. */
637 n
= _mm_cvtsi128_si64(invalid_mask
);
638 n
= n
? __builtin_ctzll(n
) / (sizeof(uint16_t) * 8) :
639 MLX5_VPMD_DESCS_PER_LOOP
;
641 /* D.2 get the final invalid mask. */
642 mask
= _mm_set_epi64x(0, n
* sizeof(uint16_t) * 8);
643 mask
= _mm_sll_epi64(ones
, mask
);
644 invalid_mask
= _mm_or_si128(invalid_mask
, mask
);
645 /* D.3 check error in opcode. */
646 opcode
= _mm_cmpeq_epi32(resp_err_check
, opcode
);
647 opcode
= _mm_packs_epi32(opcode
, zero
);
648 opcode
= _mm_andnot_si128(invalid_mask
, opcode
);
649 /* D.4 mark if any error is set */
650 *err
|= _mm_cvtsi128_si64(opcode
);
651 /* D.5 fill in mbuf - rearm_data and packet_type. */
652 rxq_cq_to_ptype_oflags_v(rxq
, cqes
, opcode
, &pkts
[pos
]);
653 if (rxq
->hw_timestamp
) {
654 pkts
[pos
]->timestamp
=
655 rte_be_to_cpu_64(cq
[pos
].timestamp
);
656 pkts
[pos
+ 1]->timestamp
=
657 rte_be_to_cpu_64(cq
[pos
+ p1
].timestamp
);
658 pkts
[pos
+ 2]->timestamp
=
659 rte_be_to_cpu_64(cq
[pos
+ p2
].timestamp
);
660 pkts
[pos
+ 3]->timestamp
=
661 rte_be_to_cpu_64(cq
[pos
+ p3
].timestamp
);
663 if (rxq
->dynf_meta
) {
664 /* This code is subject for futher optimization. */
665 int32_t offs
= rxq
->flow_meta_offset
;
667 *RTE_MBUF_DYNFIELD(pkts
[pos
], offs
, uint32_t *) =
668 cq
[pos
].flow_table_metadata
;
669 *RTE_MBUF_DYNFIELD(pkts
[pos
+ 1], offs
, uint32_t *) =
670 cq
[pos
+ p1
].flow_table_metadata
;
671 *RTE_MBUF_DYNFIELD(pkts
[pos
+ 2], offs
, uint32_t *) =
672 cq
[pos
+ p2
].flow_table_metadata
;
673 *RTE_MBUF_DYNFIELD(pkts
[pos
+ 3], offs
, uint32_t *) =
674 cq
[pos
+ p3
].flow_table_metadata
;
675 if (*RTE_MBUF_DYNFIELD(pkts
[pos
], offs
, uint32_t *))
676 pkts
[pos
]->ol_flags
|= rxq
->flow_meta_mask
;
677 if (*RTE_MBUF_DYNFIELD(pkts
[pos
+ 1], offs
, uint32_t *))
678 pkts
[pos
+ 1]->ol_flags
|= rxq
->flow_meta_mask
;
679 if (*RTE_MBUF_DYNFIELD(pkts
[pos
+ 2], offs
, uint32_t *))
680 pkts
[pos
+ 2]->ol_flags
|= rxq
->flow_meta_mask
;
681 if (*RTE_MBUF_DYNFIELD(pkts
[pos
+ 3], offs
, uint32_t *))
682 pkts
[pos
+ 3]->ol_flags
|= rxq
->flow_meta_mask
;
684 #ifdef MLX5_PMD_SOFT_COUNTERS
685 /* Add up received bytes count. */
686 byte_cnt
= _mm_shuffle_epi8(op_own
, len_shuf_mask
);
687 byte_cnt
= _mm_andnot_si128(invalid_mask
, byte_cnt
);
688 byte_cnt
= _mm_hadd_epi16(byte_cnt
, zero
);
689 rcvd_byte
+= _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt
, zero
));
692 * Break the loop unless more valid CQE is expected, or if
693 * there's a compressed CQE.
695 if (n
!= MLX5_VPMD_DESCS_PER_LOOP
)
698 /* If no new CQE seen, return without updating cq_db. */
699 if (unlikely(!nocmp_n
&& comp_idx
== MLX5_VPMD_DESCS_PER_LOOP
))
701 /* Update the consumer indexes for non-compressed CQEs. */
702 MLX5_ASSERT(nocmp_n
<= pkts_n
);
703 rxq
->cq_ci
+= nocmp_n
;
704 rxq
->rq_pi
+= nocmp_n
;
706 #ifdef MLX5_PMD_SOFT_COUNTERS
707 rxq
->stats
.ipackets
+= nocmp_n
;
708 rxq
->stats
.ibytes
+= rcvd_byte
;
710 /* Decompress the last CQE if compressed. */
711 if (comp_idx
< MLX5_VPMD_DESCS_PER_LOOP
&& comp_idx
== n
) {
712 MLX5_ASSERT(comp_idx
== (nocmp_n
% MLX5_VPMD_DESCS_PER_LOOP
));
713 rxq
->decompressed
= rxq_cq_decompress_v(rxq
, &cq
[nocmp_n
],
715 /* Return more packets if needed. */
716 if (nocmp_n
< pkts_n
) {
717 uint16_t n
= rxq
->decompressed
;
719 n
= RTE_MIN(n
, pkts_n
- nocmp_n
);
720 rxq_copy_mbuf_v(rxq
, &pkts
[nocmp_n
], n
);
723 rxq
->decompressed
-= n
;
726 rte_compiler_barrier();
727 *rxq
->cq_db
= rte_cpu_to_be_32(rxq
->cq_ci
);
731 #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */