]>
git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/dpdk/drivers/net/ice/ice_rxtx_vec_avx2.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
5 #include "ice_rxtx_vec_common.h"
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
14 ice_rxq_rearm(struct ice_rx_queue
*rxq
)
18 volatile union ice_rx_desc
*rxdp
;
19 struct ice_rx_entry
*rxep
= &rxq
->sw_ring
[rxq
->rxrearm_start
];
21 rxdp
= rxq
->rx_ring
+ rxq
->rxrearm_start
;
23 /* Pull 'n' more MBUFs into the software ring */
24 if (rte_mempool_get_bulk(rxq
->mp
,
26 ICE_RXQ_REARM_THRESH
) < 0) {
27 if (rxq
->rxrearm_nb
+ ICE_RXQ_REARM_THRESH
>=
31 dma_addr0
= _mm_setzero_si128();
32 for (i
= 0; i
< ICE_DESCS_PER_LOOP
; i
++) {
33 rxep
[i
].mbuf
= &rxq
->fake_mbuf
;
34 _mm_store_si128((__m128i
*)&rxdp
[i
].read
,
38 rte_eth_devices
[rxq
->port_id
].data
->rx_mbuf_alloc_failed
+=
43 #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
44 struct rte_mbuf
*mb0
, *mb1
;
45 __m128i dma_addr0
, dma_addr1
;
46 __m128i hdr_room
= _mm_set_epi64x(RTE_PKTMBUF_HEADROOM
,
47 RTE_PKTMBUF_HEADROOM
);
48 /* Initialize the mbufs in vector, process 2 mbufs in one loop */
49 for (i
= 0; i
< ICE_RXQ_REARM_THRESH
; i
+= 2, rxep
+= 2) {
50 __m128i vaddr0
, vaddr1
;
55 /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
56 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, buf_physaddr
) !=
57 offsetof(struct rte_mbuf
, buf_addr
) + 8);
58 vaddr0
= _mm_loadu_si128((__m128i
*)&mb0
->buf_addr
);
59 vaddr1
= _mm_loadu_si128((__m128i
*)&mb1
->buf_addr
);
61 /* convert pa to dma_addr hdr/data */
62 dma_addr0
= _mm_unpackhi_epi64(vaddr0
, vaddr0
);
63 dma_addr1
= _mm_unpackhi_epi64(vaddr1
, vaddr1
);
65 /* add headroom to pa values */
66 dma_addr0
= _mm_add_epi64(dma_addr0
, hdr_room
);
67 dma_addr1
= _mm_add_epi64(dma_addr1
, hdr_room
);
69 /* flush desc with pa dma_addr */
70 _mm_store_si128((__m128i
*)&rxdp
++->read
, dma_addr0
);
71 _mm_store_si128((__m128i
*)&rxdp
++->read
, dma_addr1
);
74 struct rte_mbuf
*mb0
, *mb1
, *mb2
, *mb3
;
75 __m256i dma_addr0_1
, dma_addr2_3
;
76 __m256i hdr_room
= _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM
);
77 /* Initialize the mbufs in vector, process 4 mbufs in one loop */
78 for (i
= 0; i
< ICE_RXQ_REARM_THRESH
;
79 i
+= 4, rxep
+= 4, rxdp
+= 4) {
80 __m128i vaddr0
, vaddr1
, vaddr2
, vaddr3
;
81 __m256i vaddr0_1
, vaddr2_3
;
88 /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
89 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, buf_physaddr
) !=
90 offsetof(struct rte_mbuf
, buf_addr
) + 8);
91 vaddr0
= _mm_loadu_si128((__m128i
*)&mb0
->buf_addr
);
92 vaddr1
= _mm_loadu_si128((__m128i
*)&mb1
->buf_addr
);
93 vaddr2
= _mm_loadu_si128((__m128i
*)&mb2
->buf_addr
);
94 vaddr3
= _mm_loadu_si128((__m128i
*)&mb3
->buf_addr
);
97 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
98 * into the high lanes. Similarly for 2 & 3
101 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0
),
104 _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2
),
107 /* convert pa to dma_addr hdr/data */
108 dma_addr0_1
= _mm256_unpackhi_epi64(vaddr0_1
, vaddr0_1
);
109 dma_addr2_3
= _mm256_unpackhi_epi64(vaddr2_3
, vaddr2_3
);
111 /* add headroom to pa values */
112 dma_addr0_1
= _mm256_add_epi64(dma_addr0_1
, hdr_room
);
113 dma_addr2_3
= _mm256_add_epi64(dma_addr2_3
, hdr_room
);
115 /* flush desc with pa dma_addr */
116 _mm256_store_si256((__m256i
*)&rxdp
->read
, dma_addr0_1
);
117 _mm256_store_si256((__m256i
*)&(rxdp
+ 2)->read
, dma_addr2_3
);
122 rxq
->rxrearm_start
+= ICE_RXQ_REARM_THRESH
;
123 if (rxq
->rxrearm_start
>= rxq
->nb_rx_desc
)
124 rxq
->rxrearm_start
= 0;
126 rxq
->rxrearm_nb
-= ICE_RXQ_REARM_THRESH
;
128 rx_id
= (uint16_t)((rxq
->rxrearm_start
== 0) ?
129 (rxq
->nb_rx_desc
- 1) : (rxq
->rxrearm_start
- 1));
131 /* Update the tail pointer on the NIC */
132 ICE_PCI_REG_WRITE(rxq
->qrx_tail
, rx_id
);
135 #define PKTLEN_SHIFT 10
137 static inline uint16_t
138 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue
*rxq
, struct rte_mbuf
**rx_pkts
,
139 uint16_t nb_pkts
, uint8_t *split_packet
)
141 #define ICE_DESCS_PER_LOOP_AVX 8
143 const uint32_t *ptype_tbl
= rxq
->vsi
->adapter
->ptype_tbl
;
144 const __m256i mbuf_init
= _mm256_set_epi64x(0, 0,
145 0, rxq
->mbuf_initializer
);
146 struct ice_rx_entry
*sw_ring
= &rxq
->sw_ring
[rxq
->rx_tail
];
147 volatile union ice_rx_desc
*rxdp
= rxq
->rx_ring
+ rxq
->rx_tail
;
148 const int avx_aligned
= ((rxq
->rx_tail
& 1) == 0);
152 /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */
153 nb_pkts
= RTE_ALIGN_FLOOR(nb_pkts
, ICE_DESCS_PER_LOOP_AVX
);
155 /* See if we need to rearm the RX queue - gives the prefetch a bit
158 if (rxq
->rxrearm_nb
> ICE_RXQ_REARM_THRESH
)
161 /* Before we start moving massive data around, check to see if
162 * there is actually a packet available
164 if (!(rxdp
->wb
.qword1
.status_error_len
&
165 rte_cpu_to_le_32(1 << ICE_RX_DESC_STATUS_DD_S
)))
168 /* constants used in processing loop */
169 const __m256i crc_adjust
=
171 (/* first descriptor */
172 0, 0, 0, /* ignore non-length fields */
173 -rxq
->crc_len
, /* sub crc on data_len */
174 0, /* ignore high-16bits of pkt_len */
175 -rxq
->crc_len
, /* sub crc on pkt_len */
176 0, 0, /* ignore pkt_type field */
177 /* second descriptor */
178 0, 0, 0, /* ignore non-length fields */
179 -rxq
->crc_len
, /* sub crc on data_len */
180 0, /* ignore high-16bits of pkt_len */
181 -rxq
->crc_len
, /* sub crc on pkt_len */
182 0, 0 /* ignore pkt_type field */
185 /* 8 packets DD mask, LSB in each 32-bit value */
186 const __m256i dd_check
= _mm256_set1_epi32(1);
188 /* 8 packets EOP mask, second-LSB in each 32-bit value */
189 const __m256i eop_check
= _mm256_slli_epi32(dd_check
,
190 ICE_RX_DESC_STATUS_EOF_S
);
192 /* mask to shuffle from desc. to mbuf (2 descriptors)*/
193 const __m256i shuf_msk
=
195 (/* first descriptor */
196 7, 6, 5, 4, /* octet 4~7, 32bits rss */
197 3, 2, /* octet 2~3, low 16 bits vlan_macip */
198 15, 14, /* octet 15~14, 16 bits data_len */
199 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
200 15, 14, /* octet 15~14, low 16 bits pkt_len */
201 0xFF, 0xFF, /* pkt_type set as unknown */
202 0xFF, 0xFF, /*pkt_type set as unknown */
203 /* second descriptor */
204 7, 6, 5, 4, /* octet 4~7, 32bits rss */
205 3, 2, /* octet 2~3, low 16 bits vlan_macip */
206 15, 14, /* octet 15~14, 16 bits data_len */
207 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */
208 15, 14, /* octet 15~14, low 16 bits pkt_len */
209 0xFF, 0xFF, /* pkt_type set as unknown */
210 0xFF, 0xFF /*pkt_type set as unknown */
213 * compile-time check the above crc and shuffle layout is correct.
214 * NOTE: the first field (lowest address) is given last in set_epi
217 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, pkt_len
) !=
218 offsetof(struct rte_mbuf
, rx_descriptor_fields1
) + 4);
219 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, data_len
) !=
220 offsetof(struct rte_mbuf
, rx_descriptor_fields1
) + 8);
221 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, vlan_tci
) !=
222 offsetof(struct rte_mbuf
, rx_descriptor_fields1
) + 10);
223 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, hash
) !=
224 offsetof(struct rte_mbuf
, rx_descriptor_fields1
) + 12);
226 /* Status/Error flag masks */
228 * mask everything except RSS, flow director and VLAN flags
229 * bit2 is for VLAN tag, bit11 for flow director indication
230 * bit13:12 for RSS indication. Bits 3-5 of error
231 * field (bits 22-24) are for IP/L4 checksum errors
233 const __m256i flags_mask
=
234 _mm256_set1_epi32((1 << 2) | (1 << 11) |
235 (3 << 12) | (7 << 22));
237 * data to be shuffled by result of flag mask. If VLAN bit is set,
238 * (bit 2), then position 4 in this array will be used in the
241 const __m256i vlan_flags_shuf
=
242 _mm256_set_epi32(0, 0, PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
, 0,
243 0, 0, PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
, 0);
245 * data to be shuffled by result of flag mask, shifted down 11.
246 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
249 const __m256i rss_flags_shuf
=
250 _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
251 PKT_RX_RSS_HASH
| PKT_RX_FDIR
, PKT_RX_RSS_HASH
,
252 0, 0, 0, 0, PKT_RX_FDIR
, 0,/* end up 128-bits */
253 0, 0, 0, 0, 0, 0, 0, 0,
254 PKT_RX_RSS_HASH
| PKT_RX_FDIR
, PKT_RX_RSS_HASH
,
255 0, 0, 0, 0, PKT_RX_FDIR
, 0);
258 * data to be shuffled by the result of the flags mask shifted by 22
259 * bits. This gives use the l3_l4 flags.
261 const __m256i l3_l4_flags_shuf
= _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
262 /* shift right 1 bit to make sure it not exceed 255 */
263 (PKT_RX_EIP_CKSUM_BAD
| PKT_RX_L4_CKSUM_BAD
|
264 PKT_RX_IP_CKSUM_BAD
) >> 1,
265 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_EIP_CKSUM_BAD
|
266 PKT_RX_L4_CKSUM_BAD
) >> 1,
267 (PKT_RX_EIP_CKSUM_BAD
| PKT_RX_IP_CKSUM_BAD
) >> 1,
268 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_EIP_CKSUM_BAD
) >> 1,
269 (PKT_RX_L4_CKSUM_BAD
| PKT_RX_IP_CKSUM_BAD
) >> 1,
270 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_BAD
) >> 1,
271 PKT_RX_IP_CKSUM_BAD
>> 1,
272 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
) >> 1,
273 /* second 128-bits */
274 0, 0, 0, 0, 0, 0, 0, 0,
275 (PKT_RX_EIP_CKSUM_BAD
| PKT_RX_L4_CKSUM_BAD
|
276 PKT_RX_IP_CKSUM_BAD
) >> 1,
277 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_EIP_CKSUM_BAD
|
278 PKT_RX_L4_CKSUM_BAD
) >> 1,
279 (PKT_RX_EIP_CKSUM_BAD
| PKT_RX_IP_CKSUM_BAD
) >> 1,
280 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_EIP_CKSUM_BAD
) >> 1,
281 (PKT_RX_L4_CKSUM_BAD
| PKT_RX_IP_CKSUM_BAD
) >> 1,
282 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_BAD
) >> 1,
283 PKT_RX_IP_CKSUM_BAD
>> 1,
284 (PKT_RX_IP_CKSUM_GOOD
| PKT_RX_L4_CKSUM_GOOD
) >> 1);
286 const __m256i cksum_mask
=
287 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD
| PKT_RX_IP_CKSUM_BAD
|
288 PKT_RX_L4_CKSUM_GOOD
| PKT_RX_L4_CKSUM_BAD
|
289 PKT_RX_EIP_CKSUM_BAD
);
291 RTE_SET_USED(avx_aligned
); /* for 32B descriptors we don't use this */
293 uint16_t i
, received
;
295 for (i
= 0, received
= 0; i
< nb_pkts
;
296 i
+= ICE_DESCS_PER_LOOP_AVX
,
297 rxdp
+= ICE_DESCS_PER_LOOP_AVX
) {
298 /* step 1, copy over 8 mbuf pointers to rx_pkts array */
299 _mm256_storeu_si256((void *)&rx_pkts
[i
],
300 _mm256_loadu_si256((void *)&sw_ring
[i
]));
301 #ifdef RTE_ARCH_X86_64
303 ((void *)&rx_pkts
[i
+ 4],
304 _mm256_loadu_si256((void *)&sw_ring
[i
+ 4]));
307 __m256i raw_desc0_1
, raw_desc2_3
, raw_desc4_5
, raw_desc6_7
;
308 #ifdef RTE_LIBRTE_ICE_16BYTE_RX_DESC
309 /* for AVX we need alignment otherwise loads are not atomic */
311 /* load in descriptors, 2 at a time, in reverse order */
312 raw_desc6_7
= _mm256_load_si256((void *)(rxdp
+ 6));
313 rte_compiler_barrier();
314 raw_desc4_5
= _mm256_load_si256((void *)(rxdp
+ 4));
315 rte_compiler_barrier();
316 raw_desc2_3
= _mm256_load_si256((void *)(rxdp
+ 2));
317 rte_compiler_barrier();
318 raw_desc0_1
= _mm256_load_si256((void *)(rxdp
+ 0));
322 const __m128i raw_desc7
=
323 _mm_load_si128((void *)(rxdp
+ 7));
324 rte_compiler_barrier();
325 const __m128i raw_desc6
=
326 _mm_load_si128((void *)(rxdp
+ 6));
327 rte_compiler_barrier();
328 const __m128i raw_desc5
=
329 _mm_load_si128((void *)(rxdp
+ 5));
330 rte_compiler_barrier();
331 const __m128i raw_desc4
=
332 _mm_load_si128((void *)(rxdp
+ 4));
333 rte_compiler_barrier();
334 const __m128i raw_desc3
=
335 _mm_load_si128((void *)(rxdp
+ 3));
336 rte_compiler_barrier();
337 const __m128i raw_desc2
=
338 _mm_load_si128((void *)(rxdp
+ 2));
339 rte_compiler_barrier();
340 const __m128i raw_desc1
=
341 _mm_load_si128((void *)(rxdp
+ 1));
342 rte_compiler_barrier();
343 const __m128i raw_desc0
=
344 _mm_load_si128((void *)(rxdp
+ 0));
347 _mm256_inserti128_si256
348 (_mm256_castsi128_si256(raw_desc6
),
351 _mm256_inserti128_si256
352 (_mm256_castsi128_si256(raw_desc4
),
355 _mm256_inserti128_si256
356 (_mm256_castsi128_si256(raw_desc2
),
359 _mm256_inserti128_si256
360 (_mm256_castsi128_si256(raw_desc0
),
367 for (j
= 0; j
< ICE_DESCS_PER_LOOP_AVX
; j
++)
368 rte_mbuf_prefetch_part2(rx_pkts
[i
+ j
]);
372 * convert descriptors 4-7 into mbufs, adjusting length and
373 * re-arranging fields. Then write into the mbuf
375 const __m256i len6_7
= _mm256_slli_epi32(raw_desc6_7
,
377 const __m256i len4_5
= _mm256_slli_epi32(raw_desc4_5
,
379 const __m256i desc6_7
= _mm256_blend_epi16(raw_desc6_7
,
381 const __m256i desc4_5
= _mm256_blend_epi16(raw_desc4_5
,
383 __m256i mb6_7
= _mm256_shuffle_epi8(desc6_7
, shuf_msk
);
384 __m256i mb4_5
= _mm256_shuffle_epi8(desc4_5
, shuf_msk
);
386 mb6_7
= _mm256_add_epi16(mb6_7
, crc_adjust
);
387 mb4_5
= _mm256_add_epi16(mb4_5
, crc_adjust
);
389 * to get packet types, shift 64-bit values down 30 bits
390 * and so ptype is in lower 8-bits in each
392 const __m256i ptypes6_7
= _mm256_srli_epi64(desc6_7
, 30);
393 const __m256i ptypes4_5
= _mm256_srli_epi64(desc4_5
, 30);
394 const uint8_t ptype7
= _mm256_extract_epi8(ptypes6_7
, 24);
395 const uint8_t ptype6
= _mm256_extract_epi8(ptypes6_7
, 8);
396 const uint8_t ptype5
= _mm256_extract_epi8(ptypes4_5
, 24);
397 const uint8_t ptype4
= _mm256_extract_epi8(ptypes4_5
, 8);
399 mb6_7
= _mm256_insert_epi32(mb6_7
, ptype_tbl
[ptype7
], 4);
400 mb6_7
= _mm256_insert_epi32(mb6_7
, ptype_tbl
[ptype6
], 0);
401 mb4_5
= _mm256_insert_epi32(mb4_5
, ptype_tbl
[ptype5
], 4);
402 mb4_5
= _mm256_insert_epi32(mb4_5
, ptype_tbl
[ptype4
], 0);
403 /* merge the status bits into one register */
404 const __m256i status4_7
= _mm256_unpackhi_epi32(desc6_7
,
408 * convert descriptors 0-3 into mbufs, adjusting length and
409 * re-arranging fields. Then write into the mbuf
411 const __m256i len2_3
= _mm256_slli_epi32(raw_desc2_3
,
413 const __m256i len0_1
= _mm256_slli_epi32(raw_desc0_1
,
415 const __m256i desc2_3
= _mm256_blend_epi16(raw_desc2_3
,
417 const __m256i desc0_1
= _mm256_blend_epi16(raw_desc0_1
,
419 __m256i mb2_3
= _mm256_shuffle_epi8(desc2_3
, shuf_msk
);
420 __m256i mb0_1
= _mm256_shuffle_epi8(desc0_1
, shuf_msk
);
422 mb2_3
= _mm256_add_epi16(mb2_3
, crc_adjust
);
423 mb0_1
= _mm256_add_epi16(mb0_1
, crc_adjust
);
424 /* get the packet types */
425 const __m256i ptypes2_3
= _mm256_srli_epi64(desc2_3
, 30);
426 const __m256i ptypes0_1
= _mm256_srli_epi64(desc0_1
, 30);
427 const uint8_t ptype3
= _mm256_extract_epi8(ptypes2_3
, 24);
428 const uint8_t ptype2
= _mm256_extract_epi8(ptypes2_3
, 8);
429 const uint8_t ptype1
= _mm256_extract_epi8(ptypes0_1
, 24);
430 const uint8_t ptype0
= _mm256_extract_epi8(ptypes0_1
, 8);
432 mb2_3
= _mm256_insert_epi32(mb2_3
, ptype_tbl
[ptype3
], 4);
433 mb2_3
= _mm256_insert_epi32(mb2_3
, ptype_tbl
[ptype2
], 0);
434 mb0_1
= _mm256_insert_epi32(mb0_1
, ptype_tbl
[ptype1
], 4);
435 mb0_1
= _mm256_insert_epi32(mb0_1
, ptype_tbl
[ptype0
], 0);
436 /* merge the status bits into one register */
437 const __m256i status0_3
= _mm256_unpackhi_epi32(desc2_3
,
441 * take the two sets of status bits and merge to one
442 * After merge, the packets status flags are in the
443 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
445 __m256i status0_7
= _mm256_unpacklo_epi64(status4_7
,
448 /* now do flag manipulation */
450 /* get only flag/error bits we want */
451 const __m256i flag_bits
=
452 _mm256_and_si256(status0_7
, flags_mask
);
453 /* set vlan and rss flags */
454 const __m256i vlan_flags
=
455 _mm256_shuffle_epi8(vlan_flags_shuf
, flag_bits
);
456 const __m256i rss_flags
=
457 _mm256_shuffle_epi8(rss_flags_shuf
,
458 _mm256_srli_epi32(flag_bits
, 11));
460 * l3_l4_error flags, shuffle, then shift to correct adjustment
461 * of flags in flags_shuf, and finally mask out extra bits
463 __m256i l3_l4_flags
= _mm256_shuffle_epi8(l3_l4_flags_shuf
,
464 _mm256_srli_epi32(flag_bits
, 22));
465 l3_l4_flags
= _mm256_slli_epi32(l3_l4_flags
, 1);
466 l3_l4_flags
= _mm256_and_si256(l3_l4_flags
, cksum_mask
);
469 const __m256i mbuf_flags
= _mm256_or_si256(l3_l4_flags
,
470 _mm256_or_si256(rss_flags
, vlan_flags
));
472 * At this point, we have the 8 sets of flags in the low 16-bits
473 * of each 32-bit value in vlan0.
474 * We want to extract these, and merge them with the mbuf init
475 * data so we can do a single write to the mbuf to set the flags
476 * and all the other initialization fields. Extracting the
477 * appropriate flags means that we have to do a shift and blend
478 * for each mbuf before we do the write. However, we can also
479 * add in the previously computed rx_descriptor fields to
480 * make a single 256-bit write per mbuf
482 /* check the structure matches expectations */
483 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, ol_flags
) !=
484 offsetof(struct rte_mbuf
, rearm_data
) + 8);
485 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf
, rearm_data
) !=
486 RTE_ALIGN(offsetof(struct rte_mbuf
,
489 /* build up data and do writes */
490 __m256i rearm0
, rearm1
, rearm2
, rearm3
, rearm4
, rearm5
,
492 rearm6
= _mm256_blend_epi32(mbuf_init
,
493 _mm256_slli_si256(mbuf_flags
, 8),
495 rearm4
= _mm256_blend_epi32(mbuf_init
,
496 _mm256_slli_si256(mbuf_flags
, 4),
498 rearm2
= _mm256_blend_epi32(mbuf_init
, mbuf_flags
, 0x04);
499 rearm0
= _mm256_blend_epi32(mbuf_init
,
500 _mm256_srli_si256(mbuf_flags
, 4),
502 /* permute to add in the rx_descriptor e.g. rss fields */
503 rearm6
= _mm256_permute2f128_si256(rearm6
, mb6_7
, 0x20);
504 rearm4
= _mm256_permute2f128_si256(rearm4
, mb4_5
, 0x20);
505 rearm2
= _mm256_permute2f128_si256(rearm2
, mb2_3
, 0x20);
506 rearm0
= _mm256_permute2f128_si256(rearm0
, mb0_1
, 0x20);
508 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 6]->rearm_data
,
510 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 4]->rearm_data
,
512 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 2]->rearm_data
,
514 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 0]->rearm_data
,
517 /* repeat for the odd mbufs */
518 const __m256i odd_flags
=
519 _mm256_castsi128_si256
520 (_mm256_extracti128_si256(mbuf_flags
, 1));
521 rearm7
= _mm256_blend_epi32(mbuf_init
,
522 _mm256_slli_si256(odd_flags
, 8),
524 rearm5
= _mm256_blend_epi32(mbuf_init
,
525 _mm256_slli_si256(odd_flags
, 4),
527 rearm3
= _mm256_blend_epi32(mbuf_init
, odd_flags
, 0x04);
528 rearm1
= _mm256_blend_epi32(mbuf_init
,
529 _mm256_srli_si256(odd_flags
, 4),
531 /* since odd mbufs are already in hi 128-bits use blend */
532 rearm7
= _mm256_blend_epi32(rearm7
, mb6_7
, 0xF0);
533 rearm5
= _mm256_blend_epi32(rearm5
, mb4_5
, 0xF0);
534 rearm3
= _mm256_blend_epi32(rearm3
, mb2_3
, 0xF0);
535 rearm1
= _mm256_blend_epi32(rearm1
, mb0_1
, 0xF0);
536 /* again write to mbufs */
537 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 7]->rearm_data
,
539 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 5]->rearm_data
,
541 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 3]->rearm_data
,
543 _mm256_storeu_si256((__m256i
*)&rx_pkts
[i
+ 1]->rearm_data
,
546 /* extract and record EOP bit */
548 const __m128i eop_mask
=
549 _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S
);
550 const __m256i eop_bits256
= _mm256_and_si256(status0_7
,
552 /* pack status bits into a single 128-bit register */
553 const __m128i eop_bits
=
555 (_mm256_castsi256_si128(eop_bits256
),
556 _mm256_extractf128_si256(eop_bits256
,
559 * flip bits, and mask out the EOP bit, which is now
560 * a split-packet bit i.e. !EOP, rather than EOP one.
562 __m128i split_bits
= _mm_andnot_si128(eop_bits
,
565 * eop bits are out of order, so we need to shuffle them
566 * back into order again. In doing so, only use low 8
567 * bits, which acts like another pack instruction
568 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
569 * [Since we use epi8, the 16-bit positions are
570 * multiplied by 2 in the eop_shuffle value.]
572 __m128i eop_shuffle
=
573 _mm_set_epi8(/* zero hi 64b */
574 0xFF, 0xFF, 0xFF, 0xFF,
575 0xFF, 0xFF, 0xFF, 0xFF,
576 /* move values to lo 64b */
579 split_bits
= _mm_shuffle_epi8(split_bits
, eop_shuffle
);
580 *(uint64_t *)split_packet
=
581 _mm_cvtsi128_si64(split_bits
);
582 split_packet
+= ICE_DESCS_PER_LOOP_AVX
;
585 /* perform dd_check */
586 status0_7
= _mm256_and_si256(status0_7
, dd_check
);
587 status0_7
= _mm256_packs_epi32(status0_7
,
588 _mm256_setzero_si256());
590 uint64_t burst
= __builtin_popcountll
592 (_mm256_extracti128_si256
594 burst
+= __builtin_popcountll
596 (_mm256_castsi256_si128(status0_7
)));
598 if (burst
!= ICE_DESCS_PER_LOOP_AVX
)
602 /* update tail pointers */
603 rxq
->rx_tail
+= received
;
604 rxq
->rx_tail
&= (rxq
->nb_rx_desc
- 1);
605 if ((rxq
->rx_tail
& 1) == 1 && received
> 1) { /* keep avx2 aligned */
609 rxq
->rxrearm_nb
+= received
;
615 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
618 ice_recv_pkts_vec_avx2(void *rx_queue
, struct rte_mbuf
**rx_pkts
,
621 return _ice_recv_raw_pkts_vec_avx2(rx_queue
, rx_pkts
, nb_pkts
, NULL
);
625 * vPMD receive routine that reassembles single burst of 32 scattered packets
627 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
630 ice_recv_scattered_burst_vec_avx2(void *rx_queue
, struct rte_mbuf
**rx_pkts
,
633 struct ice_rx_queue
*rxq
= rx_queue
;
634 uint8_t split_flags
[ICE_VPMD_RX_BURST
] = {0};
636 /* get some new buffers */
637 uint16_t nb_bufs
= _ice_recv_raw_pkts_vec_avx2(rxq
, rx_pkts
, nb_pkts
,
642 /* happy day case, full burst + no packets to be joined */
643 const uint64_t *split_fl64
= (uint64_t *)split_flags
;
645 if (!rxq
->pkt_first_seg
&&
646 split_fl64
[0] == 0 && split_fl64
[1] == 0 &&
647 split_fl64
[2] == 0 && split_fl64
[3] == 0)
650 /* reassemble any packets that need reassembly*/
653 if (!rxq
->pkt_first_seg
) {
654 /* find the first split flag, and only reassemble then*/
655 while (i
< nb_bufs
&& !split_flags
[i
])
660 return i
+ ice_rx_reassemble_packets(rxq
, &rx_pkts
[i
], nb_bufs
- i
,
665 * vPMD receive routine that reassembles scattered packets.
666 * Main receive routine that can handle arbitrary burst sizes
668 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
671 ice_recv_scattered_pkts_vec_avx2(void *rx_queue
, struct rte_mbuf
**rx_pkts
,
676 while (nb_pkts
> ICE_VPMD_RX_BURST
) {
677 uint16_t burst
= ice_recv_scattered_burst_vec_avx2(rx_queue
,
678 rx_pkts
+ retval
, ICE_VPMD_RX_BURST
);
681 if (burst
< ICE_VPMD_RX_BURST
)
684 return retval
+ ice_recv_scattered_burst_vec_avx2(rx_queue
,
685 rx_pkts
+ retval
, nb_pkts
);
689 ice_vtx1(volatile struct ice_tx_desc
*txdp
,
690 struct rte_mbuf
*pkt
, uint64_t flags
)
693 (ICE_TX_DESC_DTYPE_DATA
|
694 ((uint64_t)flags
<< ICE_TXD_QW1_CMD_S
) |
695 ((uint64_t)pkt
->data_len
<< ICE_TXD_QW1_TX_BUF_SZ_S
));
697 __m128i descriptor
= _mm_set_epi64x(high_qw
,
698 pkt
->buf_physaddr
+ pkt
->data_off
);
699 _mm_store_si128((__m128i
*)txdp
, descriptor
);
703 ice_vtx(volatile struct ice_tx_desc
*txdp
,
704 struct rte_mbuf
**pkt
, uint16_t nb_pkts
, uint64_t flags
)
706 const uint64_t hi_qw_tmpl
= (ICE_TX_DESC_DTYPE_DATA
|
707 ((uint64_t)flags
<< ICE_TXD_QW1_CMD_S
));
709 /* if unaligned on 32-bit boundary, do one to align */
710 if (((uintptr_t)txdp
& 0x1F) != 0 && nb_pkts
!= 0) {
711 ice_vtx1(txdp
, *pkt
, flags
);
712 nb_pkts
--, txdp
++, pkt
++;
715 /* do two at a time while possible, in bursts */
716 for (; nb_pkts
> 3; txdp
+= 4, pkt
+= 4, nb_pkts
-= 4) {
719 ((uint64_t)pkt
[3]->data_len
<<
720 ICE_TXD_QW1_TX_BUF_SZ_S
);
723 ((uint64_t)pkt
[2]->data_len
<<
724 ICE_TXD_QW1_TX_BUF_SZ_S
);
727 ((uint64_t)pkt
[1]->data_len
<<
728 ICE_TXD_QW1_TX_BUF_SZ_S
);
731 ((uint64_t)pkt
[0]->data_len
<<
732 ICE_TXD_QW1_TX_BUF_SZ_S
);
737 pkt
[3]->buf_physaddr
+ pkt
[3]->data_off
,
739 pkt
[2]->buf_physaddr
+ pkt
[2]->data_off
);
743 pkt
[1]->buf_physaddr
+ pkt
[1]->data_off
,
745 pkt
[0]->buf_physaddr
+ pkt
[0]->data_off
);
746 _mm256_store_si256((void *)(txdp
+ 2), desc2_3
);
747 _mm256_store_si256((void *)txdp
, desc0_1
);
750 /* do any last ones */
752 ice_vtx1(txdp
, *pkt
, flags
);
753 txdp
++, pkt
++, nb_pkts
--;
757 static inline uint16_t
758 ice_xmit_fixed_burst_vec_avx2(void *tx_queue
, struct rte_mbuf
**tx_pkts
,
761 struct ice_tx_queue
*txq
= (struct ice_tx_queue
*)tx_queue
;
762 volatile struct ice_tx_desc
*txdp
;
763 struct ice_tx_entry
*txep
;
764 uint16_t n
, nb_commit
, tx_id
;
765 uint64_t flags
= ICE_TD_CMD
;
766 uint64_t rs
= ICE_TX_DESC_CMD_RS
| ICE_TD_CMD
;
768 /* cross rx_thresh boundary is not allowed */
769 nb_pkts
= RTE_MIN(nb_pkts
, txq
->tx_rs_thresh
);
771 if (txq
->nb_tx_free
< txq
->tx_free_thresh
)
772 ice_tx_free_bufs(txq
);
774 nb_commit
= nb_pkts
= (uint16_t)RTE_MIN(txq
->nb_tx_free
, nb_pkts
);
775 if (unlikely(nb_pkts
== 0))
778 tx_id
= txq
->tx_tail
;
779 txdp
= &txq
->tx_ring
[tx_id
];
780 txep
= &txq
->sw_ring
[tx_id
];
782 txq
->nb_tx_free
= (uint16_t)(txq
->nb_tx_free
- nb_pkts
);
784 n
= (uint16_t)(txq
->nb_tx_desc
- tx_id
);
785 if (nb_commit
>= n
) {
786 ice_tx_backlog_entry(txep
, tx_pkts
, n
);
788 ice_vtx(txdp
, tx_pkts
, n
- 1, flags
);
792 ice_vtx1(txdp
, *tx_pkts
++, rs
);
794 nb_commit
= (uint16_t)(nb_commit
- n
);
797 txq
->tx_next_rs
= (uint16_t)(txq
->tx_rs_thresh
- 1);
799 /* avoid reach the end of ring */
800 txdp
= &txq
->tx_ring
[tx_id
];
801 txep
= &txq
->sw_ring
[tx_id
];
804 ice_tx_backlog_entry(txep
, tx_pkts
, nb_commit
);
806 ice_vtx(txdp
, tx_pkts
, nb_commit
, flags
);
808 tx_id
= (uint16_t)(tx_id
+ nb_commit
);
809 if (tx_id
> txq
->tx_next_rs
) {
810 txq
->tx_ring
[txq
->tx_next_rs
].cmd_type_offset_bsz
|=
811 rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS
) <<
814 (uint16_t)(txq
->tx_next_rs
+ txq
->tx_rs_thresh
);
817 txq
->tx_tail
= tx_id
;
819 ICE_PCI_REG_WRITE(txq
->qtx_tail
, txq
->tx_tail
);
825 ice_xmit_pkts_vec_avx2(void *tx_queue
, struct rte_mbuf
**tx_pkts
,
829 struct ice_tx_queue
*txq
= (struct ice_tx_queue
*)tx_queue
;
834 num
= (uint16_t)RTE_MIN(nb_pkts
, txq
->tx_rs_thresh
);
835 ret
= ice_xmit_fixed_burst_vec_avx2(tx_queue
, &tx_pkts
[nb_tx
],