]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2013-2015 Intel Corporation | |
7c673cae FG |
3 | */ |
4 | ||
5 | #include <inttypes.h> | |
6 | ||
11fdf7f2 | 7 | #include <rte_ethdev_driver.h> |
7c673cae FG |
8 | #include <rte_common.h> |
9 | #include "fm10k.h" | |
10 | #include "base/fm10k_type.h" | |
11 | ||
12 | #include <tmmintrin.h> | |
13 | ||
14 | #ifndef __INTEL_COMPILER | |
15 | #pragma GCC diagnostic ignored "-Wcast-qual" | |
16 | #endif | |
17 | ||
18 | static void | |
19 | fm10k_reset_tx_queue(struct fm10k_tx_queue *txq); | |
20 | ||
21 | /* Handling the offload flags (olflags) field takes computation | |
22 | * time when receiving packets. Therefore we provide a flag to disable | |
23 | * the processing of the olflags field when they are not needed. This | |
24 | * gives improved performance, at the cost of losing the offload info | |
25 | * in the received packet | |
26 | */ | |
27 | #ifdef RTE_LIBRTE_FM10K_RX_OLFLAGS_ENABLE | |
28 | ||
29 | /* Vlan present flag shift */ | |
30 | #define VP_SHIFT (2) | |
31 | /* L3 type shift */ | |
32 | #define L3TYPE_SHIFT (4) | |
33 | /* L4 type shift */ | |
34 | #define L4TYPE_SHIFT (7) | |
35 | /* HBO flag shift */ | |
36 | #define HBOFLAG_SHIFT (10) | |
37 | /* RXE flag shift */ | |
38 | #define RXEFLAG_SHIFT (13) | |
39 | /* IPE/L4E flag shift */ | |
40 | #define L3L4EFLAG_SHIFT (14) | |
41 | /* shift PKT_RX_L4_CKSUM_GOOD into one byte by 1 bit */ | |
42 | #define CKSUM_SHIFT (1) | |
43 | ||
44 | static inline void | |
45 | fm10k_desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) | |
46 | { | |
47 | __m128i ptype0, ptype1, vtag0, vtag1, eflag0, eflag1, cksumflag; | |
48 | union { | |
49 | uint16_t e[4]; | |
50 | uint64_t dword; | |
51 | } vol; | |
52 | ||
53 | const __m128i pkttype_msk = _mm_set_epi16( | |
54 | 0x0000, 0x0000, 0x0000, 0x0000, | |
9f95a23c TL |
55 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, |
56 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, | |
57 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, | |
58 | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED); | |
7c673cae FG |
59 | |
60 | /* mask everything except rss type */ | |
61 | const __m128i rsstype_msk = _mm_set_epi16( | |
62 | 0x0000, 0x0000, 0x0000, 0x0000, | |
63 | 0x000F, 0x000F, 0x000F, 0x000F); | |
64 | ||
65 | /* mask for HBO and RXE flag flags */ | |
66 | const __m128i rxe_msk = _mm_set_epi16( | |
67 | 0x0000, 0x0000, 0x0000, 0x0000, | |
68 | 0x0001, 0x0001, 0x0001, 0x0001); | |
69 | ||
70 | /* mask the lower byte of ol_flags */ | |
71 | const __m128i ol_flags_msk = _mm_set_epi16( | |
72 | 0x0000, 0x0000, 0x0000, 0x0000, | |
73 | 0x00FF, 0x00FF, 0x00FF, 0x00FF); | |
74 | ||
75 | const __m128i l3l4cksum_flag = _mm_set_epi8(0, 0, 0, 0, | |
76 | 0, 0, 0, 0, | |
77 | 0, 0, 0, 0, | |
78 | (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> CKSUM_SHIFT, | |
79 | (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> CKSUM_SHIFT, | |
80 | (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> CKSUM_SHIFT, | |
81 | (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> CKSUM_SHIFT); | |
82 | ||
83 | const __m128i rxe_flag = _mm_set_epi8(0, 0, 0, 0, | |
84 | 0, 0, 0, 0, | |
85 | 0, 0, 0, 0, | |
86 | 0, 0, 0, 0); | |
87 | ||
88 | /* map rss type to rss hash flag */ | |
89 | const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, | |
90 | 0, 0, 0, PKT_RX_RSS_HASH, | |
91 | PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, | |
92 | PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); | |
93 | ||
94 | /* Calculate RSS_hash and Vlan fields */ | |
95 | ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); | |
96 | ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); | |
97 | vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); | |
98 | vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); | |
99 | ||
100 | ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); | |
101 | ptype0 = _mm_and_si128(ptype0, rsstype_msk); | |
102 | ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); | |
103 | ||
104 | vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); | |
105 | eflag0 = vtag1; | |
106 | cksumflag = vtag1; | |
107 | vtag1 = _mm_srli_epi16(vtag1, VP_SHIFT); | |
108 | vtag1 = _mm_and_si128(vtag1, pkttype_msk); | |
109 | ||
110 | vtag1 = _mm_or_si128(ptype0, vtag1); | |
111 | ||
112 | /* Process err flags, simply set RECIP_ERR bit if HBO/IXE is set */ | |
113 | eflag1 = _mm_srli_epi16(eflag0, RXEFLAG_SHIFT); | |
114 | eflag0 = _mm_srli_epi16(eflag0, HBOFLAG_SHIFT); | |
115 | eflag0 = _mm_or_si128(eflag0, eflag1); | |
116 | eflag0 = _mm_and_si128(eflag0, rxe_msk); | |
117 | eflag0 = _mm_shuffle_epi8(rxe_flag, eflag0); | |
118 | ||
119 | vtag1 = _mm_or_si128(eflag0, vtag1); | |
120 | ||
121 | /* Process L4/L3 checksum error flags */ | |
122 | cksumflag = _mm_srli_epi16(cksumflag, L3L4EFLAG_SHIFT); | |
123 | cksumflag = _mm_shuffle_epi8(l3l4cksum_flag, cksumflag); | |
124 | ||
125 | /* clean the higher byte and shift back the flag bits */ | |
126 | cksumflag = _mm_and_si128(cksumflag, ol_flags_msk); | |
127 | cksumflag = _mm_slli_epi16(cksumflag, CKSUM_SHIFT); | |
128 | vtag1 = _mm_or_si128(cksumflag, vtag1); | |
129 | ||
130 | vol.dword = _mm_cvtsi128_si64(vtag1); | |
131 | ||
132 | rx_pkts[0]->ol_flags = vol.e[0]; | |
133 | rx_pkts[1]->ol_flags = vol.e[1]; | |
134 | rx_pkts[2]->ol_flags = vol.e[2]; | |
135 | rx_pkts[3]->ol_flags = vol.e[3]; | |
136 | } | |
137 | ||
138 | /* @note: When this function is changed, make corresponding change to | |
139 | * fm10k_dev_supported_ptypes_get(). | |
140 | */ | |
141 | static inline void | |
142 | fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) | |
143 | { | |
144 | __m128i l3l4type0, l3l4type1, l3type, l4type; | |
145 | union { | |
146 | uint16_t e[4]; | |
147 | uint64_t dword; | |
148 | } vol; | |
149 | ||
150 | /* L3 pkt type mask Bit4 to Bit6 */ | |
151 | const __m128i l3type_msk = _mm_set_epi16( | |
152 | 0x0000, 0x0000, 0x0000, 0x0000, | |
153 | 0x0070, 0x0070, 0x0070, 0x0070); | |
154 | ||
155 | /* L4 pkt type mask Bit7 to Bit9 */ | |
156 | const __m128i l4type_msk = _mm_set_epi16( | |
157 | 0x0000, 0x0000, 0x0000, 0x0000, | |
158 | 0x0380, 0x0380, 0x0380, 0x0380); | |
159 | ||
160 | /* convert RRC l3 type to mbuf format */ | |
161 | const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, | |
162 | 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, | |
163 | RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, | |
164 | RTE_PTYPE_L3_IPV4, 0); | |
165 | ||
166 | /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits | |
167 | * to fill into8 bits length. | |
168 | */ | |
169 | const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, | |
170 | RTE_PTYPE_TUNNEL_GENEVE >> 8, | |
171 | RTE_PTYPE_TUNNEL_NVGRE >> 8, | |
172 | RTE_PTYPE_TUNNEL_VXLAN >> 8, | |
173 | RTE_PTYPE_TUNNEL_GRE >> 8, | |
174 | RTE_PTYPE_L4_UDP >> 8, | |
175 | RTE_PTYPE_L4_TCP >> 8, | |
176 | 0); | |
177 | ||
178 | l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); | |
179 | l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); | |
180 | l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); | |
181 | ||
182 | l3type = _mm_and_si128(l3l4type0, l3type_msk); | |
183 | l4type = _mm_and_si128(l3l4type0, l4type_msk); | |
184 | ||
185 | l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); | |
186 | l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); | |
187 | ||
188 | l3type = _mm_shuffle_epi8(l3type_flags, l3type); | |
189 | /* l4type_flags shift-left for 8 bits, need shift-right back */ | |
190 | l4type = _mm_shuffle_epi8(l4type_flags, l4type); | |
191 | ||
192 | l4type = _mm_slli_epi16(l4type, 8); | |
193 | l3l4type0 = _mm_or_si128(l3type, l4type); | |
194 | vol.dword = _mm_cvtsi128_si64(l3l4type0); | |
195 | ||
196 | rx_pkts[0]->packet_type = vol.e[0]; | |
197 | rx_pkts[1]->packet_type = vol.e[1]; | |
198 | rx_pkts[2]->packet_type = vol.e[2]; | |
199 | rx_pkts[3]->packet_type = vol.e[3]; | |
200 | } | |
201 | #else | |
202 | #define fm10k_desc_to_olflags_v(desc, rx_pkts) do {} while (0) | |
203 | #define fm10k_desc_to_pktype_v(desc, rx_pkts) do {} while (0) | |
204 | #endif | |
205 | ||
206 | int __attribute__((cold)) | |
207 | fm10k_rx_vec_condition_check(struct rte_eth_dev *dev) | |
208 | { | |
209 | #ifndef RTE_LIBRTE_IEEE1588 | |
210 | struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; | |
211 | struct rte_fdir_conf *fconf = &dev->data->dev_conf.fdir_conf; | |
212 | ||
213 | #ifndef RTE_FM10K_RX_OLFLAGS_ENABLE | |
214 | /* whithout rx ol_flags, no VP flag report */ | |
11fdf7f2 | 215 | if (rxmode->offloads & DEV_RX_OFFLOAD_VLAN_EXTEND) |
7c673cae FG |
216 | return -1; |
217 | #endif | |
218 | ||
219 | /* no fdir support */ | |
220 | if (fconf->mode != RTE_FDIR_MODE_NONE) | |
221 | return -1; | |
222 | ||
223 | /* no header split support */ | |
11fdf7f2 | 224 | if (rxmode->offloads & DEV_RX_OFFLOAD_HEADER_SPLIT) |
7c673cae FG |
225 | return -1; |
226 | ||
227 | return 0; | |
228 | #else | |
229 | RTE_SET_USED(dev); | |
230 | return -1; | |
231 | #endif | |
232 | } | |
233 | ||
234 | int __attribute__((cold)) | |
235 | fm10k_rxq_vec_setup(struct fm10k_rx_queue *rxq) | |
236 | { | |
237 | uintptr_t p; | |
238 | struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */ | |
239 | ||
240 | mb_def.nb_segs = 1; | |
241 | /* data_off will be ajusted after new mbuf allocated for 512-byte | |
242 | * alignment. | |
243 | */ | |
244 | mb_def.data_off = RTE_PKTMBUF_HEADROOM; | |
245 | mb_def.port = rxq->port_id; | |
246 | rte_mbuf_refcnt_set(&mb_def, 1); | |
247 | ||
248 | /* prevent compiler reordering: rearm_data covers previous fields */ | |
249 | rte_compiler_barrier(); | |
250 | p = (uintptr_t)&mb_def.rearm_data; | |
251 | rxq->mbuf_initializer = *(uint64_t *)p; | |
252 | return 0; | |
253 | } | |
254 | ||
255 | static inline void | |
256 | fm10k_rxq_rearm(struct fm10k_rx_queue *rxq) | |
257 | { | |
258 | int i; | |
259 | uint16_t rx_id; | |
260 | volatile union fm10k_rx_desc *rxdp; | |
261 | struct rte_mbuf **mb_alloc = &rxq->sw_ring[rxq->rxrearm_start]; | |
262 | struct rte_mbuf *mb0, *mb1; | |
263 | __m128i head_off = _mm_set_epi64x( | |
264 | RTE_PKTMBUF_HEADROOM + FM10K_RX_DATABUF_ALIGN - 1, | |
265 | RTE_PKTMBUF_HEADROOM + FM10K_RX_DATABUF_ALIGN - 1); | |
266 | __m128i dma_addr0, dma_addr1; | |
267 | /* Rx buffer need to be aligned with 512 byte */ | |
268 | const __m128i hba_msk = _mm_set_epi64x(0, | |
269 | UINT64_MAX - FM10K_RX_DATABUF_ALIGN + 1); | |
270 | ||
271 | rxdp = rxq->hw_ring + rxq->rxrearm_start; | |
272 | ||
273 | /* Pull 'n' more MBUFs into the software ring */ | |
274 | if (rte_mempool_get_bulk(rxq->mp, | |
275 | (void *)mb_alloc, | |
276 | RTE_FM10K_RXQ_REARM_THRESH) < 0) { | |
277 | dma_addr0 = _mm_setzero_si128(); | |
278 | /* Clean up all the HW/SW ring content */ | |
279 | for (i = 0; i < RTE_FM10K_RXQ_REARM_THRESH; i++) { | |
280 | mb_alloc[i] = &rxq->fake_mbuf; | |
281 | _mm_store_si128((__m128i *)&rxdp[i].q, | |
282 | dma_addr0); | |
283 | } | |
284 | ||
285 | rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += | |
286 | RTE_FM10K_RXQ_REARM_THRESH; | |
287 | return; | |
288 | } | |
289 | ||
290 | /* Initialize the mbufs in vector, process 2 mbufs in one loop */ | |
291 | for (i = 0; i < RTE_FM10K_RXQ_REARM_THRESH; i += 2, mb_alloc += 2) { | |
292 | __m128i vaddr0, vaddr1; | |
293 | uintptr_t p0, p1; | |
294 | ||
295 | mb0 = mb_alloc[0]; | |
296 | mb1 = mb_alloc[1]; | |
297 | ||
298 | /* Flush mbuf with pkt template. | |
299 | * Data to be rearmed is 6 bytes long. | |
7c673cae FG |
300 | */ |
301 | p0 = (uintptr_t)&mb0->rearm_data; | |
302 | *(uint64_t *)p0 = rxq->mbuf_initializer; | |
303 | p1 = (uintptr_t)&mb1->rearm_data; | |
304 | *(uint64_t *)p1 = rxq->mbuf_initializer; | |
305 | ||
11fdf7f2 TL |
306 | /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ |
307 | RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != | |
308 | offsetof(struct rte_mbuf, buf_addr) + 8); | |
7c673cae FG |
309 | vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); |
310 | vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); | |
311 | ||
312 | /* convert pa to dma_addr hdr/data */ | |
313 | dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); | |
314 | dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); | |
315 | ||
316 | /* add headroom to pa values */ | |
317 | dma_addr0 = _mm_add_epi64(dma_addr0, head_off); | |
318 | dma_addr1 = _mm_add_epi64(dma_addr1, head_off); | |
319 | ||
320 | /* Do 512 byte alignment to satisfy HW requirement, in the | |
321 | * meanwhile, set Header Buffer Address to zero. | |
322 | */ | |
323 | dma_addr0 = _mm_and_si128(dma_addr0, hba_msk); | |
324 | dma_addr1 = _mm_and_si128(dma_addr1, hba_msk); | |
325 | ||
326 | /* flush desc with pa dma_addr */ | |
327 | _mm_store_si128((__m128i *)&rxdp++->q, dma_addr0); | |
328 | _mm_store_si128((__m128i *)&rxdp++->q, dma_addr1); | |
329 | ||
330 | /* enforce 512B alignment on default Rx virtual addresses */ | |
331 | mb0->data_off = (uint16_t)(RTE_PTR_ALIGN((char *)mb0->buf_addr | |
332 | + RTE_PKTMBUF_HEADROOM, FM10K_RX_DATABUF_ALIGN) | |
333 | - (char *)mb0->buf_addr); | |
334 | mb1->data_off = (uint16_t)(RTE_PTR_ALIGN((char *)mb1->buf_addr | |
335 | + RTE_PKTMBUF_HEADROOM, FM10K_RX_DATABUF_ALIGN) | |
336 | - (char *)mb1->buf_addr); | |
337 | } | |
338 | ||
339 | rxq->rxrearm_start += RTE_FM10K_RXQ_REARM_THRESH; | |
340 | if (rxq->rxrearm_start >= rxq->nb_desc) | |
341 | rxq->rxrearm_start = 0; | |
342 | ||
343 | rxq->rxrearm_nb -= RTE_FM10K_RXQ_REARM_THRESH; | |
344 | ||
345 | rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? | |
346 | (rxq->nb_desc - 1) : (rxq->rxrearm_start - 1)); | |
347 | ||
348 | /* Update the tail pointer on the NIC */ | |
349 | FM10K_PCI_REG_WRITE(rxq->tail_ptr, rx_id); | |
350 | } | |
351 | ||
352 | void __attribute__((cold)) | |
353 | fm10k_rx_queue_release_mbufs_vec(struct fm10k_rx_queue *rxq) | |
354 | { | |
355 | const unsigned mask = rxq->nb_desc - 1; | |
356 | unsigned i; | |
357 | ||
358 | if (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_desc) | |
359 | return; | |
360 | ||
361 | /* free all mbufs that are valid in the ring */ | |
362 | for (i = rxq->next_dd; i != rxq->rxrearm_start; i = (i + 1) & mask) | |
363 | rte_pktmbuf_free_seg(rxq->sw_ring[i]); | |
364 | rxq->rxrearm_nb = rxq->nb_desc; | |
365 | ||
366 | /* set all entries to NULL */ | |
367 | memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_desc); | |
368 | } | |
369 | ||
370 | static inline uint16_t | |
371 | fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, | |
372 | uint16_t nb_pkts, uint8_t *split_packet) | |
373 | { | |
374 | volatile union fm10k_rx_desc *rxdp; | |
375 | struct rte_mbuf **mbufp; | |
376 | uint16_t nb_pkts_recd; | |
377 | int pos; | |
378 | struct fm10k_rx_queue *rxq = rx_queue; | |
379 | uint64_t var; | |
380 | __m128i shuf_msk; | |
381 | __m128i dd_check, eop_check; | |
382 | uint16_t next_dd; | |
383 | ||
384 | next_dd = rxq->next_dd; | |
385 | ||
386 | /* Just the act of getting into the function from the application is | |
387 | * going to cost about 7 cycles | |
388 | */ | |
389 | rxdp = rxq->hw_ring + next_dd; | |
390 | ||
391 | rte_prefetch0(rxdp); | |
392 | ||
393 | /* See if we need to rearm the RX queue - gives the prefetch a bit | |
394 | * of time to act | |
395 | */ | |
396 | if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) | |
397 | fm10k_rxq_rearm(rxq); | |
398 | ||
399 | /* Before we start moving massive data around, check to see if | |
400 | * there is actually a packet available | |
401 | */ | |
402 | if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) | |
403 | return 0; | |
404 | ||
405 | /* Vecotr RX will process 4 packets at a time, strip the unaligned | |
406 | * tails in case it's not multiple of 4. | |
407 | */ | |
408 | nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); | |
409 | ||
410 | /* 4 packets DD mask */ | |
411 | dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); | |
412 | ||
413 | /* 4 packets EOP mask */ | |
414 | eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); | |
415 | ||
416 | /* mask to shuffle from desc. to mbuf */ | |
417 | shuf_msk = _mm_set_epi8( | |
418 | 7, 6, 5, 4, /* octet 4~7, 32bits rss */ | |
419 | 15, 14, /* octet 14~15, low 16 bits vlan_macip */ | |
420 | 13, 12, /* octet 12~13, 16 bits data_len */ | |
421 | 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ | |
422 | 13, 12, /* octet 12~13, low 16 bits pkt_len */ | |
423 | 0xFF, 0xFF, /* skip high 16 bits pkt_type */ | |
424 | 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ | |
425 | ); | |
11fdf7f2 TL |
426 | /* |
427 | * Compile-time verify the shuffle mask | |
428 | * NOTE: some field positions already verified above, but duplicated | |
429 | * here for completeness in case of future modifications. | |
430 | */ | |
431 | RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != | |
432 | offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); | |
433 | RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != | |
434 | offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); | |
435 | RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != | |
436 | offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); | |
437 | RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != | |
438 | offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); | |
7c673cae FG |
439 | |
440 | /* Cache is empty -> need to scan the buffer rings, but first move | |
441 | * the next 'n' mbufs into the cache | |
442 | */ | |
443 | mbufp = &rxq->sw_ring[next_dd]; | |
444 | ||
445 | /* A. load 4 packet in one loop | |
446 | * [A*. mask out 4 unused dirty field in desc] | |
447 | * B. copy 4 mbuf point from swring to rx_pkts | |
448 | * C. calc the number of DD bits among the 4 packets | |
449 | * [C*. extract the end-of-packet bit, if requested] | |
450 | * D. fill info. from desc to mbuf | |
451 | */ | |
452 | for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; | |
453 | pos += RTE_FM10K_DESCS_PER_LOOP, | |
454 | rxdp += RTE_FM10K_DESCS_PER_LOOP) { | |
455 | __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; | |
456 | __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; | |
457 | __m128i zero, staterr, sterr_tmp1, sterr_tmp2; | |
11fdf7f2 TL |
458 | __m128i mbp1; |
459 | /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ | |
460 | #if defined(RTE_ARCH_X86_64) | |
461 | __m128i mbp2; | |
462 | #endif | |
7c673cae | 463 | |
11fdf7f2 | 464 | /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ |
7c673cae FG |
465 | mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); |
466 | ||
467 | /* Read desc statuses backwards to avoid race condition */ | |
468 | /* A.1 load 4 pkts desc */ | |
469 | descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); | |
470 | rte_compiler_barrier(); | |
471 | ||
11fdf7f2 | 472 | /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ |
7c673cae FG |
473 | _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); |
474 | ||
11fdf7f2 TL |
475 | #if defined(RTE_ARCH_X86_64) |
476 | /* B.1 load 2 64 bit mbuf poitns */ | |
7c673cae | 477 | mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); |
11fdf7f2 | 478 | #endif |
7c673cae FG |
479 | |
480 | descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); | |
481 | rte_compiler_barrier(); | |
482 | /* B.1 load 2 mbuf point */ | |
483 | descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); | |
484 | rte_compiler_barrier(); | |
485 | descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); | |
486 | ||
11fdf7f2 | 487 | #if defined(RTE_ARCH_X86_64) |
7c673cae FG |
488 | /* B.2 copy 2 mbuf point into rx_pkts */ |
489 | _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); | |
11fdf7f2 | 490 | #endif |
7c673cae FG |
491 | |
492 | /* avoid compiler reorder optimization */ | |
493 | rte_compiler_barrier(); | |
494 | ||
495 | if (split_packet) { | |
496 | rte_mbuf_prefetch_part2(rx_pkts[pos]); | |
497 | rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); | |
498 | rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); | |
499 | rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); | |
500 | } | |
501 | ||
502 | /* D.1 pkt 3,4 convert format from desc to pktmbuf */ | |
503 | pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); | |
504 | pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); | |
505 | ||
506 | /* C.1 4=>2 filter staterr info only */ | |
507 | sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); | |
508 | /* C.1 4=>2 filter staterr info only */ | |
509 | sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); | |
510 | ||
511 | /* set ol_flags with vlan packet type */ | |
512 | fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); | |
513 | ||
514 | /* D.1 pkt 1,2 convert format from desc to pktmbuf */ | |
515 | pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); | |
516 | pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); | |
517 | ||
518 | /* C.2 get 4 pkts staterr value */ | |
519 | zero = _mm_xor_si128(dd_check, dd_check); | |
520 | staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); | |
521 | ||
522 | /* D.3 copy final 3,4 data to rx_pkts */ | |
523 | _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, | |
524 | pkt_mb4); | |
525 | _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, | |
526 | pkt_mb3); | |
527 | ||
528 | /* C* extract and record EOP bit */ | |
529 | if (split_packet) { | |
530 | __m128i eop_shuf_mask = _mm_set_epi8( | |
531 | 0xFF, 0xFF, 0xFF, 0xFF, | |
532 | 0xFF, 0xFF, 0xFF, 0xFF, | |
533 | 0xFF, 0xFF, 0xFF, 0xFF, | |
534 | 0x04, 0x0C, 0x00, 0x08 | |
535 | ); | |
536 | ||
537 | /* and with mask to extract bits, flipping 1-0 */ | |
538 | __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); | |
539 | /* the staterr values are not in order, as the count | |
540 | * count of dd bits doesn't care. However, for end of | |
541 | * packet tracking, we do care, so shuffle. This also | |
542 | * compresses the 32-bit values to 8-bit | |
543 | */ | |
544 | eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); | |
545 | /* store the resulting 32-bit value */ | |
546 | *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); | |
547 | split_packet += RTE_FM10K_DESCS_PER_LOOP; | |
548 | ||
549 | /* zero-out next pointers */ | |
550 | rx_pkts[pos]->next = NULL; | |
551 | rx_pkts[pos + 1]->next = NULL; | |
552 | rx_pkts[pos + 2]->next = NULL; | |
553 | rx_pkts[pos + 3]->next = NULL; | |
554 | } | |
555 | ||
556 | /* C.3 calc available number of desc */ | |
557 | staterr = _mm_and_si128(staterr, dd_check); | |
558 | staterr = _mm_packs_epi32(staterr, zero); | |
559 | ||
560 | /* D.3 copy final 1,2 data to rx_pkts */ | |
561 | _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, | |
562 | pkt_mb2); | |
563 | _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, | |
564 | pkt_mb1); | |
565 | ||
566 | fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); | |
567 | ||
568 | /* C.4 calc avaialbe number of desc */ | |
569 | var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); | |
570 | nb_pkts_recd += var; | |
571 | if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) | |
572 | break; | |
573 | } | |
574 | ||
575 | /* Update our internal tail pointer */ | |
576 | rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); | |
577 | rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); | |
578 | rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); | |
579 | ||
580 | return nb_pkts_recd; | |
581 | } | |
582 | ||
583 | /* vPMD receive routine | |
584 | * | |
585 | * Notice: | |
586 | * - don't support ol_flags for rss and csum err | |
587 | */ | |
588 | uint16_t | |
589 | fm10k_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, | |
590 | uint16_t nb_pkts) | |
591 | { | |
592 | return fm10k_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); | |
593 | } | |
594 | ||
595 | static inline uint16_t | |
596 | fm10k_reassemble_packets(struct fm10k_rx_queue *rxq, | |
597 | struct rte_mbuf **rx_bufs, | |
598 | uint16_t nb_bufs, uint8_t *split_flags) | |
599 | { | |
600 | struct rte_mbuf *pkts[RTE_FM10K_MAX_RX_BURST]; /*finished pkts*/ | |
601 | struct rte_mbuf *start = rxq->pkt_first_seg; | |
602 | struct rte_mbuf *end = rxq->pkt_last_seg; | |
603 | unsigned pkt_idx, buf_idx; | |
604 | ||
605 | for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) { | |
606 | if (end != NULL) { | |
607 | /* processing a split packet */ | |
608 | end->next = rx_bufs[buf_idx]; | |
609 | start->nb_segs++; | |
610 | start->pkt_len += rx_bufs[buf_idx]->data_len; | |
611 | end = end->next; | |
612 | ||
613 | if (!split_flags[buf_idx]) { | |
614 | /* it's the last packet of the set */ | |
615 | #ifdef RTE_LIBRTE_FM10K_RX_OLFLAGS_ENABLE | |
616 | start->hash = end->hash; | |
617 | start->ol_flags = end->ol_flags; | |
618 | start->packet_type = end->packet_type; | |
619 | #endif | |
620 | pkts[pkt_idx++] = start; | |
621 | start = end = NULL; | |
622 | } | |
623 | } else { | |
624 | /* not processing a split packet */ | |
625 | if (!split_flags[buf_idx]) { | |
626 | /* not a split packet, save and skip */ | |
627 | pkts[pkt_idx++] = rx_bufs[buf_idx]; | |
628 | continue; | |
629 | } | |
630 | end = start = rx_bufs[buf_idx]; | |
631 | } | |
632 | } | |
633 | ||
634 | /* save the partial packet for next time */ | |
635 | rxq->pkt_first_seg = start; | |
636 | rxq->pkt_last_seg = end; | |
637 | memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts))); | |
638 | return pkt_idx; | |
639 | } | |
640 | ||
641 | /* | |
642 | * vPMD receive routine that reassembles scattered packets | |
643 | * | |
644 | * Notice: | |
645 | * - don't support ol_flags for rss and csum err | |
646 | * - nb_pkts > RTE_FM10K_MAX_RX_BURST, only scan RTE_FM10K_MAX_RX_BURST | |
647 | * numbers of DD bit | |
648 | */ | |
649 | uint16_t | |
650 | fm10k_recv_scattered_pkts_vec(void *rx_queue, | |
651 | struct rte_mbuf **rx_pkts, | |
652 | uint16_t nb_pkts) | |
653 | { | |
654 | struct fm10k_rx_queue *rxq = rx_queue; | |
655 | uint8_t split_flags[RTE_FM10K_MAX_RX_BURST] = {0}; | |
656 | unsigned i = 0; | |
657 | ||
658 | /* Split_flags only can support max of RTE_FM10K_MAX_RX_BURST */ | |
659 | nb_pkts = RTE_MIN(nb_pkts, RTE_FM10K_MAX_RX_BURST); | |
660 | /* get some new buffers */ | |
661 | uint16_t nb_bufs = fm10k_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, | |
662 | split_flags); | |
663 | if (nb_bufs == 0) | |
664 | return 0; | |
665 | ||
666 | /* happy day case, full burst + no packets to be joined */ | |
667 | const uint64_t *split_fl64 = (uint64_t *)split_flags; | |
668 | ||
669 | if (rxq->pkt_first_seg == NULL && | |
670 | split_fl64[0] == 0 && split_fl64[1] == 0 && | |
671 | split_fl64[2] == 0 && split_fl64[3] == 0) | |
672 | return nb_bufs; | |
673 | ||
674 | /* reassemble any packets that need reassembly*/ | |
675 | if (rxq->pkt_first_seg == NULL) { | |
676 | /* find the first split flag, and only reassemble then*/ | |
677 | while (i < nb_bufs && !split_flags[i]) | |
678 | i++; | |
679 | if (i == nb_bufs) | |
680 | return nb_bufs; | |
681 | } | |
682 | return i + fm10k_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i, | |
683 | &split_flags[i]); | |
684 | } | |
685 | ||
686 | static const struct fm10k_txq_ops vec_txq_ops = { | |
687 | .reset = fm10k_reset_tx_queue, | |
688 | }; | |
689 | ||
690 | void __attribute__((cold)) | |
691 | fm10k_txq_vec_setup(struct fm10k_tx_queue *txq) | |
692 | { | |
693 | txq->ops = &vec_txq_ops; | |
694 | } | |
695 | ||
696 | int __attribute__((cold)) | |
697 | fm10k_tx_vec_condition_check(struct fm10k_tx_queue *txq) | |
698 | { | |
699 | /* Vector TX can't offload any features yet */ | |
11fdf7f2 | 700 | if (txq->offloads != 0) |
7c673cae FG |
701 | return -1; |
702 | ||
703 | if (txq->tx_ftag_en) | |
704 | return -1; | |
705 | ||
706 | return 0; | |
707 | } | |
708 | ||
709 | static inline void | |
710 | vtx1(volatile struct fm10k_tx_desc *txdp, | |
711 | struct rte_mbuf *pkt, uint64_t flags) | |
712 | { | |
713 | __m128i descriptor = _mm_set_epi64x(flags << 56 | | |
714 | pkt->vlan_tci << 16 | pkt->data_len, | |
715 | MBUF_DMA_ADDR(pkt)); | |
716 | _mm_store_si128((__m128i *)txdp, descriptor); | |
717 | } | |
718 | ||
719 | static inline void | |
720 | vtx(volatile struct fm10k_tx_desc *txdp, | |
721 | struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) | |
722 | { | |
723 | int i; | |
724 | ||
725 | for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt) | |
726 | vtx1(txdp, *pkt, flags); | |
727 | } | |
728 | ||
11fdf7f2 | 729 | static __rte_always_inline int |
7c673cae FG |
730 | fm10k_tx_free_bufs(struct fm10k_tx_queue *txq) |
731 | { | |
732 | struct rte_mbuf **txep; | |
733 | uint8_t flags; | |
734 | uint32_t n; | |
735 | uint32_t i; | |
736 | int nb_free = 0; | |
737 | struct rte_mbuf *m, *free[RTE_FM10K_TX_MAX_FREE_BUF_SZ]; | |
738 | ||
739 | /* check DD bit on threshold descriptor */ | |
740 | flags = txq->hw_ring[txq->next_dd].flags; | |
741 | if (!(flags & FM10K_TXD_FLAG_DONE)) | |
742 | return 0; | |
743 | ||
744 | n = txq->rs_thresh; | |
745 | ||
746 | /* First buffer to free from S/W ring is at index | |
747 | * next_dd - (rs_thresh-1) | |
748 | */ | |
749 | txep = &txq->sw_ring[txq->next_dd - (n - 1)]; | |
11fdf7f2 | 750 | m = rte_pktmbuf_prefree_seg(txep[0]); |
7c673cae FG |
751 | if (likely(m != NULL)) { |
752 | free[0] = m; | |
753 | nb_free = 1; | |
754 | for (i = 1; i < n; i++) { | |
11fdf7f2 | 755 | m = rte_pktmbuf_prefree_seg(txep[i]); |
7c673cae FG |
756 | if (likely(m != NULL)) { |
757 | if (likely(m->pool == free[0]->pool)) | |
758 | free[nb_free++] = m; | |
759 | else { | |
760 | rte_mempool_put_bulk(free[0]->pool, | |
761 | (void *)free, nb_free); | |
762 | free[0] = m; | |
763 | nb_free = 1; | |
764 | } | |
765 | } | |
766 | } | |
767 | rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); | |
768 | } else { | |
769 | for (i = 1; i < n; i++) { | |
11fdf7f2 | 770 | m = rte_pktmbuf_prefree_seg(txep[i]); |
7c673cae FG |
771 | if (m != NULL) |
772 | rte_mempool_put(m->pool, m); | |
773 | } | |
774 | } | |
775 | ||
776 | /* buffers were freed, update counters */ | |
777 | txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh); | |
778 | txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh); | |
779 | if (txq->next_dd >= txq->nb_desc) | |
780 | txq->next_dd = (uint16_t)(txq->rs_thresh - 1); | |
781 | ||
782 | return txq->rs_thresh; | |
783 | } | |
784 | ||
11fdf7f2 | 785 | static __rte_always_inline void |
7c673cae FG |
786 | tx_backlog_entry(struct rte_mbuf **txep, |
787 | struct rte_mbuf **tx_pkts, uint16_t nb_pkts) | |
788 | { | |
789 | int i; | |
790 | ||
791 | for (i = 0; i < (int)nb_pkts; ++i) | |
792 | txep[i] = tx_pkts[i]; | |
793 | } | |
794 | ||
795 | uint16_t | |
11fdf7f2 TL |
796 | fm10k_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, |
797 | uint16_t nb_pkts) | |
7c673cae FG |
798 | { |
799 | struct fm10k_tx_queue *txq = (struct fm10k_tx_queue *)tx_queue; | |
800 | volatile struct fm10k_tx_desc *txdp; | |
801 | struct rte_mbuf **txep; | |
802 | uint16_t n, nb_commit, tx_id; | |
803 | uint64_t flags = FM10K_TXD_FLAG_LAST; | |
804 | uint64_t rs = FM10K_TXD_FLAG_RS | FM10K_TXD_FLAG_LAST; | |
805 | int i; | |
806 | ||
807 | /* cross rx_thresh boundary is not allowed */ | |
808 | nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh); | |
809 | ||
810 | if (txq->nb_free < txq->free_thresh) | |
811 | fm10k_tx_free_bufs(txq); | |
812 | ||
813 | nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts); | |
814 | if (unlikely(nb_pkts == 0)) | |
815 | return 0; | |
816 | ||
817 | tx_id = txq->next_free; | |
818 | txdp = &txq->hw_ring[tx_id]; | |
819 | txep = &txq->sw_ring[tx_id]; | |
820 | ||
821 | txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts); | |
822 | ||
823 | n = (uint16_t)(txq->nb_desc - tx_id); | |
824 | if (nb_commit >= n) { | |
825 | tx_backlog_entry(txep, tx_pkts, n); | |
826 | ||
827 | for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp) | |
828 | vtx1(txdp, *tx_pkts, flags); | |
829 | ||
830 | vtx1(txdp, *tx_pkts++, rs); | |
831 | ||
832 | nb_commit = (uint16_t)(nb_commit - n); | |
833 | ||
834 | tx_id = 0; | |
835 | txq->next_rs = (uint16_t)(txq->rs_thresh - 1); | |
836 | ||
837 | /* avoid reach the end of ring */ | |
838 | txdp = &(txq->hw_ring[tx_id]); | |
839 | txep = &txq->sw_ring[tx_id]; | |
840 | } | |
841 | ||
842 | tx_backlog_entry(txep, tx_pkts, nb_commit); | |
843 | ||
844 | vtx(txdp, tx_pkts, nb_commit, flags); | |
845 | ||
846 | tx_id = (uint16_t)(tx_id + nb_commit); | |
847 | if (tx_id > txq->next_rs) { | |
848 | txq->hw_ring[txq->next_rs].flags |= FM10K_TXD_FLAG_RS; | |
849 | txq->next_rs = (uint16_t)(txq->next_rs + txq->rs_thresh); | |
850 | } | |
851 | ||
852 | txq->next_free = tx_id; | |
853 | ||
854 | FM10K_PCI_REG_WRITE(txq->tail_ptr, txq->next_free); | |
855 | ||
856 | return nb_pkts; | |
857 | } | |
858 | ||
859 | static void __attribute__((cold)) | |
860 | fm10k_reset_tx_queue(struct fm10k_tx_queue *txq) | |
861 | { | |
862 | static const struct fm10k_tx_desc zeroed_desc = {0}; | |
863 | struct rte_mbuf **txe = txq->sw_ring; | |
864 | uint16_t i; | |
865 | ||
866 | /* Zero out HW ring memory */ | |
867 | for (i = 0; i < txq->nb_desc; i++) | |
868 | txq->hw_ring[i] = zeroed_desc; | |
869 | ||
870 | /* Initialize SW ring entries */ | |
871 | for (i = 0; i < txq->nb_desc; i++) | |
872 | txe[i] = NULL; | |
873 | ||
874 | txq->next_dd = (uint16_t)(txq->rs_thresh - 1); | |
875 | txq->next_rs = (uint16_t)(txq->rs_thresh - 1); | |
876 | ||
877 | txq->next_free = 0; | |
878 | txq->nb_used = 0; | |
879 | /* Always allow 1 descriptor to be un-allocated to avoid | |
880 | * a H/W race condition | |
881 | */ | |
882 | txq->nb_free = (uint16_t)(txq->nb_desc - 1); | |
883 | FM10K_PCI_REG_WRITE(txq->tail_ptr, 0); | |
884 | } |