1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2016 Intel Corporation
15 #include <rte_interrupts.h>
16 #include <rte_byteorder.h>
17 #include <rte_common.h>
19 #include <rte_debug.h>
21 #include <rte_memory.h>
22 #include <rte_memcpy.h>
23 #include <rte_memzone.h>
24 #include <rte_launch.h>
26 #include <rte_per_lcore.h>
27 #include <rte_lcore.h>
28 #include <rte_atomic.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_mempool.h>
31 #include <rte_malloc.h>
33 #include <rte_ether.h>
34 #include <rte_ethdev_driver.h>
35 #include <rte_prefetch.h>
41 #include <rte_string_fns.h>
43 #include "e1000_logs.h"
44 #include "base/e1000_api.h"
45 #include "e1000_ethdev.h"
46 #include "base/e1000_osdep.h"
48 #define E1000_TXD_VLAN_SHIFT 16
50 #define E1000_RXDCTL_GRAN 0x01000000 /* RXDCTL Granularity */
52 #define E1000_TX_OFFLOAD_MASK ( \
59 #define E1000_TX_OFFLOAD_NOTSUP_MASK \
60 (PKT_TX_OFFLOAD_MASK ^ E1000_TX_OFFLOAD_MASK)
63 * Structure associated with each descriptor of the RX ring of a RX queue.
66 struct rte_mbuf
*mbuf
; /**< mbuf associated with RX descriptor. */
70 * Structure associated with each descriptor of the TX ring of a TX queue.
73 struct rte_mbuf
*mbuf
; /**< mbuf associated with TX desc, if any. */
74 uint16_t next_id
; /**< Index of next descriptor in ring. */
75 uint16_t last_id
; /**< Index of last scattered descriptor. */
79 * Structure associated with each RX queue.
82 struct rte_mempool
*mb_pool
; /**< mbuf pool to populate RX ring. */
83 volatile struct e1000_rx_desc
*rx_ring
; /**< RX ring virtual address. */
84 uint64_t rx_ring_phys_addr
; /**< RX ring DMA address. */
85 volatile uint32_t *rdt_reg_addr
; /**< RDT register address. */
86 volatile uint32_t *rdh_reg_addr
; /**< RDH register address. */
87 struct em_rx_entry
*sw_ring
; /**< address of RX software ring. */
88 struct rte_mbuf
*pkt_first_seg
; /**< First segment of current packet. */
89 struct rte_mbuf
*pkt_last_seg
; /**< Last segment of current packet. */
90 uint64_t offloads
; /**< Offloads of DEV_RX_OFFLOAD_* */
91 uint16_t nb_rx_desc
; /**< number of RX descriptors. */
92 uint16_t rx_tail
; /**< current value of RDT register. */
93 uint16_t nb_rx_hold
; /**< number of held free RX desc. */
94 uint16_t rx_free_thresh
; /**< max free RX desc to hold. */
95 uint16_t queue_id
; /**< RX queue index. */
96 uint16_t port_id
; /**< Device port identifier. */
97 uint8_t pthresh
; /**< Prefetch threshold register. */
98 uint8_t hthresh
; /**< Host threshold register. */
99 uint8_t wthresh
; /**< Write-back threshold register. */
100 uint8_t crc_len
; /**< 0 if CRC stripped, 4 otherwise. */
104 * Hardware context number
107 EM_CTX_0
= 0, /**< CTX0 */
108 EM_CTX_NUM
= 1, /**< CTX NUM */
111 /** Offload features */
112 union em_vlan_macip
{
115 uint16_t l3_len
:9; /**< L3 (IP) Header Length. */
116 uint16_t l2_len
:7; /**< L2 (MAC) Header Length. */
118 /**< VLAN Tag Control Identifier (CPU order). */
123 * Compare mask for vlan_macip_len.data,
124 * should be in sync with em_vlan_macip.f layout.
126 #define TX_VLAN_CMP_MASK 0xFFFF0000 /**< VLAN length - 16-bits. */
127 #define TX_MAC_LEN_CMP_MASK 0x0000FE00 /**< MAC length - 7-bits. */
128 #define TX_IP_LEN_CMP_MASK 0x000001FF /**< IP length - 9-bits. */
129 /** MAC+IP length. */
130 #define TX_MACIP_LEN_CMP_MASK (TX_MAC_LEN_CMP_MASK | TX_IP_LEN_CMP_MASK)
133 * Structure to check if new context need be built
136 uint64_t flags
; /**< ol_flags related to context build. */
137 uint32_t cmp_mask
; /**< compare mask */
138 union em_vlan_macip hdrlen
; /**< L2 and L3 header lenghts */
142 * Structure associated with each TX queue.
145 volatile struct e1000_data_desc
*tx_ring
; /**< TX ring address */
146 uint64_t tx_ring_phys_addr
; /**< TX ring DMA address. */
147 struct em_tx_entry
*sw_ring
; /**< virtual address of SW ring. */
148 volatile uint32_t *tdt_reg_addr
; /**< Address of TDT register. */
149 uint16_t nb_tx_desc
; /**< number of TX descriptors. */
150 uint16_t tx_tail
; /**< Current value of TDT register. */
151 /**< Start freeing TX buffers if there are less free descriptors than
153 uint16_t tx_free_thresh
;
154 /**< Number of TX descriptors to use before RS bit is set. */
155 uint16_t tx_rs_thresh
;
156 /** Number of TX descriptors used since RS bit was set. */
158 /** Index to last TX descriptor to have been cleaned. */
159 uint16_t last_desc_cleaned
;
160 /** Total number of TX descriptors ready to be allocated. */
162 uint16_t queue_id
; /**< TX queue index. */
163 uint16_t port_id
; /**< Device port identifier. */
164 uint8_t pthresh
; /**< Prefetch threshold register. */
165 uint8_t hthresh
; /**< Host threshold register. */
166 uint8_t wthresh
; /**< Write-back threshold register. */
167 struct em_ctx_info ctx_cache
;
168 /**< Hardware context history.*/
169 uint64_t offloads
; /**< offloads of DEV_TX_OFFLOAD_* */
173 #define RTE_PMD_USE_PREFETCH
176 #ifdef RTE_PMD_USE_PREFETCH
177 #define rte_em_prefetch(p) rte_prefetch0(p)
179 #define rte_em_prefetch(p) do {} while(0)
182 #ifdef RTE_PMD_PACKET_PREFETCH
183 #define rte_packet_prefetch(p) rte_prefetch1(p)
185 #define rte_packet_prefetch(p) do {} while(0)
188 #ifndef DEFAULT_TX_FREE_THRESH
189 #define DEFAULT_TX_FREE_THRESH 32
190 #endif /* DEFAULT_TX_FREE_THRESH */
192 #ifndef DEFAULT_TX_RS_THRESH
193 #define DEFAULT_TX_RS_THRESH 32
194 #endif /* DEFAULT_TX_RS_THRESH */
197 /*********************************************************************
201 **********************************************************************/
204 * Populates TX context descriptor.
207 em_set_xmit_ctx(struct em_tx_queue
* txq
,
208 volatile struct e1000_context_desc
*ctx_txd
,
210 union em_vlan_macip hdrlen
)
212 uint32_t cmp_mask
, cmd_len
;
213 uint16_t ipcse
, l2len
;
214 struct e1000_context_desc ctx
;
217 cmd_len
= E1000_TXD_CMD_DEXT
| E1000_TXD_DTYP_C
;
219 l2len
= hdrlen
.f
.l2_len
;
220 ipcse
= (uint16_t)(l2len
+ hdrlen
.f
.l3_len
);
222 /* setup IPCS* fields */
223 ctx
.lower_setup
.ip_fields
.ipcss
= (uint8_t)l2len
;
224 ctx
.lower_setup
.ip_fields
.ipcso
= (uint8_t)(l2len
+
225 offsetof(struct ipv4_hdr
, hdr_checksum
));
228 * When doing checksum or TCP segmentation with IPv6 headers,
229 * IPCSE field should be set t0 0.
231 if (flags
& PKT_TX_IP_CKSUM
) {
232 ctx
.lower_setup
.ip_fields
.ipcse
=
233 (uint16_t)rte_cpu_to_le_16(ipcse
- 1);
234 cmd_len
|= E1000_TXD_CMD_IP
;
235 cmp_mask
|= TX_MACIP_LEN_CMP_MASK
;
237 ctx
.lower_setup
.ip_fields
.ipcse
= 0;
240 /* setup TUCS* fields */
241 ctx
.upper_setup
.tcp_fields
.tucss
= (uint8_t)ipcse
;
242 ctx
.upper_setup
.tcp_fields
.tucse
= 0;
244 switch (flags
& PKT_TX_L4_MASK
) {
245 case PKT_TX_UDP_CKSUM
:
246 ctx
.upper_setup
.tcp_fields
.tucso
= (uint8_t)(ipcse
+
247 offsetof(struct udp_hdr
, dgram_cksum
));
248 cmp_mask
|= TX_MACIP_LEN_CMP_MASK
;
250 case PKT_TX_TCP_CKSUM
:
251 ctx
.upper_setup
.tcp_fields
.tucso
= (uint8_t)(ipcse
+
252 offsetof(struct tcp_hdr
, cksum
));
253 cmd_len
|= E1000_TXD_CMD_TCP
;
254 cmp_mask
|= TX_MACIP_LEN_CMP_MASK
;
257 ctx
.upper_setup
.tcp_fields
.tucso
= 0;
260 ctx
.cmd_and_length
= rte_cpu_to_le_32(cmd_len
);
261 ctx
.tcp_seg_setup
.data
= 0;
265 txq
->ctx_cache
.flags
= flags
;
266 txq
->ctx_cache
.cmp_mask
= cmp_mask
;
267 txq
->ctx_cache
.hdrlen
= hdrlen
;
271 * Check which hardware context can be used. Use the existing match
272 * or create a new context descriptor.
274 static inline uint32_t
275 what_ctx_update(struct em_tx_queue
*txq
, uint64_t flags
,
276 union em_vlan_macip hdrlen
)
278 /* If match with the current context */
279 if (likely (txq
->ctx_cache
.flags
== flags
&&
280 ((txq
->ctx_cache
.hdrlen
.data
^ hdrlen
.data
) &
281 txq
->ctx_cache
.cmp_mask
) == 0))
288 /* Reset transmit descriptors after they have been used */
290 em_xmit_cleanup(struct em_tx_queue
*txq
)
292 struct em_tx_entry
*sw_ring
= txq
->sw_ring
;
293 volatile struct e1000_data_desc
*txr
= txq
->tx_ring
;
294 uint16_t last_desc_cleaned
= txq
->last_desc_cleaned
;
295 uint16_t nb_tx_desc
= txq
->nb_tx_desc
;
296 uint16_t desc_to_clean_to
;
297 uint16_t nb_tx_to_clean
;
299 /* Determine the last descriptor needing to be cleaned */
300 desc_to_clean_to
= (uint16_t)(last_desc_cleaned
+ txq
->tx_rs_thresh
);
301 if (desc_to_clean_to
>= nb_tx_desc
)
302 desc_to_clean_to
= (uint16_t)(desc_to_clean_to
- nb_tx_desc
);
304 /* Check to make sure the last descriptor to clean is done */
305 desc_to_clean_to
= sw_ring
[desc_to_clean_to
].last_id
;
306 if (! (txr
[desc_to_clean_to
].upper
.fields
.status
& E1000_TXD_STAT_DD
))
308 PMD_TX_FREE_LOG(DEBUG
,
309 "TX descriptor %4u is not done"
310 "(port=%d queue=%d)", desc_to_clean_to
,
311 txq
->port_id
, txq
->queue_id
);
312 /* Failed to clean any descriptors, better luck next time */
316 /* Figure out how many descriptors will be cleaned */
317 if (last_desc_cleaned
> desc_to_clean_to
)
318 nb_tx_to_clean
= (uint16_t)((nb_tx_desc
- last_desc_cleaned
) +
321 nb_tx_to_clean
= (uint16_t)(desc_to_clean_to
-
324 PMD_TX_FREE_LOG(DEBUG
,
325 "Cleaning %4u TX descriptors: %4u to %4u "
326 "(port=%d queue=%d)", nb_tx_to_clean
,
327 last_desc_cleaned
, desc_to_clean_to
, txq
->port_id
,
331 * The last descriptor to clean is done, so that means all the
332 * descriptors from the last descriptor that was cleaned
333 * up to the last descriptor with the RS bit set
334 * are done. Only reset the threshold descriptor.
336 txr
[desc_to_clean_to
].upper
.fields
.status
= 0;
338 /* Update the txq to reflect the last descriptor that was cleaned */
339 txq
->last_desc_cleaned
= desc_to_clean_to
;
340 txq
->nb_tx_free
= (uint16_t)(txq
->nb_tx_free
+ nb_tx_to_clean
);
346 static inline uint32_t
347 tx_desc_cksum_flags_to_upper(uint64_t ol_flags
)
349 static const uint32_t l4_olinfo
[2] = {0, E1000_TXD_POPTS_TXSM
<< 8};
350 static const uint32_t l3_olinfo
[2] = {0, E1000_TXD_POPTS_IXSM
<< 8};
353 tmp
= l4_olinfo
[(ol_flags
& PKT_TX_L4_MASK
) != PKT_TX_L4_NO_CKSUM
];
354 tmp
|= l3_olinfo
[(ol_flags
& PKT_TX_IP_CKSUM
) != 0];
359 eth_em_xmit_pkts(void *tx_queue
, struct rte_mbuf
**tx_pkts
,
362 struct em_tx_queue
*txq
;
363 struct em_tx_entry
*sw_ring
;
364 struct em_tx_entry
*txe
, *txn
;
365 volatile struct e1000_data_desc
*txr
;
366 volatile struct e1000_data_desc
*txd
;
367 struct rte_mbuf
*tx_pkt
;
368 struct rte_mbuf
*m_seg
;
369 uint64_t buf_dma_addr
;
371 uint32_t cmd_type_len
;
381 union em_vlan_macip hdrlen
;
384 sw_ring
= txq
->sw_ring
;
386 tx_id
= txq
->tx_tail
;
387 txe
= &sw_ring
[tx_id
];
389 /* Determine if the descriptor ring needs to be cleaned. */
390 if (txq
->nb_tx_free
< txq
->tx_free_thresh
)
391 em_xmit_cleanup(txq
);
394 for (nb_tx
= 0; nb_tx
< nb_pkts
; nb_tx
++) {
398 RTE_MBUF_PREFETCH_TO_FREE(txe
->mbuf
);
401 * Determine how many (if any) context descriptors
402 * are needed for offload functionality.
404 ol_flags
= tx_pkt
->ol_flags
;
406 /* If hardware offload required */
407 tx_ol_req
= (ol_flags
& (PKT_TX_IP_CKSUM
| PKT_TX_L4_MASK
));
409 hdrlen
.f
.vlan_tci
= tx_pkt
->vlan_tci
;
410 hdrlen
.f
.l2_len
= tx_pkt
->l2_len
;
411 hdrlen
.f
.l3_len
= tx_pkt
->l3_len
;
412 /* If new context to be built or reuse the exist ctx. */
413 ctx
= what_ctx_update(txq
, tx_ol_req
, hdrlen
);
415 /* Only allocate context descriptor if required*/
416 new_ctx
= (ctx
== EM_CTX_NUM
);
420 * Keep track of how many descriptors are used this loop
421 * This will always be the number of segments + the number of
422 * Context descriptors required to transmit the packet
424 nb_used
= (uint16_t)(tx_pkt
->nb_segs
+ new_ctx
);
427 * The number of descriptors that must be allocated for a
428 * packet is the number of segments of that packet, plus 1
429 * Context Descriptor for the hardware offload, if any.
430 * Determine the last TX descriptor to allocate in the TX ring
431 * for the packet, starting from the current position (tx_id)
434 tx_last
= (uint16_t) (tx_id
+ nb_used
- 1);
437 if (tx_last
>= txq
->nb_tx_desc
)
438 tx_last
= (uint16_t) (tx_last
- txq
->nb_tx_desc
);
440 PMD_TX_LOG(DEBUG
, "port_id=%u queue_id=%u pktlen=%u"
441 " tx_first=%u tx_last=%u",
442 (unsigned) txq
->port_id
,
443 (unsigned) txq
->queue_id
,
444 (unsigned) tx_pkt
->pkt_len
,
449 * Make sure there are enough TX descriptors available to
450 * transmit the entire packet.
451 * nb_used better be less than or equal to txq->tx_rs_thresh
453 while (unlikely (nb_used
> txq
->nb_tx_free
)) {
454 PMD_TX_FREE_LOG(DEBUG
, "Not enough free TX descriptors "
455 "nb_used=%4u nb_free=%4u "
456 "(port=%d queue=%d)",
457 nb_used
, txq
->nb_tx_free
,
458 txq
->port_id
, txq
->queue_id
);
460 if (em_xmit_cleanup(txq
) != 0) {
461 /* Could not clean any descriptors */
469 * By now there are enough free TX descriptors to transmit
474 * Set common flags of all TX Data Descriptors.
476 * The following bits must be set in all Data Descriptors:
477 * - E1000_TXD_DTYP_DATA
478 * - E1000_TXD_DTYP_DEXT
480 * The following bits must be set in the first Data Descriptor
481 * and are ignored in the other ones:
482 * - E1000_TXD_POPTS_IXSM
483 * - E1000_TXD_POPTS_TXSM
485 * The following bits must be set in the last Data Descriptor
486 * and are ignored in the other ones:
487 * - E1000_TXD_CMD_VLE
488 * - E1000_TXD_CMD_IFCS
490 * The following bits must only be set in the last Data
492 * - E1000_TXD_CMD_EOP
494 * The following bits can be set in any Data Descriptor, but
495 * are only set in the last Data Descriptor:
498 cmd_type_len
= E1000_TXD_CMD_DEXT
| E1000_TXD_DTYP_D
|
502 /* Set VLAN Tag offload fields. */
503 if (ol_flags
& PKT_TX_VLAN_PKT
) {
504 cmd_type_len
|= E1000_TXD_CMD_VLE
;
505 popts_spec
= tx_pkt
->vlan_tci
<< E1000_TXD_VLAN_SHIFT
;
510 * Setup the TX Context Descriptor if required
513 volatile struct e1000_context_desc
*ctx_txd
;
515 ctx_txd
= (volatile struct e1000_context_desc
*)
518 txn
= &sw_ring
[txe
->next_id
];
519 RTE_MBUF_PREFETCH_TO_FREE(txn
->mbuf
);
521 if (txe
->mbuf
!= NULL
) {
522 rte_pktmbuf_free_seg(txe
->mbuf
);
526 em_set_xmit_ctx(txq
, ctx_txd
, tx_ol_req
,
529 txe
->last_id
= tx_last
;
530 tx_id
= txe
->next_id
;
535 * Setup the TX Data Descriptor,
536 * This path will go through
537 * whatever new/reuse the context descriptor
539 popts_spec
|= tx_desc_cksum_flags_to_upper(ol_flags
);
545 txn
= &sw_ring
[txe
->next_id
];
547 if (txe
->mbuf
!= NULL
)
548 rte_pktmbuf_free_seg(txe
->mbuf
);
552 * Set up Transmit Data Descriptor.
554 slen
= m_seg
->data_len
;
555 buf_dma_addr
= rte_mbuf_data_iova(m_seg
);
557 txd
->buffer_addr
= rte_cpu_to_le_64(buf_dma_addr
);
558 txd
->lower
.data
= rte_cpu_to_le_32(cmd_type_len
| slen
);
559 txd
->upper
.data
= rte_cpu_to_le_32(popts_spec
);
561 txe
->last_id
= tx_last
;
562 tx_id
= txe
->next_id
;
565 } while (m_seg
!= NULL
);
568 * The last packet data descriptor needs End Of Packet (EOP)
570 cmd_type_len
|= E1000_TXD_CMD_EOP
;
571 txq
->nb_tx_used
= (uint16_t)(txq
->nb_tx_used
+ nb_used
);
572 txq
->nb_tx_free
= (uint16_t)(txq
->nb_tx_free
- nb_used
);
574 /* Set RS bit only on threshold packets' last descriptor */
575 if (txq
->nb_tx_used
>= txq
->tx_rs_thresh
) {
576 PMD_TX_FREE_LOG(DEBUG
,
577 "Setting RS bit on TXD id=%4u "
578 "(port=%d queue=%d)",
579 tx_last
, txq
->port_id
, txq
->queue_id
);
581 cmd_type_len
|= E1000_TXD_CMD_RS
;
583 /* Update txq RS bit counters */
586 txd
->lower
.data
|= rte_cpu_to_le_32(cmd_type_len
);
592 * Set the Transmit Descriptor Tail (TDT)
594 PMD_TX_LOG(DEBUG
, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
595 (unsigned) txq
->port_id
, (unsigned) txq
->queue_id
,
596 (unsigned) tx_id
, (unsigned) nb_tx
);
597 E1000_PCI_REG_WRITE_RELAXED(txq
->tdt_reg_addr
, tx_id
);
598 txq
->tx_tail
= tx_id
;
603 /*********************************************************************
607 **********************************************************************/
609 eth_em_prep_pkts(__rte_unused
void *tx_queue
, struct rte_mbuf
**tx_pkts
,
615 for (i
= 0; i
< nb_pkts
; i
++) {
618 if (m
->ol_flags
& E1000_TX_OFFLOAD_NOTSUP_MASK
) {
619 rte_errno
= -ENOTSUP
;
623 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
624 ret
= rte_validate_tx_offload(m
);
630 ret
= rte_net_intel_cksum_prepare(m
);
640 /*********************************************************************
644 **********************************************************************/
646 static inline uint64_t
647 rx_desc_status_to_pkt_flags(uint32_t rx_status
)
651 /* Check if VLAN present */
652 pkt_flags
= ((rx_status
& E1000_RXD_STAT_VP
) ?
653 PKT_RX_VLAN
| PKT_RX_VLAN_STRIPPED
: 0);
658 static inline uint64_t
659 rx_desc_error_to_pkt_flags(uint32_t rx_error
)
661 uint64_t pkt_flags
= 0;
663 if (rx_error
& E1000_RXD_ERR_IPE
)
664 pkt_flags
|= PKT_RX_IP_CKSUM_BAD
;
665 if (rx_error
& E1000_RXD_ERR_TCPE
)
666 pkt_flags
|= PKT_RX_L4_CKSUM_BAD
;
671 eth_em_recv_pkts(void *rx_queue
, struct rte_mbuf
**rx_pkts
,
674 volatile struct e1000_rx_desc
*rx_ring
;
675 volatile struct e1000_rx_desc
*rxdp
;
676 struct em_rx_queue
*rxq
;
677 struct em_rx_entry
*sw_ring
;
678 struct em_rx_entry
*rxe
;
679 struct rte_mbuf
*rxm
;
680 struct rte_mbuf
*nmb
;
681 struct e1000_rx_desc rxd
;
693 rx_id
= rxq
->rx_tail
;
694 rx_ring
= rxq
->rx_ring
;
695 sw_ring
= rxq
->sw_ring
;
696 while (nb_rx
< nb_pkts
) {
698 * The order of operations here is important as the DD status
699 * bit must not be read after any other descriptor fields.
700 * rx_ring and rxdp are pointing to volatile data so the order
701 * of accesses cannot be reordered by the compiler. If they were
702 * not volatile, they could be reordered which could lead to
703 * using invalid descriptor fields when read from rxd.
705 rxdp
= &rx_ring
[rx_id
];
706 status
= rxdp
->status
;
707 if (! (status
& E1000_RXD_STAT_DD
))
714 * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
715 * likely to be invalid and to be dropped by the various
716 * validation checks performed by the network stack.
718 * Allocate a new mbuf to replenish the RX ring descriptor.
719 * If the allocation fails:
720 * - arrange for that RX descriptor to be the first one
721 * being parsed the next time the receive function is
722 * invoked [on the same queue].
724 * - Stop parsing the RX ring and return immediately.
726 * This policy do not drop the packet received in the RX
727 * descriptor for which the allocation of a new mbuf failed.
728 * Thus, it allows that packet to be later retrieved if
729 * mbuf have been freed in the mean time.
730 * As a side effect, holding RX descriptors instead of
731 * systematically giving them back to the NIC may lead to
732 * RX ring exhaustion situations.
733 * However, the NIC can gracefully prevent such situations
734 * to happen by sending specific "back-pressure" flow control
735 * frames to its peer(s).
737 PMD_RX_LOG(DEBUG
, "port_id=%u queue_id=%u rx_id=%u "
738 "status=0x%x pkt_len=%u",
739 (unsigned) rxq
->port_id
, (unsigned) rxq
->queue_id
,
740 (unsigned) rx_id
, (unsigned) status
,
741 (unsigned) rte_le_to_cpu_16(rxd
.length
));
743 nmb
= rte_mbuf_raw_alloc(rxq
->mb_pool
);
745 PMD_RX_LOG(DEBUG
, "RX mbuf alloc failed port_id=%u "
747 (unsigned) rxq
->port_id
,
748 (unsigned) rxq
->queue_id
);
749 rte_eth_devices
[rxq
->port_id
].data
->rx_mbuf_alloc_failed
++;
754 rxe
= &sw_ring
[rx_id
];
756 if (rx_id
== rxq
->nb_rx_desc
)
759 /* Prefetch next mbuf while processing current one. */
760 rte_em_prefetch(sw_ring
[rx_id
].mbuf
);
763 * When next RX descriptor is on a cache-line boundary,
764 * prefetch the next 4 RX descriptors and the next 8 pointers
767 if ((rx_id
& 0x3) == 0) {
768 rte_em_prefetch(&rx_ring
[rx_id
]);
769 rte_em_prefetch(&sw_ring
[rx_id
]);
772 /* Rearm RXD: attach new mbuf and reset status to zero. */
777 rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb
));
778 rxdp
->buffer_addr
= dma_addr
;
782 * Initialize the returned mbuf.
783 * 1) setup generic mbuf fields:
784 * - number of segments,
787 * - RX port identifier.
788 * 2) integrate hardware offload data, if any:
790 * - IP checksum flag,
791 * - VLAN TCI, if any,
794 pkt_len
= (uint16_t) (rte_le_to_cpu_16(rxd
.length
) -
796 rxm
->data_off
= RTE_PKTMBUF_HEADROOM
;
797 rte_packet_prefetch((char *)rxm
->buf_addr
+ rxm
->data_off
);
800 rxm
->pkt_len
= pkt_len
;
801 rxm
->data_len
= pkt_len
;
802 rxm
->port
= rxq
->port_id
;
804 rxm
->ol_flags
= rx_desc_status_to_pkt_flags(status
);
805 rxm
->ol_flags
= rxm
->ol_flags
|
806 rx_desc_error_to_pkt_flags(rxd
.errors
);
808 /* Only valid if PKT_RX_VLAN set in pkt_flags */
809 rxm
->vlan_tci
= rte_le_to_cpu_16(rxd
.special
);
812 * Store the mbuf address into the next entry of the array
813 * of returned packets.
815 rx_pkts
[nb_rx
++] = rxm
;
817 rxq
->rx_tail
= rx_id
;
820 * If the number of free RX descriptors is greater than the RX free
821 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
823 * Update the RDT with the value of the last processed RX descriptor
824 * minus 1, to guarantee that the RDT register is never equal to the
825 * RDH register, which creates a "full" ring situtation from the
826 * hardware point of view...
828 nb_hold
= (uint16_t) (nb_hold
+ rxq
->nb_rx_hold
);
829 if (nb_hold
> rxq
->rx_free_thresh
) {
830 PMD_RX_LOG(DEBUG
, "port_id=%u queue_id=%u rx_tail=%u "
831 "nb_hold=%u nb_rx=%u",
832 (unsigned) rxq
->port_id
, (unsigned) rxq
->queue_id
,
833 (unsigned) rx_id
, (unsigned) nb_hold
,
835 rx_id
= (uint16_t) ((rx_id
== 0) ?
836 (rxq
->nb_rx_desc
- 1) : (rx_id
- 1));
837 E1000_PCI_REG_WRITE(rxq
->rdt_reg_addr
, rx_id
);
840 rxq
->nb_rx_hold
= nb_hold
;
845 eth_em_recv_scattered_pkts(void *rx_queue
, struct rte_mbuf
**rx_pkts
,
848 struct em_rx_queue
*rxq
;
849 volatile struct e1000_rx_desc
*rx_ring
;
850 volatile struct e1000_rx_desc
*rxdp
;
851 struct em_rx_entry
*sw_ring
;
852 struct em_rx_entry
*rxe
;
853 struct rte_mbuf
*first_seg
;
854 struct rte_mbuf
*last_seg
;
855 struct rte_mbuf
*rxm
;
856 struct rte_mbuf
*nmb
;
857 struct e1000_rx_desc rxd
;
858 uint64_t dma
; /* Physical address of mbuf data buffer */
869 rx_id
= rxq
->rx_tail
;
870 rx_ring
= rxq
->rx_ring
;
871 sw_ring
= rxq
->sw_ring
;
874 * Retrieve RX context of current packet, if any.
876 first_seg
= rxq
->pkt_first_seg
;
877 last_seg
= rxq
->pkt_last_seg
;
879 while (nb_rx
< nb_pkts
) {
882 * The order of operations here is important as the DD status
883 * bit must not be read after any other descriptor fields.
884 * rx_ring and rxdp are pointing to volatile data so the order
885 * of accesses cannot be reordered by the compiler. If they were
886 * not volatile, they could be reordered which could lead to
887 * using invalid descriptor fields when read from rxd.
889 rxdp
= &rx_ring
[rx_id
];
890 status
= rxdp
->status
;
891 if (! (status
& E1000_RXD_STAT_DD
))
898 * Allocate a new mbuf to replenish the RX ring descriptor.
899 * If the allocation fails:
900 * - arrange for that RX descriptor to be the first one
901 * being parsed the next time the receive function is
902 * invoked [on the same queue].
904 * - Stop parsing the RX ring and return immediately.
906 * This policy does not drop the packet received in the RX
907 * descriptor for which the allocation of a new mbuf failed.
908 * Thus, it allows that packet to be later retrieved if
909 * mbuf have been freed in the mean time.
910 * As a side effect, holding RX descriptors instead of
911 * systematically giving them back to the NIC may lead to
912 * RX ring exhaustion situations.
913 * However, the NIC can gracefully prevent such situations
914 * to happen by sending specific "back-pressure" flow control
915 * frames to its peer(s).
917 PMD_RX_LOG(DEBUG
, "port_id=%u queue_id=%u rx_id=%u "
918 "status=0x%x data_len=%u",
919 (unsigned) rxq
->port_id
, (unsigned) rxq
->queue_id
,
920 (unsigned) rx_id
, (unsigned) status
,
921 (unsigned) rte_le_to_cpu_16(rxd
.length
));
923 nmb
= rte_mbuf_raw_alloc(rxq
->mb_pool
);
925 PMD_RX_LOG(DEBUG
, "RX mbuf alloc failed port_id=%u "
926 "queue_id=%u", (unsigned) rxq
->port_id
,
927 (unsigned) rxq
->queue_id
);
928 rte_eth_devices
[rxq
->port_id
].data
->rx_mbuf_alloc_failed
++;
933 rxe
= &sw_ring
[rx_id
];
935 if (rx_id
== rxq
->nb_rx_desc
)
938 /* Prefetch next mbuf while processing current one. */
939 rte_em_prefetch(sw_ring
[rx_id
].mbuf
);
942 * When next RX descriptor is on a cache-line boundary,
943 * prefetch the next 4 RX descriptors and the next 8 pointers
946 if ((rx_id
& 0x3) == 0) {
947 rte_em_prefetch(&rx_ring
[rx_id
]);
948 rte_em_prefetch(&sw_ring
[rx_id
]);
952 * Update RX descriptor with the physical address of the new
953 * data buffer of the new allocated mbuf.
957 dma
= rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb
));
958 rxdp
->buffer_addr
= dma
;
962 * Set data length & data buffer address of mbuf.
964 data_len
= rte_le_to_cpu_16(rxd
.length
);
965 rxm
->data_len
= data_len
;
966 rxm
->data_off
= RTE_PKTMBUF_HEADROOM
;
969 * If this is the first buffer of the received packet,
970 * set the pointer to the first mbuf of the packet and
971 * initialize its context.
972 * Otherwise, update the total length and the number of segments
973 * of the current scattered packet, and update the pointer to
974 * the last mbuf of the current packet.
976 if (first_seg
== NULL
) {
978 first_seg
->pkt_len
= data_len
;
979 first_seg
->nb_segs
= 1;
981 first_seg
->pkt_len
+= data_len
;
982 first_seg
->nb_segs
++;
983 last_seg
->next
= rxm
;
987 * If this is not the last buffer of the received packet,
988 * update the pointer to the last mbuf of the current scattered
989 * packet and continue to parse the RX ring.
991 if (! (status
& E1000_RXD_STAT_EOP
)) {
997 * This is the last buffer of the received packet.
998 * If the CRC is not stripped by the hardware:
999 * - Subtract the CRC length from the total packet length.
1000 * - If the last buffer only contains the whole CRC or a part
1001 * of it, free the mbuf associated to the last buffer.
1002 * If part of the CRC is also contained in the previous
1003 * mbuf, subtract the length of that CRC part from the
1004 * data length of the previous mbuf.
1007 if (unlikely(rxq
->crc_len
> 0)) {
1008 first_seg
->pkt_len
-= ETHER_CRC_LEN
;
1009 if (data_len
<= ETHER_CRC_LEN
) {
1010 rte_pktmbuf_free_seg(rxm
);
1011 first_seg
->nb_segs
--;
1012 last_seg
->data_len
= (uint16_t)
1013 (last_seg
->data_len
-
1014 (ETHER_CRC_LEN
- data_len
));
1015 last_seg
->next
= NULL
;
1018 (uint16_t) (data_len
- ETHER_CRC_LEN
);
1022 * Initialize the first mbuf of the returned packet:
1023 * - RX port identifier,
1024 * - hardware offload data, if any:
1025 * - IP checksum flag,
1028 first_seg
->port
= rxq
->port_id
;
1030 first_seg
->ol_flags
= rx_desc_status_to_pkt_flags(status
);
1031 first_seg
->ol_flags
= first_seg
->ol_flags
|
1032 rx_desc_error_to_pkt_flags(rxd
.errors
);
1034 /* Only valid if PKT_RX_VLAN set in pkt_flags */
1035 rxm
->vlan_tci
= rte_le_to_cpu_16(rxd
.special
);
1037 /* Prefetch data of first segment, if configured to do so. */
1038 rte_packet_prefetch((char *)first_seg
->buf_addr
+
1039 first_seg
->data_off
);
1042 * Store the mbuf address into the next entry of the array
1043 * of returned packets.
1045 rx_pkts
[nb_rx
++] = first_seg
;
1048 * Setup receipt context for a new packet.
1054 * Record index of the next RX descriptor to probe.
1056 rxq
->rx_tail
= rx_id
;
1059 * Save receive context.
1061 rxq
->pkt_first_seg
= first_seg
;
1062 rxq
->pkt_last_seg
= last_seg
;
1065 * If the number of free RX descriptors is greater than the RX free
1066 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1068 * Update the RDT with the value of the last processed RX descriptor
1069 * minus 1, to guarantee that the RDT register is never equal to the
1070 * RDH register, which creates a "full" ring situtation from the
1071 * hardware point of view...
1073 nb_hold
= (uint16_t) (nb_hold
+ rxq
->nb_rx_hold
);
1074 if (nb_hold
> rxq
->rx_free_thresh
) {
1075 PMD_RX_LOG(DEBUG
, "port_id=%u queue_id=%u rx_tail=%u "
1076 "nb_hold=%u nb_rx=%u",
1077 (unsigned) rxq
->port_id
, (unsigned) rxq
->queue_id
,
1078 (unsigned) rx_id
, (unsigned) nb_hold
,
1080 rx_id
= (uint16_t) ((rx_id
== 0) ?
1081 (rxq
->nb_rx_desc
- 1) : (rx_id
- 1));
1082 E1000_PCI_REG_WRITE(rxq
->rdt_reg_addr
, rx_id
);
1085 rxq
->nb_rx_hold
= nb_hold
;
1089 #define EM_MAX_BUF_SIZE 16384
1090 #define EM_RCTL_FLXBUF_STEP 1024
1093 em_tx_queue_release_mbufs(struct em_tx_queue
*txq
)
1097 if (txq
->sw_ring
!= NULL
) {
1098 for (i
= 0; i
!= txq
->nb_tx_desc
; i
++) {
1099 if (txq
->sw_ring
[i
].mbuf
!= NULL
) {
1100 rte_pktmbuf_free_seg(txq
->sw_ring
[i
].mbuf
);
1101 txq
->sw_ring
[i
].mbuf
= NULL
;
1108 em_tx_queue_release(struct em_tx_queue
*txq
)
1111 em_tx_queue_release_mbufs(txq
);
1112 rte_free(txq
->sw_ring
);
1118 eth_em_tx_queue_release(void *txq
)
1120 em_tx_queue_release(txq
);
1123 /* (Re)set dynamic em_tx_queue fields to defaults */
1125 em_reset_tx_queue(struct em_tx_queue
*txq
)
1127 uint16_t i
, nb_desc
, prev
;
1128 static const struct e1000_data_desc txd_init
= {
1129 .upper
.fields
= {.status
= E1000_TXD_STAT_DD
},
1132 nb_desc
= txq
->nb_tx_desc
;
1134 /* Initialize ring entries */
1136 prev
= (uint16_t) (nb_desc
- 1);
1138 for (i
= 0; i
< nb_desc
; i
++) {
1139 txq
->tx_ring
[i
] = txd_init
;
1140 txq
->sw_ring
[i
].mbuf
= NULL
;
1141 txq
->sw_ring
[i
].last_id
= i
;
1142 txq
->sw_ring
[prev
].next_id
= i
;
1147 * Always allow 1 descriptor to be un-allocated to avoid
1148 * a H/W race condition
1150 txq
->nb_tx_free
= (uint16_t)(nb_desc
- 1);
1151 txq
->last_desc_cleaned
= (uint16_t)(nb_desc
- 1);
1152 txq
->nb_tx_used
= 0;
1155 memset((void*)&txq
->ctx_cache
, 0, sizeof (txq
->ctx_cache
));
1159 em_get_tx_port_offloads_capa(struct rte_eth_dev
*dev
)
1161 uint64_t tx_offload_capa
;
1165 DEV_TX_OFFLOAD_MULTI_SEGS
|
1166 DEV_TX_OFFLOAD_VLAN_INSERT
|
1167 DEV_TX_OFFLOAD_IPV4_CKSUM
|
1168 DEV_TX_OFFLOAD_UDP_CKSUM
|
1169 DEV_TX_OFFLOAD_TCP_CKSUM
;
1171 return tx_offload_capa
;
1175 em_get_tx_queue_offloads_capa(struct rte_eth_dev
*dev
)
1177 uint64_t tx_queue_offload_capa
;
1180 * As only one Tx queue can be used, let per queue offloading
1181 * capability be same to per port queue offloading capability
1182 * for better convenience.
1184 tx_queue_offload_capa
= em_get_tx_port_offloads_capa(dev
);
1186 return tx_queue_offload_capa
;
1190 eth_em_tx_queue_setup(struct rte_eth_dev
*dev
,
1193 unsigned int socket_id
,
1194 const struct rte_eth_txconf
*tx_conf
)
1196 const struct rte_memzone
*tz
;
1197 struct em_tx_queue
*txq
;
1198 struct e1000_hw
*hw
;
1200 uint16_t tx_rs_thresh
, tx_free_thresh
;
1203 hw
= E1000_DEV_PRIVATE_TO_HW(dev
->data
->dev_private
);
1205 offloads
= tx_conf
->offloads
| dev
->data
->dev_conf
.txmode
.offloads
;
1208 * Validate number of transmit descriptors.
1209 * It must not exceed hardware maximum, and must be multiple
1212 if (nb_desc
% EM_TXD_ALIGN
!= 0 ||
1213 (nb_desc
> E1000_MAX_RING_DESC
) ||
1214 (nb_desc
< E1000_MIN_RING_DESC
)) {
1218 tx_free_thresh
= tx_conf
->tx_free_thresh
;
1219 if (tx_free_thresh
== 0)
1220 tx_free_thresh
= (uint16_t)RTE_MIN(nb_desc
/ 4,
1221 DEFAULT_TX_FREE_THRESH
);
1223 tx_rs_thresh
= tx_conf
->tx_rs_thresh
;
1224 if (tx_rs_thresh
== 0)
1225 tx_rs_thresh
= (uint16_t)RTE_MIN(tx_free_thresh
,
1226 DEFAULT_TX_RS_THRESH
);
1228 if (tx_free_thresh
>= (nb_desc
- 3)) {
1229 PMD_INIT_LOG(ERR
, "tx_free_thresh must be less than the "
1230 "number of TX descriptors minus 3. "
1231 "(tx_free_thresh=%u port=%d queue=%d)",
1232 (unsigned int)tx_free_thresh
,
1233 (int)dev
->data
->port_id
, (int)queue_idx
);
1236 if (tx_rs_thresh
> tx_free_thresh
) {
1237 PMD_INIT_LOG(ERR
, "tx_rs_thresh must be less than or equal to "
1238 "tx_free_thresh. (tx_free_thresh=%u "
1239 "tx_rs_thresh=%u port=%d queue=%d)",
1240 (unsigned int)tx_free_thresh
,
1241 (unsigned int)tx_rs_thresh
,
1242 (int)dev
->data
->port_id
,
1248 * If rs_bit_thresh is greater than 1, then TX WTHRESH should be
1249 * set to 0. If WTHRESH is greater than zero, the RS bit is ignored
1250 * by the NIC and all descriptors are written back after the NIC
1251 * accumulates WTHRESH descriptors.
1253 if (tx_conf
->tx_thresh
.wthresh
!= 0 && tx_rs_thresh
!= 1) {
1254 PMD_INIT_LOG(ERR
, "TX WTHRESH must be set to 0 if "
1255 "tx_rs_thresh is greater than 1. (tx_rs_thresh=%u "
1256 "port=%d queue=%d)", (unsigned int)tx_rs_thresh
,
1257 (int)dev
->data
->port_id
, (int)queue_idx
);
1261 /* Free memory prior to re-allocation if needed... */
1262 if (dev
->data
->tx_queues
[queue_idx
] != NULL
) {
1263 em_tx_queue_release(dev
->data
->tx_queues
[queue_idx
]);
1264 dev
->data
->tx_queues
[queue_idx
] = NULL
;
1268 * Allocate TX ring hardware descriptors. A memzone large enough to
1269 * handle the maximum ring size is allocated in order to allow for
1270 * resizing in later calls to the queue setup function.
1272 tsize
= sizeof(txq
->tx_ring
[0]) * E1000_MAX_RING_DESC
;
1273 tz
= rte_eth_dma_zone_reserve(dev
, "tx_ring", queue_idx
, tsize
,
1274 RTE_CACHE_LINE_SIZE
, socket_id
);
1278 /* Allocate the tx queue data structure. */
1279 if ((txq
= rte_zmalloc("ethdev TX queue", sizeof(*txq
),
1280 RTE_CACHE_LINE_SIZE
)) == NULL
)
1283 /* Allocate software ring */
1284 if ((txq
->sw_ring
= rte_zmalloc("txq->sw_ring",
1285 sizeof(txq
->sw_ring
[0]) * nb_desc
,
1286 RTE_CACHE_LINE_SIZE
)) == NULL
) {
1287 em_tx_queue_release(txq
);
1291 txq
->nb_tx_desc
= nb_desc
;
1292 txq
->tx_free_thresh
= tx_free_thresh
;
1293 txq
->tx_rs_thresh
= tx_rs_thresh
;
1294 txq
->pthresh
= tx_conf
->tx_thresh
.pthresh
;
1295 txq
->hthresh
= tx_conf
->tx_thresh
.hthresh
;
1296 txq
->wthresh
= tx_conf
->tx_thresh
.wthresh
;
1297 txq
->queue_id
= queue_idx
;
1298 txq
->port_id
= dev
->data
->port_id
;
1300 txq
->tdt_reg_addr
= E1000_PCI_REG_ADDR(hw
, E1000_TDT(queue_idx
));
1301 txq
->tx_ring_phys_addr
= tz
->iova
;
1302 txq
->tx_ring
= (struct e1000_data_desc
*) tz
->addr
;
1304 PMD_INIT_LOG(DEBUG
, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64
,
1305 txq
->sw_ring
, txq
->tx_ring
, txq
->tx_ring_phys_addr
);
1307 em_reset_tx_queue(txq
);
1309 dev
->data
->tx_queues
[queue_idx
] = txq
;
1310 txq
->offloads
= offloads
;
1315 em_rx_queue_release_mbufs(struct em_rx_queue
*rxq
)
1319 if (rxq
->sw_ring
!= NULL
) {
1320 for (i
= 0; i
!= rxq
->nb_rx_desc
; i
++) {
1321 if (rxq
->sw_ring
[i
].mbuf
!= NULL
) {
1322 rte_pktmbuf_free_seg(rxq
->sw_ring
[i
].mbuf
);
1323 rxq
->sw_ring
[i
].mbuf
= NULL
;
1330 em_rx_queue_release(struct em_rx_queue
*rxq
)
1333 em_rx_queue_release_mbufs(rxq
);
1334 rte_free(rxq
->sw_ring
);
1340 eth_em_rx_queue_release(void *rxq
)
1342 em_rx_queue_release(rxq
);
1345 /* Reset dynamic em_rx_queue fields back to defaults */
1347 em_reset_rx_queue(struct em_rx_queue
*rxq
)
1350 rxq
->nb_rx_hold
= 0;
1351 rxq
->pkt_first_seg
= NULL
;
1352 rxq
->pkt_last_seg
= NULL
;
1356 em_get_rx_port_offloads_capa(struct rte_eth_dev
*dev
)
1358 uint64_t rx_offload_capa
;
1359 uint32_t max_rx_pktlen
;
1361 max_rx_pktlen
= em_get_max_pktlen(dev
);
1364 DEV_RX_OFFLOAD_VLAN_STRIP
|
1365 DEV_RX_OFFLOAD_VLAN_FILTER
|
1366 DEV_RX_OFFLOAD_IPV4_CKSUM
|
1367 DEV_RX_OFFLOAD_UDP_CKSUM
|
1368 DEV_RX_OFFLOAD_TCP_CKSUM
|
1369 DEV_RX_OFFLOAD_KEEP_CRC
|
1370 DEV_RX_OFFLOAD_SCATTER
;
1371 if (max_rx_pktlen
> ETHER_MAX_LEN
)
1372 rx_offload_capa
|= DEV_RX_OFFLOAD_JUMBO_FRAME
;
1374 return rx_offload_capa
;
1378 em_get_rx_queue_offloads_capa(struct rte_eth_dev
*dev
)
1380 uint64_t rx_queue_offload_capa
;
1383 * As only one Rx queue can be used, let per queue offloading
1384 * capability be same to per port queue offloading capability
1385 * for better convenience.
1387 rx_queue_offload_capa
= em_get_rx_port_offloads_capa(dev
);
1389 return rx_queue_offload_capa
;
1393 eth_em_rx_queue_setup(struct rte_eth_dev
*dev
,
1396 unsigned int socket_id
,
1397 const struct rte_eth_rxconf
*rx_conf
,
1398 struct rte_mempool
*mp
)
1400 const struct rte_memzone
*rz
;
1401 struct em_rx_queue
*rxq
;
1402 struct e1000_hw
*hw
;
1406 hw
= E1000_DEV_PRIVATE_TO_HW(dev
->data
->dev_private
);
1408 offloads
= rx_conf
->offloads
| dev
->data
->dev_conf
.rxmode
.offloads
;
1411 * Validate number of receive descriptors.
1412 * It must not exceed hardware maximum, and must be multiple
1415 if (nb_desc
% EM_RXD_ALIGN
!= 0 ||
1416 (nb_desc
> E1000_MAX_RING_DESC
) ||
1417 (nb_desc
< E1000_MIN_RING_DESC
)) {
1422 * EM devices don't support drop_en functionality.
1423 * It's an optimization that does nothing on single-queue devices,
1424 * so just log the issue and carry on.
1426 if (rx_conf
->rx_drop_en
) {
1427 PMD_INIT_LOG(NOTICE
, "drop_en functionality not supported by "
1431 /* Free memory prior to re-allocation if needed. */
1432 if (dev
->data
->rx_queues
[queue_idx
] != NULL
) {
1433 em_rx_queue_release(dev
->data
->rx_queues
[queue_idx
]);
1434 dev
->data
->rx_queues
[queue_idx
] = NULL
;
1437 /* Allocate RX ring for max possible mumber of hardware descriptors. */
1438 rsize
= sizeof(rxq
->rx_ring
[0]) * E1000_MAX_RING_DESC
;
1439 rz
= rte_eth_dma_zone_reserve(dev
, "rx_ring", queue_idx
, rsize
,
1440 RTE_CACHE_LINE_SIZE
, socket_id
);
1444 /* Allocate the RX queue data structure. */
1445 if ((rxq
= rte_zmalloc("ethdev RX queue", sizeof(*rxq
),
1446 RTE_CACHE_LINE_SIZE
)) == NULL
)
1449 /* Allocate software ring. */
1450 if ((rxq
->sw_ring
= rte_zmalloc("rxq->sw_ring",
1451 sizeof (rxq
->sw_ring
[0]) * nb_desc
,
1452 RTE_CACHE_LINE_SIZE
)) == NULL
) {
1453 em_rx_queue_release(rxq
);
1458 rxq
->nb_rx_desc
= nb_desc
;
1459 rxq
->pthresh
= rx_conf
->rx_thresh
.pthresh
;
1460 rxq
->hthresh
= rx_conf
->rx_thresh
.hthresh
;
1461 rxq
->wthresh
= rx_conf
->rx_thresh
.wthresh
;
1462 rxq
->rx_free_thresh
= rx_conf
->rx_free_thresh
;
1463 rxq
->queue_id
= queue_idx
;
1464 rxq
->port_id
= dev
->data
->port_id
;
1465 if (dev
->data
->dev_conf
.rxmode
.offloads
& DEV_RX_OFFLOAD_KEEP_CRC
)
1466 rxq
->crc_len
= ETHER_CRC_LEN
;
1470 rxq
->rdt_reg_addr
= E1000_PCI_REG_ADDR(hw
, E1000_RDT(queue_idx
));
1471 rxq
->rdh_reg_addr
= E1000_PCI_REG_ADDR(hw
, E1000_RDH(queue_idx
));
1472 rxq
->rx_ring_phys_addr
= rz
->iova
;
1473 rxq
->rx_ring
= (struct e1000_rx_desc
*) rz
->addr
;
1475 PMD_INIT_LOG(DEBUG
, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64
,
1476 rxq
->sw_ring
, rxq
->rx_ring
, rxq
->rx_ring_phys_addr
);
1478 dev
->data
->rx_queues
[queue_idx
] = rxq
;
1479 em_reset_rx_queue(rxq
);
1480 rxq
->offloads
= offloads
;
1486 eth_em_rx_queue_count(struct rte_eth_dev
*dev
, uint16_t rx_queue_id
)
1488 #define EM_RXQ_SCAN_INTERVAL 4
1489 volatile struct e1000_rx_desc
*rxdp
;
1490 struct em_rx_queue
*rxq
;
1493 rxq
= dev
->data
->rx_queues
[rx_queue_id
];
1494 rxdp
= &(rxq
->rx_ring
[rxq
->rx_tail
]);
1496 while ((desc
< rxq
->nb_rx_desc
) &&
1497 (rxdp
->status
& E1000_RXD_STAT_DD
)) {
1498 desc
+= EM_RXQ_SCAN_INTERVAL
;
1499 rxdp
+= EM_RXQ_SCAN_INTERVAL
;
1500 if (rxq
->rx_tail
+ desc
>= rxq
->nb_rx_desc
)
1501 rxdp
= &(rxq
->rx_ring
[rxq
->rx_tail
+
1502 desc
- rxq
->nb_rx_desc
]);
1509 eth_em_rx_descriptor_done(void *rx_queue
, uint16_t offset
)
1511 volatile struct e1000_rx_desc
*rxdp
;
1512 struct em_rx_queue
*rxq
= rx_queue
;
1515 if (unlikely(offset
>= rxq
->nb_rx_desc
))
1517 desc
= rxq
->rx_tail
+ offset
;
1518 if (desc
>= rxq
->nb_rx_desc
)
1519 desc
-= rxq
->nb_rx_desc
;
1521 rxdp
= &rxq
->rx_ring
[desc
];
1522 return !!(rxdp
->status
& E1000_RXD_STAT_DD
);
1526 eth_em_rx_descriptor_status(void *rx_queue
, uint16_t offset
)
1528 struct em_rx_queue
*rxq
= rx_queue
;
1529 volatile uint8_t *status
;
1532 if (unlikely(offset
>= rxq
->nb_rx_desc
))
1535 if (offset
>= rxq
->nb_rx_desc
- rxq
->nb_rx_hold
)
1536 return RTE_ETH_RX_DESC_UNAVAIL
;
1538 desc
= rxq
->rx_tail
+ offset
;
1539 if (desc
>= rxq
->nb_rx_desc
)
1540 desc
-= rxq
->nb_rx_desc
;
1542 status
= &rxq
->rx_ring
[desc
].status
;
1543 if (*status
& E1000_RXD_STAT_DD
)
1544 return RTE_ETH_RX_DESC_DONE
;
1546 return RTE_ETH_RX_DESC_AVAIL
;
1550 eth_em_tx_descriptor_status(void *tx_queue
, uint16_t offset
)
1552 struct em_tx_queue
*txq
= tx_queue
;
1553 volatile uint8_t *status
;
1556 if (unlikely(offset
>= txq
->nb_tx_desc
))
1559 desc
= txq
->tx_tail
+ offset
;
1560 /* go to next desc that has the RS bit */
1561 desc
= ((desc
+ txq
->tx_rs_thresh
- 1) / txq
->tx_rs_thresh
) *
1563 if (desc
>= txq
->nb_tx_desc
) {
1564 desc
-= txq
->nb_tx_desc
;
1565 if (desc
>= txq
->nb_tx_desc
)
1566 desc
-= txq
->nb_tx_desc
;
1569 status
= &txq
->tx_ring
[desc
].upper
.fields
.status
;
1570 if (*status
& E1000_TXD_STAT_DD
)
1571 return RTE_ETH_TX_DESC_DONE
;
1573 return RTE_ETH_TX_DESC_FULL
;
1577 em_dev_clear_queues(struct rte_eth_dev
*dev
)
1580 struct em_tx_queue
*txq
;
1581 struct em_rx_queue
*rxq
;
1583 for (i
= 0; i
< dev
->data
->nb_tx_queues
; i
++) {
1584 txq
= dev
->data
->tx_queues
[i
];
1586 em_tx_queue_release_mbufs(txq
);
1587 em_reset_tx_queue(txq
);
1591 for (i
= 0; i
< dev
->data
->nb_rx_queues
; i
++) {
1592 rxq
= dev
->data
->rx_queues
[i
];
1594 em_rx_queue_release_mbufs(rxq
);
1595 em_reset_rx_queue(rxq
);
1601 em_dev_free_queues(struct rte_eth_dev
*dev
)
1605 for (i
= 0; i
< dev
->data
->nb_rx_queues
; i
++) {
1606 eth_em_rx_queue_release(dev
->data
->rx_queues
[i
]);
1607 dev
->data
->rx_queues
[i
] = NULL
;
1609 dev
->data
->nb_rx_queues
= 0;
1611 for (i
= 0; i
< dev
->data
->nb_tx_queues
; i
++) {
1612 eth_em_tx_queue_release(dev
->data
->tx_queues
[i
]);
1613 dev
->data
->tx_queues
[i
] = NULL
;
1615 dev
->data
->nb_tx_queues
= 0;
1619 * Takes as input/output parameter RX buffer size.
1620 * Returns (BSIZE | BSEX | FLXBUF) fields of RCTL register.
1623 em_rctl_bsize(__rte_unused
enum e1000_mac_type hwtyp
, uint32_t *bufsz
)
1626 * For BSIZE & BSEX all configurable sizes are:
1627 * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
1628 * 8192: rctl |= (E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX);
1629 * 4096: rctl |= (E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX);
1630 * 2048: rctl |= E1000_RCTL_SZ_2048;
1631 * 1024: rctl |= E1000_RCTL_SZ_1024;
1632 * 512: rctl |= E1000_RCTL_SZ_512;
1633 * 256: rctl |= E1000_RCTL_SZ_256;
1635 static const struct {
1638 } bufsz_to_rctl
[] = {
1639 {16384, (E1000_RCTL_SZ_16384
| E1000_RCTL_BSEX
)},
1640 {8192, (E1000_RCTL_SZ_8192
| E1000_RCTL_BSEX
)},
1641 {4096, (E1000_RCTL_SZ_4096
| E1000_RCTL_BSEX
)},
1642 {2048, E1000_RCTL_SZ_2048
},
1643 {1024, E1000_RCTL_SZ_1024
},
1644 {512, E1000_RCTL_SZ_512
},
1645 {256, E1000_RCTL_SZ_256
},
1649 uint32_t rctl_bsize
;
1651 rctl_bsize
= *bufsz
;
1654 * Starting from 82571 it is possible to specify RX buffer size
1655 * by RCTL.FLXBUF. When this field is different from zero, the
1656 * RX buffer size = RCTL.FLXBUF * 1K
1657 * (e.g. t is possible to specify RX buffer size 1,2,...,15KB).
1658 * It is working ok on real HW, but by some reason doesn't work
1659 * on VMware emulated 82574L.
1660 * So for now, always use BSIZE/BSEX to setup RX buffer size.
1661 * If you don't plan to use it on VMware emulated 82574L and
1662 * would like to specify RX buffer size in 1K granularity,
1663 * uncomment the following lines:
1664 * ***************************************************************
1665 * if (hwtyp >= e1000_82571 && hwtyp <= e1000_82574 &&
1666 * rctl_bsize >= EM_RCTL_FLXBUF_STEP) {
1667 * rctl_bsize /= EM_RCTL_FLXBUF_STEP;
1668 * *bufsz = rctl_bsize;
1669 * return (rctl_bsize << E1000_RCTL_FLXBUF_SHIFT &
1670 * E1000_RCTL_FLXBUF_MASK);
1672 * ***************************************************************
1675 for (i
= 0; i
!= sizeof(bufsz_to_rctl
) / sizeof(bufsz_to_rctl
[0]);
1677 if (rctl_bsize
>= bufsz_to_rctl
[i
].bufsz
) {
1678 *bufsz
= bufsz_to_rctl
[i
].bufsz
;
1679 return bufsz_to_rctl
[i
].rctl
;
1683 /* Should never happen. */
1688 em_alloc_rx_queue_mbufs(struct em_rx_queue
*rxq
)
1690 struct em_rx_entry
*rxe
= rxq
->sw_ring
;
1693 static const struct e1000_rx_desc rxd_init
= {
1697 /* Initialize software ring entries */
1698 for (i
= 0; i
< rxq
->nb_rx_desc
; i
++) {
1699 volatile struct e1000_rx_desc
*rxd
;
1700 struct rte_mbuf
*mbuf
= rte_mbuf_raw_alloc(rxq
->mb_pool
);
1703 PMD_INIT_LOG(ERR
, "RX mbuf alloc failed "
1704 "queue_id=%hu", rxq
->queue_id
);
1709 rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf
));
1711 /* Clear HW ring memory */
1712 rxq
->rx_ring
[i
] = rxd_init
;
1714 rxd
= &rxq
->rx_ring
[i
];
1715 rxd
->buffer_addr
= dma_addr
;
1722 /*********************************************************************
1724 * Enable receive unit.
1726 **********************************************************************/
1728 eth_em_rx_init(struct rte_eth_dev
*dev
)
1730 struct e1000_hw
*hw
;
1731 struct em_rx_queue
*rxq
;
1732 struct rte_eth_rxmode
*rxmode
;
1736 uint32_t rctl_bsize
;
1740 hw
= E1000_DEV_PRIVATE_TO_HW(dev
->data
->dev_private
);
1741 rxmode
= &dev
->data
->dev_conf
.rxmode
;
1744 * Make sure receives are disabled while setting
1745 * up the descriptor ring.
1747 rctl
= E1000_READ_REG(hw
, E1000_RCTL
);
1748 E1000_WRITE_REG(hw
, E1000_RCTL
, rctl
& ~E1000_RCTL_EN
);
1750 rfctl
= E1000_READ_REG(hw
, E1000_RFCTL
);
1752 /* Disable extended descriptor type. */
1753 rfctl
&= ~E1000_RFCTL_EXTEN
;
1754 /* Disable accelerated acknowledge */
1755 if (hw
->mac
.type
== e1000_82574
)
1756 rfctl
|= E1000_RFCTL_ACK_DIS
;
1758 E1000_WRITE_REG(hw
, E1000_RFCTL
, rfctl
);
1761 * XXX TEMPORARY WORKAROUND: on some systems with 82573
1762 * long latencies are observed, like Lenovo X60. This
1763 * change eliminates the problem, but since having positive
1764 * values in RDTR is a known source of problems on other
1765 * platforms another solution is being sought.
1767 if (hw
->mac
.type
== e1000_82573
)
1768 E1000_WRITE_REG(hw
, E1000_RDTR
, 0x20);
1770 dev
->rx_pkt_burst
= (eth_rx_burst_t
)eth_em_recv_pkts
;
1772 /* Determine RX bufsize. */
1773 rctl_bsize
= EM_MAX_BUF_SIZE
;
1774 for (i
= 0; i
< dev
->data
->nb_rx_queues
; i
++) {
1777 rxq
= dev
->data
->rx_queues
[i
];
1778 buf_size
= rte_pktmbuf_data_room_size(rxq
->mb_pool
) -
1779 RTE_PKTMBUF_HEADROOM
;
1780 rctl_bsize
= RTE_MIN(rctl_bsize
, buf_size
);
1783 rctl
|= em_rctl_bsize(hw
->mac
.type
, &rctl_bsize
);
1785 /* Configure and enable each RX queue. */
1786 for (i
= 0; i
< dev
->data
->nb_rx_queues
; i
++) {
1790 rxq
= dev
->data
->rx_queues
[i
];
1792 /* Allocate buffers for descriptor rings and setup queue */
1793 ret
= em_alloc_rx_queue_mbufs(rxq
);
1798 * Reset crc_len in case it was changed after queue setup by a
1801 if (dev
->data
->dev_conf
.rxmode
.offloads
& DEV_RX_OFFLOAD_KEEP_CRC
)
1802 rxq
->crc_len
= ETHER_CRC_LEN
;
1806 bus_addr
= rxq
->rx_ring_phys_addr
;
1807 E1000_WRITE_REG(hw
, E1000_RDLEN(i
),
1809 sizeof(*rxq
->rx_ring
));
1810 E1000_WRITE_REG(hw
, E1000_RDBAH(i
),
1811 (uint32_t)(bus_addr
>> 32));
1812 E1000_WRITE_REG(hw
, E1000_RDBAL(i
), (uint32_t)bus_addr
);
1814 E1000_WRITE_REG(hw
, E1000_RDH(i
), 0);
1815 E1000_WRITE_REG(hw
, E1000_RDT(i
), rxq
->nb_rx_desc
- 1);
1817 rxdctl
= E1000_READ_REG(hw
, E1000_RXDCTL(0));
1818 rxdctl
&= 0xFE000000;
1819 rxdctl
|= rxq
->pthresh
& 0x3F;
1820 rxdctl
|= (rxq
->hthresh
& 0x3F) << 8;
1821 rxdctl
|= (rxq
->wthresh
& 0x3F) << 16;
1822 rxdctl
|= E1000_RXDCTL_GRAN
;
1823 E1000_WRITE_REG(hw
, E1000_RXDCTL(i
), rxdctl
);
1826 * Due to EM devices not having any sort of hardware
1827 * limit for packet length, jumbo frame of any size
1828 * can be accepted, thus we have to enable scattered
1829 * rx if jumbo frames are enabled (or if buffer size
1830 * is too small to accommodate non-jumbo packets)
1831 * to avoid splitting packets that don't fit into
1834 if (rxmode
->offloads
& DEV_RX_OFFLOAD_JUMBO_FRAME
||
1835 rctl_bsize
< ETHER_MAX_LEN
) {
1836 if (!dev
->data
->scattered_rx
)
1837 PMD_INIT_LOG(DEBUG
, "forcing scatter mode");
1839 (eth_rx_burst_t
)eth_em_recv_scattered_pkts
;
1840 dev
->data
->scattered_rx
= 1;
1844 if (dev
->data
->dev_conf
.rxmode
.offloads
& DEV_RX_OFFLOAD_SCATTER
) {
1845 if (!dev
->data
->scattered_rx
)
1846 PMD_INIT_LOG(DEBUG
, "forcing scatter mode");
1847 dev
->rx_pkt_burst
= eth_em_recv_scattered_pkts
;
1848 dev
->data
->scattered_rx
= 1;
1852 * Setup the Checksum Register.
1853 * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1855 rxcsum
= E1000_READ_REG(hw
, E1000_RXCSUM
);
1857 if (rxmode
->offloads
& DEV_RX_OFFLOAD_CHECKSUM
)
1858 rxcsum
|= E1000_RXCSUM_IPOFL
;
1860 rxcsum
&= ~E1000_RXCSUM_IPOFL
;
1861 E1000_WRITE_REG(hw
, E1000_RXCSUM
, rxcsum
);
1863 /* No MRQ or RSS support for now */
1865 /* Set early receive threshold on appropriate hw */
1866 if ((hw
->mac
.type
== e1000_ich9lan
||
1867 hw
->mac
.type
== e1000_pch2lan
||
1868 hw
->mac
.type
== e1000_ich10lan
) &&
1869 rxmode
->offloads
& DEV_RX_OFFLOAD_JUMBO_FRAME
) {
1870 u32 rxdctl
= E1000_READ_REG(hw
, E1000_RXDCTL(0));
1871 E1000_WRITE_REG(hw
, E1000_RXDCTL(0), rxdctl
| 3);
1872 E1000_WRITE_REG(hw
, E1000_ERT
, 0x100 | (1 << 13));
1875 if (hw
->mac
.type
== e1000_pch2lan
) {
1876 if (rxmode
->offloads
& DEV_RX_OFFLOAD_JUMBO_FRAME
)
1877 e1000_lv_jumbo_workaround_ich8lan(hw
, TRUE
);
1879 e1000_lv_jumbo_workaround_ich8lan(hw
, FALSE
);
1882 /* Setup the Receive Control Register. */
1883 if (dev
->data
->dev_conf
.rxmode
.offloads
& DEV_RX_OFFLOAD_KEEP_CRC
)
1884 rctl
&= ~E1000_RCTL_SECRC
; /* Do not Strip Ethernet CRC. */
1886 rctl
|= E1000_RCTL_SECRC
; /* Strip Ethernet CRC. */
1888 rctl
&= ~(3 << E1000_RCTL_MO_SHIFT
);
1889 rctl
|= E1000_RCTL_EN
| E1000_RCTL_BAM
| E1000_RCTL_LBM_NO
|
1890 E1000_RCTL_RDMTS_HALF
|
1891 (hw
->mac
.mc_filter_type
<< E1000_RCTL_MO_SHIFT
);
1893 /* Make sure VLAN Filters are off. */
1894 rctl
&= ~E1000_RCTL_VFE
;
1895 /* Don't store bad packets. */
1896 rctl
&= ~E1000_RCTL_SBP
;
1897 /* Legacy descriptor type. */
1898 rctl
&= ~E1000_RCTL_DTYP_MASK
;
1901 * Configure support of jumbo frames, if any.
1903 if (rxmode
->offloads
& DEV_RX_OFFLOAD_JUMBO_FRAME
)
1904 rctl
|= E1000_RCTL_LPE
;
1906 rctl
&= ~E1000_RCTL_LPE
;
1908 /* Enable Receives. */
1909 E1000_WRITE_REG(hw
, E1000_RCTL
, rctl
);
1914 /*********************************************************************
1916 * Enable transmit unit.
1918 **********************************************************************/
1920 eth_em_tx_init(struct rte_eth_dev
*dev
)
1922 struct e1000_hw
*hw
;
1923 struct em_tx_queue
*txq
;
1928 hw
= E1000_DEV_PRIVATE_TO_HW(dev
->data
->dev_private
);
1930 /* Setup the Base and Length of the Tx Descriptor Rings. */
1931 for (i
= 0; i
< dev
->data
->nb_tx_queues
; i
++) {
1934 txq
= dev
->data
->tx_queues
[i
];
1935 bus_addr
= txq
->tx_ring_phys_addr
;
1936 E1000_WRITE_REG(hw
, E1000_TDLEN(i
),
1938 sizeof(*txq
->tx_ring
));
1939 E1000_WRITE_REG(hw
, E1000_TDBAH(i
),
1940 (uint32_t)(bus_addr
>> 32));
1941 E1000_WRITE_REG(hw
, E1000_TDBAL(i
), (uint32_t)bus_addr
);
1943 /* Setup the HW Tx Head and Tail descriptor pointers. */
1944 E1000_WRITE_REG(hw
, E1000_TDT(i
), 0);
1945 E1000_WRITE_REG(hw
, E1000_TDH(i
), 0);
1947 /* Setup Transmit threshold registers. */
1948 txdctl
= E1000_READ_REG(hw
, E1000_TXDCTL(i
));
1950 * bit 22 is reserved, on some models should always be 0,
1951 * on others - always 1.
1953 txdctl
&= E1000_TXDCTL_COUNT_DESC
;
1954 txdctl
|= txq
->pthresh
& 0x3F;
1955 txdctl
|= (txq
->hthresh
& 0x3F) << 8;
1956 txdctl
|= (txq
->wthresh
& 0x3F) << 16;
1957 txdctl
|= E1000_TXDCTL_GRAN
;
1958 E1000_WRITE_REG(hw
, E1000_TXDCTL(i
), txdctl
);
1961 /* Program the Transmit Control Register. */
1962 tctl
= E1000_READ_REG(hw
, E1000_TCTL
);
1963 tctl
&= ~E1000_TCTL_CT
;
1964 tctl
|= (E1000_TCTL_PSP
| E1000_TCTL_RTLC
| E1000_TCTL_EN
|
1965 (E1000_COLLISION_THRESHOLD
<< E1000_CT_SHIFT
));
1967 /* This write will effectively turn on the transmit unit. */
1968 E1000_WRITE_REG(hw
, E1000_TCTL
, tctl
);
1972 em_rxq_info_get(struct rte_eth_dev
*dev
, uint16_t queue_id
,
1973 struct rte_eth_rxq_info
*qinfo
)
1975 struct em_rx_queue
*rxq
;
1977 rxq
= dev
->data
->rx_queues
[queue_id
];
1979 qinfo
->mp
= rxq
->mb_pool
;
1980 qinfo
->scattered_rx
= dev
->data
->scattered_rx
;
1981 qinfo
->nb_desc
= rxq
->nb_rx_desc
;
1982 qinfo
->conf
.rx_free_thresh
= rxq
->rx_free_thresh
;
1983 qinfo
->conf
.offloads
= rxq
->offloads
;
1987 em_txq_info_get(struct rte_eth_dev
*dev
, uint16_t queue_id
,
1988 struct rte_eth_txq_info
*qinfo
)
1990 struct em_tx_queue
*txq
;
1992 txq
= dev
->data
->tx_queues
[queue_id
];
1994 qinfo
->nb_desc
= txq
->nb_tx_desc
;
1996 qinfo
->conf
.tx_thresh
.pthresh
= txq
->pthresh
;
1997 qinfo
->conf
.tx_thresh
.hthresh
= txq
->hthresh
;
1998 qinfo
->conf
.tx_thresh
.wthresh
= txq
->wthresh
;
1999 qinfo
->conf
.tx_free_thresh
= txq
->tx_free_thresh
;
2000 qinfo
->conf
.tx_rs_thresh
= txq
->tx_rs_thresh
;
2001 qinfo
->conf
.offloads
= txq
->offloads
;