1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2018, Intel Corporation. */
4 /* The driver transmit and receive code */
6 #include <linux/prefetch.h>
8 #include <linux/bpf_trace.h>
10 #include "ice_txrx_lib.h"
13 #include "ice_dcb_lib.h"
16 #define ICE_RX_HDR_SIZE 256
19 * ice_unmap_and_free_tx_buf - Release a Tx buffer
20 * @ring: the ring that owns the buffer
21 * @tx_buf: the buffer to free
24 ice_unmap_and_free_tx_buf(struct ice_ring
*ring
, struct ice_tx_buf
*tx_buf
)
27 if (ice_ring_is_xdp(ring
))
28 page_frag_free(tx_buf
->raw_buf
);
30 dev_kfree_skb_any(tx_buf
->skb
);
31 if (dma_unmap_len(tx_buf
, len
))
32 dma_unmap_single(ring
->dev
,
33 dma_unmap_addr(tx_buf
, dma
),
34 dma_unmap_len(tx_buf
, len
),
36 } else if (dma_unmap_len(tx_buf
, len
)) {
37 dma_unmap_page(ring
->dev
,
38 dma_unmap_addr(tx_buf
, dma
),
39 dma_unmap_len(tx_buf
, len
),
43 tx_buf
->next_to_watch
= NULL
;
45 dma_unmap_len_set(tx_buf
, len
, 0);
46 /* tx_buf must be completely set up in the transmit path */
49 static struct netdev_queue
*txring_txq(const struct ice_ring
*ring
)
51 return netdev_get_tx_queue(ring
->netdev
, ring
->q_index
);
55 * ice_clean_tx_ring - Free any empty Tx buffers
56 * @tx_ring: ring to be cleaned
58 void ice_clean_tx_ring(struct ice_ring
*tx_ring
)
62 if (ice_ring_is_xdp(tx_ring
) && tx_ring
->xsk_umem
) {
63 ice_xsk_clean_xdp_ring(tx_ring
);
67 /* ring already cleared, nothing to do */
71 /* Free all the Tx ring sk_buffs */
72 for (i
= 0; i
< tx_ring
->count
; i
++)
73 ice_unmap_and_free_tx_buf(tx_ring
, &tx_ring
->tx_buf
[i
]);
76 memset(tx_ring
->tx_buf
, 0, sizeof(*tx_ring
->tx_buf
) * tx_ring
->count
);
78 /* Zero out the descriptor ring */
79 memset(tx_ring
->desc
, 0, tx_ring
->size
);
81 tx_ring
->next_to_use
= 0;
82 tx_ring
->next_to_clean
= 0;
87 /* cleanup Tx queue statistics */
88 netdev_tx_reset_queue(txring_txq(tx_ring
));
92 * ice_free_tx_ring - Free Tx resources per queue
93 * @tx_ring: Tx descriptor ring for a specific queue
95 * Free all transmit software resources
97 void ice_free_tx_ring(struct ice_ring
*tx_ring
)
99 ice_clean_tx_ring(tx_ring
);
100 devm_kfree(tx_ring
->dev
, tx_ring
->tx_buf
);
101 tx_ring
->tx_buf
= NULL
;
104 dmam_free_coherent(tx_ring
->dev
, tx_ring
->size
,
105 tx_ring
->desc
, tx_ring
->dma
);
106 tx_ring
->desc
= NULL
;
111 * ice_clean_tx_irq - Reclaim resources after transmit completes
112 * @tx_ring: Tx ring to clean
113 * @napi_budget: Used to determine if we are in netpoll
115 * Returns true if there's any budget left (e.g. the clean is finished)
117 static bool ice_clean_tx_irq(struct ice_ring
*tx_ring
, int napi_budget
)
119 unsigned int total_bytes
= 0, total_pkts
= 0;
120 unsigned int budget
= ICE_DFLT_IRQ_WORK
;
121 struct ice_vsi
*vsi
= tx_ring
->vsi
;
122 s16 i
= tx_ring
->next_to_clean
;
123 struct ice_tx_desc
*tx_desc
;
124 struct ice_tx_buf
*tx_buf
;
126 tx_buf
= &tx_ring
->tx_buf
[i
];
127 tx_desc
= ICE_TX_DESC(tx_ring
, i
);
130 prefetch(&vsi
->state
);
133 struct ice_tx_desc
*eop_desc
= tx_buf
->next_to_watch
;
135 /* if next_to_watch is not set then there is no work pending */
139 smp_rmb(); /* prevent any other reads prior to eop_desc */
141 /* if the descriptor isn't done, no work yet to do */
142 if (!(eop_desc
->cmd_type_offset_bsz
&
143 cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE
)))
146 /* clear next_to_watch to prevent false hangs */
147 tx_buf
->next_to_watch
= NULL
;
149 /* update the statistics for this packet */
150 total_bytes
+= tx_buf
->bytecount
;
151 total_pkts
+= tx_buf
->gso_segs
;
153 if (ice_ring_is_xdp(tx_ring
))
154 page_frag_free(tx_buf
->raw_buf
);
157 napi_consume_skb(tx_buf
->skb
, napi_budget
);
159 /* unmap skb header data */
160 dma_unmap_single(tx_ring
->dev
,
161 dma_unmap_addr(tx_buf
, dma
),
162 dma_unmap_len(tx_buf
, len
),
165 /* clear tx_buf data */
167 dma_unmap_len_set(tx_buf
, len
, 0);
169 /* unmap remaining buffers */
170 while (tx_desc
!= eop_desc
) {
176 tx_buf
= tx_ring
->tx_buf
;
177 tx_desc
= ICE_TX_DESC(tx_ring
, 0);
180 /* unmap any remaining paged data */
181 if (dma_unmap_len(tx_buf
, len
)) {
182 dma_unmap_page(tx_ring
->dev
,
183 dma_unmap_addr(tx_buf
, dma
),
184 dma_unmap_len(tx_buf
, len
),
186 dma_unmap_len_set(tx_buf
, len
, 0);
190 /* move us one more past the eop_desc for start of next pkt */
196 tx_buf
= tx_ring
->tx_buf
;
197 tx_desc
= ICE_TX_DESC(tx_ring
, 0);
202 /* update budget accounting */
204 } while (likely(budget
));
207 tx_ring
->next_to_clean
= i
;
209 ice_update_tx_ring_stats(tx_ring
, total_pkts
, total_bytes
);
211 if (ice_ring_is_xdp(tx_ring
))
214 netdev_tx_completed_queue(txring_txq(tx_ring
), total_pkts
,
217 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
218 if (unlikely(total_pkts
&& netif_carrier_ok(tx_ring
->netdev
) &&
219 (ICE_DESC_UNUSED(tx_ring
) >= TX_WAKE_THRESHOLD
))) {
220 /* Make sure that anybody stopping the queue after this
221 * sees the new next_to_clean.
224 if (__netif_subqueue_stopped(tx_ring
->netdev
,
226 !test_bit(__ICE_DOWN
, vsi
->state
)) {
227 netif_wake_subqueue(tx_ring
->netdev
,
229 ++tx_ring
->tx_stats
.restart_q
;
237 * ice_setup_tx_ring - Allocate the Tx descriptors
238 * @tx_ring: the Tx ring to set up
240 * Return 0 on success, negative on error
242 int ice_setup_tx_ring(struct ice_ring
*tx_ring
)
244 struct device
*dev
= tx_ring
->dev
;
249 /* warn if we are about to overwrite the pointer */
250 WARN_ON(tx_ring
->tx_buf
);
252 devm_kzalloc(dev
, sizeof(*tx_ring
->tx_buf
) * tx_ring
->count
,
254 if (!tx_ring
->tx_buf
)
257 /* round up to nearest page */
258 tx_ring
->size
= ALIGN(tx_ring
->count
* sizeof(struct ice_tx_desc
),
260 tx_ring
->desc
= dmam_alloc_coherent(dev
, tx_ring
->size
, &tx_ring
->dma
,
262 if (!tx_ring
->desc
) {
263 dev_err(dev
, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
268 tx_ring
->next_to_use
= 0;
269 tx_ring
->next_to_clean
= 0;
270 tx_ring
->tx_stats
.prev_pkt
= -1;
274 devm_kfree(dev
, tx_ring
->tx_buf
);
275 tx_ring
->tx_buf
= NULL
;
280 * ice_clean_rx_ring - Free Rx buffers
281 * @rx_ring: ring to be cleaned
283 void ice_clean_rx_ring(struct ice_ring
*rx_ring
)
285 struct device
*dev
= rx_ring
->dev
;
288 /* ring already cleared, nothing to do */
289 if (!rx_ring
->rx_buf
)
292 if (rx_ring
->xsk_umem
) {
293 ice_xsk_clean_rx_ring(rx_ring
);
297 /* Free all the Rx ring sk_buffs */
298 for (i
= 0; i
< rx_ring
->count
; i
++) {
299 struct ice_rx_buf
*rx_buf
= &rx_ring
->rx_buf
[i
];
302 dev_kfree_skb(rx_buf
->skb
);
308 /* Invalidate cache lines that may have been written to by
309 * device so that we avoid corrupting memory.
311 dma_sync_single_range_for_cpu(dev
, rx_buf
->dma
,
316 /* free resources associated with mapping */
317 dma_unmap_page_attrs(dev
, rx_buf
->dma
, ice_rx_pg_size(rx_ring
),
318 DMA_FROM_DEVICE
, ICE_RX_DMA_ATTR
);
319 __page_frag_cache_drain(rx_buf
->page
, rx_buf
->pagecnt_bias
);
322 rx_buf
->page_offset
= 0;
326 memset(rx_ring
->rx_buf
, 0, sizeof(*rx_ring
->rx_buf
) * rx_ring
->count
);
328 /* Zero out the descriptor ring */
329 memset(rx_ring
->desc
, 0, rx_ring
->size
);
331 rx_ring
->next_to_alloc
= 0;
332 rx_ring
->next_to_clean
= 0;
333 rx_ring
->next_to_use
= 0;
337 * ice_free_rx_ring - Free Rx resources
338 * @rx_ring: ring to clean the resources from
340 * Free all receive software resources
342 void ice_free_rx_ring(struct ice_ring
*rx_ring
)
344 ice_clean_rx_ring(rx_ring
);
345 if (rx_ring
->vsi
->type
== ICE_VSI_PF
)
346 if (xdp_rxq_info_is_reg(&rx_ring
->xdp_rxq
))
347 xdp_rxq_info_unreg(&rx_ring
->xdp_rxq
);
348 rx_ring
->xdp_prog
= NULL
;
349 devm_kfree(rx_ring
->dev
, rx_ring
->rx_buf
);
350 rx_ring
->rx_buf
= NULL
;
353 dmam_free_coherent(rx_ring
->dev
, rx_ring
->size
,
354 rx_ring
->desc
, rx_ring
->dma
);
355 rx_ring
->desc
= NULL
;
360 * ice_setup_rx_ring - Allocate the Rx descriptors
361 * @rx_ring: the Rx ring to set up
363 * Return 0 on success, negative on error
365 int ice_setup_rx_ring(struct ice_ring
*rx_ring
)
367 struct device
*dev
= rx_ring
->dev
;
372 /* warn if we are about to overwrite the pointer */
373 WARN_ON(rx_ring
->rx_buf
);
375 devm_kzalloc(dev
, sizeof(*rx_ring
->rx_buf
) * rx_ring
->count
,
377 if (!rx_ring
->rx_buf
)
380 /* round up to nearest page */
381 rx_ring
->size
= ALIGN(rx_ring
->count
* sizeof(union ice_32byte_rx_desc
),
383 rx_ring
->desc
= dmam_alloc_coherent(dev
, rx_ring
->size
, &rx_ring
->dma
,
385 if (!rx_ring
->desc
) {
386 dev_err(dev
, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
391 rx_ring
->next_to_use
= 0;
392 rx_ring
->next_to_clean
= 0;
394 if (ice_is_xdp_ena_vsi(rx_ring
->vsi
))
395 WRITE_ONCE(rx_ring
->xdp_prog
, rx_ring
->vsi
->xdp_prog
);
397 if (rx_ring
->vsi
->type
== ICE_VSI_PF
&&
398 !xdp_rxq_info_is_reg(&rx_ring
->xdp_rxq
))
399 if (xdp_rxq_info_reg(&rx_ring
->xdp_rxq
, rx_ring
->netdev
,
405 devm_kfree(dev
, rx_ring
->rx_buf
);
406 rx_ring
->rx_buf
= NULL
;
411 * ice_rx_offset - Return expected offset into page to access data
412 * @rx_ring: Ring we are requesting offset of
414 * Returns the offset value for ring into the data buffer.
416 static unsigned int ice_rx_offset(struct ice_ring
*rx_ring
)
418 if (ice_ring_uses_build_skb(rx_ring
))
420 else if (ice_is_xdp_ena_vsi(rx_ring
->vsi
))
421 return XDP_PACKET_HEADROOM
;
427 * ice_run_xdp - Executes an XDP program on initialized xdp_buff
429 * @xdp: xdp_buff used as input to the XDP program
430 * @xdp_prog: XDP program to run
432 * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
435 ice_run_xdp(struct ice_ring
*rx_ring
, struct xdp_buff
*xdp
,
436 struct bpf_prog
*xdp_prog
)
438 int err
, result
= ICE_XDP_PASS
;
439 struct ice_ring
*xdp_ring
;
442 act
= bpf_prog_run_xdp(xdp_prog
, xdp
);
447 xdp_ring
= rx_ring
->vsi
->xdp_rings
[smp_processor_id()];
448 result
= ice_xmit_xdp_buff(xdp
, xdp_ring
);
451 err
= xdp_do_redirect(rx_ring
->netdev
, xdp
, xdp_prog
);
452 result
= !err
? ICE_XDP_REDIR
: ICE_XDP_CONSUMED
;
455 bpf_warn_invalid_xdp_action(act
);
456 /* fallthrough -- not supported action */
458 trace_xdp_exception(rx_ring
->netdev
, xdp_prog
, act
);
459 /* fallthrough -- handle aborts by dropping frame */
461 result
= ICE_XDP_CONSUMED
;
469 * ice_xdp_xmit - submit packets to XDP ring for transmission
471 * @n: number of XDP frames to be transmitted
472 * @frames: XDP frames to be transmitted
473 * @flags: transmit flags
475 * Returns number of frames successfully sent. Frames that fail are
476 * free'ed via XDP return API.
477 * For error cases, a negative errno code is returned and no-frames
478 * are transmitted (caller must handle freeing frames).
481 ice_xdp_xmit(struct net_device
*dev
, int n
, struct xdp_frame
**frames
,
484 struct ice_netdev_priv
*np
= netdev_priv(dev
);
485 unsigned int queue_index
= smp_processor_id();
486 struct ice_vsi
*vsi
= np
->vsi
;
487 struct ice_ring
*xdp_ring
;
490 if (test_bit(__ICE_DOWN
, vsi
->state
))
493 if (!ice_is_xdp_ena_vsi(vsi
) || queue_index
>= vsi
->num_xdp_txq
)
496 if (unlikely(flags
& ~XDP_XMIT_FLAGS_MASK
))
499 xdp_ring
= vsi
->xdp_rings
[queue_index
];
500 for (i
= 0; i
< n
; i
++) {
501 struct xdp_frame
*xdpf
= frames
[i
];
504 err
= ice_xmit_xdp_ring(xdpf
->data
, xdpf
->len
, xdp_ring
);
505 if (err
!= ICE_XDP_TX
) {
506 xdp_return_frame_rx_napi(xdpf
);
511 if (unlikely(flags
& XDP_XMIT_FLUSH
))
512 ice_xdp_ring_update_tail(xdp_ring
);
518 * ice_alloc_mapped_page - recycle or make a new page
519 * @rx_ring: ring to use
520 * @bi: rx_buf struct to modify
522 * Returns true if the page was successfully allocated or
526 ice_alloc_mapped_page(struct ice_ring
*rx_ring
, struct ice_rx_buf
*bi
)
528 struct page
*page
= bi
->page
;
531 /* since we are recycling buffers we should seldom need to alloc */
533 rx_ring
->rx_stats
.page_reuse_count
++;
537 /* alloc new page for storage */
538 page
= dev_alloc_pages(ice_rx_pg_order(rx_ring
));
539 if (unlikely(!page
)) {
540 rx_ring
->rx_stats
.alloc_page_failed
++;
544 /* map page for use */
545 dma
= dma_map_page_attrs(rx_ring
->dev
, page
, 0, ice_rx_pg_size(rx_ring
),
546 DMA_FROM_DEVICE
, ICE_RX_DMA_ATTR
);
548 /* if mapping failed free memory back to system since
549 * there isn't much point in holding memory we can't use
551 if (dma_mapping_error(rx_ring
->dev
, dma
)) {
552 __free_pages(page
, ice_rx_pg_order(rx_ring
));
553 rx_ring
->rx_stats
.alloc_page_failed
++;
559 bi
->page_offset
= ice_rx_offset(rx_ring
);
560 page_ref_add(page
, USHRT_MAX
- 1);
561 bi
->pagecnt_bias
= USHRT_MAX
;
567 * ice_alloc_rx_bufs - Replace used receive buffers
568 * @rx_ring: ring to place buffers on
569 * @cleaned_count: number of buffers to replace
571 * Returns false if all allocations were successful, true if any fail. Returning
572 * true signals to the caller that we didn't replace cleaned_count buffers and
573 * there is more work to do.
575 * First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx
576 * buffers. Then bump tail at most one time. Grouping like this lets us avoid
577 * multiple tail writes per call.
579 bool ice_alloc_rx_bufs(struct ice_ring
*rx_ring
, u16 cleaned_count
)
581 union ice_32b_rx_flex_desc
*rx_desc
;
582 u16 ntu
= rx_ring
->next_to_use
;
583 struct ice_rx_buf
*bi
;
585 /* do nothing if no valid netdev defined */
586 if (!rx_ring
->netdev
|| !cleaned_count
)
589 /* get the Rx descriptor and buffer based on next_to_use */
590 rx_desc
= ICE_RX_DESC(rx_ring
, ntu
);
591 bi
= &rx_ring
->rx_buf
[ntu
];
594 /* if we fail here, we have work remaining */
595 if (!ice_alloc_mapped_page(rx_ring
, bi
))
598 /* sync the buffer for use by the device */
599 dma_sync_single_range_for_device(rx_ring
->dev
, bi
->dma
,
604 /* Refresh the desc even if buffer_addrs didn't change
605 * because each write-back erases this info.
607 rx_desc
->read
.pkt_addr
= cpu_to_le64(bi
->dma
+ bi
->page_offset
);
612 if (unlikely(ntu
== rx_ring
->count
)) {
613 rx_desc
= ICE_RX_DESC(rx_ring
, 0);
614 bi
= rx_ring
->rx_buf
;
618 /* clear the status bits for the next_to_use descriptor */
619 rx_desc
->wb
.status_error0
= 0;
622 } while (cleaned_count
);
624 if (rx_ring
->next_to_use
!= ntu
)
625 ice_release_rx_desc(rx_ring
, ntu
);
627 return !!cleaned_count
;
631 * ice_page_is_reserved - check if reuse is possible
632 * @page: page struct to check
634 static bool ice_page_is_reserved(struct page
*page
)
636 return (page_to_nid(page
) != numa_mem_id()) || page_is_pfmemalloc(page
);
640 * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse
641 * @rx_buf: Rx buffer to adjust
642 * @size: Size of adjustment
644 * Update the offset within page so that Rx buf will be ready to be reused.
645 * For systems with PAGE_SIZE < 8192 this function will flip the page offset
646 * so the second half of page assigned to Rx buffer will be used, otherwise
647 * the offset is moved by "size" bytes
650 ice_rx_buf_adjust_pg_offset(struct ice_rx_buf
*rx_buf
, unsigned int size
)
652 #if (PAGE_SIZE < 8192)
653 /* flip page offset to other buffer */
654 rx_buf
->page_offset
^= size
;
656 /* move offset up to the next cache line */
657 rx_buf
->page_offset
+= size
;
662 * ice_can_reuse_rx_page - Determine if page can be reused for another Rx
663 * @rx_buf: buffer containing the page
665 * If page is reusable, we have a green light for calling ice_reuse_rx_page,
666 * which will assign the current buffer to the buffer that next_to_alloc is
667 * pointing to; otherwise, the DMA mapping needs to be destroyed and
670 static bool ice_can_reuse_rx_page(struct ice_rx_buf
*rx_buf
)
672 unsigned int pagecnt_bias
= rx_buf
->pagecnt_bias
;
673 struct page
*page
= rx_buf
->page
;
675 /* avoid re-using remote pages */
676 if (unlikely(ice_page_is_reserved(page
)))
679 #if (PAGE_SIZE < 8192)
680 /* if we are only owner of page we can reuse it */
681 if (unlikely((page_count(page
) - pagecnt_bias
) > 1))
684 #define ICE_LAST_OFFSET \
685 (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
686 if (rx_buf
->page_offset
> ICE_LAST_OFFSET
)
688 #endif /* PAGE_SIZE < 8192) */
690 /* If we have drained the page fragment pool we need to update
691 * the pagecnt_bias and page count so that we fully restock the
692 * number of references the driver holds.
694 if (unlikely(pagecnt_bias
== 1)) {
695 page_ref_add(page
, USHRT_MAX
- 1);
696 rx_buf
->pagecnt_bias
= USHRT_MAX
;
703 * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
704 * @rx_ring: Rx descriptor ring to transact packets on
705 * @rx_buf: buffer containing page to add
706 * @skb: sk_buff to place the data into
707 * @size: packet length from rx_desc
709 * This function will add the data contained in rx_buf->page to the skb.
710 * It will just attach the page as a frag to the skb.
711 * The function will then update the page offset.
714 ice_add_rx_frag(struct ice_ring
*rx_ring
, struct ice_rx_buf
*rx_buf
,
715 struct sk_buff
*skb
, unsigned int size
)
717 #if (PAGE_SIZE >= 8192)
718 unsigned int truesize
= SKB_DATA_ALIGN(size
+ ice_rx_offset(rx_ring
));
720 unsigned int truesize
= ice_rx_pg_size(rx_ring
) / 2;
725 skb_add_rx_frag(skb
, skb_shinfo(skb
)->nr_frags
, rx_buf
->page
,
726 rx_buf
->page_offset
, size
, truesize
);
728 /* page is being used so we must update the page offset */
729 ice_rx_buf_adjust_pg_offset(rx_buf
, truesize
);
733 * ice_reuse_rx_page - page flip buffer and store it back on the ring
734 * @rx_ring: Rx descriptor ring to store buffers on
735 * @old_buf: donor buffer to have page reused
737 * Synchronizes page for reuse by the adapter
740 ice_reuse_rx_page(struct ice_ring
*rx_ring
, struct ice_rx_buf
*old_buf
)
742 u16 nta
= rx_ring
->next_to_alloc
;
743 struct ice_rx_buf
*new_buf
;
745 new_buf
= &rx_ring
->rx_buf
[nta
];
747 /* update, and store next to alloc */
749 rx_ring
->next_to_alloc
= (nta
< rx_ring
->count
) ? nta
: 0;
751 /* Transfer page from old buffer to new buffer.
752 * Move each member individually to avoid possible store
753 * forwarding stalls and unnecessary copy of skb.
755 new_buf
->dma
= old_buf
->dma
;
756 new_buf
->page
= old_buf
->page
;
757 new_buf
->page_offset
= old_buf
->page_offset
;
758 new_buf
->pagecnt_bias
= old_buf
->pagecnt_bias
;
762 * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
763 * @rx_ring: Rx descriptor ring to transact packets on
764 * @skb: skb to be used
765 * @size: size of buffer to add to skb
767 * This function will pull an Rx buffer from the ring and synchronize it
768 * for use by the CPU.
770 static struct ice_rx_buf
*
771 ice_get_rx_buf(struct ice_ring
*rx_ring
, struct sk_buff
**skb
,
772 const unsigned int size
)
774 struct ice_rx_buf
*rx_buf
;
776 rx_buf
= &rx_ring
->rx_buf
[rx_ring
->next_to_clean
];
777 prefetchw(rx_buf
->page
);
782 /* we are reusing so sync this buffer for CPU use */
783 dma_sync_single_range_for_cpu(rx_ring
->dev
, rx_buf
->dma
,
784 rx_buf
->page_offset
, size
,
787 /* We have pulled a buffer for use, so decrement pagecnt_bias */
788 rx_buf
->pagecnt_bias
--;
794 * ice_build_skb - Build skb around an existing buffer
795 * @rx_ring: Rx descriptor ring to transact packets on
796 * @rx_buf: Rx buffer to pull data from
797 * @xdp: xdp_buff pointing to the data
799 * This function builds an skb around an existing Rx buffer, taking care
800 * to set up the skb correctly and avoid any memcpy overhead.
802 static struct sk_buff
*
803 ice_build_skb(struct ice_ring
*rx_ring
, struct ice_rx_buf
*rx_buf
,
804 struct xdp_buff
*xdp
)
806 unsigned int metasize
= xdp
->data
- xdp
->data_meta
;
807 #if (PAGE_SIZE < 8192)
808 unsigned int truesize
= ice_rx_pg_size(rx_ring
) / 2;
810 unsigned int truesize
= SKB_DATA_ALIGN(sizeof(struct skb_shared_info
)) +
811 SKB_DATA_ALIGN(xdp
->data_end
-
812 xdp
->data_hard_start
);
816 /* Prefetch first cache line of first page. If xdp->data_meta
817 * is unused, this points exactly as xdp->data, otherwise we
818 * likely have a consumer accessing first few bytes of meta
819 * data, and then actual data.
821 prefetch(xdp
->data_meta
);
822 #if L1_CACHE_BYTES < 128
823 prefetch((void *)(xdp
->data
+ L1_CACHE_BYTES
));
825 /* build an skb around the page buffer */
826 skb
= build_skb(xdp
->data_hard_start
, truesize
);
830 /* must to record Rx queue, otherwise OS features such as
831 * symmetric queue won't work
833 skb_record_rx_queue(skb
, rx_ring
->q_index
);
835 /* update pointers within the skb to store the data */
836 skb_reserve(skb
, xdp
->data
- xdp
->data_hard_start
);
837 __skb_put(skb
, xdp
->data_end
- xdp
->data
);
839 skb_metadata_set(skb
, metasize
);
841 /* buffer is used by skb, update page_offset */
842 ice_rx_buf_adjust_pg_offset(rx_buf
, truesize
);
848 * ice_construct_skb - Allocate skb and populate it
849 * @rx_ring: Rx descriptor ring to transact packets on
850 * @rx_buf: Rx buffer to pull data from
851 * @xdp: xdp_buff pointing to the data
853 * This function allocates an skb. It then populates it with the page
854 * data from the current receive descriptor, taking care to set up the
857 static struct sk_buff
*
858 ice_construct_skb(struct ice_ring
*rx_ring
, struct ice_rx_buf
*rx_buf
,
859 struct xdp_buff
*xdp
)
861 unsigned int size
= xdp
->data_end
- xdp
->data
;
862 unsigned int headlen
;
865 /* prefetch first cache line of first page */
867 #if L1_CACHE_BYTES < 128
868 prefetch((void *)(xdp
->data
+ L1_CACHE_BYTES
));
869 #endif /* L1_CACHE_BYTES */
871 /* allocate a skb to store the frags */
872 skb
= __napi_alloc_skb(&rx_ring
->q_vector
->napi
, ICE_RX_HDR_SIZE
,
873 GFP_ATOMIC
| __GFP_NOWARN
);
877 skb_record_rx_queue(skb
, rx_ring
->q_index
);
878 /* Determine available headroom for copy */
880 if (headlen
> ICE_RX_HDR_SIZE
)
881 headlen
= eth_get_headlen(skb
->dev
, xdp
->data
, ICE_RX_HDR_SIZE
);
883 /* align pull length to size of long to optimize memcpy performance */
884 memcpy(__skb_put(skb
, headlen
), xdp
->data
, ALIGN(headlen
,
887 /* if we exhaust the linear part then add what is left as a frag */
890 #if (PAGE_SIZE >= 8192)
891 unsigned int truesize
= SKB_DATA_ALIGN(size
);
893 unsigned int truesize
= ice_rx_pg_size(rx_ring
) / 2;
895 skb_add_rx_frag(skb
, 0, rx_buf
->page
,
896 rx_buf
->page_offset
+ headlen
, size
, truesize
);
897 /* buffer is used by skb, update page_offset */
898 ice_rx_buf_adjust_pg_offset(rx_buf
, truesize
);
900 /* buffer is unused, reset bias back to rx_buf; data was copied
901 * onto skb's linear part so there's no need for adjusting
902 * page offset and we can reuse this buffer as-is
904 rx_buf
->pagecnt_bias
++;
911 * ice_put_rx_buf - Clean up used buffer and either recycle or free
912 * @rx_ring: Rx descriptor ring to transact packets on
913 * @rx_buf: Rx buffer to pull data from
915 * This function will update next_to_clean and then clean up the contents
916 * of the rx_buf. It will either recycle the buffer or unmap it and free
917 * the associated resources.
919 static void ice_put_rx_buf(struct ice_ring
*rx_ring
, struct ice_rx_buf
*rx_buf
)
921 u32 ntc
= rx_ring
->next_to_clean
+ 1;
923 /* fetch, update, and store next to clean */
924 ntc
= (ntc
< rx_ring
->count
) ? ntc
: 0;
925 rx_ring
->next_to_clean
= ntc
;
930 if (ice_can_reuse_rx_page(rx_buf
)) {
931 /* hand second half of page back to the ring */
932 ice_reuse_rx_page(rx_ring
, rx_buf
);
933 rx_ring
->rx_stats
.page_reuse_count
++;
935 /* we are not reusing the buffer so unmap it */
936 dma_unmap_page_attrs(rx_ring
->dev
, rx_buf
->dma
,
937 ice_rx_pg_size(rx_ring
), DMA_FROM_DEVICE
,
939 __page_frag_cache_drain(rx_buf
->page
, rx_buf
->pagecnt_bias
);
942 /* clear contents of buffer_info */
948 * ice_is_non_eop - process handling of non-EOP buffers
949 * @rx_ring: Rx ring being processed
950 * @rx_desc: Rx descriptor for current buffer
951 * @skb: Current socket buffer containing buffer in progress
953 * If the buffer is an EOP buffer, this function exits returning false,
954 * otherwise return true indicating that this is in fact a non-EOP buffer.
957 ice_is_non_eop(struct ice_ring
*rx_ring
, union ice_32b_rx_flex_desc
*rx_desc
,
960 /* if we are the last buffer then there is nothing else to do */
961 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
962 if (likely(ice_test_staterr(rx_desc
, ICE_RXD_EOF
)))
965 /* place skb in next buffer to be received */
966 rx_ring
->rx_buf
[rx_ring
->next_to_clean
].skb
= skb
;
967 rx_ring
->rx_stats
.non_eop_descs
++;
973 * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
974 * @rx_ring: Rx descriptor ring to transact packets on
975 * @budget: Total limit on number of packets to process
977 * This function provides a "bounce buffer" approach to Rx interrupt
978 * processing. The advantage to this is that on systems that have
979 * expensive overhead for IOMMU access this provides a means of avoiding
980 * it by maintaining the mapping of the page to the system.
982 * Returns amount of work completed
984 static int ice_clean_rx_irq(struct ice_ring
*rx_ring
, int budget
)
986 unsigned int total_rx_bytes
= 0, total_rx_pkts
= 0;
987 u16 cleaned_count
= ICE_DESC_UNUSED(rx_ring
);
988 unsigned int xdp_res
, xdp_xmit
= 0;
989 struct bpf_prog
*xdp_prog
= NULL
;
993 xdp
.rxq
= &rx_ring
->xdp_rxq
;
995 /* start the loop to process Rx packets bounded by 'budget' */
996 while (likely(total_rx_pkts
< (unsigned int)budget
)) {
997 union ice_32b_rx_flex_desc
*rx_desc
;
998 struct ice_rx_buf
*rx_buf
;
1005 /* get the Rx desc from Rx ring based on 'next_to_clean' */
1006 rx_desc
= ICE_RX_DESC(rx_ring
, rx_ring
->next_to_clean
);
1008 /* status_error_len will always be zero for unused descriptors
1009 * because it's cleared in cleanup, and overlaps with hdr_addr
1010 * which is always zero because packet split isn't used, if the
1011 * hardware wrote DD then it will be non-zero
1013 stat_err_bits
= BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S
);
1014 if (!ice_test_staterr(rx_desc
, stat_err_bits
))
1017 /* This memory barrier is needed to keep us from reading
1018 * any other fields out of the rx_desc until we know the
1023 size
= le16_to_cpu(rx_desc
->wb
.pkt_len
) &
1024 ICE_RX_FLX_DESC_PKT_LEN_M
;
1026 /* retrieve a buffer from the ring */
1027 rx_buf
= ice_get_rx_buf(rx_ring
, &skb
, size
);
1031 xdp
.data_end
= NULL
;
1032 xdp
.data_hard_start
= NULL
;
1033 xdp
.data_meta
= NULL
;
1037 xdp
.data
= page_address(rx_buf
->page
) + rx_buf
->page_offset
;
1038 xdp
.data_hard_start
= xdp
.data
- ice_rx_offset(rx_ring
);
1039 xdp
.data_meta
= xdp
.data
;
1040 xdp
.data_end
= xdp
.data
+ size
;
1043 xdp_prog
= READ_ONCE(rx_ring
->xdp_prog
);
1049 xdp_res
= ice_run_xdp(rx_ring
, &xdp
, xdp_prog
);
1053 if (xdp_res
& (ICE_XDP_TX
| ICE_XDP_REDIR
)) {
1054 unsigned int truesize
;
1056 #if (PAGE_SIZE < 8192)
1057 truesize
= ice_rx_pg_size(rx_ring
) / 2;
1059 truesize
= SKB_DATA_ALIGN(ice_rx_offset(rx_ring
) +
1062 xdp_xmit
|= xdp_res
;
1063 ice_rx_buf_adjust_pg_offset(rx_buf
, truesize
);
1065 rx_buf
->pagecnt_bias
++;
1067 total_rx_bytes
+= size
;
1071 ice_put_rx_buf(rx_ring
, rx_buf
);
1075 ice_add_rx_frag(rx_ring
, rx_buf
, skb
, size
);
1076 } else if (likely(xdp
.data
)) {
1077 if (ice_ring_uses_build_skb(rx_ring
))
1078 skb
= ice_build_skb(rx_ring
, rx_buf
, &xdp
);
1080 skb
= ice_construct_skb(rx_ring
, rx_buf
, &xdp
);
1082 /* exit if we failed to retrieve a buffer */
1084 rx_ring
->rx_stats
.alloc_buf_failed
++;
1086 rx_buf
->pagecnt_bias
++;
1090 ice_put_rx_buf(rx_ring
, rx_buf
);
1093 /* skip if it is NOP desc */
1094 if (ice_is_non_eop(rx_ring
, rx_desc
, skb
))
1097 stat_err_bits
= BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S
);
1098 if (unlikely(ice_test_staterr(rx_desc
, stat_err_bits
))) {
1099 dev_kfree_skb_any(skb
);
1103 stat_err_bits
= BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S
);
1104 if (ice_test_staterr(rx_desc
, stat_err_bits
))
1105 vlan_tag
= le16_to_cpu(rx_desc
->wb
.l2tag1
);
1107 /* pad the skb if needed, to make a valid ethernet frame */
1108 if (eth_skb_pad(skb
)) {
1113 /* probably a little skewed due to removing CRC */
1114 total_rx_bytes
+= skb
->len
;
1116 /* populate checksum, VLAN, and protocol */
1117 rx_ptype
= le16_to_cpu(rx_desc
->wb
.ptype_flex_flags0
) &
1118 ICE_RX_FLEX_DESC_PTYPE_M
;
1120 ice_process_skb_fields(rx_ring
, rx_desc
, skb
, rx_ptype
);
1122 /* send completed skb up the stack */
1123 ice_receive_skb(rx_ring
, skb
, vlan_tag
);
1125 /* update budget accounting */
1129 /* return up to cleaned_count buffers to hardware */
1130 failure
= ice_alloc_rx_bufs(rx_ring
, cleaned_count
);
1133 ice_finalize_xdp_rx(rx_ring
, xdp_xmit
);
1135 ice_update_rx_ring_stats(rx_ring
, total_rx_pkts
, total_rx_bytes
);
1137 /* guarantee a trip back through this routine if there was a failure */
1138 return failure
? budget
: (int)total_rx_pkts
;
1142 * ice_adjust_itr_by_size_and_speed - Adjust ITR based on current traffic
1143 * @port_info: port_info structure containing the current link speed
1144 * @avg_pkt_size: average size of Tx or Rx packets based on clean routine
1145 * @itr: ITR value to update
1147 * Calculate how big of an increment should be applied to the ITR value passed
1148 * in based on wmem_default, SKB overhead, Ethernet overhead, and the current
1151 * The following is a calculation derived from:
1152 * wmem_default / (size + overhead) = desired_pkts_per_int
1153 * rate / bits_per_byte / (size + Ethernet overhead) = pkt_rate
1154 * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
1156 * Assuming wmem_default is 212992 and overhead is 640 bytes per
1157 * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
1160 * wmem_default * bits_per_byte * usecs_per_sec pkt_size + 24
1161 * ITR = -------------------------------------------- * --------------
1162 * rate pkt_size + 640
1165 ice_adjust_itr_by_size_and_speed(struct ice_port_info
*port_info
,
1166 unsigned int avg_pkt_size
,
1169 switch (port_info
->phy
.link_info
.link_speed
) {
1170 case ICE_AQ_LINK_SPEED_100GB
:
1171 itr
+= DIV_ROUND_UP(17 * (avg_pkt_size
+ 24),
1172 avg_pkt_size
+ 640);
1174 case ICE_AQ_LINK_SPEED_50GB
:
1175 itr
+= DIV_ROUND_UP(34 * (avg_pkt_size
+ 24),
1176 avg_pkt_size
+ 640);
1178 case ICE_AQ_LINK_SPEED_40GB
:
1179 itr
+= DIV_ROUND_UP(43 * (avg_pkt_size
+ 24),
1180 avg_pkt_size
+ 640);
1182 case ICE_AQ_LINK_SPEED_25GB
:
1183 itr
+= DIV_ROUND_UP(68 * (avg_pkt_size
+ 24),
1184 avg_pkt_size
+ 640);
1186 case ICE_AQ_LINK_SPEED_20GB
:
1187 itr
+= DIV_ROUND_UP(85 * (avg_pkt_size
+ 24),
1188 avg_pkt_size
+ 640);
1190 case ICE_AQ_LINK_SPEED_10GB
:
1193 itr
+= DIV_ROUND_UP(170 * (avg_pkt_size
+ 24),
1194 avg_pkt_size
+ 640);
1198 if ((itr
& ICE_ITR_MASK
) > ICE_ITR_ADAPTIVE_MAX_USECS
) {
1199 itr
&= ICE_ITR_ADAPTIVE_LATENCY
;
1200 itr
+= ICE_ITR_ADAPTIVE_MAX_USECS
;
1207 * ice_update_itr - update the adaptive ITR value based on statistics
1208 * @q_vector: structure containing interrupt and ring information
1209 * @rc: structure containing ring performance data
1211 * Stores a new ITR value based on packets and byte
1212 * counts during the last interrupt. The advantage of per interrupt
1213 * computation is faster updates and more accurate ITR for the current
1214 * traffic pattern. Constants in this function were computed
1215 * based on theoretical maximum wire speed and thresholds were set based
1216 * on testing data as well as attempting to minimize response time
1217 * while increasing bulk throughput.
1220 ice_update_itr(struct ice_q_vector
*q_vector
, struct ice_ring_container
*rc
)
1222 unsigned long next_update
= jiffies
;
1223 unsigned int packets
, bytes
, itr
;
1224 bool container_is_rx
;
1226 if (!rc
->ring
|| !ITR_IS_DYNAMIC(rc
->itr_setting
))
1229 /* If itr_countdown is set it means we programmed an ITR within
1230 * the last 4 interrupt cycles. This has a side effect of us
1231 * potentially firing an early interrupt. In order to work around
1232 * this we need to throw out any data received for a few
1233 * interrupts following the update.
1235 if (q_vector
->itr_countdown
) {
1236 itr
= rc
->target_itr
;
1240 container_is_rx
= (&q_vector
->rx
== rc
);
1241 /* For Rx we want to push the delay up and default to low latency.
1242 * for Tx we want to pull the delay down and default to high latency.
1244 itr
= container_is_rx
?
1245 ICE_ITR_ADAPTIVE_MIN_USECS
| ICE_ITR_ADAPTIVE_LATENCY
:
1246 ICE_ITR_ADAPTIVE_MAX_USECS
| ICE_ITR_ADAPTIVE_LATENCY
;
1248 /* If we didn't update within up to 1 - 2 jiffies we can assume
1249 * that either packets are coming in so slow there hasn't been
1250 * any work, or that there is so much work that NAPI is dealing
1251 * with interrupt moderation and we don't need to do anything.
1253 if (time_after(next_update
, rc
->next_update
))
1256 prefetch(q_vector
->vsi
->port_info
);
1258 packets
= rc
->total_pkts
;
1259 bytes
= rc
->total_bytes
;
1261 if (container_is_rx
) {
1262 /* If Rx there are 1 to 4 packets and bytes are less than
1263 * 9000 assume insufficient data to use bulk rate limiting
1264 * approach unless Tx is already in bulk rate limiting. We
1265 * are likely latency driven.
1267 if (packets
&& packets
< 4 && bytes
< 9000 &&
1268 (q_vector
->tx
.target_itr
& ICE_ITR_ADAPTIVE_LATENCY
)) {
1269 itr
= ICE_ITR_ADAPTIVE_LATENCY
;
1270 goto adjust_by_size_and_speed
;
1272 } else if (packets
< 4) {
1273 /* If we have Tx and Rx ITR maxed and Tx ITR is running in
1274 * bulk mode and we are receiving 4 or fewer packets just
1275 * reset the ITR_ADAPTIVE_LATENCY bit for latency mode so
1276 * that the Rx can relax.
1278 if (rc
->target_itr
== ICE_ITR_ADAPTIVE_MAX_USECS
&&
1279 (q_vector
->rx
.target_itr
& ICE_ITR_MASK
) ==
1280 ICE_ITR_ADAPTIVE_MAX_USECS
)
1282 } else if (packets
> 32) {
1283 /* If we have processed over 32 packets in a single interrupt
1284 * for Tx assume we need to switch over to "bulk" mode.
1286 rc
->target_itr
&= ~ICE_ITR_ADAPTIVE_LATENCY
;
1289 /* We have no packets to actually measure against. This means
1290 * either one of the other queues on this vector is active or
1291 * we are a Tx queue doing TSO with too high of an interrupt rate.
1293 * Between 4 and 56 we can assume that our current interrupt delay
1294 * is only slightly too low. As such we should increase it by a small
1298 itr
= rc
->target_itr
+ ICE_ITR_ADAPTIVE_MIN_INC
;
1299 if ((itr
& ICE_ITR_MASK
) > ICE_ITR_ADAPTIVE_MAX_USECS
) {
1300 itr
&= ICE_ITR_ADAPTIVE_LATENCY
;
1301 itr
+= ICE_ITR_ADAPTIVE_MAX_USECS
;
1306 if (packets
<= 256) {
1307 itr
= min(q_vector
->tx
.current_itr
, q_vector
->rx
.current_itr
);
1308 itr
&= ICE_ITR_MASK
;
1310 /* Between 56 and 112 is our "goldilocks" zone where we are
1311 * working out "just right". Just report that our current
1312 * ITR is good for us.
1317 /* If packet count is 128 or greater we are likely looking
1318 * at a slight overrun of the delay we want. Try halving
1319 * our delay to see if that will cut the number of packets
1320 * in half per interrupt.
1323 itr
&= ICE_ITR_MASK
;
1324 if (itr
< ICE_ITR_ADAPTIVE_MIN_USECS
)
1325 itr
= ICE_ITR_ADAPTIVE_MIN_USECS
;
1330 /* The paths below assume we are dealing with a bulk ITR since
1331 * number of packets is greater than 256. We are just going to have
1332 * to compute a value and try to bring the count under control,
1333 * though for smaller packet sizes there isn't much we can do as
1334 * NAPI polling will likely be kicking in sooner rather than later.
1336 itr
= ICE_ITR_ADAPTIVE_BULK
;
1338 adjust_by_size_and_speed
:
1340 /* based on checks above packets cannot be 0 so division is safe */
1341 itr
= ice_adjust_itr_by_size_and_speed(q_vector
->vsi
->port_info
,
1342 bytes
/ packets
, itr
);
1345 /* write back value */
1346 rc
->target_itr
= itr
;
1348 /* next update should occur within next jiffy */
1349 rc
->next_update
= next_update
+ 1;
1351 rc
->total_bytes
= 0;
1356 * ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register
1357 * @itr_idx: interrupt throttling index
1358 * @itr: interrupt throttling value in usecs
1360 static u32
ice_buildreg_itr(u16 itr_idx
, u16 itr
)
1362 /* The ITR value is reported in microseconds, and the register value is
1363 * recorded in 2 microsecond units. For this reason we only need to
1364 * shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this
1365 * granularity as a shift instead of division. The mask makes sure the
1366 * ITR value is never odd so we don't accidentally write into the field
1367 * prior to the ITR field.
1369 itr
&= ICE_ITR_MASK
;
1371 return GLINT_DYN_CTL_INTENA_M
| GLINT_DYN_CTL_CLEARPBA_M
|
1372 (itr_idx
<< GLINT_DYN_CTL_ITR_INDX_S
) |
1373 (itr
<< (GLINT_DYN_CTL_INTERVAL_S
- ICE_ITR_GRAN_S
));
1376 /* The act of updating the ITR will cause it to immediately trigger. In order
1377 * to prevent this from throwing off adaptive update statistics we defer the
1378 * update so that it can only happen so often. So after either Tx or Rx are
1379 * updated we make the adaptive scheme wait until either the ITR completely
1380 * expires via the next_update expiration or we have been through at least
1383 #define ITR_COUNTDOWN_START 3
1386 * ice_update_ena_itr - Update ITR and re-enable MSIX interrupt
1387 * @q_vector: q_vector for which ITR is being updated and interrupt enabled
1389 static void ice_update_ena_itr(struct ice_q_vector
*q_vector
)
1391 struct ice_ring_container
*tx
= &q_vector
->tx
;
1392 struct ice_ring_container
*rx
= &q_vector
->rx
;
1393 struct ice_vsi
*vsi
= q_vector
->vsi
;
1396 /* when exiting WB_ON_ITR lets set a low ITR value and trigger
1397 * interrupts to expire right away in case we have more work ready to go
1400 if (q_vector
->itr_countdown
== ICE_IN_WB_ON_ITR_MODE
) {
1401 itr_val
= ice_buildreg_itr(rx
->itr_idx
, ICE_WB_ON_ITR_USECS
);
1402 wr32(&vsi
->back
->hw
, GLINT_DYN_CTL(q_vector
->reg_idx
), itr_val
);
1403 /* set target back to last user set value */
1404 rx
->target_itr
= rx
->itr_setting
;
1405 /* set current to what we just wrote and dynamic if needed */
1406 rx
->current_itr
= ICE_WB_ON_ITR_USECS
|
1407 (rx
->itr_setting
& ICE_ITR_DYNAMIC
);
1408 /* allow normal interrupt flow to start */
1409 q_vector
->itr_countdown
= 0;
1413 /* This will do nothing if dynamic updates are not enabled */
1414 ice_update_itr(q_vector
, tx
);
1415 ice_update_itr(q_vector
, rx
);
1417 /* This block of logic allows us to get away with only updating
1418 * one ITR value with each interrupt. The idea is to perform a
1419 * pseudo-lazy update with the following criteria.
1421 * 1. Rx is given higher priority than Tx if both are in same state
1422 * 2. If we must reduce an ITR that is given highest priority.
1423 * 3. We then give priority to increasing ITR based on amount.
1425 if (rx
->target_itr
< rx
->current_itr
) {
1426 /* Rx ITR needs to be reduced, this is highest priority */
1427 itr_val
= ice_buildreg_itr(rx
->itr_idx
, rx
->target_itr
);
1428 rx
->current_itr
= rx
->target_itr
;
1429 q_vector
->itr_countdown
= ITR_COUNTDOWN_START
;
1430 } else if ((tx
->target_itr
< tx
->current_itr
) ||
1431 ((rx
->target_itr
- rx
->current_itr
) <
1432 (tx
->target_itr
- tx
->current_itr
))) {
1433 /* Tx ITR needs to be reduced, this is second priority
1434 * Tx ITR needs to be increased more than Rx, fourth priority
1436 itr_val
= ice_buildreg_itr(tx
->itr_idx
, tx
->target_itr
);
1437 tx
->current_itr
= tx
->target_itr
;
1438 q_vector
->itr_countdown
= ITR_COUNTDOWN_START
;
1439 } else if (rx
->current_itr
!= rx
->target_itr
) {
1440 /* Rx ITR needs to be increased, third priority */
1441 itr_val
= ice_buildreg_itr(rx
->itr_idx
, rx
->target_itr
);
1442 rx
->current_itr
= rx
->target_itr
;
1443 q_vector
->itr_countdown
= ITR_COUNTDOWN_START
;
1445 /* Still have to re-enable the interrupts */
1446 itr_val
= ice_buildreg_itr(ICE_ITR_NONE
, 0);
1447 if (q_vector
->itr_countdown
)
1448 q_vector
->itr_countdown
--;
1451 if (!test_bit(__ICE_DOWN
, q_vector
->vsi
->state
))
1452 wr32(&q_vector
->vsi
->back
->hw
,
1453 GLINT_DYN_CTL(q_vector
->reg_idx
),
1458 * ice_set_wb_on_itr - set WB_ON_ITR for this q_vector
1459 * @q_vector: q_vector to set WB_ON_ITR on
1461 * We need to tell hardware to write-back completed descriptors even when
1462 * interrupts are disabled. Descriptors will be written back on cache line
1463 * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
1464 * descriptors may not be written back if they don't fill a cache line until the
1467 * This sets the write-back frequency to 2 microseconds as that is the minimum
1468 * value that's not 0 due to ITR granularity. Also, set the INTENA_MSK bit to
1469 * make sure hardware knows we aren't meddling with the INTENA_M bit.
1471 static void ice_set_wb_on_itr(struct ice_q_vector
*q_vector
)
1473 struct ice_vsi
*vsi
= q_vector
->vsi
;
1475 /* already in WB_ON_ITR mode no need to change it */
1476 if (q_vector
->itr_countdown
== ICE_IN_WB_ON_ITR_MODE
)
1479 if (q_vector
->num_ring_rx
)
1480 wr32(&vsi
->back
->hw
, GLINT_DYN_CTL(q_vector
->reg_idx
),
1481 ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS
,
1484 if (q_vector
->num_ring_tx
)
1485 wr32(&vsi
->back
->hw
, GLINT_DYN_CTL(q_vector
->reg_idx
),
1486 ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS
,
1489 q_vector
->itr_countdown
= ICE_IN_WB_ON_ITR_MODE
;
1493 * ice_napi_poll - NAPI polling Rx/Tx cleanup routine
1494 * @napi: napi struct with our devices info in it
1495 * @budget: amount of work driver is allowed to do this pass, in packets
1497 * This function will clean all queues associated with a q_vector.
1499 * Returns the amount of work done
1501 int ice_napi_poll(struct napi_struct
*napi
, int budget
)
1503 struct ice_q_vector
*q_vector
=
1504 container_of(napi
, struct ice_q_vector
, napi
);
1505 bool clean_complete
= true;
1506 struct ice_ring
*ring
;
1507 int budget_per_ring
;
1510 /* Since the actual Tx work is minimal, we can give the Tx a larger
1511 * budget and be more aggressive about cleaning up the Tx descriptors.
1513 ice_for_each_ring(ring
, q_vector
->tx
) {
1514 bool wd
= ring
->xsk_umem
?
1515 ice_clean_tx_irq_zc(ring
, budget
) :
1516 ice_clean_tx_irq(ring
, budget
);
1519 clean_complete
= false;
1522 /* Handle case where we are called by netpoll with a budget of 0 */
1523 if (unlikely(budget
<= 0))
1526 /* normally we have 1 Rx ring per q_vector */
1527 if (unlikely(q_vector
->num_ring_rx
> 1))
1528 /* We attempt to distribute budget to each Rx queue fairly, but
1529 * don't allow the budget to go below 1 because that would exit
1532 budget_per_ring
= max(budget
/ q_vector
->num_ring_rx
, 1);
1534 /* Max of 1 Rx ring in this q_vector so give it the budget */
1535 budget_per_ring
= budget
;
1537 ice_for_each_ring(ring
, q_vector
->rx
) {
1540 /* A dedicated path for zero-copy allows making a single
1541 * comparison in the irq context instead of many inside the
1542 * ice_clean_rx_irq function and makes the codebase cleaner.
1544 cleaned
= ring
->xsk_umem
?
1545 ice_clean_rx_irq_zc(ring
, budget_per_ring
) :
1546 ice_clean_rx_irq(ring
, budget_per_ring
);
1547 work_done
+= cleaned
;
1548 /* if we clean as many as budgeted, we must not be done */
1549 if (cleaned
>= budget_per_ring
)
1550 clean_complete
= false;
1553 /* If work not completed, return budget and polling will return */
1554 if (!clean_complete
)
1557 /* Exit the polling mode, but don't re-enable interrupts if stack might
1558 * poll us due to busy-polling
1560 if (likely(napi_complete_done(napi
, work_done
)))
1561 ice_update_ena_itr(q_vector
);
1563 ice_set_wb_on_itr(q_vector
);
1565 return min_t(int, work_done
, budget
- 1);
1569 * __ice_maybe_stop_tx - 2nd level check for Tx stop conditions
1570 * @tx_ring: the ring to be checked
1571 * @size: the size buffer we want to assure is available
1573 * Returns -EBUSY if a stop is needed, else 0
1575 static int __ice_maybe_stop_tx(struct ice_ring
*tx_ring
, unsigned int size
)
1577 netif_stop_subqueue(tx_ring
->netdev
, tx_ring
->q_index
);
1578 /* Memory barrier before checking head and tail */
1581 /* Check again in a case another CPU has just made room available. */
1582 if (likely(ICE_DESC_UNUSED(tx_ring
) < size
))
1585 /* A reprieve! - use start_subqueue because it doesn't call schedule */
1586 netif_start_subqueue(tx_ring
->netdev
, tx_ring
->q_index
);
1587 ++tx_ring
->tx_stats
.restart_q
;
1592 * ice_maybe_stop_tx - 1st level check for Tx stop conditions
1593 * @tx_ring: the ring to be checked
1594 * @size: the size buffer we want to assure is available
1596 * Returns 0 if stop is not needed
1598 static int ice_maybe_stop_tx(struct ice_ring
*tx_ring
, unsigned int size
)
1600 if (likely(ICE_DESC_UNUSED(tx_ring
) >= size
))
1603 return __ice_maybe_stop_tx(tx_ring
, size
);
1607 * ice_tx_map - Build the Tx descriptor
1608 * @tx_ring: ring to send buffer on
1609 * @first: first buffer info buffer to use
1610 * @off: pointer to struct that holds offload parameters
1612 * This function loops over the skb data pointed to by *first
1613 * and gets a physical address for each memory location and programs
1614 * it and the length into the transmit descriptor.
1617 ice_tx_map(struct ice_ring
*tx_ring
, struct ice_tx_buf
*first
,
1618 struct ice_tx_offload_params
*off
)
1620 u64 td_offset
, td_tag
, td_cmd
;
1621 u16 i
= tx_ring
->next_to_use
;
1622 unsigned int data_len
, size
;
1623 struct ice_tx_desc
*tx_desc
;
1624 struct ice_tx_buf
*tx_buf
;
1625 struct sk_buff
*skb
;
1629 td_tag
= off
->td_l2tag1
;
1630 td_cmd
= off
->td_cmd
;
1631 td_offset
= off
->td_offset
;
1634 data_len
= skb
->data_len
;
1635 size
= skb_headlen(skb
);
1637 tx_desc
= ICE_TX_DESC(tx_ring
, i
);
1639 if (first
->tx_flags
& ICE_TX_FLAGS_HW_VLAN
) {
1640 td_cmd
|= (u64
)ICE_TX_DESC_CMD_IL2TAG1
;
1641 td_tag
= (first
->tx_flags
& ICE_TX_FLAGS_VLAN_M
) >>
1642 ICE_TX_FLAGS_VLAN_S
;
1645 dma
= dma_map_single(tx_ring
->dev
, skb
->data
, size
, DMA_TO_DEVICE
);
1649 for (frag
= &skb_shinfo(skb
)->frags
[0];; frag
++) {
1650 unsigned int max_data
= ICE_MAX_DATA_PER_TXD_ALIGNED
;
1652 if (dma_mapping_error(tx_ring
->dev
, dma
))
1655 /* record length, and DMA address */
1656 dma_unmap_len_set(tx_buf
, len
, size
);
1657 dma_unmap_addr_set(tx_buf
, dma
, dma
);
1659 /* align size to end of page */
1660 max_data
+= -dma
& (ICE_MAX_READ_REQ_SIZE
- 1);
1661 tx_desc
->buf_addr
= cpu_to_le64(dma
);
1663 /* account for data chunks larger than the hardware
1666 while (unlikely(size
> ICE_MAX_DATA_PER_TXD
)) {
1667 tx_desc
->cmd_type_offset_bsz
=
1668 build_ctob(td_cmd
, td_offset
, max_data
, td_tag
);
1673 if (i
== tx_ring
->count
) {
1674 tx_desc
= ICE_TX_DESC(tx_ring
, 0);
1681 max_data
= ICE_MAX_DATA_PER_TXD_ALIGNED
;
1682 tx_desc
->buf_addr
= cpu_to_le64(dma
);
1685 if (likely(!data_len
))
1688 tx_desc
->cmd_type_offset_bsz
= build_ctob(td_cmd
, td_offset
,
1694 if (i
== tx_ring
->count
) {
1695 tx_desc
= ICE_TX_DESC(tx_ring
, 0);
1699 size
= skb_frag_size(frag
);
1702 dma
= skb_frag_dma_map(tx_ring
->dev
, frag
, 0, size
,
1705 tx_buf
= &tx_ring
->tx_buf
[i
];
1708 /* record bytecount for BQL */
1709 netdev_tx_sent_queue(txring_txq(tx_ring
), first
->bytecount
);
1711 /* record SW timestamp if HW timestamp is not available */
1712 skb_tx_timestamp(first
->skb
);
1715 if (i
== tx_ring
->count
)
1718 /* write last descriptor with RS and EOP bits */
1719 td_cmd
|= (u64
)ICE_TXD_LAST_DESC_CMD
;
1720 tx_desc
->cmd_type_offset_bsz
= build_ctob(td_cmd
, td_offset
, size
,
1723 /* Force memory writes to complete before letting h/w know there
1724 * are new descriptors to fetch.
1726 * We also use this memory barrier to make certain all of the
1727 * status bits have been updated before next_to_watch is written.
1731 /* set next_to_watch value indicating a packet is present */
1732 first
->next_to_watch
= tx_desc
;
1734 tx_ring
->next_to_use
= i
;
1736 ice_maybe_stop_tx(tx_ring
, DESC_NEEDED
);
1738 /* notify HW of packet */
1739 if (netif_xmit_stopped(txring_txq(tx_ring
)) || !netdev_xmit_more())
1740 writel(i
, tx_ring
->tail
);
1745 /* clear DMA mappings for failed tx_buf map */
1747 tx_buf
= &tx_ring
->tx_buf
[i
];
1748 ice_unmap_and_free_tx_buf(tx_ring
, tx_buf
);
1749 if (tx_buf
== first
)
1756 tx_ring
->next_to_use
= i
;
1760 * ice_tx_csum - Enable Tx checksum offloads
1761 * @first: pointer to the first descriptor
1762 * @off: pointer to struct that holds offload parameters
1764 * Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise.
1767 int ice_tx_csum(struct ice_tx_buf
*first
, struct ice_tx_offload_params
*off
)
1769 u32 l4_len
= 0, l3_len
= 0, l2_len
= 0;
1770 struct sk_buff
*skb
= first
->skb
;
1780 __be16 frag_off
, protocol
;
1781 unsigned char *exthdr
;
1782 u32 offset
, cmd
= 0;
1785 if (skb
->ip_summed
!= CHECKSUM_PARTIAL
)
1788 ip
.hdr
= skb_network_header(skb
);
1789 l4
.hdr
= skb_transport_header(skb
);
1791 /* compute outer L2 header size */
1792 l2_len
= ip
.hdr
- skb
->data
;
1793 offset
= (l2_len
/ 2) << ICE_TX_DESC_LEN_MACLEN_S
;
1795 if (skb
->encapsulation
)
1798 /* Enable IP checksum offloads */
1799 protocol
= vlan_get_protocol(skb
);
1800 if (protocol
== htons(ETH_P_IP
)) {
1801 l4_proto
= ip
.v4
->protocol
;
1802 /* the stack computes the IP header already, the only time we
1803 * need the hardware to recompute it is in the case of TSO.
1805 if (first
->tx_flags
& ICE_TX_FLAGS_TSO
)
1806 cmd
|= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM
;
1808 cmd
|= ICE_TX_DESC_CMD_IIPT_IPV4
;
1810 } else if (protocol
== htons(ETH_P_IPV6
)) {
1811 cmd
|= ICE_TX_DESC_CMD_IIPT_IPV6
;
1812 exthdr
= ip
.hdr
+ sizeof(*ip
.v6
);
1813 l4_proto
= ip
.v6
->nexthdr
;
1814 if (l4
.hdr
!= exthdr
)
1815 ipv6_skip_exthdr(skb
, exthdr
- skb
->data
, &l4_proto
,
1821 /* compute inner L3 header size */
1822 l3_len
= l4
.hdr
- ip
.hdr
;
1823 offset
|= (l3_len
/ 4) << ICE_TX_DESC_LEN_IPLEN_S
;
1825 /* Enable L4 checksum offloads */
1828 /* enable checksum offloads */
1829 cmd
|= ICE_TX_DESC_CMD_L4T_EOFT_TCP
;
1830 l4_len
= l4
.tcp
->doff
;
1831 offset
|= l4_len
<< ICE_TX_DESC_LEN_L4_LEN_S
;
1834 /* enable UDP checksum offload */
1835 cmd
|= ICE_TX_DESC_CMD_L4T_EOFT_UDP
;
1836 l4_len
= (sizeof(struct udphdr
) >> 2);
1837 offset
|= l4_len
<< ICE_TX_DESC_LEN_L4_LEN_S
;
1840 /* enable SCTP checksum offload */
1841 cmd
|= ICE_TX_DESC_CMD_L4T_EOFT_SCTP
;
1842 l4_len
= sizeof(struct sctphdr
) >> 2;
1843 offset
|= l4_len
<< ICE_TX_DESC_LEN_L4_LEN_S
;
1847 if (first
->tx_flags
& ICE_TX_FLAGS_TSO
)
1849 skb_checksum_help(skb
);
1854 off
->td_offset
|= offset
;
1859 * ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW
1860 * @tx_ring: ring to send buffer on
1861 * @first: pointer to struct ice_tx_buf
1863 * Checks the skb and set up correspondingly several generic transmit flags
1864 * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
1866 * Returns error code indicate the frame should be dropped upon error and the
1867 * otherwise returns 0 to indicate the flags has been set properly.
1870 ice_tx_prepare_vlan_flags(struct ice_ring
*tx_ring
, struct ice_tx_buf
*first
)
1872 struct sk_buff
*skb
= first
->skb
;
1873 __be16 protocol
= skb
->protocol
;
1875 if (protocol
== htons(ETH_P_8021Q
) &&
1876 !(tx_ring
->netdev
->features
& NETIF_F_HW_VLAN_CTAG_TX
)) {
1877 /* when HW VLAN acceleration is turned off by the user the
1878 * stack sets the protocol to 8021q so that the driver
1879 * can take any steps required to support the SW only
1880 * VLAN handling. In our case the driver doesn't need
1881 * to take any further steps so just set the protocol
1882 * to the encapsulated ethertype.
1884 skb
->protocol
= vlan_get_protocol(skb
);
1888 /* if we have a HW VLAN tag being added, default to the HW one */
1889 if (skb_vlan_tag_present(skb
)) {
1890 first
->tx_flags
|= skb_vlan_tag_get(skb
) << ICE_TX_FLAGS_VLAN_S
;
1891 first
->tx_flags
|= ICE_TX_FLAGS_HW_VLAN
;
1892 } else if (protocol
== htons(ETH_P_8021Q
)) {
1893 struct vlan_hdr
*vhdr
, _vhdr
;
1895 /* for SW VLAN, check the next protocol and store the tag */
1896 vhdr
= (struct vlan_hdr
*)skb_header_pointer(skb
, ETH_HLEN
,
1902 first
->tx_flags
|= ntohs(vhdr
->h_vlan_TCI
) <<
1903 ICE_TX_FLAGS_VLAN_S
;
1904 first
->tx_flags
|= ICE_TX_FLAGS_SW_VLAN
;
1907 return ice_tx_prepare_vlan_flags_dcb(tx_ring
, first
);
1911 * ice_tso - computes mss and TSO length to prepare for TSO
1912 * @first: pointer to struct ice_tx_buf
1913 * @off: pointer to struct that holds offload parameters
1915 * Returns 0 or error (negative) if TSO can't happen, 1 otherwise.
1918 int ice_tso(struct ice_tx_buf
*first
, struct ice_tx_offload_params
*off
)
1920 struct sk_buff
*skb
= first
->skb
;
1931 u64 cd_mss
, cd_tso_len
;
1932 u32 paylen
, l4_start
;
1935 if (skb
->ip_summed
!= CHECKSUM_PARTIAL
)
1938 if (!skb_is_gso(skb
))
1941 err
= skb_cow_head(skb
, 0);
1945 /* cppcheck-suppress unreadVariable */
1946 ip
.hdr
= skb_network_header(skb
);
1947 l4
.hdr
= skb_transport_header(skb
);
1949 /* initialize outer IP header fields */
1950 if (ip
.v4
->version
== 4) {
1954 ip
.v6
->payload_len
= 0;
1957 /* determine offset of transport header */
1958 l4_start
= l4
.hdr
- skb
->data
;
1960 /* remove payload length from checksum */
1961 paylen
= skb
->len
- l4_start
;
1963 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_UDP_L4
) {
1964 csum_replace_by_diff(&l4
.udp
->check
,
1965 (__force __wsum
)htonl(paylen
));
1966 /* compute length of UDP segmentation header */
1967 off
->header_len
= sizeof(l4
.udp
) + l4_start
;
1969 csum_replace_by_diff(&l4
.tcp
->check
,
1970 (__force __wsum
)htonl(paylen
));
1971 /* compute length of TCP segmentation header */
1972 off
->header_len
= (l4
.tcp
->doff
* 4) + l4_start
;
1975 /* update gso_segs and bytecount */
1976 first
->gso_segs
= skb_shinfo(skb
)->gso_segs
;
1977 first
->bytecount
+= (first
->gso_segs
- 1) * off
->header_len
;
1979 cd_tso_len
= skb
->len
- off
->header_len
;
1980 cd_mss
= skb_shinfo(skb
)->gso_size
;
1982 /* record cdesc_qw1 with TSO parameters */
1983 off
->cd_qw1
|= (u64
)(ICE_TX_DESC_DTYPE_CTX
|
1984 (ICE_TX_CTX_DESC_TSO
<< ICE_TXD_CTX_QW1_CMD_S
) |
1985 (cd_tso_len
<< ICE_TXD_CTX_QW1_TSO_LEN_S
) |
1986 (cd_mss
<< ICE_TXD_CTX_QW1_MSS_S
));
1987 first
->tx_flags
|= ICE_TX_FLAGS_TSO
;
1992 * ice_txd_use_count - estimate the number of descriptors needed for Tx
1993 * @size: transmit request size in bytes
1995 * Due to hardware alignment restrictions (4K alignment), we need to
1996 * assume that we can have no more than 12K of data per descriptor, even
1997 * though each descriptor can take up to 16K - 1 bytes of aligned memory.
1998 * Thus, we need to divide by 12K. But division is slow! Instead,
1999 * we decompose the operation into shifts and one relatively cheap
2000 * multiply operation.
2002 * To divide by 12K, we first divide by 4K, then divide by 3:
2003 * To divide by 4K, shift right by 12 bits
2004 * To divide by 3, multiply by 85, then divide by 256
2005 * (Divide by 256 is done by shifting right by 8 bits)
2006 * Finally, we add one to round up. Because 256 isn't an exact multiple of
2007 * 3, we'll underestimate near each multiple of 12K. This is actually more
2008 * accurate as we have 4K - 1 of wiggle room that we can fit into the last
2009 * segment. For our purposes this is accurate out to 1M which is orders of
2010 * magnitude greater than our largest possible GSO size.
2012 * This would then be implemented as:
2013 * return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR;
2015 * Since multiplication and division are commutative, we can reorder
2017 * return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
2019 static unsigned int ice_txd_use_count(unsigned int size
)
2021 return ((size
* 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR
;
2025 * ice_xmit_desc_count - calculate number of Tx descriptors needed
2028 * Returns number of data descriptors needed for this skb.
2030 static unsigned int ice_xmit_desc_count(struct sk_buff
*skb
)
2032 const skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[0];
2033 unsigned int nr_frags
= skb_shinfo(skb
)->nr_frags
;
2034 unsigned int count
= 0, size
= skb_headlen(skb
);
2037 count
+= ice_txd_use_count(size
);
2042 size
= skb_frag_size(frag
++);
2049 * __ice_chk_linearize - Check if there are more than 8 buffers per packet
2052 * Note: This HW can't DMA more than 8 buffers to build a packet on the wire
2053 * and so we need to figure out the cases where we need to linearize the skb.
2055 * For TSO we need to count the TSO header and segment payload separately.
2056 * As such we need to check cases where we have 7 fragments or more as we
2057 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2058 * the segment payload in the first descriptor, and another 7 for the
2061 static bool __ice_chk_linearize(struct sk_buff
*skb
)
2063 const skb_frag_t
*frag
, *stale
;
2066 /* no need to check if number of frags is less than 7 */
2067 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2068 if (nr_frags
< (ICE_MAX_BUF_TXD
- 1))
2071 /* We need to walk through the list and validate that each group
2072 * of 6 fragments totals at least gso_size.
2074 nr_frags
-= ICE_MAX_BUF_TXD
- 2;
2075 frag
= &skb_shinfo(skb
)->frags
[0];
2077 /* Initialize size to the negative value of gso_size minus 1. We
2078 * use this as the worst case scenario in which the frag ahead
2079 * of us only provides one byte which is why we are limited to 6
2080 * descriptors for a single transmit as the header and previous
2081 * fragment are already consuming 2 descriptors.
2083 sum
= 1 - skb_shinfo(skb
)->gso_size
;
2085 /* Add size of frags 0 through 4 to create our initial sum */
2086 sum
+= skb_frag_size(frag
++);
2087 sum
+= skb_frag_size(frag
++);
2088 sum
+= skb_frag_size(frag
++);
2089 sum
+= skb_frag_size(frag
++);
2090 sum
+= skb_frag_size(frag
++);
2092 /* Walk through fragments adding latest fragment, testing it, and
2093 * then removing stale fragments from the sum.
2095 stale
= &skb_shinfo(skb
)->frags
[0];
2097 sum
+= skb_frag_size(frag
++);
2099 /* if sum is negative we failed to make sufficient progress */
2106 sum
-= skb_frag_size(stale
++);
2113 * ice_chk_linearize - Check if there are more than 8 fragments per packet
2115 * @count: number of buffers used
2117 * Note: Our HW can't scatter-gather more than 8 fragments to build
2118 * a packet on the wire and so we need to figure out the cases where we
2119 * need to linearize the skb.
2121 static bool ice_chk_linearize(struct sk_buff
*skb
, unsigned int count
)
2123 /* Both TSO and single send will work if count is less than 8 */
2124 if (likely(count
< ICE_MAX_BUF_TXD
))
2127 if (skb_is_gso(skb
))
2128 return __ice_chk_linearize(skb
);
2130 /* we can support up to 8 data buffers for a single send */
2131 return count
!= ICE_MAX_BUF_TXD
;
2135 * ice_xmit_frame_ring - Sends buffer on Tx ring
2137 * @tx_ring: ring to send buffer on
2139 * Returns NETDEV_TX_OK if sent, else an error code
2142 ice_xmit_frame_ring(struct sk_buff
*skb
, struct ice_ring
*tx_ring
)
2144 struct ice_tx_offload_params offload
= { 0 };
2145 struct ice_vsi
*vsi
= tx_ring
->vsi
;
2146 struct ice_tx_buf
*first
;
2150 count
= ice_xmit_desc_count(skb
);
2151 if (ice_chk_linearize(skb
, count
)) {
2152 if (__skb_linearize(skb
))
2154 count
= ice_txd_use_count(skb
->len
);
2155 tx_ring
->tx_stats
.tx_linearize
++;
2158 /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
2159 * + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
2160 * + 4 desc gap to avoid the cache line where head is,
2161 * + 1 desc for context descriptor,
2162 * otherwise try next time
2164 if (ice_maybe_stop_tx(tx_ring
, count
+ ICE_DESCS_PER_CACHE_LINE
+
2165 ICE_DESCS_FOR_CTX_DESC
)) {
2166 tx_ring
->tx_stats
.tx_busy
++;
2167 return NETDEV_TX_BUSY
;
2170 offload
.tx_ring
= tx_ring
;
2172 /* record the location of the first descriptor for this packet */
2173 first
= &tx_ring
->tx_buf
[tx_ring
->next_to_use
];
2175 first
->bytecount
= max_t(unsigned int, skb
->len
, ETH_ZLEN
);
2176 first
->gso_segs
= 1;
2177 first
->tx_flags
= 0;
2179 /* prepare the VLAN tagging flags for Tx */
2180 if (ice_tx_prepare_vlan_flags(tx_ring
, first
))
2183 /* set up TSO offload */
2184 tso
= ice_tso(first
, &offload
);
2188 /* always set up Tx checksum offload */
2189 csum
= ice_tx_csum(first
, &offload
);
2193 /* allow CONTROL frames egress from main VSI if FW LLDP disabled */
2194 if (unlikely(skb
->priority
== TC_PRIO_CONTROL
&&
2195 vsi
->type
== ICE_VSI_PF
&&
2196 vsi
->port_info
->is_sw_lldp
))
2197 offload
.cd_qw1
|= (u64
)(ICE_TX_DESC_DTYPE_CTX
|
2198 ICE_TX_CTX_DESC_SWTCH_UPLINK
<<
2199 ICE_TXD_CTX_QW1_CMD_S
);
2201 if (offload
.cd_qw1
& ICE_TX_DESC_DTYPE_CTX
) {
2202 struct ice_tx_ctx_desc
*cdesc
;
2203 int i
= tx_ring
->next_to_use
;
2205 /* grab the next descriptor */
2206 cdesc
= ICE_TX_CTX_DESC(tx_ring
, i
);
2208 tx_ring
->next_to_use
= (i
< tx_ring
->count
) ? i
: 0;
2210 /* setup context descriptor */
2211 cdesc
->tunneling_params
= cpu_to_le32(offload
.cd_tunnel_params
);
2212 cdesc
->l2tag2
= cpu_to_le16(offload
.cd_l2tag2
);
2213 cdesc
->rsvd
= cpu_to_le16(0);
2214 cdesc
->qw1
= cpu_to_le64(offload
.cd_qw1
);
2217 ice_tx_map(tx_ring
, first
, &offload
);
2218 return NETDEV_TX_OK
;
2221 dev_kfree_skb_any(skb
);
2222 return NETDEV_TX_OK
;
2226 * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
2228 * @netdev: network interface device structure
2230 * Returns NETDEV_TX_OK if sent, else an error code
2232 netdev_tx_t
ice_start_xmit(struct sk_buff
*skb
, struct net_device
*netdev
)
2234 struct ice_netdev_priv
*np
= netdev_priv(netdev
);
2235 struct ice_vsi
*vsi
= np
->vsi
;
2236 struct ice_ring
*tx_ring
;
2238 tx_ring
= vsi
->tx_rings
[skb
->queue_mapping
];
2240 /* hardware can't handle really short frames, hardware padding works
2243 if (skb_put_padto(skb
, ICE_MIN_TX_LEN
))
2244 return NETDEV_TX_OK
;
2246 return ice_xmit_frame_ring(skb
, tx_ring
);