2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
21 #ifdef SEASTAR_HAVE_DPDK
24 #include <seastar/core/posix.hh>
25 #include "core/vla.hh"
26 #include <seastar/core/reactor.hh>
27 #include <seastar/net/virtio-interface.hh>
28 #include <seastar/core/stream.hh>
29 #include <seastar/core/circular_buffer.hh>
30 #include <seastar/core/align.hh>
31 #include <seastar/core/sstring.hh>
32 #include <seastar/core/memory.hh>
33 #include <seastar/core/metrics.hh>
34 #include <seastar/util/function_input_iterator.hh>
35 #include <seastar/util/transform_iterator.hh>
39 #include <seastar/util/std-compat.hh>
40 #include <boost/preprocessor.hpp>
41 #include <seastar/net/ip.hh>
42 #include <seastar/net/const.hh>
43 #include <seastar/core/dpdk_rte.hh>
44 #include <seastar/net/dpdk.hh>
45 #include <seastar/net/toeplitz.hh>
46 #include <seastar/net/native-stack.hh>
52 #include <rte_config.h>
53 #include <rte_common.h>
56 #include <rte_ethdev.h>
57 #include <rte_cycles.h>
58 #include <rte_memzone.h>
61 #if RTE_VERSION <= RTE_VERSION_NUM(2,0,0,16)
66 rte_mbuf_to_baddr(rte_mbuf
* mbuf
) {
67 return reinterpret_cast<char*>(RTE_MBUF_TO_BADDR(mbuf
));
70 void* as_cookie(struct rte_pktmbuf_pool_private
& p
) {
71 return reinterpret_cast<void*>(uint64_t(p
.mbuf_data_room_size
));
76 void* as_cookie(struct rte_pktmbuf_pool_private
& p
) {
83 typedef void *MARKER
[0]; /**< generic marker for a point in a structure */
86 // Calculate maximum amount of memory required to store given number of objects
88 get_mempool_xmem_size(uint32_t elt_num
, size_t total_elt_sz
, uint32_t pg_shift
)
90 size_t obj_per_page
, pg_num
, pg_sz
;
92 if (total_elt_sz
== 0) {
97 return total_elt_sz
* elt_num
;
100 pg_sz
= (size_t)1 << pg_shift
;
101 obj_per_page
= pg_sz
/ total_elt_sz
;
102 if (obj_per_page
== 0) {
103 return RTE_ALIGN_CEIL(total_elt_sz
, pg_sz
) * elt_num
;
106 pg_num
= (elt_num
+ obj_per_page
- 1) / obj_per_page
;
107 return pg_num
<< pg_shift
;
110 using namespace seastar::net
;
116 /******************* Net device related constatns *****************************/
117 static constexpr uint16_t default_ring_size
= 512;
120 // We need 2 times the ring size of buffers because of the way PMDs
123 static constexpr uint16_t mbufs_per_queue_rx
= 2 * default_ring_size
;
124 static constexpr uint16_t rx_gc_thresh
= 64;
127 // No need to keep more descriptors in the air than can be sent in a single
128 // rte_eth_tx_burst() call.
130 static constexpr uint16_t mbufs_per_queue_tx
= 2 * default_ring_size
;
132 static constexpr uint16_t mbuf_cache_size
= 512;
133 static constexpr uint16_t mbuf_overhead
=
134 sizeof(struct rte_mbuf
) + RTE_PKTMBUF_HEADROOM
;
136 // We'll allocate 2K data buffers for an inline case because this would require
137 // a single page per mbuf. If we used 4K data buffers here it would require 2
138 // pages for a single buffer (due to "mbuf_overhead") and this is a much more
139 // demanding memory constraint.
141 static constexpr size_t inline_mbuf_data_size
= 2048;
144 // Size of the data buffer in the non-inline case.
146 // We may want to change (increase) this value in future, while the
147 // inline_mbuf_data_size value will unlikely change due to reasons described
150 static constexpr size_t mbuf_data_size
= 2048;
152 // (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
153 static constexpr uint8_t max_frags
= 32 + 1;
156 // Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
158 // See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
159 // spec. for more details.
161 static constexpr uint8_t i40e_max_xmit_segment_frags
= 8;
164 // VMWare's virtual NIC limit for a number of fragments in an xmit segment.
166 // see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
168 static constexpr uint8_t vmxnet3_max_xmit_segment_frags
= 16;
170 static constexpr uint16_t inline_mbuf_size
=
171 inline_mbuf_data_size
+ mbuf_overhead
;
173 uint32_t qp_mempool_obj_size(bool hugetlbfs_membackend
)
175 uint32_t mp_size
= 0;
176 struct rte_mempool_objsz mp_obj_sz
= {};
179 // We will align each size to huge page size because DPDK allocates
180 // physically contiguous memory region for each pool object.
184 if (hugetlbfs_membackend
) {
186 align_up(rte_mempool_calc_obj_size(mbuf_overhead
, 0, &mp_obj_sz
)+
187 sizeof(struct rte_pktmbuf_pool_private
),
188 memory::huge_page_size
);
191 align_up(rte_mempool_calc_obj_size(inline_mbuf_size
, 0, &mp_obj_sz
)+
192 sizeof(struct rte_pktmbuf_pool_private
),
193 memory::huge_page_size
);
196 std::memset(&mp_obj_sz
, 0, sizeof(mp_obj_sz
));
197 mp_size
+= align_up(rte_mempool_calc_obj_size(inline_mbuf_size
, 0,
199 sizeof(struct rte_pktmbuf_pool_private
),
200 memory::huge_page_size
);
204 static constexpr const char* pktmbuf_pool_name
= "dpdk_pktmbuf_pool";
207 * When doing reads from the NIC queues, use this batch size
209 static constexpr uint8_t packet_read_size
= 32;
210 /******************************************************************************/
213 port_stats() : rx
{}, tx
{} {}
217 uint64_t mcast
; // number of received multicast packets
218 uint64_t pause_xon
; // number of received PAUSE XON frames
219 uint64_t pause_xoff
; // number of received PAUSE XOFF frames
223 uint64_t dropped
; // missed packets (e.g. full FIFO)
224 uint64_t crc
; // packets with CRC error
225 uint64_t len
; // packets with a bad length
226 uint64_t total
; // total number of erroneous received packets
232 uint64_t pause_xon
; // number of sent PAUSE XON frames
233 uint64_t pause_xoff
; // number of sent PAUSE XOFF frames
237 uint64_t total
; // total number of failed transmitted packets
242 #define XSTATS_ID_LIST \
243 (rx_multicast_packets) \
248 (rx_undersize_errors) \
249 (rx_oversize_errors) \
255 dpdk_xstats(uint16_t port_id
)
269 BOOST_PP_SEQ_ENUM(XSTATS_ID_LIST
)
274 _len
= rte_eth_xstats_get_names(_port_id
, NULL
, 0);
275 _xstats
= new rte_eth_xstat
[_len
];
276 _xstat_names
= new rte_eth_xstat_name
[_len
];
278 update_xstat_names();
282 void update_xstats() {
283 auto len
= rte_eth_xstats_get(_port_id
, _xstats
, _len
);
287 uint64_t get_value(const xstat_id id
) {
288 auto off
= _offsets
[static_cast<int>(id
)];
292 return _xstats
[off
].value
;
298 struct rte_eth_xstat
*_xstats
= nullptr;
299 struct rte_eth_xstat_name
*_xstat_names
= nullptr;
300 int _offsets
[BOOST_PP_SEQ_SIZE(XSTATS_ID_LIST
)];
302 static const sstring
id_to_str(const xstat_id id
) {
303 #define ENUM_TO_STR(r, data, elem) \
305 return BOOST_PP_STRINGIZE(elem);
307 BOOST_PP_SEQ_FOR_EACH(ENUM_TO_STR
, ~, XSTATS_ID_LIST
)
311 int get_offset_by_name(const xstat_id id
, const int len
) {
312 for (int i
= 0; i
< len
; i
++) {
313 if (id_to_str(id
) == _xstat_names
[i
].name
)
319 void update_xstat_names() {
320 auto len
= rte_eth_xstats_get_names(_port_id
, _xstat_names
, _len
);
324 void update_offsets() {
325 #define FIND_OFFSET(r, data, elem) \
326 _offsets[static_cast<int>(elem)] = \
327 get_offset_by_name(elem, _len);
329 BOOST_PP_SEQ_FOR_EACH(FIND_OFFSET
, ~, XSTATS_ID_LIST
)
333 class dpdk_device
: public device
{
335 uint16_t _num_queues
;
336 net::hw_features _hw_features
;
337 uint16_t _queues_ready
= 0;
341 std::vector
<uint8_t> _redir_table
;
342 rss_key_type _rss_key
;
344 timer
<> _stats_collector
;
345 const std::string _stats_plugin_name
;
346 const std::string _stats_plugin_inst
;
347 seastar::metrics::metric_groups _metrics
;
348 bool _is_i40e_device
= false;
349 bool _is_vmxnet3_device
= false;
353 rte_eth_dev_info _dev_info
= {};
354 promise
<> _link_ready_promise
;
358 * Port initialization consists of 3 main stages:
359 * 1) General port initialization which ends with a call to
360 * rte_eth_dev_configure() where we request the needed number of Rx and
362 * 2) Individual queues initialization. This is done in the constructor of
363 * dpdk_qp class. In particular the memory pools for queues are allocated
365 * 3) The final stage of the initialization which starts with the call of
366 * rte_eth_dev_start() after which the port becomes fully functional. We
367 * will also wait for a link to get up in this stage.
372 * First stage of the port initialization.
374 * @return 0 in case of success and an appropriate error code in case of an
377 int init_port_start();
380 * The final stage of a port initialization.
381 * @note Must be called *after* all queues from stage (2) have been
384 void init_port_fini();
387 * Check the link status of out port in up to 9s, and print them finally.
389 void check_port_link_status();
392 * Configures the HW Flow Control
394 void set_hw_flow_control();
397 dpdk_device(uint16_t port_idx
, uint16_t num_queues
, bool use_lro
,
399 : _port_idx(port_idx
)
400 , _num_queues(num_queues
)
401 , _home_cpu(this_shard_id())
403 , _enable_fc(enable_fc
)
404 , _stats_plugin_name("network")
405 , _stats_plugin_inst(std::string("port") + std::to_string(_port_idx
))
409 /* now initialise the port we will use */
410 int ret
= init_port_start();
412 rte_exit(EXIT_FAILURE
, "Cannot initialise port %u\n", _port_idx
);
415 // Register port statistics pollers
416 namespace sm
= seastar::metrics
;
417 _metrics
.add_group(_stats_plugin_name
, {
419 sm::make_derive("rx_multicast", _stats
.rx
.good
.mcast
,
420 sm::description("Counts a number of received multicast packets."), {sm::shard_label(_stats_plugin_inst
)}),
422 sm::make_derive("rx_crc_errors", _stats
.rx
.bad
.crc
,
423 sm::description("Counts a number of received packets with a bad CRC value. "
424 "A non-zero value of this metric usually indicates a HW problem, e.g. a bad cable."), {sm::shard_label(_stats_plugin_inst
)}),
426 sm::make_derive("rx_dropped", _stats
.rx
.bad
.dropped
,
427 sm::description("Counts a number of dropped received packets. "
428 "A non-zero value of this counter indicated the overflow of ingress HW buffers. "
429 "This usually happens because of a rate of a sender on the other side of the link is higher than we can process as a receiver."), {sm::shard_label(_stats_plugin_inst
)}),
431 sm::make_derive("rx_bad_length_errors", _stats
.rx
.bad
.len
,
432 sm::description("Counts a number of received packets with a bad length value. "
433 "A non-zero value of this metric usually indicates a HW issue: e.g. bad cable."), {sm::shard_label(_stats_plugin_inst
)}),
436 sm::make_derive("rx_pause_xon", _stats
.rx
.good
.pause_xon
,
437 sm::description("Counts a number of received PAUSE XON frames (PAUSE frame with a quanta of zero). "
438 "When PAUSE XON frame is received our port may resume sending L2 frames. "
439 "PAUSE XON frames are sent to resume sending that was previously paused with a PAUSE XOFF frame. If ingress "
440 "buffer falls below the low watermark threshold before the timeout configured in the original PAUSE XOFF frame the receiver may decide to send PAUSE XON frame. "
441 "A non-zero value of this metric may mean that our sender is bursty and that the spikes overwhelm the receiver on the other side of the link."), {sm::shard_label(_stats_plugin_inst
)}),
443 sm::make_derive("tx_pause_xon", _stats
.tx
.good
.pause_xon
,
444 sm::description("Counts a number of sent PAUSE XON frames (L2 flow control frames). "
445 "A non-zero value of this metric indicates that our ingress path doesn't keep up with the rate of a sender on the other side of the link. "
446 "Note that if a sender port respects PAUSE frames this will prevent it from sending from ALL its egress queues because L2 flow control is defined "
447 "on a per-link resolution."), {sm::shard_label(_stats_plugin_inst
)}),
449 sm::make_derive("rx_pause_xoff", _stats
.rx
.good
.pause_xoff
,
450 sm::description("Counts a number of received PAUSE XOFF frames. "
451 "A non-zero value of this metric indicates that our egress overwhelms the receiver on the other side of the link and it has to send PAUSE frames to make us stop sending. "
452 "Note that if our port respects PAUSE frames a reception of a PAUSE XOFF frame will cause ALL egress queues of this port to stop sending."), {sm::shard_label(_stats_plugin_inst
)}),
454 sm::make_derive("tx_pause_xoff", _stats
.tx
.good
.pause_xoff
,
455 sm::description("Counts a number of sent PAUSE XOFF frames. "
456 "A non-zero value of this metric indicates that our ingress path (SW and HW) doesn't keep up with the rate of a sender on the other side of the link and as a result "
457 "our ingress HW buffers overflow."), {sm::shard_label(_stats_plugin_inst
)}),
459 sm::make_derive("rx_errors", _stats
.rx
.bad
.total
,
460 sm::description("Counts the total number of ingress errors: CRC errors, bad length errors, etc."), {sm::shard_label(_stats_plugin_inst
)}),
462 sm::make_derive("tx_errors", _stats
.tx
.bad
.total
,
463 sm::description("Counts a total number of egress errors. A non-zero value usually indicated a problem with a HW or a SW driver."), {sm::shard_label(_stats_plugin_inst
)}),
468 _stats_collector
.cancel();
471 ethernet_address
hw_address() override
{
472 struct ether_addr mac
;
473 rte_eth_macaddr_get(_port_idx
, &mac
);
475 return mac
.addr_bytes
;
477 net::hw_features
hw_features() override
{
481 net::hw_features
& hw_features_ref() { return _hw_features
; }
483 const rte_eth_rxconf
* def_rx_conf() const {
484 return &_dev_info
.default_rxconf
;
487 const rte_eth_txconf
* def_tx_conf() const {
488 return &_dev_info
.default_txconf
;
492 * Set the RSS table in the device and store it in the internal vector.
494 void set_rss_table();
496 virtual uint16_t hw_queues_count() override
{ return _num_queues
; }
497 virtual future
<> link_ready() override
{ return _link_ready_promise
.get_future(); }
498 virtual std::unique_ptr
<qp
> init_local_queue(const program_options::option_group
& opts
, uint16_t qid
) override
;
499 virtual unsigned hash2qid(uint32_t hash
) override
{
500 assert(_redir_table
.size());
501 return _redir_table
[hash
& (_redir_table
.size() - 1)];
503 uint16_t port_idx() { return _port_idx
; }
504 bool is_i40e_device() const {
505 return _is_i40e_device
;
507 bool is_vmxnet3_device() const {
508 return _is_vmxnet3_device
;
511 virtual rss_key_type
rss_key() const override
{ return _rss_key
; }
514 template <bool HugetlbfsMemBackend
>
515 class dpdk_qp
: public net::qp
{
516 class tx_buf_factory
;
519 friend class dpdk_qp
;
521 static tx_buf
* me(rte_mbuf
* mbuf
) {
522 return reinterpret_cast<tx_buf
*>(mbuf
);
527 * Checks if the original packet of a given cluster should be linearized
528 * due to HW limitations.
530 * @param head head of a cluster to check
532 * @return TRUE if a packet should be linearized.
534 static bool i40e_should_linearize(rte_mbuf
*head
) {
535 bool is_tso
= head
->ol_flags
& PKT_TX_TCP_SEG
;
537 // For a non-TSO case: number of fragments should not exceed 8
539 return head
->nb_segs
> i40e_max_xmit_segment_frags
;
543 // For a TSO case each MSS window should not include more than 8
544 // fragments including headers.
547 // Calculate the number of frags containing headers.
549 // Note: we support neither VLAN nor tunneling thus headers size
550 // accounting is super simple.
552 size_t headers_size
= head
->l2_len
+ head
->l3_len
+ head
->l4_len
;
553 unsigned hdr_frags
= 0;
554 size_t cur_payload_len
= 0;
555 rte_mbuf
*cur_seg
= head
;
557 while (cur_seg
&& cur_payload_len
< headers_size
) {
558 cur_payload_len
+= cur_seg
->data_len
;
559 cur_seg
= cur_seg
->next
;
564 // Header fragments will be used for each TSO segment, thus the
565 // maximum number of data segments will be 8 minus the number of
568 // It's unclear from the spec how the first TSO segment is treated
569 // if the last fragment with headers contains some data bytes:
570 // whether this fragment will be accounted as a single fragment or
571 // as two separate fragments. We prefer to play it safe and assume
572 // that this fragment will be accounted as two separate fragments.
574 size_t max_win_size
= i40e_max_xmit_segment_frags
- hdr_frags
;
576 if (head
->nb_segs
<= max_win_size
) {
580 // Get the data (without headers) part of the first data fragment
581 size_t prev_frag_data
= cur_payload_len
- headers_size
;
582 auto mss
= head
->tso_segsz
;
585 unsigned frags_in_seg
= 0;
586 size_t cur_seg_size
= 0;
588 if (prev_frag_data
) {
589 cur_seg_size
= prev_frag_data
;
594 while (cur_seg_size
< mss
&& cur_seg
) {
595 cur_seg_size
+= cur_seg
->data_len
;
596 cur_seg
= cur_seg
->next
;
599 if (frags_in_seg
> max_win_size
) {
604 if (cur_seg_size
> mss
) {
605 prev_frag_data
= cur_seg_size
- mss
;
613 * Sets the offload info in the head buffer of an rte_mbufs cluster.
615 * @param p an original packet the cluster is built for
616 * @param qp QP handle
617 * @param head a head of an rte_mbufs cluster
619 static void set_cluster_offload_info(const packet
& p
, const dpdk_qp
& qp
, rte_mbuf
* head
) {
620 // Handle TCP checksum offload
621 auto oi
= p
.offload_info();
622 if (oi
.needs_ip_csum
) {
623 head
->ol_flags
|= PKT_TX_IP_CKSUM
;
624 // TODO: Take a VLAN header into an account here
625 head
->l2_len
= sizeof(struct ether_hdr
);
626 head
->l3_len
= oi
.ip_hdr_len
;
628 if (qp
.port().hw_features().tx_csum_l4_offload
) {
629 if (oi
.protocol
== ip_protocol_num::tcp
) {
630 head
->ol_flags
|= PKT_TX_TCP_CKSUM
;
631 // TODO: Take a VLAN header into an account here
632 head
->l2_len
= sizeof(struct ether_hdr
);
633 head
->l3_len
= oi
.ip_hdr_len
;
635 if (oi
.tso_seg_size
) {
636 assert(oi
.needs_ip_csum
);
637 head
->ol_flags
|= PKT_TX_TCP_SEG
;
638 head
->l4_len
= oi
.tcp_hdr_len
;
639 head
->tso_segsz
= oi
.tso_seg_size
;
641 } else if (oi
.protocol
== ip_protocol_num::udp
) {
642 head
->ol_flags
|= PKT_TX_UDP_CKSUM
;
643 // TODO: Take a VLAN header into an account here
644 head
->l2_len
= sizeof(struct ether_hdr
);
645 head
->l3_len
= oi
.ip_hdr_len
;
651 * Creates a tx_buf cluster representing a given packet in a "zero-copy"
654 * @param p packet to translate
655 * @param qp dpdk_qp handle
657 * @return the HEAD tx_buf of the cluster or nullptr in case of a
660 static tx_buf
* from_packet_zc(packet
&& p
, dpdk_qp
& qp
) {
662 // Too fragmented - linearize
663 if (p
.nr_frags() > max_frags
) {
665 ++qp
._stats
.tx
.linearized
;
669 rte_mbuf
*head
= nullptr, *last_seg
= nullptr;
672 // Create a HEAD of the fragmented packet
673 if (!translate_one_frag(qp
, p
.frag(0), head
, last_seg
, nsegs
)) {
677 unsigned total_nsegs
= nsegs
;
679 for (unsigned i
= 1; i
< p
.nr_frags(); i
++) {
680 rte_mbuf
*h
= nullptr, *new_last_seg
= nullptr;
681 if (!translate_one_frag(qp
, p
.frag(i
), h
, new_last_seg
, nsegs
)) {
686 total_nsegs
+= nsegs
;
688 // Attach a new buffers' chain to the packet chain
690 last_seg
= new_last_seg
;
693 // Update the HEAD buffer with the packet info
694 head
->pkt_len
= p
.len();
695 head
->nb_segs
= total_nsegs
;
697 set_cluster_offload_info(p
, qp
, head
);
700 // If a packet hasn't been linearized already and the resulting
701 // cluster requires the linearisation due to HW limitation:
703 // - Recycle the cluster.
704 // - Linearize the packet.
705 // - Build the cluster once again
707 if (head
->nb_segs
> max_frags
||
708 (p
.nr_frags() > 1 && qp
.port().is_i40e_device() && i40e_should_linearize(head
)) ||
709 (p
.nr_frags() > vmxnet3_max_xmit_segment_frags
&& qp
.port().is_vmxnet3_device())) {
712 ++qp
._stats
.tx
.linearized
;
714 goto build_mbuf_cluster
;
717 me(last_seg
)->set_packet(std::move(p
));
723 * Copy the contents of the "packet" into the given cluster of
726 * @note Size of the cluster has to be big enough to accommodate all the
727 * contents of the given packet.
729 * @param p packet to copy
730 * @param head head of the rte_mbuf's cluster
732 static void copy_packet_to_cluster(const packet
& p
, rte_mbuf
* head
) {
733 rte_mbuf
* cur_seg
= head
;
734 size_t cur_seg_offset
= 0;
735 unsigned cur_frag_idx
= 0;
736 size_t cur_frag_offset
= 0;
739 size_t to_copy
= std::min(p
.frag(cur_frag_idx
).size
- cur_frag_offset
,
740 inline_mbuf_data_size
- cur_seg_offset
);
742 memcpy(rte_pktmbuf_mtod_offset(cur_seg
, void*, cur_seg_offset
),
743 p
.frag(cur_frag_idx
).base
+ cur_frag_offset
, to_copy
);
745 cur_frag_offset
+= to_copy
;
746 cur_seg_offset
+= to_copy
;
748 if (cur_frag_offset
>= p
.frag(cur_frag_idx
).size
) {
750 if (cur_frag_idx
>= p
.nr_frags()) {
752 // We are done - set the data size of the last segment
755 cur_seg
->data_len
= cur_seg_offset
;
762 if (cur_seg_offset
>= inline_mbuf_data_size
) {
763 cur_seg
->data_len
= inline_mbuf_data_size
;
764 cur_seg
= cur_seg
->next
;
767 // FIXME: assert in a fast-path - remove!!!
774 * Creates a tx_buf cluster representing a given packet in a "copy" way.
776 * @param p packet to translate
777 * @param qp dpdk_qp handle
779 * @return the HEAD tx_buf of the cluster or nullptr in case of a
782 static tx_buf
* from_packet_copy(packet
&& p
, dpdk_qp
& qp
) {
789 * Here we are going to use the fact that the inline data size is a
792 * We will first try to allocate the cluster and only if we are
793 * successful - we will go and copy the data.
795 auto aligned_len
= align_up((size_t)p
.len(), inline_mbuf_data_size
);
796 unsigned nsegs
= aligned_len
/ inline_mbuf_data_size
;
797 rte_mbuf
*head
= nullptr, *last_seg
= nullptr;
799 tx_buf
* buf
= qp
.get_tx_buf();
804 head
= buf
->rte_mbuf_p();
806 for (unsigned i
= 1; i
< nsegs
; i
++) {
807 buf
= qp
.get_tx_buf();
813 last_seg
->next
= buf
->rte_mbuf_p();
814 last_seg
= last_seg
->next
;
818 // If we've got here means that we have succeeded already!
819 // We only need to copy the data and set the head buffer with the
822 head
->pkt_len
= p
.len();
823 head
->nb_segs
= nsegs
;
825 copy_packet_to_cluster(p
, head
);
826 set_cluster_offload_info(p
, qp
, head
);
832 * Zero-copy handling of a single net::fragment.
834 * @param do_one_buf Functor responsible for a single rte_mbuf
836 * @param qp dpdk_qp handle (in)
837 * @param frag Fragment to copy (in)
838 * @param head Head of the cluster (out)
839 * @param last_seg Last segment of the cluster (out)
840 * @param nsegs Number of segments in the cluster (out)
842 * @return TRUE in case of success
844 template <class DoOneBufFunc
>
845 static bool do_one_frag(DoOneBufFunc do_one_buf
, dpdk_qp
& qp
,
846 fragment
& frag
, rte_mbuf
*& head
,
847 rte_mbuf
*& last_seg
, unsigned& nsegs
) {
848 size_t len
, left_to_set
= frag
.size
;
849 char* base
= frag
.base
;
853 // TODO: assert() in a fast path! Remove me ASAP!
856 // Create a HEAD of mbufs' cluster and set the first bytes into it
857 len
= do_one_buf(qp
, head
, base
, left_to_set
);
867 // Set the rest of the data into the new mbufs and chain them to
870 rte_mbuf
* prev_seg
= head
;
871 while (left_to_set
) {
872 len
= do_one_buf(qp
, m
, base
, left_to_set
);
886 // Return the last mbuf in the cluster
893 * Zero-copy handling of a single net::fragment.
895 * @param qp dpdk_qp handle (in)
896 * @param frag Fragment to copy (in)
897 * @param head Head of the cluster (out)
898 * @param last_seg Last segment of the cluster (out)
899 * @param nsegs Number of segments in the cluster (out)
901 * @return TRUE in case of success
903 static bool translate_one_frag(dpdk_qp
& qp
, fragment
& frag
,
904 rte_mbuf
*& head
, rte_mbuf
*& last_seg
,
906 return do_one_frag(set_one_data_buf
, qp
, frag
, head
,
911 * Copies one net::fragment into the cluster of rte_mbuf's.
913 * @param qp dpdk_qp handle (in)
914 * @param frag Fragment to copy (in)
915 * @param head Head of the cluster (out)
916 * @param last_seg Last segment of the cluster (out)
917 * @param nsegs Number of segments in the cluster (out)
919 * We return the "last_seg" to avoid traversing the cluster in order to get
922 * @return TRUE in case of success
924 static bool copy_one_frag(dpdk_qp
& qp
, fragment
& frag
,
925 rte_mbuf
*& head
, rte_mbuf
*& last_seg
,
927 return do_one_frag(copy_one_data_buf
, qp
, frag
, head
,
932 * Allocates a single rte_mbuf and sets it to point to a given data
935 * @param qp dpdk_qp handle (in)
936 * @param m New allocated rte_mbuf (out)
937 * @param va virtual address of a data buffer (in)
938 * @param buf_len length of the data to copy (in)
940 * @return The actual number of bytes that has been set in the mbuf
942 static size_t set_one_data_buf(
943 dpdk_qp
& qp
, rte_mbuf
*& m
, char* va
, size_t buf_len
) {
944 static constexpr size_t max_frag_len
= 15 * 1024; // 15K
947 // Currently we break a buffer on a 15K boundary because 82599
948 // devices have a 15.5K limitation on a maximum single fragment
951 rte_iova_t iova
= rte_mem_virt2iova(va
);
953 if (iova
== RTE_BAD_IOVA
) {
954 return copy_one_data_buf(qp
, m
, va
, buf_len
);
957 tx_buf
* buf
= qp
.get_tx_buf();
962 size_t len
= std::min(buf_len
, max_frag_len
);
964 buf
->set_zc_info(va
, iova
, len
);
965 m
= buf
->rte_mbuf_p();
971 * Allocates a single rte_mbuf and copies a given data into it.
973 * @param qp dpdk_qp handle (in)
974 * @param m New allocated rte_mbuf (out)
975 * @param data Data to copy from (in)
976 * @param buf_len length of the data to copy (in)
978 * @return The actual number of bytes that has been copied
980 static size_t copy_one_data_buf(
981 dpdk_qp
& qp
, rte_mbuf
*& m
, char* data
, size_t buf_len
)
983 tx_buf
* buf
= qp
.get_tx_buf();
988 size_t len
= std::min(buf_len
, inline_mbuf_data_size
);
990 m
= buf
->rte_mbuf_p();
996 qp
._stats
.tx
.good
.update_copy_stats(1, len
);
998 memcpy(rte_pktmbuf_mtod(m
, void*), data
, len
);
1004 tx_buf(tx_buf_factory
& fc
) : _fc(fc
) {
1006 _buf_iova
= _mbuf
.buf_iova
;
1007 _data_off
= _mbuf
.data_off
;
1010 rte_mbuf
* rte_mbuf_p() { return &_mbuf
; }
1012 void set_zc_info(void* va
, rte_iova_t iova
, size_t len
) {
1014 _mbuf
.data_len
= len
;
1015 _mbuf
.pkt_len
= len
;
1017 // Set the mbuf to point to our data
1018 _mbuf
.buf_addr
= va
;
1019 _mbuf
.buf_iova
= iova
;
1026 // If this mbuf was the last in a cluster and contains an
1027 // original packet object then call the destructor of the
1028 // original packet object.
1032 // Reset the std::optional. This in particular is going
1033 // to call the "packet"'s destructor and reset the
1034 // "optional" state to "nonengaged".
1038 } else if (!_is_zc
) {
1042 // Restore the rte_mbuf fields we trashed in set_zc_info()
1043 _mbuf
.buf_iova
= _buf_iova
;
1044 _mbuf
.buf_addr
= rte_mbuf_to_baddr(&_mbuf
);
1045 _mbuf
.data_off
= _data_off
;
1051 struct rte_mbuf
*m
= &_mbuf
, *m_next
;
1053 while (m
!= nullptr) {
1055 rte_pktmbuf_reset(m
);
1061 void set_packet(packet
&& p
) {
1066 struct rte_mbuf _mbuf
;
1067 MARKER private_start
;
1068 std::optional
<packet
> _p
;
1069 rte_iova_t _buf_iova
;
1071 // TRUE if underlying mbuf has been used in the zero-copy flow
1072 bool _is_zc
= false;
1073 // buffers' factory the buffer came from
1074 tx_buf_factory
& _fc
;
1078 class tx_buf_factory
{
1080 // Number of buffers to free in each GC iteration:
1081 // We want the buffers to be allocated from the mempool as many as
1084 // On the other hand if there is no Tx for some time we want the
1085 // completions to be eventually handled. Thus we choose the smallest
1086 // possible packets count number here.
1088 static constexpr int gc_count
= 1;
1090 tx_buf_factory(uint16_t qid
) {
1091 using namespace memory
;
1093 sstring name
= sstring(pktmbuf_pool_name
) + to_sstring(qid
) + "_tx";
1094 printf("Creating Tx mbuf pool '%s' [%u mbufs] ...\n",
1095 name
.c_str(), mbufs_per_queue_tx
);
1097 if (HugetlbfsMemBackend
) {
1100 _xmem
.reset(dpdk_qp::alloc_mempool_xmem(mbufs_per_queue_tx
,
1104 printf("Can't allocate a memory for Tx buffers\n");
1109 // We are going to push the buffers from the mempool into
1110 // the circular_buffer and then poll them from there anyway, so
1111 // we prefer to make a mempool non-atomic in this case.
1114 rte_mempool_create_empty(name
.c_str(),
1118 sizeof(struct rte_pktmbuf_pool_private
),
1119 rte_socket_id(), 0);
1121 rte_pktmbuf_pool_init(_pool
, nullptr);
1123 if (rte_mempool_populate_virt(_pool
, (char*)(_xmem
.get()),
1124 xmem_size
, page_size
,
1125 nullptr, nullptr) <= 0) {
1126 printf("Failed to populate mempool for Tx\n");
1130 rte_mempool_obj_iter(_pool
, rte_pktmbuf_init
, nullptr);
1135 rte_mempool_create(name
.c_str(),
1136 mbufs_per_queue_tx
, inline_mbuf_size
,
1138 sizeof(struct rte_pktmbuf_pool_private
),
1139 rte_pktmbuf_pool_init
, nullptr,
1140 rte_pktmbuf_init
, nullptr,
1141 rte_socket_id(), 0);
1145 printf("Failed to create mempool for Tx\n");
1150 // Fill the factory with the buffers from the mempool allocated
1157 * @note Should not be called if there are no free tx_buf's
1159 * @return a free tx_buf object
1162 // Take completed from the HW first
1163 tx_buf
*pkt
= get_one_completed();
1165 if (HugetlbfsMemBackend
) {
1173 // If there are no completed at the moment - take from the
1176 if (_ring
.empty()) {
1186 void put(tx_buf
* buf
) {
1187 if (HugetlbfsMemBackend
) {
1190 _ring
.push_back(buf
);
1194 for (int cnt
= 0; cnt
< gc_count
; ++cnt
) {
1195 auto tx_buf_p
= get_one_completed();
1207 * Fill the mbufs circular buffer: after this the _pool will become
1208 * empty. We will use it to catch the completed buffers:
1210 * - Underlying PMD drivers will "free" the mbufs once they are
1212 * - We will poll the _pktmbuf_pool_tx till it's empty and release
1213 * all the buffers from the freed mbufs.
1215 void init_factory() {
1216 while (rte_mbuf
* mbuf
= rte_pktmbuf_alloc(_pool
)) {
1217 _ring
.push_back(new(tx_buf::me(mbuf
)) tx_buf
{*this});
1222 * PMD puts the completed buffers back into the mempool they have
1223 * originally come from.
1225 * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
1226 * rte_pktmbuf_reset() here again.
1228 * @return a single tx_buf that has been completed by HW.
1230 tx_buf
* get_one_completed() {
1231 return tx_buf::me(rte_pktmbuf_alloc(_pool
));
1235 std::vector
<tx_buf
*> _ring
;
1236 rte_mempool
* _pool
= nullptr;
1237 std::unique_ptr
<void, free_deleter
> _xmem
;
1241 explicit dpdk_qp(dpdk_device
* dev
, uint16_t qid
,
1242 const std::string stats_plugin_name
);
1244 virtual void rx_start() override
;
1245 virtual future
<> send(packet p
) override
{
1248 virtual ~dpdk_qp() {}
1250 virtual uint32_t send(circular_buffer
<packet
>& pb
) override
{
1251 if (HugetlbfsMemBackend
) {
1253 return _send(pb
, [&] (packet
&& p
) {
1254 return tx_buf::from_packet_zc(std::move(p
), *this);
1258 return _send(pb
, [&](packet
&& p
) {
1259 return tx_buf::from_packet_copy(std::move(p
), *this);
1264 dpdk_device
& port() const { return *_dev
; }
1265 tx_buf
* get_tx_buf() { return _tx_buf_factory
.get(); }
1268 template <class Func
>
1269 uint32_t _send(circular_buffer
<packet
>& pb
, Func packet_to_tx_buf_p
) {
1270 if (_tx_burst
.size() == 0) {
1271 for (auto&& p
: pb
) {
1272 // TODO: assert() in a fast path! Remove me ASAP!
1275 tx_buf
* buf
= packet_to_tx_buf_p(std::move(p
));
1280 _tx_burst
.push_back(buf
->rte_mbuf_p());
1284 uint16_t sent
= rte_eth_tx_burst(_dev
->port_idx(), _qid
,
1285 _tx_burst
.data() + _tx_burst_idx
,
1286 _tx_burst
.size() - _tx_burst_idx
);
1288 uint64_t nr_frags
= 0, bytes
= 0;
1290 for (int i
= 0; i
< sent
; i
++) {
1291 rte_mbuf
* m
= _tx_burst
[_tx_burst_idx
+ i
];
1292 bytes
+= m
->pkt_len
;
1293 nr_frags
+= m
->nb_segs
;
1297 _stats
.tx
.good
.update_frags_stats(nr_frags
, bytes
);
1299 _tx_burst_idx
+= sent
;
1301 if (_tx_burst_idx
== _tx_burst
.size()) {
1310 * Allocate a new data buffer and set the mbuf to point to it.
1312 * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
1313 * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
1316 * @param m mbuf to update
1318 static bool refill_rx_mbuf(rte_mbuf
* m
, size_t size
= mbuf_data_size
) {
1321 if (posix_memalign((void**)&data
, size
, size
)) {
1325 rte_iova_t iova
= rte_mem_virt2iova(data
);
1328 // Set the mbuf to point to our data.
1330 // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
1331 // points to the private data of RTE_PKTMBUF_HEADROOM before the
1332 // actual data buffer.
1334 m
->buf_addr
= data
- RTE_PKTMBUF_HEADROOM
;
1335 m
->buf_iova
= iova
- RTE_PKTMBUF_HEADROOM
;
1339 static bool init_noninline_rx_mbuf(rte_mbuf
* m
,
1340 size_t size
= mbuf_data_size
) {
1341 if (!refill_rx_mbuf(m
, size
)) {
1344 // The below fields stay constant during the execution.
1345 m
->buf_len
= size
+ RTE_PKTMBUF_HEADROOM
;
1346 m
->data_off
= RTE_PKTMBUF_HEADROOM
;
1350 bool init_rx_mbuf_pool();
1353 bool refill_one_cluster(rte_mbuf
* head
);
1356 * Allocates a memory chunk to accommodate the given number of buffers of
1357 * the given size and fills a vector with underlying physical pages.
1359 * The chunk is going to be used as an external memory buffer of the DPDK
1362 * The chunk size if calculated using get_mempool_xmem_size() function.
1364 * @param num_bufs Number of buffers (in)
1365 * @param buf_sz Size of each buffer (in)
1366 * @param xmem_size Size of allocated memory chunk (out)
1368 * @return a virtual address of the allocated memory chunk or nullptr in
1369 * case of a failure.
1371 static void* alloc_mempool_xmem(uint16_t num_bufs
, uint16_t buf_sz
,
1375 * Polls for a burst of incoming packets. This function will not block and
1376 * will immediately return after processing all available packets.
1379 bool poll_rx_once();
1382 * Translates an rte_mbuf's into net::packet and feeds them to _rx_stream.
1384 * @param bufs An array of received rte_mbuf's
1385 * @param count Number of buffers in the bufs[]
1387 void process_packets(struct rte_mbuf
**bufs
, uint16_t count
);
1390 * Translate rte_mbuf into the "packet".
1391 * @param m mbuf to translate
1393 * @return a "optional" object representing the newly received data if in an
1394 * "engaged" state or an error if in a "disengaged" state.
1396 std::optional
<packet
> from_mbuf(rte_mbuf
* m
);
1399 * Transform an LRO rte_mbuf cluster into the "packet" object.
1400 * @param m HEAD of the mbufs' cluster to transform
1402 * @return a "optional" object representing the newly received LRO packet if
1403 * in an "engaged" state or an error if in a "disengaged" state.
1405 std::optional
<packet
> from_mbuf_lro(rte_mbuf
* m
);
1410 rte_mempool
*_pktmbuf_pool_rx
;
1411 std::vector
<rte_mbuf
*> _rx_free_pkts
;
1412 std::vector
<rte_mbuf
*> _rx_free_bufs
;
1413 std::vector
<fragment
> _frags
;
1414 std::vector
<char*> _bufs
;
1415 size_t _num_rx_free_segs
= 0;
1416 reactor::poller _rx_gc_poller
;
1417 std::unique_ptr
<void, free_deleter
> _rx_xmem
;
1418 tx_buf_factory _tx_buf_factory
;
1419 std::optional
<reactor::poller
> _rx_poller
;
1420 reactor::poller _tx_gc_poller
;
1421 std::vector
<rte_mbuf
*> _tx_burst
;
1422 uint16_t _tx_burst_idx
= 0;
1423 static constexpr phys_addr_t page_mask
= ~(memory::page_size
- 1);
1426 int dpdk_device::init_port_start()
1428 assert(_port_idx
< rte_eth_dev_count_avail());
1430 rte_eth_dev_info_get(_port_idx
, &_dev_info
);
1433 // This is a workaround for a missing handling of a HW limitation in the
1434 // DPDK i40e driver. This and all related to _is_i40e_device code should be
1435 // removed once this handling is added.
1437 if (sstring("rte_i40evf_pmd") == _dev_info
.driver_name
||
1438 sstring("rte_i40e_pmd") == _dev_info
.driver_name
) {
1439 printf("Device is an Intel's 40G NIC. Enabling 8 fragments hack!\n");
1440 _is_i40e_device
= true;
1443 if (std::string("rte_vmxnet3_pmd") == _dev_info
.driver_name
) {
1444 printf("Device is a VMWare Virtual NIC. Enabling 16 fragments hack!\n");
1445 _is_vmxnet3_device
= true;
1449 // Another workaround: this time for a lack of number of RSS bits.
1450 // ixgbe PF NICs support up to 16 RSS queues.
1451 // ixgbe VF NICs support up to 4 RSS queues.
1452 // i40e PF NICs support up to 64 RSS queues.
1453 // i40e VF NICs support up to 16 RSS queues.
1455 if (sstring("rte_ixgbe_pmd") == _dev_info
.driver_name
) {
1456 _dev_info
.max_rx_queues
= std::min(_dev_info
.max_rx_queues
, (uint16_t)16);
1457 } else if (sstring("rte_ixgbevf_pmd") == _dev_info
.driver_name
) {
1458 _dev_info
.max_rx_queues
= std::min(_dev_info
.max_rx_queues
, (uint16_t)4);
1459 } else if (sstring("rte_i40e_pmd") == _dev_info
.driver_name
) {
1460 _dev_info
.max_rx_queues
= std::min(_dev_info
.max_rx_queues
, (uint16_t)64);
1461 } else if (sstring("rte_i40evf_pmd") == _dev_info
.driver_name
) {
1462 _dev_info
.max_rx_queues
= std::min(_dev_info
.max_rx_queues
, (uint16_t)16);
1465 // Hardware offload capabilities
1466 // https://github.com/DPDK/dpdk/blob/v19.05/lib/librte_ethdev/rte_ethdev.h#L993-L1074
1468 // We want to support all available offload features
1469 // TODO: below features are implemented in 17.05, should support new ones
1470 const uint64_t tx_offloads_wanted
=
1471 DEV_TX_OFFLOAD_VLAN_INSERT
|
1472 DEV_TX_OFFLOAD_IPV4_CKSUM
|
1473 DEV_TX_OFFLOAD_UDP_CKSUM
|
1474 DEV_TX_OFFLOAD_TCP_CKSUM
|
1475 DEV_TX_OFFLOAD_SCTP_CKSUM
|
1476 DEV_TX_OFFLOAD_TCP_TSO
|
1477 DEV_TX_OFFLOAD_UDP_TSO
|
1478 DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM
|
1479 DEV_TX_OFFLOAD_QINQ_INSERT
|
1480 DEV_TX_OFFLOAD_VXLAN_TNL_TSO
|
1481 DEV_TX_OFFLOAD_GRE_TNL_TSO
|
1482 DEV_TX_OFFLOAD_IPIP_TNL_TSO
|
1483 DEV_TX_OFFLOAD_GENEVE_TNL_TSO
|
1484 DEV_TX_OFFLOAD_MACSEC_INSERT
;
1486 _dev_info
.default_txconf
.offloads
=
1487 _dev_info
.tx_offload_capa
& tx_offloads_wanted
;
1489 /* for port configuration all features are off by default */
1490 rte_eth_conf port_conf
= { 0 };
1492 /* setting tx offloads for port */
1493 port_conf
.txmode
.offloads
= _dev_info
.default_txconf
.offloads
;
1495 printf("Port %d: max_rx_queues %d max_tx_queues %d\n",
1496 _port_idx
, _dev_info
.max_rx_queues
, _dev_info
.max_tx_queues
);
1498 _num_queues
= std::min({_num_queues
, _dev_info
.max_rx_queues
, _dev_info
.max_tx_queues
});
1500 printf("Port %d: using %d %s\n", _port_idx
, _num_queues
,
1501 (_num_queues
> 1) ? "queues" : "queue");
1503 // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
1504 // Even if port has a single queue we still want the RSS feature to be
1505 // available in order to make HW calculate RSS hash for us.
1506 if (smp::count
> 1) {
1507 if (_dev_info
.hash_key_size
== 40) {
1508 _rss_key
= default_rsskey_40bytes
;
1509 } else if (_dev_info
.hash_key_size
== 52) {
1510 _rss_key
= default_rsskey_52bytes
;
1511 } else if (_dev_info
.hash_key_size
!= 0) {
1513 rte_exit(EXIT_FAILURE
,
1514 "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
1515 _port_idx
, _dev_info
.hash_key_size
);
1517 _rss_key
= default_rsskey_40bytes
;
1518 _dev_info
.hash_key_size
= 40;
1521 port_conf
.rxmode
.mq_mode
= ETH_MQ_RX_RSS
;
1522 /* enable all supported rss offloads */
1523 port_conf
.rx_adv_conf
.rss_conf
.rss_hf
= _dev_info
.flow_type_rss_offloads
;
1524 if (_dev_info
.hash_key_size
) {
1525 port_conf
.rx_adv_conf
.rss_conf
.rss_key
= const_cast<uint8_t *>(_rss_key
.data());
1526 port_conf
.rx_adv_conf
.rss_conf
.rss_key_len
= _dev_info
.hash_key_size
;
1529 port_conf
.rxmode
.mq_mode
= ETH_MQ_RX_NONE
;
1532 if (_num_queues
> 1) {
1533 if (_dev_info
.reta_size
) {
1534 // RETA size should be a power of 2
1535 assert((_dev_info
.reta_size
& (_dev_info
.reta_size
- 1)) == 0);
1537 // Set the RSS table to the correct size
1538 _redir_table
.resize(_dev_info
.reta_size
);
1539 _rss_table_bits
= std::lround(std::log2(_dev_info
.reta_size
));
1540 printf("Port %d: RSS table size is %d\n",
1541 _port_idx
, _dev_info
.reta_size
);
1543 _rss_table_bits
= std::lround(std::log2(_dev_info
.max_rx_queues
));
1546 _redir_table
.push_back(0);
1549 // Set Rx VLAN stripping
1550 if (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_VLAN_STRIP
) {
1551 port_conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_VLAN_STRIP
;
1554 #ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
1556 if (_use_lro
&& (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_TCP_LRO
)) {
1557 printf("LRO is on\n");
1558 port_conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_TCP_LRO
;
1559 _hw_features
.rx_lro
= true;
1562 printf("LRO is off\n");
1564 // Check that all CSUM features are either all set all together or not set
1565 // all together. If this assumption breaks we need to rework the below logic
1566 // by splitting the csum offload feature bit into separate bits for IPv4,
1568 assert(((_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_IPV4_CKSUM
) &&
1569 (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_UDP_CKSUM
) &&
1570 (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_TCP_CKSUM
)) ||
1571 (!(_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_IPV4_CKSUM
) &&
1572 !(_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_UDP_CKSUM
) &&
1573 !(_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_TCP_CKSUM
)));
1575 // Set Rx checksum checking
1576 if ( (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_IPV4_CKSUM
) &&
1577 (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_UDP_CKSUM
) &&
1578 (_dev_info
.rx_offload_capa
& DEV_RX_OFFLOAD_TCP_CKSUM
)) {
1579 printf("RX checksum offload supported\n");
1580 port_conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_CHECKSUM
;
1581 _hw_features
.rx_csum_offload
= 1;
1584 if ((_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_IPV4_CKSUM
)) {
1585 printf("TX ip checksum offload supported\n");
1586 _hw_features
.tx_csum_ip_offload
= 1;
1589 // TSO is supported starting from DPDK v1.8
1590 if (_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_TCP_TSO
) {
1591 printf("TSO is supported\n");
1592 _hw_features
.tx_tso
= 1;
1595 // There is no UFO support in the PMDs yet.
1597 if (_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_UDP_TSO
) {
1598 printf("UFO is supported\n");
1599 _hw_features
.tx_ufo
= 1;
1603 // Check that Tx TCP and UDP CSUM features are either all set all together
1604 // or not set all together. If this assumption breaks we need to rework the
1605 // below logic by splitting the csum offload feature bit into separate bits
1607 assert(((_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_UDP_CKSUM
) &&
1608 (_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_TCP_CKSUM
)) ||
1609 (!(_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_UDP_CKSUM
) &&
1610 !(_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_TCP_CKSUM
)));
1612 if ( (_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_UDP_CKSUM
) &&
1613 (_dev_info
.tx_offload_capa
& DEV_TX_OFFLOAD_TCP_CKSUM
)) {
1614 printf("TX TCP&UDP checksum offload supported\n");
1615 _hw_features
.tx_csum_l4_offload
= 1;
1620 printf("Port %u init ... ", _port_idx
);
1624 * Standard DPDK port initialisation - config port, then set up
1627 if ((retval
= rte_eth_dev_configure(_port_idx
, _num_queues
, _num_queues
,
1628 &port_conf
)) != 0) {
1632 //rte_eth_promiscuous_enable(port_num);
1638 void dpdk_device::set_hw_flow_control()
1640 // Read the port's current/default flow control settings
1641 struct rte_eth_fc_conf fc_conf
;
1642 auto ret
= rte_eth_dev_flow_ctrl_get(_port_idx
, &fc_conf
);
1644 if (ret
== -ENOTSUP
) {
1649 rte_exit(EXIT_FAILURE
, "Port %u: failed to get hardware flow control settings: (error %d)\n", _port_idx
, ret
);
1653 fc_conf
.mode
= RTE_FC_FULL
;
1655 fc_conf
.mode
= RTE_FC_NONE
;
1658 ret
= rte_eth_dev_flow_ctrl_set(_port_idx
, &fc_conf
);
1659 if (ret
== -ENOTSUP
) {
1664 rte_exit(EXIT_FAILURE
, "Port %u: failed to set hardware flow control (error %d)\n", _port_idx
, ret
);
1667 printf("Port %u: %s HW FC\n", _port_idx
,
1668 (_enable_fc
? "Enabling" : "Disabling"));
1672 printf("Port %u: Changing HW FC settings is not supported\n", _port_idx
);
1675 void dpdk_device::init_port_fini()
1677 // Changing FC requires HW reset, so set it before the port is initialized.
1678 set_hw_flow_control();
1680 if (rte_eth_dev_start(_port_idx
) < 0) {
1681 rte_exit(EXIT_FAILURE
, "Cannot start port %d\n", _port_idx
);
1684 /* need to defer initialize xstats since NIC specific xstat entries
1685 show up only after port initization */
1688 _stats_collector
.set_callback([&] {
1689 rte_eth_stats rte_stats
= {};
1690 int rc
= rte_eth_stats_get(_port_idx
, &rte_stats
);
1693 printf("Failed to get port statistics: %s\n", strerror(rc
));
1696 _stats
.rx
.good
.mcast
=
1697 _xstats
.get_value(dpdk_xstats::xstat_id::rx_multicast_packets
);
1698 _stats
.rx
.good
.pause_xon
=
1699 _xstats
.get_value(dpdk_xstats::xstat_id::rx_xon_packets
);
1700 _stats
.rx
.good
.pause_xoff
=
1701 _xstats
.get_value(dpdk_xstats::xstat_id::rx_xoff_packets
);
1704 _xstats
.get_value(dpdk_xstats::xstat_id::rx_crc_errors
);
1706 _xstats
.get_value(dpdk_xstats::xstat_id::rx_length_errors
) +
1707 _xstats
.get_value(dpdk_xstats::xstat_id::rx_undersize_errors
) +
1708 _xstats
.get_value(dpdk_xstats::xstat_id::rx_oversize_errors
);
1709 _stats
.rx
.bad
.total
= rte_stats
.ierrors
;
1711 _stats
.tx
.good
.pause_xon
=
1712 _xstats
.get_value(dpdk_xstats::xstat_id::tx_xon_packets
);
1713 _stats
.tx
.good
.pause_xoff
=
1714 _xstats
.get_value(dpdk_xstats::xstat_id::tx_xoff_packets
);
1716 _stats
.tx
.bad
.total
= rte_stats
.oerrors
;
1719 // TODO: replace deprecated filter api with generic flow api
1720 #pragma GCC diagnostic push
1721 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1722 if (_num_queues
> 1) {
1723 if (!rte_eth_dev_filter_supported(_port_idx
, RTE_ETH_FILTER_HASH
)) {
1724 printf("Port %d: HASH FILTER configuration is supported\n", _port_idx
);
1726 // Setup HW touse the TOEPLITZ hash function as an RSS hash function
1727 struct rte_eth_hash_filter_info info
= {};
1729 info
.info_type
= RTE_ETH_HASH_FILTER_GLOBAL_CONFIG
;
1730 info
.info
.global_conf
.hash_func
= RTE_ETH_HASH_FUNCTION_TOEPLITZ
;
1732 if (rte_eth_dev_filter_ctrl(_port_idx
, RTE_ETH_FILTER_HASH
,
1733 RTE_ETH_FILTER_SET
, &info
) < 0) {
1734 rte_exit(EXIT_FAILURE
, "Cannot set hash function on a port %d\n", _port_idx
);
1740 #pragma GCC diagnostic pop
1743 check_port_link_status();
1745 printf("Created DPDK device\n");
1748 template <bool HugetlbfsMemBackend
>
1749 void* dpdk_qp
<HugetlbfsMemBackend
>::alloc_mempool_xmem(
1750 uint16_t num_bufs
, uint16_t buf_sz
, size_t& xmem_size
)
1752 using namespace memory
;
1754 struct rte_mempool_objsz mp_obj_sz
= {};
1756 rte_mempool_calc_obj_size(buf_sz
, 0, &mp_obj_sz
);
1759 get_mempool_xmem_size(num_bufs
,
1760 mp_obj_sz
.elt_size
+ mp_obj_sz
.header_size
+
1761 mp_obj_sz
.trailer_size
,
1764 // Aligning to 2M causes the further failure in small allocations.
1765 // TODO: Check why - and fix.
1766 if (posix_memalign((void**)&xmem
, page_size
, xmem_size
)) {
1767 printf("Can't allocate %ld bytes aligned to %ld\n",
1768 xmem_size
, page_size
);
1775 template <bool HugetlbfsMemBackend
>
1776 bool dpdk_qp
<HugetlbfsMemBackend
>::init_rx_mbuf_pool()
1778 using namespace memory
;
1779 sstring name
= sstring(pktmbuf_pool_name
) + to_sstring(_qid
) + "_rx";
1781 printf("Creating Rx mbuf pool '%s' [%u mbufs] ...\n",
1782 name
.c_str(), mbufs_per_queue_rx
);
1785 // If we have a hugetlbfs memory backend we may perform a virt2phys
1786 // translation and memory is "pinned". Therefore we may provide an external
1787 // memory for DPDK pools and this way significantly reduce the memory needed
1788 // for the DPDK in this case.
1790 if (HugetlbfsMemBackend
) {
1793 _rx_xmem
.reset(alloc_mempool_xmem(mbufs_per_queue_rx
, mbuf_overhead
,
1795 if (!_rx_xmem
.get()) {
1796 printf("Can't allocate a memory for Rx buffers\n");
1801 // Don't pass single-producer/single-consumer flags to mbuf create as it
1802 // seems faster to use a cache instead.
1804 struct rte_pktmbuf_pool_private roomsz
= {};
1805 roomsz
.mbuf_data_room_size
= mbuf_data_size
+ RTE_PKTMBUF_HEADROOM
;
1807 rte_mempool_create_empty(name
.c_str(),
1808 mbufs_per_queue_rx
, mbuf_overhead
,
1810 sizeof(struct rte_pktmbuf_pool_private
),
1811 rte_socket_id(), 0);
1812 if (!_pktmbuf_pool_rx
) {
1813 printf("Failed to create mempool for Rx\n");
1817 rte_pktmbuf_pool_init(_pktmbuf_pool_rx
, as_cookie(roomsz
));
1819 if (rte_mempool_populate_virt(_pktmbuf_pool_rx
,
1820 (char*)(_rx_xmem
.get()), xmem_size
,
1822 nullptr, nullptr) < 0) {
1823 printf("Failed to populate mempool for Rx\n");
1827 rte_mempool_obj_iter(_pktmbuf_pool_rx
, rte_pktmbuf_init
, nullptr);
1829 // reserve the memory for Rx buffers containers
1830 _rx_free_pkts
.reserve(mbufs_per_queue_rx
);
1831 _rx_free_bufs
.reserve(mbufs_per_queue_rx
);
1834 // 1) Pull all entries from the pool.
1835 // 2) Bind data buffers to each of them.
1836 // 3) Return them back to the pool.
1838 for (int i
= 0; i
< mbufs_per_queue_rx
; i
++) {
1839 rte_mbuf
* m
= rte_pktmbuf_alloc(_pktmbuf_pool_rx
);
1841 _rx_free_bufs
.push_back(m
);
1844 for (auto&& m
: _rx_free_bufs
) {
1845 if (!init_noninline_rx_mbuf(m
)) {
1846 printf("Failed to allocate data buffers for Rx ring. "
1847 "Consider increasing the amount of memory.\n");
1852 rte_mempool_put_bulk(_pktmbuf_pool_rx
, (void**)_rx_free_bufs
.data(),
1853 _rx_free_bufs
.size());
1855 _rx_free_bufs
.clear();
1857 struct rte_pktmbuf_pool_private roomsz
= {};
1858 roomsz
.mbuf_data_room_size
= inline_mbuf_data_size
+ RTE_PKTMBUF_HEADROOM
;
1860 rte_mempool_create(name
.c_str(),
1861 mbufs_per_queue_rx
, inline_mbuf_size
,
1863 sizeof(struct rte_pktmbuf_pool_private
),
1864 rte_pktmbuf_pool_init
, as_cookie(roomsz
),
1865 rte_pktmbuf_init
, nullptr,
1866 rte_socket_id(), 0);
1869 return _pktmbuf_pool_rx
!= nullptr;
1872 // Map DMA address explicitly.
1873 // XXX: does NOT work with Mellanox NICs as they use IB libs instead of VFIO.
1874 template <bool HugetlbfsMemBackend
>
1875 bool dpdk_qp
<HugetlbfsMemBackend
>::map_dma()
1877 auto m
= memory::get_memory_layout();
1878 rte_iova_t iova
= rte_mem_virt2iova((const void*)m
.start
);
1880 return rte_vfio_dma_map(m
.start
, iova
, m
.end
- m
.start
) == 0;
1883 void dpdk_device::check_port_link_status()
1885 using namespace std::literals::chrono_literals
;
1887 constexpr auto check_interval
= 100ms
;
1889 std::cout
<< "\nChecking link status " << std::endl
;
1890 auto t
= new timer
<>;
1891 t
->set_callback([this, count
, t
] () mutable {
1892 const int max_check_time
= 90; /* 9s (90 * 100ms) in total */
1893 struct rte_eth_link link
;
1894 memset(&link
, 0, sizeof(link
));
1895 rte_eth_link_get_nowait(_port_idx
, &link
);
1897 if (link
.link_status
) {
1899 "done\nPort " << static_cast<unsigned>(_port_idx
) <<
1900 " Link Up - speed " << link
.link_speed
<<
1901 " Mbps - " << ((link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) ?
1902 ("full-duplex") : ("half-duplex\n")) <<
1904 _link_ready_promise
.set_value();
1906 // We may start collecting statistics only after the Link is UP.
1907 _stats_collector
.arm_periodic(2s
);
1908 } else if (count
++ < max_check_time
) {
1909 std::cout
<< "." << std::flush
;
1912 std::cout
<< "done\nPort " << _port_idx
<< " Link Down" << std::endl
;
1917 t
->arm_periodic(check_interval
);
1920 // This function uses offsetof with non POD types.
1921 #pragma GCC diagnostic push
1922 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
1924 template <bool HugetlbfsMemBackend
>
1925 dpdk_qp
<HugetlbfsMemBackend
>::dpdk_qp(dpdk_device
* dev
, uint16_t qid
,
1926 const std::string stats_plugin_name
)
1927 : qp(true, stats_plugin_name
, qid
), _dev(dev
), _qid(qid
),
1928 _rx_gc_poller(reactor::poller::simple([&] { return rx_gc(); })),
1929 _tx_buf_factory(qid
),
1930 _tx_gc_poller(reactor::poller::simple([&] { return _tx_buf_factory
.gc(); }))
1932 if (!init_rx_mbuf_pool()) {
1933 rte_exit(EXIT_FAILURE
, "Cannot initialize mbuf pools\n");
1936 if (HugetlbfsMemBackend
&& !map_dma()) {
1937 rte_exit(EXIT_FAILURE
, "Cannot map DMA\n");
1940 static_assert(offsetof(class tx_buf
, private_end
) -
1941 offsetof(class tx_buf
, private_start
) <= RTE_PKTMBUF_HEADROOM
,
1942 "RTE_PKTMBUF_HEADROOM is less than dpdk_qp::tx_buf size! "
1943 "Increase the headroom size in the DPDK configuration");
1944 static_assert(offsetof(class tx_buf
, _mbuf
) == 0,
1945 "There is a pad at the beginning of the tx_buf before _mbuf "
1947 static_assert((inline_mbuf_data_size
& (inline_mbuf_data_size
- 1)) == 0,
1948 "inline_mbuf_data_size has to be a power of two!");
1950 if (rte_eth_rx_queue_setup(_dev
->port_idx(), _qid
, default_ring_size
,
1951 rte_eth_dev_socket_id(_dev
->port_idx()),
1952 _dev
->def_rx_conf(), _pktmbuf_pool_rx
) < 0) {
1953 rte_exit(EXIT_FAILURE
, "Cannot initialize rx queue\n");
1956 if (rte_eth_tx_queue_setup(_dev
->port_idx(), _qid
, default_ring_size
,
1957 rte_eth_dev_socket_id(_dev
->port_idx()), _dev
->def_tx_conf()) < 0) {
1958 rte_exit(EXIT_FAILURE
, "Cannot initialize tx queue\n");
1961 // Register error statistics: Rx total and checksum errors
1962 namespace sm
= seastar::metrics
;
1963 _metrics
.add_group(_stats_plugin_name
, {
1964 sm::make_derive(_queue_name
+ "_rx_csum_errors", _stats
.rx
.bad
.csum
,
1965 sm::description("Counts a number of packets received by this queue that have a bad CSUM value. "
1966 "A non-zero value of this metric usually indicates a HW issue, e.g. a bad cable.")),
1968 sm::make_derive(_queue_name
+ "_rx_errors", _stats
.rx
.bad
.total
,
1969 sm::description("Counts a total number of errors in the ingress path for this queue: CSUM errors, etc.")),
1971 sm::make_derive(_queue_name
+ "_rx_no_memory_errors", _stats
.rx
.bad
.no_mem
,
1972 sm::description("Counts a number of ingress packets received by this HW queue but dropped by the SW due to low memory. "
1973 "A non-zero value indicates that seastar doesn't have enough memory to handle the packet reception or the memory is too fragmented.")),
1977 #pragma GCC diagnostic pop
1979 template <bool HugetlbfsMemBackend
>
1980 void dpdk_qp
<HugetlbfsMemBackend
>::rx_start() {
1981 _rx_poller
= reactor::poller::simple([&] { return poll_rx_once(); });
1985 inline std::optional
<packet
>
1986 dpdk_qp
<false>::from_mbuf_lro(rte_mbuf
* m
)
1989 // Try to allocate a buffer for the whole packet's data.
1990 // If we fail - construct the packet from mbufs.
1991 // If we succeed - copy the data into this buffer, create a packet based on
1992 // this buffer and return the mbuf to its pool.
1994 auto pkt_len
= rte_pktmbuf_pkt_len(m
);
1995 char* buf
= (char*)malloc(pkt_len
);
1997 // Copy the contents of the packet into the buffer we've just allocated
1999 for (rte_mbuf
* m1
= m
; m1
!= nullptr; m1
= m1
->next
) {
2000 char* data
= rte_pktmbuf_mtod(m1
, char*);
2001 auto len
= rte_pktmbuf_data_len(m1
);
2003 rte_memcpy(buf
+ offset
, data
, len
);
2007 rte_pktmbuf_free(m
);
2009 return packet(fragment
{buf
, pkt_len
}, make_free_deleter(buf
));
2012 // Drop if allocation failed
2013 rte_pktmbuf_free(m
);
2015 return std::nullopt
;
2019 inline std::optional
<packet
>
2020 dpdk_qp
<false>::from_mbuf(rte_mbuf
* m
)
2022 if (!_dev
->hw_features_ref().rx_lro
|| rte_pktmbuf_is_contiguous(m
)) {
2024 // Try to allocate a buffer for packet's data. If we fail - give the
2025 // application an mbuf itself. If we succeed - copy the data into this
2026 // buffer, create a packet based on this buffer and return the mbuf to
2029 auto len
= rte_pktmbuf_data_len(m
);
2030 char* buf
= (char*)malloc(len
);
2033 // Drop if allocation failed
2034 rte_pktmbuf_free(m
);
2036 return std::nullopt
;
2038 rte_memcpy(buf
, rte_pktmbuf_mtod(m
, char*), len
);
2039 rte_pktmbuf_free(m
);
2041 return packet(fragment
{buf
, len
}, make_free_deleter(buf
));
2044 return from_mbuf_lro(m
);
2049 inline std::optional
<packet
>
2050 dpdk_qp
<true>::from_mbuf_lro(rte_mbuf
* m
)
2055 for (; m
!= nullptr; m
= m
->next
) {
2056 char* data
= rte_pktmbuf_mtod(m
, char*);
2058 _frags
.emplace_back(fragment
{data
, rte_pktmbuf_data_len(m
)});
2059 _bufs
.push_back(data
);
2062 return packet(_frags
.begin(), _frags
.end(),
2063 make_deleter(deleter(),
2064 [bufs_vec
= std::move(_bufs
)] {
2065 for (auto&& b
: bufs_vec
) {
2072 inline std::optional
<packet
> dpdk_qp
<true>::from_mbuf(rte_mbuf
* m
)
2074 _rx_free_pkts
.push_back(m
);
2075 _num_rx_free_segs
+= m
->nb_segs
;
2077 if (!_dev
->hw_features_ref().rx_lro
|| rte_pktmbuf_is_contiguous(m
)) {
2078 char* data
= rte_pktmbuf_mtod(m
, char*);
2080 return packet(fragment
{data
, rte_pktmbuf_data_len(m
)},
2081 make_free_deleter(data
));
2083 return from_mbuf_lro(m
);
2087 template <bool HugetlbfsMemBackend
>
2088 inline bool dpdk_qp
<HugetlbfsMemBackend
>::refill_one_cluster(rte_mbuf
* head
)
2090 for (; head
!= nullptr; head
= head
->next
) {
2091 if (!refill_rx_mbuf(head
)) {
2093 // If we failed to allocate a new buffer - push the rest of the
2094 // cluster back to the free_packets list for a later retry.
2096 _rx_free_pkts
.push_back(head
);
2099 _rx_free_bufs
.push_back(head
);
2105 template <bool HugetlbfsMemBackend
>
2106 bool dpdk_qp
<HugetlbfsMemBackend
>::rx_gc()
2108 if (_num_rx_free_segs
>= rx_gc_thresh
) {
2109 while (!_rx_free_pkts
.empty()) {
2111 // Use back() + pop_back() semantics to avoid an extra
2112 // _rx_free_pkts.clear() at the end of the function - clear() has a
2113 // linear complexity.
2115 auto m
= _rx_free_pkts
.back();
2116 _rx_free_pkts
.pop_back();
2118 if (!refill_one_cluster(m
)) {
2123 if (_rx_free_bufs
.size()) {
2124 rte_mempool_put_bulk(_pktmbuf_pool_rx
,
2125 (void **)_rx_free_bufs
.data(),
2126 _rx_free_bufs
.size());
2128 // TODO: assert() in a fast path! Remove me ASAP!
2129 assert(_num_rx_free_segs
>= _rx_free_bufs
.size());
2131 _num_rx_free_segs
-= _rx_free_bufs
.size();
2132 _rx_free_bufs
.clear();
2134 // TODO: assert() in a fast path! Remove me ASAP!
2135 assert((_rx_free_pkts
.empty() && !_num_rx_free_segs
) ||
2136 (!_rx_free_pkts
.empty() && _num_rx_free_segs
));
2140 return _num_rx_free_segs
>= rx_gc_thresh
;
2144 template <bool HugetlbfsMemBackend
>
2145 void dpdk_qp
<HugetlbfsMemBackend
>::process_packets(
2146 struct rte_mbuf
**bufs
, uint16_t count
)
2148 uint64_t nr_frags
= 0, bytes
= 0;
2150 for (uint16_t i
= 0; i
< count
; i
++) {
2151 struct rte_mbuf
*m
= bufs
[i
];
2154 std::optional
<packet
> p
= from_mbuf(m
);
2156 // Drop the packet if translation above has failed
2158 _stats
.rx
.bad
.inc_no_mem();
2162 nr_frags
+= m
->nb_segs
;
2163 bytes
+= m
->pkt_len
;
2165 // Set stripped VLAN value if available
2166 if ((m
->ol_flags
& PKT_RX_VLAN_STRIPPED
) &&
2167 (m
->ol_flags
& PKT_RX_VLAN
)) {
2169 oi
.vlan_tci
= m
->vlan_tci
;
2172 if (_dev
->hw_features().rx_csum_offload
) {
2173 if (m
->ol_flags
& (PKT_RX_IP_CKSUM_BAD
| PKT_RX_L4_CKSUM_BAD
)) {
2174 // Packet with bad checksum, just drop it.
2175 _stats
.rx
.bad
.inc_csum_err();
2178 // Note that when _hw_features.rx_csum_offload is on, the receive
2179 // code for ip, tcp and udp will assume they don't need to check
2180 // the checksum again, because we did this here.
2183 (*p
).set_offload_info(oi
);
2184 if (m
->ol_flags
& PKT_RX_RSS_HASH
) {
2185 (*p
).set_rss_hash(m
->hash
.rss
);
2188 _dev
->l2receive(std::move(*p
));
2191 _stats
.rx
.good
.update_pkts_bunch(count
);
2192 _stats
.rx
.good
.update_frags_stats(nr_frags
, bytes
);
2194 if (!HugetlbfsMemBackend
) {
2195 _stats
.rx
.good
.copy_frags
= _stats
.rx
.good
.nr_frags
;
2196 _stats
.rx
.good
.copy_bytes
= _stats
.rx
.good
.bytes
;
2200 template <bool HugetlbfsMemBackend
>
2201 bool dpdk_qp
<HugetlbfsMemBackend
>::poll_rx_once()
2203 struct rte_mbuf
*buf
[packet_read_size
];
2206 uint16_t rx_count
= rte_eth_rx_burst(_dev
->port_idx(), _qid
,
2207 buf
, packet_read_size
);
2209 /* Now process the NIC packets read */
2210 if (likely(rx_count
> 0)) {
2211 process_packets(buf
, rx_count
);
2217 void dpdk_device::set_rss_table()
2219 if (_dev_info
.reta_size
== 0)
2222 int reta_conf_size
=
2223 std::max(1, _dev_info
.reta_size
/ RTE_RETA_GROUP_SIZE
);
2224 std::vector
<rte_eth_rss_reta_entry64
> reta_conf(reta_conf_size
);
2226 // Configure the HW indirection table
2228 for (auto& x
: reta_conf
) {
2230 for (auto& r
: x
.reta
) {
2231 r
= i
++ % _num_queues
;
2235 if (rte_eth_dev_rss_reta_update(_port_idx
, reta_conf
.data(), _dev_info
.reta_size
)) {
2236 rte_exit(EXIT_FAILURE
, "Port %d: Failed to update an RSS indirection table", _port_idx
);
2239 // Fill our local indirection table. Make it in a separate loop to keep things simple.
2241 for (auto& r
: _redir_table
) {
2242 r
= i
++ % _num_queues
;
2246 std::unique_ptr
<qp
> dpdk_device::init_local_queue(const program_options::option_group
& opts
, uint16_t qid
) {
2247 auto net_opts
= dynamic_cast<const net::native_stack_options
*>(&opts
);
2250 std::unique_ptr
<qp
> qp
;
2251 if (net_opts
->_hugepages
) {
2252 qp
= std::make_unique
<dpdk_qp
<true>>(this, qid
,
2253 _stats_plugin_name
+ "-" + _stats_plugin_inst
);
2255 qp
= std::make_unique
<dpdk_qp
<false>>(this, qid
,
2256 _stats_plugin_name
+ "-" + _stats_plugin_inst
);
2259 // FIXME: future is discarded
2260 (void)smp::submit_to(_home_cpu
, [this] () mutable {
2261 if (++_queues_ready
== _num_queues
) {
2269 /******************************** Interface functions *************************/
2271 std::unique_ptr
<net::device
> create_dpdk_net_device(
2273 uint16_t num_queues
,
2277 static bool called
= false;
2280 assert(dpdk::eal::initialized
);
2284 // Check that we have at least one DPDK-able port
2285 if (rte_eth_dev_count_avail() == 0) {
2286 rte_exit(EXIT_FAILURE
, "No Ethernet ports - bye\n");
2288 printf("ports number: %d\n", rte_eth_dev_count_avail());
2291 return std::make_unique
<dpdk::dpdk_device
>(port_idx
, num_queues
, use_lro
,
2295 std::unique_ptr
<net::device
> create_dpdk_net_device(
2296 const hw_config
& hw_cfg
)
2298 return create_dpdk_net_device(*hw_cfg
.port_index
, smp::count
, hw_cfg
.lro
, hw_cfg
.hw_fc
);
2304 #include <seastar/net/dpdk.hh>
2305 #endif // SEASTAR_HAVE_DPDK
2307 namespace seastar::net
{
2309 dpdk_options::dpdk_options(program_options::option_group
* parent_group
)
2310 #ifdef SEASTAR_HAVE_DPDK
2311 : program_options::option_group(parent_group
, "DPDK net options")
2312 , dpdk_port_index(*this, "dpdk-port-index",
2315 , hw_fc(*this, "hw-fc",
2317 "Enable HW Flow Control (on / off)")
2319 : program_options::option_group(parent_group
, "DPDK net options", program_options::unused
{})
2320 , dpdk_port_index(*this, "dpdk-port-index", program_options::unused
{})
2321 , hw_fc(*this, "hw-fc", program_options::unused
{})
2326 boost::program_options::value
<std::string
>()->default_value("on"),
2327 "Enable checksum offload feature (on / off)")
2329 boost::program_options::value
<std::string
>()->default_value("on"),
2330 "Enable TCP segment offload feature (on / off)")
2332 boost::program_options::value
<std::string
>()->default_value("on"),
2333 "Enable UDP fragmentation offload feature (on / off)")