int DPDKDevice::init_port_start()
{
- ceph_assert(_port_idx < rte_eth_dev_count());
+ ceph_assert(_port_idx < rte_eth_dev_count_avail());
rte_eth_dev_info_get(_port_idx, &_dev_info);
_dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
}
- // Clear txq_flags - we want to support all available offload features
- // except for multi-mempool and refcnt'ing which we don't need
- _dev_info.default_txconf.txq_flags =
- ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT;
-
- //
- // Disable features that are not supported by port's HW
- //
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
- }
-
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
- }
-
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
- }
-
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
- }
-
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
- }
-
- if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) {
- _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
- }
+ // Hardware offload capabilities
+ // https://github.com/DPDK/dpdk/blob/v19.05/lib/librte_ethdev/rte_ethdev.h#L993-L1074
+ // We want to support all available offload features
+ // TODO: below features are implemented in 17.05, should support new ones
+ const uint64_t tx_offloads_wanted =
+ DEV_TX_OFFLOAD_VLAN_INSERT |
+ DEV_TX_OFFLOAD_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_SCTP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_UDP_TSO |
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_QINQ_INSERT |
+ DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO |
+ DEV_TX_OFFLOAD_IPIP_TNL_TSO |
+ DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
+ DEV_TX_OFFLOAD_MACSEC_INSERT;
+
+ _dev_info.default_txconf.offloads =
+ _dev_info.tx_offload_capa & tx_offloads_wanted;
/* for port configuration all features are off by default */
rte_eth_conf port_conf = { 0 };
+ /* setting tx offloads for port */
+ port_conf.txmode.offloads = _dev_info.default_txconf.offloads;
+
ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
<< _dev_info.max_rx_queues << " max_tx_queues "
<< _dev_info.max_tx_queues << dendl;
_num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
- << _num_queues << " queues" << dendl;;
+ << _num_queues << " queues" << dendl;
// Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
// Even if port has a single queue we still want the RSS feature to be
} else if (_dev_info.hash_key_size == 52) {
_rss_key = default_rsskey_52bytes;
} else if (_dev_info.hash_key_size != 0) {
- // WTF?!!
rte_exit(EXIT_FAILURE,
"Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
_port_idx, _dev_info.hash_key_size);
}
port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
- port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
+ /* enable all supported rss offloads */
+ port_conf.rx_adv_conf.rss_conf.rss_hf = _dev_info.flow_type_rss_offloads;
if (_dev_info.hash_key_size) {
port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
// Set Rx VLAN stripping
if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
- port_conf.rxmode.hw_vlan_strip = 1;
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
}
- // Enable HW CRC stripping
- port_conf.rxmode.hw_strip_crc = 1;
-
#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
// Enable LRO
if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
ldout(cct, 1) << __func__ << " LRO is on" << dendl;
- port_conf.rxmode.enable_lro = 1;
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
_hw_features.rx_lro = true;
} else
#endif
if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
- port_conf.rxmode.hw_ip_checksum = 1;
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
_hw_features.rx_csum_offload = 1;
}
return -1;
}
- if (_num_queues > 1) {
- if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) {
- ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl;
-
- // Setup HW touse the TOEPLITZ hash function as an RSS hash function
- struct rte_eth_hash_filter_info info = {};
-
- info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
- info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
-
- if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH,
- RTE_ETH_FILTER_SET, &info) < 0) {
- lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl;
- return -1;
- }
- }
-
+ if (_num_queues > 1)
set_rss_table();
- }
// Wait for a link
if (check_port_link_status() < 0) {
return 0;
}
+void DPDKDevice::set_rss_table()
+{
+ struct rte_flow_attr attr;
+ struct rte_flow_item pattern[1];
+ struct rte_flow_action action[2];
+ struct rte_flow_action_rss rss_conf;
+
+ /*
+ * set the rule attribute.
+ * in this case only ingress packets will be checked.
+ */
+ memset(&attr, 0, sizeof(struct rte_flow_attr));
+ attr.ingress = 1;
+
+ /* the final level must be always type end */
+ pattern[0].type = RTE_FLOW_ITEM_TYPE_END;
+
+ /*
+ * create the action sequence.
+ * one action only, set rss hash func to toeplitz.
+ */
+ uint16_t i = 0;
+ for (auto& r : _redir_table) {
+ r = i++ % _num_queues;
+ }
+ rss_conf.func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+ rss_conf.types = ETH_RSS_FRAG_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP;
+ rss_conf.queue_num = _num_queues;
+ rss_conf.queue = const_cast<uint16_t *>(_redir_table.data());
+ rss_conf.key_len = _dev_info.hash_key_size;
+ rss_conf.key = const_cast<uint8_t *>(_rss_key.data());
+ rss_conf.level = 0;
+ action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
+ action[0].conf = &rss_conf;
+ action[1].type = RTE_FLOW_ACTION_TYPE_END;
+
+ if (rte_flow_validate(_port_idx, &attr, pattern, action, nullptr) == 0)
+ _flow = rte_flow_create(_port_idx, &attr, pattern, action, nullptr);
+ else
+ ldout(cct, 0) << __func__ << " Port " << _port_idx
+ << ": flow rss func configuration is unsupported"
+ << dendl;
+}
+
void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
ceph_assert(!cpu_weights.empty());
if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
// Update the HEAD buffer with the packet info
head->pkt_len = p.len();
head->nb_segs = total_nsegs;
+ // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+ // be null.
+ last_seg->next = nullptr;
set_cluster_offload_info(p, qp, head);
//
head->pkt_len = p.len();
head->nb_segs = nsegs;
+ // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+ // be null.
+ last_seg->next = nullptr;
copy_packet_to_cluster(p, head);
set_cluster_offload_info(p, qp, head);
return len;
}
-void DPDKDevice::set_rss_table()
-{
- // always fill our local indirection table.
- unsigned i = 0;
- for (auto& r : _redir_table) {
- r = i++ % _num_queues;
- }
-
- if (_dev_info.reta_size == 0)
- return;
-
- int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE);
- rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
-
- // Configure the HW indirection table
- i = 0;
- for (auto& x : reta_conf) {
- x.mask = ~0ULL;
- for (auto& r: x.reta) {
- r = i++ % _num_queues;
- }
- }
-
- if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) {
- rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx);
- }
-}
-
/******************************** Interface functions *************************/
std::unique_ptr<DPDKDevice> create_dpdk_net_device(
bool enable_fc)
{
// Check that we have at least one DPDK-able port
- if (rte_eth_dev_count() == 0) {
+ if (rte_eth_dev_count_avail() == 0) {
rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
} else {
- ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl;
+ ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count_avail()) << dendl;
}
return std::unique_ptr<DPDKDevice>(