2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "netdev-dpdk.h"
25 #include <linux/virtio_net.h>
26 #include <sys/socket.h>
29 /* Include rte_compat.h first to allow experimental API's needed for the
30 * rte_meter.h rfc4115 functions. Once they are no longer marked as
31 * experimental the #define and rte_compat.h include can be removed.
33 #define ALLOW_EXPERIMENTAL_API
34 #include <rte_compat.h>
35 #include <rte_bus_pci.h>
36 #include <rte_config.h>
37 #include <rte_cycles.h>
38 #include <rte_errno.h>
39 #include <rte_ethdev.h>
41 #include <rte_malloc.h>
43 #include <rte_meter.h>
45 #include <rte_version.h>
46 #include <rte_vhost.h>
51 #include "dp-packet.h"
53 #include "dpif-netdev.h"
54 #include "fatal-signal.h"
55 #include "if-notifier.h"
56 #include "netdev-provider.h"
57 #include "netdev-vport.h"
59 #include "openvswitch/dynamic-string.h"
60 #include "openvswitch/list.h"
61 #include "openvswitch/match.h"
62 #include "openvswitch/ofp-print.h"
63 #include "openvswitch/shash.h"
64 #include "openvswitch/vlog.h"
67 #include "ovs-thread.h"
72 #include "unaligned.h"
74 #include "userspace-tso.h"
78 enum {VIRTIO_RXQ
, VIRTIO_TXQ
, VIRTIO_QNUM
};
80 VLOG_DEFINE_THIS_MODULE(netdev_dpdk
);
81 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
83 COVERAGE_DEFINE(vhost_tx_contention
);
84 COVERAGE_DEFINE(vhost_notification
);
86 #define DPDK_PORT_WATCHDOG_INTERVAL 5
88 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
89 #define OVS_VPORT_DPDK "ovs_dpdk"
92 * need to reserve tons of extra space in the mbufs so we can align the
93 * DMA addresses to 4KB.
94 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
95 * performance for standard Ethernet MTU.
97 #define ETHER_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \
98 + (2 * VLAN_HEADER_LEN))
99 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + RTE_ETHER_HDR_LEN + \
101 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
102 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
103 - RTE_ETHER_HDR_LEN - RTE_ETHER_CRC_LEN)
104 #define NETDEV_DPDK_MBUF_ALIGN 1024
105 #define NETDEV_DPDK_MAX_PKT_LEN 9728
107 /* Max and min number of packets in the mempool. OVS tries to allocate a
108 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
109 * enough hugepages) we keep halving the number until the allocation succeeds
110 * or we reach MIN_NB_MBUF */
112 #define MAX_NB_MBUF (4096 * 64)
113 #define MIN_NB_MBUF (4096 * 4)
114 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
116 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
117 BUILD_ASSERT_DECL(MAX_NB_MBUF
% ROUND_DOWN_POW2(MAX_NB_MBUF
/ MIN_NB_MBUF
)
120 /* The smallest possible NB_MBUF that we're going to try should be a multiple
121 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
122 BUILD_ASSERT_DECL((MAX_NB_MBUF
/ ROUND_DOWN_POW2(MAX_NB_MBUF
/ MIN_NB_MBUF
))
127 /* Default size of Physical NIC RXQ */
128 #define NIC_PORT_DEFAULT_RXQ_SIZE 2048
129 /* Default size of Physical NIC TXQ */
130 #define NIC_PORT_DEFAULT_TXQ_SIZE 2048
131 /* Maximum size of Physical NIC Queues */
132 #define NIC_PORT_MAX_Q_SIZE 4096
134 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
135 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
136 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
137 * yet mapped to another queue. */
139 #define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
141 /* DPDK library uses uint16_t for port_id. */
142 typedef uint16_t dpdk_port_t
;
143 #define DPDK_PORT_ID_FMT "%"PRIu16
145 /* Minimum amount of vhost tx retries, effectively a disable. */
146 #define VHOST_ENQ_RETRY_MIN 0
147 /* Maximum amount of vhost tx retries. */
148 #define VHOST_ENQ_RETRY_MAX 32
149 /* Legacy default value for vhost tx retries. */
150 #define VHOST_ENQ_RETRY_DEF 8
152 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
154 /* List of required flags advertised by the hardware that will be used
155 * if TSO is enabled. Ideally this should include DEV_TX_OFFLOAD_SCTP_CKSUM.
156 * However, very few drivers supports that the moment and SCTP is not a
157 * widely used protocol as TCP and UDP, so it's optional. */
158 #define DPDK_TX_TSO_OFFLOAD_FLAGS (DEV_TX_OFFLOAD_TCP_TSO \
159 | DEV_TX_OFFLOAD_TCP_CKSUM \
160 | DEV_TX_OFFLOAD_UDP_CKSUM \
161 | DEV_TX_OFFLOAD_IPV4_CKSUM)
164 static const struct rte_eth_conf port_conf
= {
172 .rss_hf
= ETH_RSS_IP
| ETH_RSS_UDP
| ETH_RSS_TCP
,
176 .mq_mode
= ETH_MQ_TX_NONE
,
181 * These callbacks allow virtio-net devices to be added to vhost ports when
182 * configuration has been fully completed.
184 static int new_device(int vid
);
185 static void destroy_device(int vid
);
186 static int vring_state_changed(int vid
, uint16_t queue_id
, int enable
);
187 static void destroy_connection(int vid
);
188 static void vhost_guest_notified(int vid
);
190 static const struct vhost_device_ops virtio_net_device_ops
=
192 .new_device
= new_device
,
193 .destroy_device
= destroy_device
,
194 .vring_state_changed
= vring_state_changed
,
195 .features_changed
= NULL
,
196 .new_connection
= NULL
,
197 .destroy_connection
= destroy_connection
,
198 .guest_notified
= vhost_guest_notified
,
201 /* Custom software stats for dpdk ports */
202 struct netdev_dpdk_sw_stats
{
203 /* No. of retries when unable to transmit. */
205 /* Packet drops when unable to transmit; Probably Tx queue is full. */
206 uint64_t tx_failure_drops
;
207 /* Packet length greater than device MTU. */
208 uint64_t tx_mtu_exceeded_drops
;
209 /* Packet drops in egress policer processing. */
210 uint64_t tx_qos_drops
;
211 /* Packet drops in ingress policer processing. */
212 uint64_t rx_qos_drops
;
213 /* Packet drops in HWOL processing. */
214 uint64_t tx_invalid_hwol_drops
;
222 /* Quality of Service */
224 /* An instance of a QoS configuration. Always associated with a particular
227 * Each QoS implementation subclasses this with whatever additional data it
231 const struct dpdk_qos_ops
*ops
;
235 /* QoS queue information used by the netdev queue dump functions. */
236 struct netdev_dpdk_queue_state
{
242 /* A particular implementation of dpdk QoS operations.
244 * The functions below return 0 if successful or a positive errno value on
245 * failure, except where otherwise noted. All of them must be provided, except
246 * where otherwise noted.
248 struct dpdk_qos_ops
{
250 /* Name of the QoS type */
251 const char *qos_name
;
253 /* Called to construct a qos_conf object. The implementation should make
254 * the appropriate calls to configure QoS according to 'details'.
256 * The contents of 'details' should be documented as valid for 'ovs_name'
257 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
258 * (which is built as ovs-vswitchd.conf.db(8)).
260 * This function must return 0 if and only if it sets '*conf' to an
261 * initialized 'struct qos_conf'.
263 * For all QoS implementations it should always be non-null.
265 int (*qos_construct
)(const struct smap
*details
, struct qos_conf
**conf
);
267 /* Destroys the data structures allocated by the implementation as part of
270 * For all QoS implementations it should always be non-null.
272 void (*qos_destruct
)(struct qos_conf
*conf
);
274 /* Retrieves details of 'conf' configuration into 'details'.
276 * The contents of 'details' should be documented as valid for 'ovs_name'
277 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
278 * (which is built as ovs-vswitchd.conf.db(8)).
280 int (*qos_get
)(const struct qos_conf
*conf
, struct smap
*details
);
282 /* Returns true if 'conf' is already configured according to 'details'.
284 * The contents of 'details' should be documented as valid for 'ovs_name'
285 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
286 * (which is built as ovs-vswitchd.conf.db(8)).
288 * For all QoS implementations it should always be non-null.
290 bool (*qos_is_equal
)(const struct qos_conf
*conf
,
291 const struct smap
*details
);
293 /* Modify an array of rte_mbufs. The modification is specific to
294 * each qos implementation.
296 * The function should take and array of mbufs and an int representing
297 * the current number of mbufs present in the array.
299 * After the function has performed a qos modification to the array of
300 * mbufs it returns an int representing the number of mbufs now present in
301 * the array. This value is can then be passed to the port send function
302 * along with the modified array for transmission.
304 * For all QoS implementations it should always be non-null.
306 int (*qos_run
)(struct qos_conf
*qos_conf
, struct rte_mbuf
**pkts
,
307 int pkt_cnt
, bool should_steal
);
309 /* Called to construct a QoS Queue. The implementation should make
310 * the appropriate calls to configure QoS Queue according to 'details'.
312 * The contents of 'details' should be documented as valid for 'ovs_name'
313 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
314 * (which is built as ovs-vswitchd.conf.db(8)).
316 * This function must return 0 if and only if it constructs
317 * QoS queue successfully.
319 int (*qos_queue_construct
)(const struct smap
*details
,
320 uint32_t queue_id
, struct qos_conf
*conf
);
322 /* Destroys the QoS Queue. */
323 void (*qos_queue_destruct
)(struct qos_conf
*conf
, uint32_t queue_id
);
325 /* Retrieves details of QoS Queue configuration into 'details'.
327 * The contents of 'details' should be documented as valid for 'ovs_name'
328 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
329 * (which is built as ovs-vswitchd.conf.db(8)).
331 int (*qos_queue_get
)(struct smap
*details
, uint32_t queue_id
,
332 const struct qos_conf
*conf
);
334 /* Retrieves statistics of QoS Queue configuration into 'stats'. */
335 int (*qos_queue_get_stats
)(const struct qos_conf
*conf
, uint32_t queue_id
,
336 struct netdev_queue_stats
*stats
);
338 /* Setup the 'netdev_dpdk_queue_state' structure used by the dpdk queue
341 int (*qos_queue_dump_state_init
)(const struct qos_conf
*conf
,
342 struct netdev_dpdk_queue_state
*state
);
345 /* dpdk_qos_ops for each type of user space QoS implementation. */
346 static const struct dpdk_qos_ops egress_policer_ops
;
347 static const struct dpdk_qos_ops trtcm_policer_ops
;
350 * Array of dpdk_qos_ops, contains pointer to all supported QoS
353 static const struct dpdk_qos_ops
*const qos_confs
[] = {
359 static struct ovs_mutex dpdk_mutex
= OVS_MUTEX_INITIALIZER
;
361 /* Contains all 'struct dpdk_dev's. */
362 static struct ovs_list dpdk_list
OVS_GUARDED_BY(dpdk_mutex
)
363 = OVS_LIST_INITIALIZER(&dpdk_list
);
365 static struct ovs_mutex dpdk_mp_mutex
OVS_ACQ_AFTER(dpdk_mutex
)
366 = OVS_MUTEX_INITIALIZER
;
368 /* Contains all 'struct dpdk_mp's. */
369 static struct ovs_list dpdk_mp_list
OVS_GUARDED_BY(dpdk_mp_mutex
)
370 = OVS_LIST_INITIALIZER(&dpdk_mp_list
);
373 struct rte_mempool
*mp
;
377 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mp_mutex
);
380 /* There should be one 'struct dpdk_tx_queue' created for
381 * each netdev tx queue. */
382 struct dpdk_tx_queue
{
383 /* Padding to make dpdk_tx_queue exactly one cache line long. */
384 PADDED_MEMBERS(CACHE_LINE_SIZE
,
385 /* Protects the members and the NIC queue from concurrent access.
386 * It is used only if the queue is shared among different pmd threads
387 * (see 'concurrent_txq'). */
388 rte_spinlock_t tx_lock
;
389 /* Mapping of configured vhost-user queue to enabled by guest. */
394 struct ingress_policer
{
395 struct rte_meter_srtcm_params app_srtcm_params
;
396 struct rte_meter_srtcm in_policer
;
397 struct rte_meter_srtcm_profile in_prof
;
398 rte_spinlock_t policer_lock
;
401 enum dpdk_hw_ol_features
{
402 NETDEV_RX_CHECKSUM_OFFLOAD
= 1 << 0,
403 NETDEV_RX_HW_CRC_STRIP
= 1 << 1,
404 NETDEV_RX_HW_SCATTER
= 1 << 2,
405 NETDEV_TX_TSO_OFFLOAD
= 1 << 3,
406 NETDEV_TX_SCTP_CHECKSUM_OFFLOAD
= 1 << 4,
410 * In order to avoid confusion in variables names, following naming convention
411 * should be used, if possible:
413 * 'struct netdev' : 'netdev'
414 * 'struct netdev_dpdk' : 'dev'
415 * 'struct netdev_rxq' : 'rxq'
416 * 'struct netdev_rxq_dpdk' : 'rx'
419 * struct netdev *netdev = netdev_from_name(name);
420 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
422 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
427 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline0
,
430 /* If true, device was attached by rte_eth_dev_attach(). */
432 /* If true, rte_eth_dev_start() was successfully called */
435 /* 1 pad byte here. */
436 struct eth_addr hwaddr
;
441 enum dpdk_dev_type type
;
442 enum netdev_flags flags
;
445 /* Device arguments for dpdk ports. */
447 /* Identifier used to distinguish vhost devices from each other. */
450 struct dpdk_tx_queue
*tx_q
;
451 struct rte_eth_link link
;
454 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline1
,
455 struct ovs_mutex mutex
OVS_ACQ_AFTER(dpdk_mutex
);
456 struct dpdk_mp
*dpdk_mp
;
458 /* virtio identifier for vhost devices */
461 /* True if vHost device is 'up' and has been reconfigured at least once */
462 bool vhost_reconfigured
;
464 atomic_uint8_t vhost_tx_retries_max
;
465 /* 2 pad bytes here. */
468 PADDED_MEMBERS(CACHE_LINE_SIZE
,
471 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
473 /* QoS configuration and lock for the device */
474 OVSRCU_TYPE(struct qos_conf
*) qos_conf
;
476 /* Ingress Policer */
477 OVSRCU_TYPE(struct ingress_policer
*) ingress_policer
;
478 uint32_t policer_rate
;
479 uint32_t policer_burst
;
481 /* Array of vhost rxq states, see vring_state_changed. */
482 bool *vhost_rxq_enabled
;
485 PADDED_MEMBERS(CACHE_LINE_SIZE
,
486 struct netdev_stats stats
;
487 struct netdev_dpdk_sw_stats
*sw_stats
;
489 rte_spinlock_t stats_lock
;
490 /* 36 pad bytes here. */
493 PADDED_MEMBERS(CACHE_LINE_SIZE
,
494 /* The following properties cannot be changed when a device is running,
495 * so we remember the request and update them next time
496 * netdev_dpdk*_reconfigure() is called */
500 int requested_rxq_size
;
501 int requested_txq_size
;
503 /* Number of rx/tx descriptors for physical devices */
507 /* Socket ID detected when vHost device is brought up */
508 int requested_socket_id
;
510 /* Denotes whether vHost port is client/server mode */
511 uint64_t vhost_driver_flags
;
513 /* DPDK-ETH Flow control */
514 struct rte_eth_fc_conf fc_conf
;
516 /* DPDK-ETH hardware offload features,
517 * from the enum set 'dpdk_hw_ol_features' */
518 uint32_t hw_ol_features
;
520 /* Properties for link state change detection mode.
521 * If lsc_interrupt_mode is set to false, poll mode is used,
522 * otherwise interrupt mode is used. */
523 bool requested_lsc_interrupt_mode
;
524 bool lsc_interrupt_mode
;
526 /* VF configuration. */
527 struct eth_addr requested_hwaddr
;
530 PADDED_MEMBERS(CACHE_LINE_SIZE
,
531 /* Names of all XSTATS counters */
532 struct rte_eth_xstat_name
*rte_xstats_names
;
533 int rte_xstats_names_size
;
534 int rte_xstats_ids_size
;
535 uint64_t *rte_xstats_ids
;
539 struct netdev_rxq_dpdk
{
540 struct netdev_rxq up
;
544 static void netdev_dpdk_destruct(struct netdev
*netdev
);
545 static void netdev_dpdk_vhost_destruct(struct netdev
*netdev
);
547 static int netdev_dpdk_get_sw_custom_stats(const struct netdev
*,
548 struct netdev_custom_stats
*);
549 static void netdev_dpdk_clear_xstats(struct netdev_dpdk
*dev
);
551 int netdev_dpdk_get_vid(const struct netdev_dpdk
*dev
);
553 struct ingress_policer
*
554 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
);
557 is_dpdk_class(const struct netdev_class
*class)
559 return class->destruct
== netdev_dpdk_destruct
560 || class->destruct
== netdev_dpdk_vhost_destruct
;
563 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
564 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
565 * value, insufficient buffers are allocated to accomodate the packet in its
566 * entirety. Furthermore, certain drivers need to ensure that there is also
567 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
568 * frames). If the RX buffer is too small, then the driver enables scatter RX
569 * behaviour, which reduces performance. To prevent this, use a buffer size
570 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
573 dpdk_buf_size(int mtu
)
575 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu
), NETDEV_DPDK_MBUF_ALIGN
)
576 + RTE_PKTMBUF_HEADROOM
;
579 /* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
581 * Unlike xmalloc(), this function can return NULL on failure. */
583 dpdk_rte_mzalloc(size_t sz
)
585 return rte_zmalloc(OVS_VPORT_DPDK
, sz
, OVS_CACHE_LINE_SIZE
);
589 free_dpdk_buf(struct dp_packet
*p
)
591 struct rte_mbuf
*pkt
= (struct rte_mbuf
*) p
;
593 rte_pktmbuf_free(pkt
);
597 ovs_rte_pktmbuf_init(struct rte_mempool
*mp OVS_UNUSED
,
598 void *opaque_arg OVS_UNUSED
,
600 unsigned i OVS_UNUSED
)
602 struct rte_mbuf
*pkt
= _p
;
604 dp_packet_init_dpdk((struct dp_packet
*) pkt
);
608 dpdk_mp_full(const struct rte_mempool
*mp
) OVS_REQUIRES(dpdk_mp_mutex
)
610 /* At this point we want to know if all the mbufs are back
611 * in the mempool. rte_mempool_full() is not atomic but it's
612 * the best available and as we are no longer requesting mbufs
613 * from the mempool, it means mbufs will not move from
614 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
615 * the ring is counted before caches, so we won't get false
616 * positives in this use case and we handle false negatives.
618 * If future implementations of rte_mempool_full() were to change
619 * it could be possible for a false positive. Even that would
620 * likely be ok, as there are additional checks during mempool
621 * freeing but it would make things racey.
623 return rte_mempool_full(mp
);
626 /* Free unused mempools. */
628 dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex
)
630 struct dpdk_mp
*dmp
, *next
;
632 LIST_FOR_EACH_SAFE (dmp
, next
, list_node
, &dpdk_mp_list
) {
633 if (!dmp
->refcount
&& dpdk_mp_full(dmp
->mp
)) {
634 VLOG_DBG("Freeing mempool \"%s\"", dmp
->mp
->name
);
635 ovs_list_remove(&dmp
->list_node
);
636 rte_mempool_free(dmp
->mp
);
642 /* Calculating the required number of mbufs differs depending on the
643 * mempool model being used. Check if per port memory is in use before
647 dpdk_calculate_mbufs(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
652 /* Shared memory are being used.
653 * XXX: this is a really rough method of provisioning memory.
654 * It's impossible to determine what the exact memory requirements are
655 * when the number of ports and rxqs that utilize a particular mempool
656 * can change dynamically at runtime. For now, use this rough
659 if (mtu
>= RTE_ETHER_MTU
) {
660 n_mbufs
= MAX_NB_MBUF
;
662 n_mbufs
= MIN_NB_MBUF
;
665 /* Per port memory is being used.
666 * XXX: rough estimation of number of mbufs required for this port:
667 * <packets required to fill the device rxqs>
668 * + <packets that could be stuck on other ports txqs>
669 * + <packets in the pmd threads>
670 * + <additional memory for corner cases>
672 n_mbufs
= dev
->requested_n_rxq
* dev
->requested_rxq_size
673 + dev
->requested_n_txq
* dev
->requested_txq_size
674 + MIN(RTE_MAX_LCORE
, dev
->requested_n_rxq
) * NETDEV_MAX_BURST
681 static struct dpdk_mp
*
682 dpdk_mp_create(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
684 char mp_name
[RTE_MEMPOOL_NAMESIZE
];
685 const char *netdev_name
= netdev_get_name(&dev
->up
);
686 int socket_id
= dev
->requested_socket_id
;
687 uint32_t n_mbufs
= 0;
688 uint32_t mbuf_size
= 0;
689 uint32_t aligned_mbuf_size
= 0;
690 uint32_t mbuf_priv_data_len
= 0;
691 uint32_t pkt_size
= 0;
692 uint32_t hash
= hash_string(netdev_name
, 0);
693 struct dpdk_mp
*dmp
= NULL
;
696 dmp
= dpdk_rte_mzalloc(sizeof *dmp
);
700 dmp
->socket_id
= socket_id
;
704 /* Get the size of each mbuf, based on the MTU */
705 mbuf_size
= MTU_TO_FRAME_LEN(mtu
);
707 n_mbufs
= dpdk_calculate_mbufs(dev
, mtu
, per_port_mp
);
710 /* Full DPDK memory pool name must be unique and cannot be
711 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
712 * mempool case this can result in one device using a mempool
713 * which references a different device in it's name. However as
714 * mempool names are hashed, the device name will not be readable
715 * so this is not an issue for tasks such as debugging.
717 ret
= snprintf(mp_name
, RTE_MEMPOOL_NAMESIZE
,
718 "ovs%08x%02d%05d%07u",
719 hash
, socket_id
, mtu
, n_mbufs
);
720 if (ret
< 0 || ret
>= RTE_MEMPOOL_NAMESIZE
) {
721 VLOG_DBG("snprintf returned %d. "
722 "Failed to generate a mempool name for \"%s\". "
723 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
724 ret
, netdev_name
, hash
, socket_id
, mtu
, n_mbufs
);
728 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
729 "on socket %d for %d Rx and %d Tx queues, "
730 "cache line size of %u",
731 netdev_name
, n_mbufs
, mbuf_size
, socket_id
,
732 dev
->requested_n_rxq
, dev
->requested_n_txq
,
733 RTE_CACHE_LINE_SIZE
);
735 /* The size of the mbuf's private area (i.e. area that holds OvS'
737 mbuf_priv_data_len
= sizeof(struct dp_packet
) -
738 sizeof(struct rte_mbuf
);
739 /* The size of the entire dp_packet. */
740 pkt_size
= sizeof(struct dp_packet
) + mbuf_size
;
741 /* mbuf size, rounded up to cacheline size. */
742 aligned_mbuf_size
= ROUND_UP(pkt_size
, RTE_CACHE_LINE_SIZE
);
743 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
744 * This maintains mbuf size cache alignment, while also honoring RX
745 * buffer alignment in the data portion of the mbuf. If this adjustment
746 * is not made, there is a possiblity later on that for an element of
747 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
748 * This is problematic in the case of multi-segment mbufs, particularly
749 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
750 * header, for example.
752 mbuf_priv_data_len
+= (aligned_mbuf_size
- pkt_size
);
754 dmp
->mp
= rte_pktmbuf_pool_create(mp_name
, n_mbufs
, MP_CACHE_SZ
,
760 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
762 /* rte_pktmbuf_pool_create has done some initialization of the
763 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
764 * initializes some OVS specific fields of dp_packet.
766 rte_mempool_obj_iter(dmp
->mp
, ovs_rte_pktmbuf_init
, NULL
);
768 } else if (rte_errno
== EEXIST
) {
769 /* A mempool with the same name already exists. We just
770 * retrieve its pointer to be returned to the caller. */
771 dmp
->mp
= rte_mempool_lookup(mp_name
);
772 /* As the mempool create returned EEXIST we can expect the
773 * lookup has returned a valid pointer. If for some reason
774 * that's not the case we keep track of it. */
775 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
779 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
780 "%u mbufs, retrying with %u mbufs",
781 mp_name
, n_mbufs
, n_mbufs
/ 2);
783 } while (!dmp
->mp
&& rte_errno
== ENOMEM
&& (n_mbufs
/= 2) >= MIN_NB_MBUF
);
785 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
792 static struct dpdk_mp
*
793 dpdk_mp_get(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
795 struct dpdk_mp
*dmp
, *next
;
798 ovs_mutex_lock(&dpdk_mp_mutex
);
799 /* Check if shared memory is being used, if so check existing mempools
800 * to see if reuse is possible. */
802 LIST_FOR_EACH (dmp
, list_node
, &dpdk_mp_list
) {
803 if (dmp
->socket_id
== dev
->requested_socket_id
804 && dmp
->mtu
== mtu
) {
805 VLOG_DBG("Reusing mempool \"%s\"", dmp
->mp
->name
);
812 /* Sweep mempools after reuse or before create. */
816 dmp
= dpdk_mp_create(dev
, mtu
, per_port_mp
);
818 /* Shared memory will hit the reuse case above so will not
819 * request a mempool that already exists but we need to check
820 * for the EEXIST case for per port memory case. Compare the
821 * mempool returned by dmp to each entry in dpdk_mp_list. If a
822 * match is found, free dmp as a new entry is not required, set
823 * dmp to point to the existing entry and increment the refcount
824 * to avoid being freed at a later stage.
826 if (per_port_mp
&& rte_errno
== EEXIST
) {
827 LIST_FOR_EACH (next
, list_node
, &dpdk_mp_list
) {
828 if (dmp
->mp
== next
->mp
) {
835 ovs_list_push_back(&dpdk_mp_list
, &dmp
->list_node
);
840 ovs_mutex_unlock(&dpdk_mp_mutex
);
845 /* Decrement reference to a mempool. */
847 dpdk_mp_put(struct dpdk_mp
*dmp
)
853 ovs_mutex_lock(&dpdk_mp_mutex
);
854 ovs_assert(dmp
->refcount
);
856 ovs_mutex_unlock(&dpdk_mp_mutex
);
859 /* Depending on the memory model being used this function tries to
860 * identify and reuse an existing mempool or tries to allocate a new
861 * mempool on requested_socket_id with mbuf size corresponding to the
862 * requested_mtu. On success, a new configuration will be applied.
863 * On error, device will be left unchanged. */
865 netdev_dpdk_mempool_configure(struct netdev_dpdk
*dev
)
866 OVS_REQUIRES(dev
->mutex
)
868 uint32_t buf_size
= dpdk_buf_size(dev
->requested_mtu
);
871 bool per_port_mp
= dpdk_per_port_memory();
873 /* With shared memory we do not need to configure a mempool if the MTU
874 * and socket ID have not changed, the previous configuration is still
875 * valid so return 0 */
876 if (!per_port_mp
&& dev
->mtu
== dev
->requested_mtu
877 && dev
->socket_id
== dev
->requested_socket_id
) {
881 dmp
= dpdk_mp_get(dev
, FRAME_LEN_TO_MTU(buf_size
), per_port_mp
);
883 VLOG_ERR("Failed to create memory pool for netdev "
884 "%s, with MTU %d on socket %d: %s\n",
885 dev
->up
.name
, dev
->requested_mtu
, dev
->requested_socket_id
,
886 rte_strerror(rte_errno
));
889 /* Check for any pre-existing dpdk_mp for the device before accessing
890 * the associated mempool.
892 if (dev
->dpdk_mp
!= NULL
) {
893 /* A new MTU was requested, decrement the reference count for the
894 * devices current dpdk_mp. This is required even if a pointer to
895 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
896 * has already been incremented by dpdk_mp_get at this stage so it
897 * must be decremented to keep an accurate refcount for the
900 dpdk_mp_put(dev
->dpdk_mp
);
903 dev
->mtu
= dev
->requested_mtu
;
904 dev
->socket_id
= dev
->requested_socket_id
;
905 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
912 check_link_status(struct netdev_dpdk
*dev
)
914 struct rte_eth_link link
;
916 rte_eth_link_get_nowait(dev
->port_id
, &link
);
918 if (dev
->link
.link_status
!= link
.link_status
) {
919 netdev_change_seq_changed(&dev
->up
);
921 dev
->link_reset_cnt
++;
923 if (dev
->link
.link_status
) {
925 "Port "DPDK_PORT_ID_FMT
" Link Up - speed %u Mbps - %s",
926 dev
->port_id
, (unsigned) dev
->link
.link_speed
,
927 (dev
->link
.link_duplex
== ETH_LINK_FULL_DUPLEX
)
928 ? "full-duplex" : "half-duplex");
930 VLOG_DBG_RL(&rl
, "Port "DPDK_PORT_ID_FMT
" Link Down",
937 dpdk_watchdog(void *dummy OVS_UNUSED
)
939 struct netdev_dpdk
*dev
;
941 pthread_detach(pthread_self());
944 ovs_mutex_lock(&dpdk_mutex
);
945 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
946 ovs_mutex_lock(&dev
->mutex
);
947 if (dev
->type
== DPDK_DEV_ETH
) {
948 check_link_status(dev
);
950 ovs_mutex_unlock(&dev
->mutex
);
952 ovs_mutex_unlock(&dpdk_mutex
);
953 xsleep(DPDK_PORT_WATCHDOG_INTERVAL
);
960 dpdk_eth_dev_port_config(struct netdev_dpdk
*dev
, int n_rxq
, int n_txq
)
964 struct rte_eth_conf conf
= port_conf
;
965 struct rte_eth_dev_info info
;
968 rte_eth_dev_info_get(dev
->port_id
, &info
);
970 /* As of DPDK 19.11, it is not allowed to set a mq_mode for
971 * virtio PMD driver. */
972 if (!strcmp(info
.driver_name
, "net_virtio")) {
973 conf
.rxmode
.mq_mode
= ETH_MQ_RX_NONE
;
975 conf
.rxmode
.mq_mode
= ETH_MQ_RX_RSS
;
978 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
979 * scatter to support jumbo RX.
980 * Setting scatter for the device is done after checking for
981 * scatter support in the device capabilites. */
982 if (dev
->mtu
> RTE_ETHER_MTU
) {
983 if (dev
->hw_ol_features
& NETDEV_RX_HW_SCATTER
) {
984 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_SCATTER
;
988 conf
.intr_conf
.lsc
= dev
->lsc_interrupt_mode
;
990 if (dev
->hw_ol_features
& NETDEV_RX_CHECKSUM_OFFLOAD
) {
991 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_CHECKSUM
;
994 if (!(dev
->hw_ol_features
& NETDEV_RX_HW_CRC_STRIP
)
995 && info
.rx_offload_capa
& DEV_RX_OFFLOAD_KEEP_CRC
) {
996 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_KEEP_CRC
;
999 if (dev
->hw_ol_features
& NETDEV_TX_TSO_OFFLOAD
) {
1000 conf
.txmode
.offloads
|= DPDK_TX_TSO_OFFLOAD_FLAGS
;
1001 if (dev
->hw_ol_features
& NETDEV_TX_SCTP_CHECKSUM_OFFLOAD
) {
1002 conf
.txmode
.offloads
|= DEV_TX_OFFLOAD_SCTP_CKSUM
;
1006 /* Limit configured rss hash functions to only those supported
1007 * by the eth device. */
1008 conf
.rx_adv_conf
.rss_conf
.rss_hf
&= info
.flow_type_rss_offloads
;
1010 /* A device may report more queues than it makes available (this has
1011 * been observed for Intel xl710, which reserves some of them for
1012 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
1013 * available. When this happens we can retry the configuration
1014 * and request less queues */
1015 while (n_rxq
&& n_txq
) {
1017 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq
, n_txq
);
1020 diag
= rte_eth_dev_configure(dev
->port_id
, n_rxq
, n_txq
, &conf
);
1022 VLOG_WARN("Interface %s eth_dev setup error %s\n",
1023 dev
->up
.name
, rte_strerror(-diag
));
1027 diag
= rte_eth_dev_set_mtu(dev
->port_id
, dev
->mtu
);
1029 /* A device may not support rte_eth_dev_set_mtu, in this case
1030 * flag a warning to the user and include the devices configured
1031 * MTU value that will be used instead. */
1032 if (-ENOTSUP
== diag
) {
1033 rte_eth_dev_get_mtu(dev
->port_id
, &conf_mtu
);
1034 VLOG_WARN("Interface %s does not support MTU configuration, "
1035 "max packet size supported is %"PRIu16
".",
1036 dev
->up
.name
, conf_mtu
);
1038 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
1039 dev
->up
.name
, dev
->mtu
, rte_strerror(-diag
));
1044 for (i
= 0; i
< n_txq
; i
++) {
1045 diag
= rte_eth_tx_queue_setup(dev
->port_id
, i
, dev
->txq_size
,
1046 dev
->socket_id
, NULL
);
1048 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
1049 dev
->up
.name
, i
, rte_strerror(-diag
));
1055 /* Retry with less tx queues */
1060 for (i
= 0; i
< n_rxq
; i
++) {
1061 diag
= rte_eth_rx_queue_setup(dev
->port_id
, i
, dev
->rxq_size
,
1062 dev
->socket_id
, NULL
,
1065 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
1066 dev
->up
.name
, i
, rte_strerror(-diag
));
1072 /* Retry with less rx queues */
1077 dev
->up
.n_rxq
= n_rxq
;
1078 dev
->up
.n_txq
= n_txq
;
1087 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk
*dev
) OVS_REQUIRES(dev
->mutex
)
1089 if (rte_eth_dev_flow_ctrl_set(dev
->port_id
, &dev
->fc_conf
)) {
1090 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT
,
1096 dpdk_eth_dev_init(struct netdev_dpdk
*dev
)
1097 OVS_REQUIRES(dev
->mutex
)
1099 struct rte_pktmbuf_pool_private
*mbp_priv
;
1100 struct rte_eth_dev_info info
;
1101 struct rte_ether_addr eth_addr
;
1104 uint32_t tx_tso_offload_capa
= DPDK_TX_TSO_OFFLOAD_FLAGS
;
1105 uint32_t rx_chksm_offload_capa
= DEV_RX_OFFLOAD_UDP_CKSUM
|
1106 DEV_RX_OFFLOAD_TCP_CKSUM
|
1107 DEV_RX_OFFLOAD_IPV4_CKSUM
;
1109 rte_eth_dev_info_get(dev
->port_id
, &info
);
1111 if (strstr(info
.driver_name
, "vf") != NULL
) {
1112 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1113 dev
->hw_ol_features
|= NETDEV_RX_HW_CRC_STRIP
;
1115 dev
->hw_ol_features
&= ~NETDEV_RX_HW_CRC_STRIP
;
1118 if ((info
.rx_offload_capa
& rx_chksm_offload_capa
) !=
1119 rx_chksm_offload_capa
) {
1120 VLOG_WARN("Rx checksum offload is not supported on port "
1121 DPDK_PORT_ID_FMT
, dev
->port_id
);
1122 dev
->hw_ol_features
&= ~NETDEV_RX_CHECKSUM_OFFLOAD
;
1124 dev
->hw_ol_features
|= NETDEV_RX_CHECKSUM_OFFLOAD
;
1127 if (info
.rx_offload_capa
& DEV_RX_OFFLOAD_SCATTER
) {
1128 dev
->hw_ol_features
|= NETDEV_RX_HW_SCATTER
;
1130 /* Do not warn on lack of scatter support */
1131 dev
->hw_ol_features
&= ~NETDEV_RX_HW_SCATTER
;
1134 dev
->hw_ol_features
&= ~NETDEV_TX_TSO_OFFLOAD
;
1135 if (userspace_tso_enabled()) {
1136 if ((info
.tx_offload_capa
& tx_tso_offload_capa
)
1137 == tx_tso_offload_capa
) {
1138 dev
->hw_ol_features
|= NETDEV_TX_TSO_OFFLOAD
;
1139 if (info
.tx_offload_capa
& DEV_TX_OFFLOAD_SCTP_CKSUM
) {
1140 dev
->hw_ol_features
|= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD
;
1142 VLOG_WARN("%s: Tx SCTP checksum offload is not supported, "
1143 "SCTP packets sent to this device will be dropped",
1144 netdev_get_name(&dev
->up
));
1147 VLOG_WARN("%s: Tx TSO offload is not supported.",
1148 netdev_get_name(&dev
->up
));
1152 n_rxq
= MIN(info
.max_rx_queues
, dev
->up
.n_rxq
);
1153 n_txq
= MIN(info
.max_tx_queues
, dev
->up
.n_txq
);
1155 diag
= dpdk_eth_dev_port_config(dev
, n_rxq
, n_txq
);
1157 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1158 "configure error: %s",
1159 dev
->up
.name
, n_rxq
, n_txq
,
1160 dev
->lsc_interrupt_mode
? "true" : "false",
1161 rte_strerror(-diag
));
1165 diag
= rte_eth_dev_start(dev
->port_id
);
1167 VLOG_ERR("Interface %s start error: %s", dev
->up
.name
,
1168 rte_strerror(-diag
));
1171 dev
->started
= true;
1173 rte_eth_promiscuous_enable(dev
->port_id
);
1174 rte_eth_allmulticast_enable(dev
->port_id
);
1176 memset(ð_addr
, 0x0, sizeof(eth_addr
));
1177 rte_eth_macaddr_get(dev
->port_id
, ð_addr
);
1178 VLOG_INFO_RL(&rl
, "Port "DPDK_PORT_ID_FMT
": "ETH_ADDR_FMT
,
1179 dev
->port_id
, ETH_ADDR_BYTES_ARGS(eth_addr
.addr_bytes
));
1181 memcpy(dev
->hwaddr
.ea
, eth_addr
.addr_bytes
, ETH_ADDR_LEN
);
1182 rte_eth_link_get_nowait(dev
->port_id
, &dev
->link
);
1184 mbp_priv
= rte_mempool_get_priv(dev
->dpdk_mp
->mp
);
1185 dev
->buf_size
= mbp_priv
->mbuf_data_room_size
- RTE_PKTMBUF_HEADROOM
;
1189 static struct netdev_dpdk
*
1190 netdev_dpdk_cast(const struct netdev
*netdev
)
1192 return CONTAINER_OF(netdev
, struct netdev_dpdk
, up
);
1195 static struct netdev
*
1196 netdev_dpdk_alloc(void)
1198 struct netdev_dpdk
*dev
;
1200 dev
= dpdk_rte_mzalloc(sizeof *dev
);
1208 static struct dpdk_tx_queue
*
1209 netdev_dpdk_alloc_txq(unsigned int n_txqs
)
1211 struct dpdk_tx_queue
*txqs
;
1214 txqs
= dpdk_rte_mzalloc(n_txqs
* sizeof *txqs
);
1216 for (i
= 0; i
< n_txqs
; i
++) {
1217 /* Initialize map for vhost devices. */
1218 txqs
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
1219 rte_spinlock_init(&txqs
[i
].tx_lock
);
1227 common_construct(struct netdev
*netdev
, dpdk_port_t port_no
,
1228 enum dpdk_dev_type type
, int socket_id
)
1229 OVS_REQUIRES(dpdk_mutex
)
1231 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1233 ovs_mutex_init(&dev
->mutex
);
1235 rte_spinlock_init(&dev
->stats_lock
);
1237 /* If the 'sid' is negative, it means that the kernel fails
1238 * to obtain the pci numa info. In that situation, always
1240 dev
->socket_id
= socket_id
< 0 ? SOCKET0
: socket_id
;
1241 dev
->requested_socket_id
= dev
->socket_id
;
1242 dev
->port_id
= port_no
;
1245 dev
->requested_mtu
= RTE_ETHER_MTU
;
1246 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
1247 dev
->requested_lsc_interrupt_mode
= 0;
1248 ovsrcu_index_init(&dev
->vid
, -1);
1249 dev
->vhost_reconfigured
= false;
1250 dev
->attached
= false;
1251 dev
->started
= false;
1252 dev
->reset_needed
= false;
1254 ovsrcu_init(&dev
->qos_conf
, NULL
);
1256 ovsrcu_init(&dev
->ingress_policer
, NULL
);
1257 dev
->policer_rate
= 0;
1258 dev
->policer_burst
= 0;
1262 dev
->requested_n_rxq
= NR_QUEUE
;
1263 dev
->requested_n_txq
= NR_QUEUE
;
1264 dev
->requested_rxq_size
= NIC_PORT_DEFAULT_RXQ_SIZE
;
1265 dev
->requested_txq_size
= NIC_PORT_DEFAULT_TXQ_SIZE
;
1267 /* Initialize the flow control to NULL */
1268 memset(&dev
->fc_conf
, 0, sizeof dev
->fc_conf
);
1270 /* Initilize the hardware offload flags to 0 */
1271 dev
->hw_ol_features
= 0;
1273 dev
->flags
= NETDEV_UP
| NETDEV_PROMISC
;
1275 ovs_list_push_back(&dpdk_list
, &dev
->list_node
);
1277 netdev_request_reconfigure(netdev
);
1279 dev
->rte_xstats_names
= NULL
;
1280 dev
->rte_xstats_names_size
= 0;
1282 dev
->rte_xstats_ids
= NULL
;
1283 dev
->rte_xstats_ids_size
= 0;
1285 dev
->sw_stats
= xzalloc(sizeof *dev
->sw_stats
);
1286 dev
->sw_stats
->tx_retries
= (dev
->type
== DPDK_DEV_VHOST
) ? 0 : UINT64_MAX
;
1291 /* Get the number of OVS interfaces which have the same DPDK
1292 * rte device (e.g. same pci bus address).
1293 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1296 netdev_dpdk_get_num_ports(struct rte_device
*device
)
1297 OVS_REQUIRES(dpdk_mutex
)
1299 struct netdev_dpdk
*dev
;
1302 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
1303 if (rte_eth_devices
[dev
->port_id
].device
== device
1304 && rte_eth_devices
[dev
->port_id
].state
!= RTE_ETH_DEV_UNUSED
) {
1312 vhost_common_construct(struct netdev
*netdev
)
1313 OVS_REQUIRES(dpdk_mutex
)
1315 int socket_id
= rte_lcore_to_socket_id(rte_get_master_lcore());
1316 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1318 dev
->vhost_rxq_enabled
= dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM
*
1319 sizeof *dev
->vhost_rxq_enabled
);
1320 if (!dev
->vhost_rxq_enabled
) {
1323 dev
->tx_q
= netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM
);
1325 rte_free(dev
->vhost_rxq_enabled
);
1329 atomic_init(&dev
->vhost_tx_retries_max
, VHOST_ENQ_RETRY_DEF
);
1331 return common_construct(netdev
, DPDK_ETH_PORT_ID_INVALID
,
1332 DPDK_DEV_VHOST
, socket_id
);
1336 netdev_dpdk_vhost_construct(struct netdev
*netdev
)
1338 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1339 const char *name
= netdev
->name
;
1342 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1343 * the file system. '/' or '\' would traverse directories, so they're not
1344 * acceptable in 'name'. */
1345 if (strchr(name
, '/') || strchr(name
, '\\')) {
1346 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1347 "A valid name must not include '/' or '\\'",
1352 ovs_mutex_lock(&dpdk_mutex
);
1353 /* Take the name of the vhost-user port and append it to the location where
1354 * the socket is to be created, then register the socket.
1356 dev
->vhost_id
= xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name
);
1358 dev
->vhost_driver_flags
&= ~RTE_VHOST_USER_CLIENT
;
1360 /* There is no support for multi-segments buffers. */
1361 dev
->vhost_driver_flags
|= RTE_VHOST_USER_LINEARBUF_SUPPORT
;
1362 err
= rte_vhost_driver_register(dev
->vhost_id
, dev
->vhost_driver_flags
);
1364 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
1368 fatal_signal_add_file_to_unlink(dev
->vhost_id
);
1369 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1370 dev
->vhost_id
, name
);
1373 err
= rte_vhost_driver_callback_register(dev
->vhost_id
,
1374 &virtio_net_device_ops
);
1376 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1377 "port: %s\n", name
);
1381 if (!userspace_tso_enabled()) {
1382 err
= rte_vhost_driver_disable_features(dev
->vhost_id
,
1383 1ULL << VIRTIO_NET_F_HOST_TSO4
1384 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1385 | 1ULL << VIRTIO_NET_F_CSUM
);
1387 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1388 "port: %s\n", name
);
1393 err
= rte_vhost_driver_start(dev
->vhost_id
);
1395 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1396 "port: %s\n", name
);
1400 err
= vhost_common_construct(netdev
);
1402 VLOG_ERR("vhost_common_construct failed for vhost user "
1403 "port: %s\n", name
);
1408 free(dev
->vhost_id
);
1409 dev
->vhost_id
= NULL
;
1412 ovs_mutex_unlock(&dpdk_mutex
);
1413 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1414 "please migrate to dpdkvhostuserclient ports.");
1419 netdev_dpdk_vhost_client_construct(struct netdev
*netdev
)
1423 ovs_mutex_lock(&dpdk_mutex
);
1424 err
= vhost_common_construct(netdev
);
1426 VLOG_ERR("vhost_common_construct failed for vhost user client"
1427 "port: %s\n", netdev
->name
);
1429 ovs_mutex_unlock(&dpdk_mutex
);
1434 netdev_dpdk_construct(struct netdev
*netdev
)
1438 ovs_mutex_lock(&dpdk_mutex
);
1439 err
= common_construct(netdev
, DPDK_ETH_PORT_ID_INVALID
,
1440 DPDK_DEV_ETH
, SOCKET0
);
1441 ovs_mutex_unlock(&dpdk_mutex
);
1446 common_destruct(struct netdev_dpdk
*dev
)
1447 OVS_REQUIRES(dpdk_mutex
)
1448 OVS_EXCLUDED(dev
->mutex
)
1450 rte_free(dev
->tx_q
);
1451 dpdk_mp_put(dev
->dpdk_mp
);
1453 ovs_list_remove(&dev
->list_node
);
1454 free(ovsrcu_get_protected(struct ingress_policer
*,
1455 &dev
->ingress_policer
));
1456 free(dev
->sw_stats
);
1457 ovs_mutex_destroy(&dev
->mutex
);
1461 netdev_dpdk_destruct(struct netdev
*netdev
)
1463 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1464 struct rte_device
*rte_dev
;
1465 struct rte_eth_dev
*eth_dev
;
1466 bool remove_on_close
;
1468 ovs_mutex_lock(&dpdk_mutex
);
1470 rte_eth_dev_stop(dev
->port_id
);
1471 dev
->started
= false;
1473 if (dev
->attached
) {
1474 /* Retrieve eth device data before closing it.
1475 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1477 eth_dev
= &rte_eth_devices
[dev
->port_id
];
1480 (eth_dev
->data
->dev_flags
& RTE_ETH_DEV_CLOSE_REMOVE
);
1481 rte_dev
= eth_dev
->device
;
1483 /* Remove the eth device. */
1484 rte_eth_dev_close(dev
->port_id
);
1486 /* Remove this rte device and all its eth devices if flag
1487 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1488 * are not supported), or if all the eth devices belonging to the rte
1489 * device are closed.
1491 if (!remove_on_close
|| !netdev_dpdk_get_num_ports(rte_dev
)) {
1492 int ret
= rte_dev_remove(rte_dev
);
1495 VLOG_ERR("Device '%s' can not be detached: %s.",
1496 dev
->devargs
, rte_strerror(-ret
));
1498 /* Device was closed and detached. */
1499 VLOG_INFO("Device '%s' has been removed and detached",
1503 /* Device was only closed. rte_dev_remove() was not called. */
1504 VLOG_INFO("Device '%s' has been removed", dev
->devargs
);
1508 netdev_dpdk_clear_xstats(dev
);
1510 common_destruct(dev
);
1512 ovs_mutex_unlock(&dpdk_mutex
);
1515 /* rte_vhost_driver_unregister() can call back destroy_device(), which will
1516 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1517 * deadlock, none of the mutexes must be held while calling this function. */
1519 dpdk_vhost_driver_unregister(struct netdev_dpdk
*dev OVS_UNUSED
,
1521 OVS_EXCLUDED(dpdk_mutex
)
1522 OVS_EXCLUDED(dev
->mutex
)
1524 return rte_vhost_driver_unregister(vhost_id
);
1528 netdev_dpdk_vhost_destruct(struct netdev
*netdev
)
1530 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1533 ovs_mutex_lock(&dpdk_mutex
);
1535 /* Guest becomes an orphan if still attached. */
1536 if (netdev_dpdk_get_vid(dev
) >= 0
1537 && !(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
1538 VLOG_ERR("Removing port '%s' while vhost device still attached.",
1540 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1541 "socket '%s' must be restarted.", dev
->vhost_id
);
1544 vhost_id
= dev
->vhost_id
;
1545 dev
->vhost_id
= NULL
;
1546 rte_free(dev
->vhost_rxq_enabled
);
1548 common_destruct(dev
);
1550 ovs_mutex_unlock(&dpdk_mutex
);
1556 if (dpdk_vhost_driver_unregister(dev
, vhost_id
)) {
1557 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1558 netdev
->name
, vhost_id
);
1559 } else if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
1560 /* OVS server mode - remove this socket from list for deletion */
1561 fatal_signal_remove_file_to_unlink(vhost_id
);
1568 netdev_dpdk_dealloc(struct netdev
*netdev
)
1570 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1576 netdev_dpdk_clear_xstats(struct netdev_dpdk
*dev
)
1578 /* If statistics are already allocated, we have to
1579 * reconfigure, as port_id could have been changed. */
1580 if (dev
->rte_xstats_names
) {
1581 free(dev
->rte_xstats_names
);
1582 dev
->rte_xstats_names
= NULL
;
1583 dev
->rte_xstats_names_size
= 0;
1585 if (dev
->rte_xstats_ids
) {
1586 free(dev
->rte_xstats_ids
);
1587 dev
->rte_xstats_ids
= NULL
;
1588 dev
->rte_xstats_ids_size
= 0;
1593 netdev_dpdk_get_xstat_name(struct netdev_dpdk
*dev
, uint64_t id
)
1595 if (id
>= dev
->rte_xstats_names_size
) {
1598 return dev
->rte_xstats_names
[id
].name
;
1602 netdev_dpdk_configure_xstats(struct netdev_dpdk
*dev
)
1603 OVS_REQUIRES(dev
->mutex
)
1607 struct rte_eth_xstat
*rte_xstats
;
1612 /* Retrieving all XSTATS names. If something will go wrong
1613 * or amount of counters will be equal 0, rte_xstats_names
1614 * buffer will be marked as NULL, and any further xstats
1615 * query won't be performed (e.g. during netdev_dpdk_get_stats
1621 if (dev
->rte_xstats_names
== NULL
|| dev
->rte_xstats_ids
== NULL
) {
1622 dev
->rte_xstats_names_size
=
1623 rte_eth_xstats_get_names(dev
->port_id
, NULL
, 0);
1625 if (dev
->rte_xstats_names_size
< 0) {
1626 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT
,
1628 dev
->rte_xstats_names_size
= 0;
1630 /* Reserve memory for xstats names and values */
1631 dev
->rte_xstats_names
= xcalloc(dev
->rte_xstats_names_size
,
1632 sizeof *dev
->rte_xstats_names
);
1634 if (dev
->rte_xstats_names
) {
1635 /* Retreive xstats names */
1637 rte_eth_xstats_get_names(dev
->port_id
,
1638 dev
->rte_xstats_names
,
1639 dev
->rte_xstats_names_size
);
1641 if (rte_xstats_len
< 0) {
1642 VLOG_WARN("Cannot get XSTATS names for port: "
1643 DPDK_PORT_ID_FMT
, dev
->port_id
);
1645 } else if (rte_xstats_len
!= dev
->rte_xstats_names_size
) {
1646 VLOG_WARN("XSTATS size doesn't match for port: "
1647 DPDK_PORT_ID_FMT
, dev
->port_id
);
1651 dev
->rte_xstats_ids
= xcalloc(dev
->rte_xstats_names_size
,
1654 /* We have to calculate number of counters */
1655 rte_xstats
= xmalloc(rte_xstats_len
* sizeof *rte_xstats
);
1656 memset(rte_xstats
, 0xff, sizeof *rte_xstats
* rte_xstats_len
);
1658 /* Retreive xstats values */
1659 if (rte_eth_xstats_get(dev
->port_id
, rte_xstats
,
1660 rte_xstats_len
) > 0) {
1661 dev
->rte_xstats_ids_size
= 0;
1663 for (uint32_t i
= 0; i
< rte_xstats_len
; i
++) {
1664 id
= rte_xstats
[i
].id
;
1665 name
= netdev_dpdk_get_xstat_name(dev
, id
);
1666 /* We need to filter out everything except
1667 * dropped, error and management counters */
1668 if (string_ends_with(name
, "_errors") ||
1669 strstr(name
, "_management_") ||
1670 string_ends_with(name
, "_dropped")) {
1672 dev
->rte_xstats_ids
[xstats_no
] = id
;
1676 dev
->rte_xstats_ids_size
= xstats_no
;
1679 VLOG_WARN("Can't get XSTATS IDs for port: "
1680 DPDK_PORT_ID_FMT
, dev
->port_id
);
1687 /* Already configured */
1693 netdev_dpdk_clear_xstats(dev
);
1699 dpdk_port_is_representor(struct netdev_dpdk
*dev
)
1700 OVS_REQUIRES(dev
->mutex
)
1702 struct rte_eth_dev_info dev_info
;
1704 rte_eth_dev_info_get(dev
->port_id
, &dev_info
);
1705 return (*dev_info
.dev_flags
) & RTE_ETH_DEV_REPRESENTOR
;
1709 netdev_dpdk_get_config(const struct netdev
*netdev
, struct smap
*args
)
1711 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1713 ovs_mutex_lock(&dev
->mutex
);
1715 smap_add_format(args
, "requested_rx_queues", "%d", dev
->requested_n_rxq
);
1716 smap_add_format(args
, "configured_rx_queues", "%d", netdev
->n_rxq
);
1717 smap_add_format(args
, "requested_tx_queues", "%d", dev
->requested_n_txq
);
1718 smap_add_format(args
, "configured_tx_queues", "%d", netdev
->n_txq
);
1719 smap_add_format(args
, "mtu", "%d", dev
->mtu
);
1721 if (dev
->type
== DPDK_DEV_ETH
) {
1722 smap_add_format(args
, "requested_rxq_descriptors", "%d",
1723 dev
->requested_rxq_size
);
1724 smap_add_format(args
, "configured_rxq_descriptors", "%d",
1726 smap_add_format(args
, "requested_txq_descriptors", "%d",
1727 dev
->requested_txq_size
);
1728 smap_add_format(args
, "configured_txq_descriptors", "%d",
1730 if (dev
->hw_ol_features
& NETDEV_RX_CHECKSUM_OFFLOAD
) {
1731 smap_add(args
, "rx_csum_offload", "true");
1733 smap_add(args
, "rx_csum_offload", "false");
1735 if (dev
->hw_ol_features
& NETDEV_TX_TSO_OFFLOAD
) {
1736 smap_add(args
, "tx_tso_offload", "true");
1738 smap_add(args
, "tx_tso_offload", "false");
1740 smap_add(args
, "lsc_interrupt_mode",
1741 dev
->lsc_interrupt_mode
? "true" : "false");
1743 if (dpdk_port_is_representor(dev
)) {
1744 smap_add_format(args
, "dpdk-vf-mac", ETH_ADDR_FMT
,
1745 ETH_ADDR_ARGS(dev
->requested_hwaddr
));
1748 ovs_mutex_unlock(&dev
->mutex
);
1753 static struct netdev_dpdk
*
1754 netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id
)
1755 OVS_REQUIRES(dpdk_mutex
)
1757 struct netdev_dpdk
*dev
;
1759 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
1760 if (dev
->port_id
== port_id
) {
1769 netdev_dpdk_get_port_by_mac(const char *mac_str
)
1771 dpdk_port_t port_id
;
1772 struct eth_addr mac
, port_mac
;
1774 if (!eth_addr_from_string(mac_str
, &mac
)) {
1775 VLOG_ERR("invalid mac: %s", mac_str
);
1776 return DPDK_ETH_PORT_ID_INVALID
;
1779 RTE_ETH_FOREACH_DEV (port_id
) {
1780 struct rte_ether_addr ea
;
1782 rte_eth_macaddr_get(port_id
, &ea
);
1783 memcpy(port_mac
.ea
, ea
.addr_bytes
, ETH_ADDR_LEN
);
1784 if (eth_addr_equals(mac
, port_mac
)) {
1789 return DPDK_ETH_PORT_ID_INVALID
;
1792 /* Return the first DPDK port id matching the devargs pattern. */
1793 static dpdk_port_t
netdev_dpdk_get_port_by_devargs(const char *devargs
)
1794 OVS_REQUIRES(dpdk_mutex
)
1796 dpdk_port_t port_id
;
1797 struct rte_dev_iterator iterator
;
1799 RTE_ETH_FOREACH_MATCHING_DEV (port_id
, devargs
, &iterator
) {
1800 /* If a break is done - must call rte_eth_iterator_cleanup. */
1801 rte_eth_iterator_cleanup(&iterator
);
1809 * Normally, a PCI id (optionally followed by a representor number)
1810 * is enough for identifying a specific DPDK port.
1811 * However, for some NICs having multiple ports sharing the same PCI
1812 * id, using PCI id won't work then.
1814 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1816 * Note that the compatibility is fully kept: user can still use the
1817 * PCI id for adding ports (when it's enough for them).
1820 netdev_dpdk_process_devargs(struct netdev_dpdk
*dev
,
1821 const char *devargs
, char **errp
)
1822 OVS_REQUIRES(dpdk_mutex
)
1824 dpdk_port_t new_port_id
;
1826 if (strncmp(devargs
, "class=eth,mac=", 14) == 0) {
1827 new_port_id
= netdev_dpdk_get_port_by_mac(&devargs
[14]);
1829 new_port_id
= netdev_dpdk_get_port_by_devargs(devargs
);
1830 if (!rte_eth_dev_is_valid_port(new_port_id
)) {
1831 /* Device not found in DPDK, attempt to attach it */
1832 if (rte_dev_probe(devargs
)) {
1833 new_port_id
= DPDK_ETH_PORT_ID_INVALID
;
1835 new_port_id
= netdev_dpdk_get_port_by_devargs(devargs
);
1836 if (rte_eth_dev_is_valid_port(new_port_id
)) {
1837 /* Attach successful */
1838 dev
->attached
= true;
1839 VLOG_INFO("Device '%s' attached to DPDK", devargs
);
1841 /* Attach unsuccessful */
1842 new_port_id
= DPDK_ETH_PORT_ID_INVALID
;
1848 if (new_port_id
== DPDK_ETH_PORT_ID_INVALID
) {
1849 VLOG_WARN_BUF(errp
, "Error attaching device '%s' to DPDK", devargs
);
1856 dpdk_eth_event_callback(dpdk_port_t port_id
, enum rte_eth_event_type type
,
1857 void *param OVS_UNUSED
, void *ret_param OVS_UNUSED
)
1859 struct netdev_dpdk
*dev
;
1861 switch ((int) type
) {
1862 case RTE_ETH_EVENT_INTR_RESET
:
1863 ovs_mutex_lock(&dpdk_mutex
);
1864 dev
= netdev_dpdk_lookup_by_port_id(port_id
);
1866 ovs_mutex_lock(&dev
->mutex
);
1867 dev
->reset_needed
= true;
1868 netdev_request_reconfigure(&dev
->up
);
1869 VLOG_DBG_RL(&rl
, "%s: Device reset requested.",
1870 netdev_get_name(&dev
->up
));
1871 ovs_mutex_unlock(&dev
->mutex
);
1873 ovs_mutex_unlock(&dpdk_mutex
);
1877 /* Ignore all other types. */
1884 dpdk_set_rxq_config(struct netdev_dpdk
*dev
, const struct smap
*args
)
1885 OVS_REQUIRES(dev
->mutex
)
1889 new_n_rxq
= MAX(smap_get_int(args
, "n_rxq", NR_QUEUE
), 1);
1890 if (new_n_rxq
!= dev
->requested_n_rxq
) {
1891 dev
->requested_n_rxq
= new_n_rxq
;
1892 netdev_request_reconfigure(&dev
->up
);
1897 dpdk_process_queue_size(struct netdev
*netdev
, const struct smap
*args
,
1898 const char *flag
, int default_size
, int *new_size
)
1900 int queue_size
= smap_get_int(args
, flag
, default_size
);
1902 if (queue_size
<= 0 || queue_size
> NIC_PORT_MAX_Q_SIZE
1903 || !is_pow2(queue_size
)) {
1904 queue_size
= default_size
;
1907 if (queue_size
!= *new_size
) {
1908 *new_size
= queue_size
;
1909 netdev_request_reconfigure(netdev
);
1914 netdev_dpdk_set_config(struct netdev
*netdev
, const struct smap
*args
,
1917 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1918 bool rx_fc_en
, tx_fc_en
, autoneg
, lsc_interrupt_mode
;
1919 bool flow_control_requested
= true;
1920 enum rte_eth_fc_mode fc_mode
;
1921 static const enum rte_eth_fc_mode fc_mode_set
[2][2] = {
1922 {RTE_FC_NONE
, RTE_FC_TX_PAUSE
},
1923 {RTE_FC_RX_PAUSE
, RTE_FC_FULL
}
1925 const char *new_devargs
;
1929 ovs_mutex_lock(&dpdk_mutex
);
1930 ovs_mutex_lock(&dev
->mutex
);
1932 dpdk_set_rxq_config(dev
, args
);
1934 dpdk_process_queue_size(netdev
, args
, "n_rxq_desc",
1935 NIC_PORT_DEFAULT_RXQ_SIZE
,
1936 &dev
->requested_rxq_size
);
1937 dpdk_process_queue_size(netdev
, args
, "n_txq_desc",
1938 NIC_PORT_DEFAULT_TXQ_SIZE
,
1939 &dev
->requested_txq_size
);
1941 new_devargs
= smap_get(args
, "dpdk-devargs");
1943 if (dev
->devargs
&& new_devargs
&& strcmp(new_devargs
, dev
->devargs
)) {
1944 /* The user requested a new device. If we return error, the caller
1945 * will delete this netdev and try to recreate it. */
1950 /* dpdk-devargs is required for device configuration */
1951 if (new_devargs
&& new_devargs
[0]) {
1952 /* Don't process dpdk-devargs if value is unchanged and port id
1954 if (!(dev
->devargs
&& !strcmp(dev
->devargs
, new_devargs
)
1955 && rte_eth_dev_is_valid_port(dev
->port_id
))) {
1956 dpdk_port_t new_port_id
= netdev_dpdk_process_devargs(dev
,
1959 if (!rte_eth_dev_is_valid_port(new_port_id
)) {
1961 } else if (new_port_id
== dev
->port_id
) {
1962 /* Already configured, do not reconfigure again */
1965 struct netdev_dpdk
*dup_dev
;
1967 dup_dev
= netdev_dpdk_lookup_by_port_id(new_port_id
);
1969 VLOG_WARN_BUF(errp
, "'%s' is trying to use device '%s' "
1970 "which is already in use by '%s'",
1971 netdev_get_name(netdev
), new_devargs
,
1972 netdev_get_name(&dup_dev
->up
));
1975 int sid
= rte_eth_dev_socket_id(new_port_id
);
1977 dev
->requested_socket_id
= sid
< 0 ? SOCKET0
: sid
;
1978 dev
->devargs
= xstrdup(new_devargs
);
1979 dev
->port_id
= new_port_id
;
1980 netdev_request_reconfigure(&dev
->up
);
1981 netdev_dpdk_clear_xstats(dev
);
1987 VLOG_WARN_BUF(errp
, "'%s' is missing 'options:dpdk-devargs'. "
1988 "The old 'dpdk<port_id>' names are not supported",
1989 netdev_get_name(netdev
));
1997 vf_mac
= smap_get(args
, "dpdk-vf-mac");
1999 struct eth_addr mac
;
2001 if (!dpdk_port_is_representor(dev
)) {
2002 VLOG_WARN_BUF(errp
, "'%s' is trying to set the VF MAC '%s' "
2003 "but 'options:dpdk-vf-mac' is only supported for "
2005 netdev_get_name(netdev
), vf_mac
);
2006 } else if (!eth_addr_from_string(vf_mac
, &mac
)) {
2007 VLOG_WARN_BUF(errp
, "interface '%s': cannot parse VF MAC '%s'.",
2008 netdev_get_name(netdev
), vf_mac
);
2009 } else if (eth_addr_is_multicast(mac
)) {
2011 "interface '%s': cannot set VF MAC to multicast "
2012 "address '%s'.", netdev_get_name(netdev
), vf_mac
);
2013 } else if (!eth_addr_equals(dev
->requested_hwaddr
, mac
)) {
2014 dev
->requested_hwaddr
= mac
;
2015 netdev_request_reconfigure(netdev
);
2019 lsc_interrupt_mode
= smap_get_bool(args
, "dpdk-lsc-interrupt", false);
2020 if (dev
->requested_lsc_interrupt_mode
!= lsc_interrupt_mode
) {
2021 dev
->requested_lsc_interrupt_mode
= lsc_interrupt_mode
;
2022 netdev_request_reconfigure(netdev
);
2025 rx_fc_en
= smap_get_bool(args
, "rx-flow-ctrl", false);
2026 tx_fc_en
= smap_get_bool(args
, "tx-flow-ctrl", false);
2027 autoneg
= smap_get_bool(args
, "flow-ctrl-autoneg", false);
2029 fc_mode
= fc_mode_set
[tx_fc_en
][rx_fc_en
];
2031 if (!smap_get(args
, "rx-flow-ctrl") && !smap_get(args
, "tx-flow-ctrl")
2032 && !smap_get(args
, "flow-ctrl-autoneg")) {
2033 /* FIXME: User didn't ask for flow control configuration.
2034 * For now we'll not print a warning if flow control is not
2035 * supported by the DPDK port. */
2036 flow_control_requested
= false;
2039 /* Get the Flow control configuration. */
2040 err
= -rte_eth_dev_flow_ctrl_get(dev
->port_id
, &dev
->fc_conf
);
2042 if (err
== ENOTSUP
) {
2043 if (flow_control_requested
) {
2044 VLOG_WARN("%s: Flow control is not supported.",
2045 netdev_get_name(netdev
));
2047 err
= 0; /* Not fatal. */
2049 VLOG_WARN("%s: Cannot get flow control parameters: %s",
2050 netdev_get_name(netdev
), rte_strerror(err
));
2055 if (dev
->fc_conf
.mode
!= fc_mode
|| autoneg
!= dev
->fc_conf
.autoneg
) {
2056 dev
->fc_conf
.mode
= fc_mode
;
2057 dev
->fc_conf
.autoneg
= autoneg
;
2058 dpdk_eth_flow_ctrl_setup(dev
);
2062 ovs_mutex_unlock(&dev
->mutex
);
2063 ovs_mutex_unlock(&dpdk_mutex
);
2069 netdev_dpdk_vhost_client_set_config(struct netdev
*netdev
,
2070 const struct smap
*args
,
2071 char **errp OVS_UNUSED
)
2073 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2075 int max_tx_retries
, cur_max_tx_retries
;
2077 ovs_mutex_lock(&dev
->mutex
);
2078 if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
2079 path
= smap_get(args
, "vhost-server-path");
2080 if (!nullable_string_is_equal(path
, dev
->vhost_id
)) {
2081 free(dev
->vhost_id
);
2082 dev
->vhost_id
= nullable_xstrdup(path
);
2083 netdev_request_reconfigure(netdev
);
2087 max_tx_retries
= smap_get_int(args
, "tx-retries-max",
2088 VHOST_ENQ_RETRY_DEF
);
2089 if (max_tx_retries
< VHOST_ENQ_RETRY_MIN
2090 || max_tx_retries
> VHOST_ENQ_RETRY_MAX
) {
2091 max_tx_retries
= VHOST_ENQ_RETRY_DEF
;
2093 atomic_read_relaxed(&dev
->vhost_tx_retries_max
, &cur_max_tx_retries
);
2094 if (max_tx_retries
!= cur_max_tx_retries
) {
2095 atomic_store_relaxed(&dev
->vhost_tx_retries_max
, max_tx_retries
);
2096 VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
2097 netdev_get_name(netdev
), max_tx_retries
);
2099 ovs_mutex_unlock(&dev
->mutex
);
2105 netdev_dpdk_get_numa_id(const struct netdev
*netdev
)
2107 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2109 return dev
->socket_id
;
2112 /* Sets the number of tx queues for the dpdk interface. */
2114 netdev_dpdk_set_tx_multiq(struct netdev
*netdev
, unsigned int n_txq
)
2116 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2118 ovs_mutex_lock(&dev
->mutex
);
2120 if (dev
->requested_n_txq
== n_txq
) {
2124 dev
->requested_n_txq
= n_txq
;
2125 netdev_request_reconfigure(netdev
);
2128 ovs_mutex_unlock(&dev
->mutex
);
2132 static struct netdev_rxq
*
2133 netdev_dpdk_rxq_alloc(void)
2135 struct netdev_rxq_dpdk
*rx
= dpdk_rte_mzalloc(sizeof *rx
);
2144 static struct netdev_rxq_dpdk
*
2145 netdev_rxq_dpdk_cast(const struct netdev_rxq
*rxq
)
2147 return CONTAINER_OF(rxq
, struct netdev_rxq_dpdk
, up
);
2151 netdev_dpdk_rxq_construct(struct netdev_rxq
*rxq
)
2153 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2154 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2156 ovs_mutex_lock(&dev
->mutex
);
2157 rx
->port_id
= dev
->port_id
;
2158 ovs_mutex_unlock(&dev
->mutex
);
2164 netdev_dpdk_rxq_destruct(struct netdev_rxq
*rxq OVS_UNUSED
)
2169 netdev_dpdk_rxq_dealloc(struct netdev_rxq
*rxq
)
2171 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2176 /* Prepare the packet for HWOL.
2177 * Return True if the packet is OK to continue. */
2179 netdev_dpdk_prep_hwol_packet(struct netdev_dpdk
*dev
, struct rte_mbuf
*mbuf
)
2181 struct dp_packet
*pkt
= CONTAINER_OF(mbuf
, struct dp_packet
, mbuf
);
2183 if (mbuf
->ol_flags
& PKT_TX_L4_MASK
) {
2184 mbuf
->l2_len
= (char *)dp_packet_l3(pkt
) - (char *)dp_packet_eth(pkt
);
2185 mbuf
->l3_len
= (char *)dp_packet_l4(pkt
) - (char *)dp_packet_l3(pkt
);
2186 mbuf
->outer_l2_len
= 0;
2187 mbuf
->outer_l3_len
= 0;
2190 if (mbuf
->ol_flags
& PKT_TX_TCP_SEG
) {
2191 struct tcp_header
*th
= dp_packet_l4(pkt
);
2194 VLOG_WARN_RL(&rl
, "%s: TCP Segmentation without L4 header"
2195 " pkt len: %"PRIu32
"", dev
->up
.name
, mbuf
->pkt_len
);
2199 mbuf
->l4_len
= TCP_OFFSET(th
->tcp_ctl
) * 4;
2200 mbuf
->ol_flags
|= PKT_TX_TCP_CKSUM
;
2201 mbuf
->tso_segsz
= dev
->mtu
- mbuf
->l3_len
- mbuf
->l4_len
;
2203 if (mbuf
->ol_flags
& PKT_TX_IPV4
) {
2204 mbuf
->ol_flags
|= PKT_TX_IP_CKSUM
;
2210 /* Prepare a batch for HWOL.
2211 * Return the number of good packets in the batch. */
2213 netdev_dpdk_prep_hwol_batch(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
2218 struct rte_mbuf
*pkt
;
2220 /* Prepare and filter bad HWOL packets. */
2221 for (i
= 0; i
< pkt_cnt
; i
++) {
2223 if (!netdev_dpdk_prep_hwol_packet(dev
, pkt
)) {
2224 rte_pktmbuf_free(pkt
);
2228 if (OVS_UNLIKELY(i
!= cnt
)) {
2237 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2238 * 'pkts', even in case of failure.
2240 * Returns the number of packets that weren't transmitted. */
2242 netdev_dpdk_eth_tx_burst(struct netdev_dpdk
*dev
, int qid
,
2243 struct rte_mbuf
**pkts
, int cnt
)
2246 uint16_t nb_tx_prep
= cnt
;
2248 if (userspace_tso_enabled()) {
2249 nb_tx_prep
= rte_eth_tx_prepare(dev
->port_id
, qid
, pkts
, cnt
);
2250 if (nb_tx_prep
!= cnt
) {
2251 VLOG_WARN_RL(&rl
, "%s: Output batch contains invalid packets. "
2252 "Only %u/%u are valid: %s", dev
->up
.name
, nb_tx_prep
,
2253 cnt
, rte_strerror(rte_errno
));
2257 while (nb_tx
!= nb_tx_prep
) {
2260 ret
= rte_eth_tx_burst(dev
->port_id
, qid
, pkts
+ nb_tx
,
2261 nb_tx_prep
- nb_tx
);
2269 if (OVS_UNLIKELY(nb_tx
!= cnt
)) {
2270 /* Free buffers, which we couldn't transmit, one at a time (each
2271 * packet could come from a different mempool) */
2274 for (i
= nb_tx
; i
< cnt
; i
++) {
2275 rte_pktmbuf_free(pkts
[i
]);
2283 netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm
*meter
,
2284 struct rte_meter_srtcm_profile
*profile
,
2285 struct rte_mbuf
*pkt
, uint64_t time
)
2287 uint32_t pkt_len
= rte_pktmbuf_pkt_len(pkt
) - sizeof(struct rte_ether_hdr
);
2289 return rte_meter_srtcm_color_blind_check(meter
, profile
, time
, pkt_len
) ==
2294 srtcm_policer_run_single_packet(struct rte_meter_srtcm
*meter
,
2295 struct rte_meter_srtcm_profile
*profile
,
2296 struct rte_mbuf
**pkts
, int pkt_cnt
,
2301 struct rte_mbuf
*pkt
= NULL
;
2302 uint64_t current_time
= rte_rdtsc();
2304 for (i
= 0; i
< pkt_cnt
; i
++) {
2306 /* Handle current packet */
2307 if (netdev_dpdk_srtcm_policer_pkt_handle(meter
, profile
,
2308 pkt
, current_time
)) {
2315 rte_pktmbuf_free(pkt
);
2324 ingress_policer_run(struct ingress_policer
*policer
, struct rte_mbuf
**pkts
,
2325 int pkt_cnt
, bool should_steal
)
2329 rte_spinlock_lock(&policer
->policer_lock
);
2330 cnt
= srtcm_policer_run_single_packet(&policer
->in_policer
,
2332 pkts
, pkt_cnt
, should_steal
);
2333 rte_spinlock_unlock(&policer
->policer_lock
);
2339 is_vhost_running(struct netdev_dpdk
*dev
)
2341 return (netdev_dpdk_get_vid(dev
) >= 0 && dev
->vhost_reconfigured
);
2345 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats
*stats
,
2346 unsigned int packet_size
)
2348 /* Hard-coded search for the size bucket. */
2349 if (packet_size
< 256) {
2350 if (packet_size
>= 128) {
2351 stats
->rx_128_to_255_packets
++;
2352 } else if (packet_size
<= 64) {
2353 stats
->rx_1_to_64_packets
++;
2355 stats
->rx_65_to_127_packets
++;
2358 if (packet_size
>= 1523) {
2359 stats
->rx_1523_to_max_packets
++;
2360 } else if (packet_size
>= 1024) {
2361 stats
->rx_1024_to_1522_packets
++;
2362 } else if (packet_size
< 512) {
2363 stats
->rx_256_to_511_packets
++;
2365 stats
->rx_512_to_1023_packets
++;
2371 netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk
*dev
,
2372 struct dp_packet
**packets
, int count
,
2375 struct netdev_stats
*stats
= &dev
->stats
;
2376 struct dp_packet
*packet
;
2377 unsigned int packet_size
;
2380 stats
->rx_packets
+= count
;
2381 stats
->rx_dropped
+= qos_drops
;
2382 for (i
= 0; i
< count
; i
++) {
2383 packet
= packets
[i
];
2384 packet_size
= dp_packet_size(packet
);
2386 if (OVS_UNLIKELY(packet_size
< ETH_HEADER_LEN
)) {
2387 /* This only protects the following multicast counting from
2388 * too short packets, but it does not stop the packet from
2389 * further processing. */
2391 stats
->rx_length_errors
++;
2395 netdev_dpdk_vhost_update_rx_size_counters(stats
, packet_size
);
2397 struct eth_header
*eh
= (struct eth_header
*) dp_packet_data(packet
);
2398 if (OVS_UNLIKELY(eth_addr_is_multicast(eh
->eth_dst
))) {
2402 stats
->rx_bytes
+= packet_size
;
2405 if (OVS_UNLIKELY(qos_drops
)) {
2406 dev
->sw_stats
->rx_qos_drops
+= qos_drops
;
2411 * The receive path for the vhost port is the TX path out from guest.
2414 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
*rxq
,
2415 struct dp_packet_batch
*batch
, int *qfill
)
2417 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2418 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
2420 uint16_t qos_drops
= 0;
2421 int qid
= rxq
->queue_id
* VIRTIO_QNUM
+ VIRTIO_TXQ
;
2422 int vid
= netdev_dpdk_get_vid(dev
);
2424 if (OVS_UNLIKELY(vid
< 0 || !dev
->vhost_reconfigured
2425 || !(dev
->flags
& NETDEV_UP
))) {
2429 nb_rx
= rte_vhost_dequeue_burst(vid
, qid
, dev
->dpdk_mp
->mp
,
2430 (struct rte_mbuf
**) batch
->packets
,
2437 if (nb_rx
== NETDEV_MAX_BURST
) {
2438 /* The DPDK API returns a uint32_t which often has invalid bits in
2439 * the upper 16-bits. Need to restrict the value to uint16_t. */
2440 *qfill
= rte_vhost_rx_queue_count(vid
, qid
) & UINT16_MAX
;
2448 nb_rx
= ingress_policer_run(policer
,
2449 (struct rte_mbuf
**) batch
->packets
,
2454 rte_spinlock_lock(&dev
->stats_lock
);
2455 netdev_dpdk_vhost_update_rx_counters(dev
, batch
->packets
,
2457 rte_spinlock_unlock(&dev
->stats_lock
);
2459 batch
->count
= nb_rx
;
2460 dp_packet_batch_init_packet_fields(batch
);
2466 netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq
*rxq
)
2468 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2470 return dev
->vhost_rxq_enabled
[rxq
->queue_id
];
2474 netdev_dpdk_rxq_recv(struct netdev_rxq
*rxq
, struct dp_packet_batch
*batch
,
2477 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2478 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2479 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
2483 if (OVS_UNLIKELY(!(dev
->flags
& NETDEV_UP
))) {
2487 nb_rx
= rte_eth_rx_burst(rx
->port_id
, rxq
->queue_id
,
2488 (struct rte_mbuf
**) batch
->packets
,
2496 nb_rx
= ingress_policer_run(policer
,
2497 (struct rte_mbuf
**) batch
->packets
,
2502 /* Update stats to reflect dropped packets */
2503 if (OVS_UNLIKELY(dropped
)) {
2504 rte_spinlock_lock(&dev
->stats_lock
);
2505 dev
->stats
.rx_dropped
+= dropped
;
2506 dev
->sw_stats
->rx_qos_drops
+= dropped
;
2507 rte_spinlock_unlock(&dev
->stats_lock
);
2510 batch
->count
= nb_rx
;
2511 dp_packet_batch_init_packet_fields(batch
);
2514 if (nb_rx
== NETDEV_MAX_BURST
) {
2515 *qfill
= rte_eth_rx_queue_count(rx
->port_id
, rxq
->queue_id
);
2525 netdev_dpdk_qos_run(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
2526 int cnt
, bool should_steal
)
2528 struct qos_conf
*qos_conf
= ovsrcu_get(struct qos_conf
*, &dev
->qos_conf
);
2531 rte_spinlock_lock(&qos_conf
->lock
);
2532 cnt
= qos_conf
->ops
->qos_run(qos_conf
, pkts
, cnt
, should_steal
);
2533 rte_spinlock_unlock(&qos_conf
->lock
);
2540 netdev_dpdk_filter_packet_len(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
2545 struct rte_mbuf
*pkt
;
2547 /* Filter oversized packets, unless are marked for TSO. */
2548 for (i
= 0; i
< pkt_cnt
; i
++) {
2550 if (OVS_UNLIKELY((pkt
->pkt_len
> dev
->max_packet_len
)
2551 && !(pkt
->ol_flags
& PKT_TX_TCP_SEG
))) {
2552 VLOG_WARN_RL(&rl
, "%s: Too big size %" PRIu32
" "
2553 "max_packet_len %d", dev
->up
.name
, pkt
->pkt_len
,
2554 dev
->max_packet_len
);
2555 rte_pktmbuf_free(pkt
);
2559 if (OVS_UNLIKELY(i
!= cnt
)) {
2569 netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk
*dev
,
2570 struct dp_packet
**packets
,
2572 struct netdev_dpdk_sw_stats
*sw_stats_add
)
2574 int dropped
= sw_stats_add
->tx_mtu_exceeded_drops
+
2575 sw_stats_add
->tx_qos_drops
+
2576 sw_stats_add
->tx_failure_drops
+
2577 sw_stats_add
->tx_invalid_hwol_drops
;
2578 struct netdev_stats
*stats
= &dev
->stats
;
2579 int sent
= attempted
- dropped
;
2582 stats
->tx_packets
+= sent
;
2583 stats
->tx_dropped
+= dropped
;
2585 for (i
= 0; i
< sent
; i
++) {
2586 stats
->tx_bytes
+= dp_packet_size(packets
[i
]);
2589 if (OVS_UNLIKELY(dropped
|| sw_stats_add
->tx_retries
)) {
2590 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2592 sw_stats
->tx_retries
+= sw_stats_add
->tx_retries
;
2593 sw_stats
->tx_failure_drops
+= sw_stats_add
->tx_failure_drops
;
2594 sw_stats
->tx_mtu_exceeded_drops
+= sw_stats_add
->tx_mtu_exceeded_drops
;
2595 sw_stats
->tx_qos_drops
+= sw_stats_add
->tx_qos_drops
;
2596 sw_stats
->tx_invalid_hwol_drops
+= sw_stats_add
->tx_invalid_hwol_drops
;
2601 __netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
2602 struct dp_packet
**pkts
, int cnt
)
2604 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2605 struct rte_mbuf
**cur_pkts
= (struct rte_mbuf
**) pkts
;
2606 struct netdev_dpdk_sw_stats sw_stats_add
;
2607 unsigned int n_packets_to_free
= cnt
;
2608 unsigned int total_packets
= cnt
;
2610 int max_retries
= VHOST_ENQ_RETRY_MIN
;
2611 int vid
= netdev_dpdk_get_vid(dev
);
2613 qid
= dev
->tx_q
[qid
% netdev
->n_txq
].map
;
2615 if (OVS_UNLIKELY(vid
< 0 || !dev
->vhost_reconfigured
|| qid
< 0
2616 || !(dev
->flags
& NETDEV_UP
))) {
2617 rte_spinlock_lock(&dev
->stats_lock
);
2618 dev
->stats
.tx_dropped
+= cnt
;
2619 rte_spinlock_unlock(&dev
->stats_lock
);
2623 if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev
->tx_q
[qid
].tx_lock
))) {
2624 COVERAGE_INC(vhost_tx_contention
);
2625 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
2628 sw_stats_add
.tx_invalid_hwol_drops
= cnt
;
2629 if (userspace_tso_enabled()) {
2630 cnt
= netdev_dpdk_prep_hwol_batch(dev
, cur_pkts
, cnt
);
2633 sw_stats_add
.tx_invalid_hwol_drops
-= cnt
;
2634 sw_stats_add
.tx_mtu_exceeded_drops
= cnt
;
2635 cnt
= netdev_dpdk_filter_packet_len(dev
, cur_pkts
, cnt
);
2636 sw_stats_add
.tx_mtu_exceeded_drops
-= cnt
;
2638 /* Check has QoS has been configured for the netdev */
2639 sw_stats_add
.tx_qos_drops
= cnt
;
2640 cnt
= netdev_dpdk_qos_run(dev
, cur_pkts
, cnt
, true);
2641 sw_stats_add
.tx_qos_drops
-= cnt
;
2643 n_packets_to_free
= cnt
;
2646 int vhost_qid
= qid
* VIRTIO_QNUM
+ VIRTIO_RXQ
;
2647 unsigned int tx_pkts
;
2649 tx_pkts
= rte_vhost_enqueue_burst(vid
, vhost_qid
, cur_pkts
, cnt
);
2650 if (OVS_LIKELY(tx_pkts
)) {
2651 /* Packets have been sent.*/
2653 /* Prepare for possible retry.*/
2654 cur_pkts
= &cur_pkts
[tx_pkts
];
2655 if (OVS_UNLIKELY(cnt
&& !retries
)) {
2657 * Read max retries as there are packets not sent
2658 * and no retries have already occurred.
2660 atomic_read_relaxed(&dev
->vhost_tx_retries_max
, &max_retries
);
2663 /* No packets sent - do not retry.*/
2666 } while (cnt
&& (retries
++ < max_retries
));
2668 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
2670 sw_stats_add
.tx_failure_drops
= cnt
;
2671 sw_stats_add
.tx_retries
= MIN(retries
, max_retries
);
2673 rte_spinlock_lock(&dev
->stats_lock
);
2674 netdev_dpdk_vhost_update_tx_counters(dev
, pkts
, total_packets
,
2676 rte_spinlock_unlock(&dev
->stats_lock
);
2679 for (i
= 0; i
< n_packets_to_free
; i
++) {
2680 dp_packet_delete(pkts
[i
]);
2685 netdev_dpdk_extbuf_free(void *addr OVS_UNUSED
, void *opaque
)
2690 static struct rte_mbuf
*
2691 dpdk_pktmbuf_attach_extbuf(struct rte_mbuf
*pkt
, uint32_t data_len
)
2693 uint32_t total_len
= RTE_PKTMBUF_HEADROOM
+ data_len
;
2694 struct rte_mbuf_ext_shared_info
*shinfo
= NULL
;
2698 if (rte_pktmbuf_tailroom(pkt
) >= sizeof *shinfo
) {
2699 shinfo
= rte_pktmbuf_mtod(pkt
, struct rte_mbuf_ext_shared_info
*);
2701 total_len
+= sizeof *shinfo
+ sizeof(uintptr_t);
2702 total_len
= RTE_ALIGN_CEIL(total_len
, sizeof(uintptr_t));
2705 if (OVS_UNLIKELY(total_len
> UINT16_MAX
)) {
2706 VLOG_ERR("Can't copy packet: too big %u", total_len
);
2710 buf_len
= total_len
;
2711 buf
= rte_malloc(NULL
, buf_len
, RTE_CACHE_LINE_SIZE
);
2712 if (OVS_UNLIKELY(buf
== NULL
)) {
2713 VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len
);
2717 /* Initialize shinfo. */
2719 shinfo
->free_cb
= netdev_dpdk_extbuf_free
;
2720 shinfo
->fcb_opaque
= buf
;
2721 rte_mbuf_ext_refcnt_set(shinfo
, 1);
2723 shinfo
= rte_pktmbuf_ext_shinfo_init_helper(buf
, &buf_len
,
2724 netdev_dpdk_extbuf_free
,
2726 if (OVS_UNLIKELY(shinfo
== NULL
)) {
2728 VLOG_ERR("Failed to initialize shared info for mbuf while "
2729 "attempting to attach an external buffer.");
2734 rte_pktmbuf_attach_extbuf(pkt
, buf
, rte_malloc_virt2iova(buf
), buf_len
,
2736 rte_pktmbuf_reset_headroom(pkt
);
2741 static struct rte_mbuf
*
2742 dpdk_pktmbuf_alloc(struct rte_mempool
*mp
, uint32_t data_len
)
2744 struct rte_mbuf
*pkt
= rte_pktmbuf_alloc(mp
);
2746 if (OVS_UNLIKELY(!pkt
)) {
2750 if (rte_pktmbuf_tailroom(pkt
) >= data_len
) {
2754 if (dpdk_pktmbuf_attach_extbuf(pkt
, data_len
)) {
2758 rte_pktmbuf_free(pkt
);
2763 static struct dp_packet
*
2764 dpdk_copy_dp_packet_to_mbuf(struct rte_mempool
*mp
, struct dp_packet
*pkt_orig
)
2766 struct rte_mbuf
*mbuf_dest
;
2767 struct dp_packet
*pkt_dest
;
2770 pkt_len
= dp_packet_size(pkt_orig
);
2771 mbuf_dest
= dpdk_pktmbuf_alloc(mp
, pkt_len
);
2772 if (OVS_UNLIKELY(mbuf_dest
== NULL
)) {
2776 pkt_dest
= CONTAINER_OF(mbuf_dest
, struct dp_packet
, mbuf
);
2777 memcpy(dp_packet_data(pkt_dest
), dp_packet_data(pkt_orig
), pkt_len
);
2778 dp_packet_set_size(pkt_dest
, pkt_len
);
2780 mbuf_dest
->tx_offload
= pkt_orig
->mbuf
.tx_offload
;
2781 mbuf_dest
->packet_type
= pkt_orig
->mbuf
.packet_type
;
2782 mbuf_dest
->ol_flags
|= (pkt_orig
->mbuf
.ol_flags
&
2783 ~(EXT_ATTACHED_MBUF
| IND_ATTACHED_MBUF
));
2785 memcpy(&pkt_dest
->l2_pad_size
, &pkt_orig
->l2_pad_size
,
2786 sizeof(struct dp_packet
) - offsetof(struct dp_packet
, l2_pad_size
));
2788 if (mbuf_dest
->ol_flags
& PKT_TX_L4_MASK
) {
2789 mbuf_dest
->l2_len
= (char *)dp_packet_l3(pkt_dest
)
2790 - (char *)dp_packet_eth(pkt_dest
);
2791 mbuf_dest
->l3_len
= (char *)dp_packet_l4(pkt_dest
)
2792 - (char *) dp_packet_l3(pkt_dest
);
2798 /* Tx function. Transmit packets indefinitely */
2800 dpdk_do_tx_copy(struct netdev
*netdev
, int qid
, struct dp_packet_batch
*batch
)
2801 OVS_NO_THREAD_SAFETY_ANALYSIS
2803 const size_t batch_cnt
= dp_packet_batch_size(batch
);
2804 #if !defined(__CHECKER__) && !defined(_WIN32)
2805 const size_t PKT_ARRAY_SIZE
= batch_cnt
;
2807 /* Sparse or MSVC doesn't like variable length array. */
2808 enum { PKT_ARRAY_SIZE
= NETDEV_MAX_BURST
};
2810 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2811 struct dp_packet
*pkts
[PKT_ARRAY_SIZE
];
2812 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2813 uint32_t cnt
= batch_cnt
;
2814 uint32_t dropped
= 0;
2815 uint32_t tx_failure
= 0;
2816 uint32_t mtu_drops
= 0;
2817 uint32_t qos_drops
= 0;
2819 if (dev
->type
!= DPDK_DEV_VHOST
) {
2820 /* Check if QoS has been configured for this netdev. */
2821 cnt
= netdev_dpdk_qos_run(dev
, (struct rte_mbuf
**) batch
->packets
,
2823 qos_drops
= batch_cnt
- cnt
;
2828 for (uint32_t i
= 0; i
< cnt
; i
++) {
2829 struct dp_packet
*packet
= batch
->packets
[i
];
2830 uint32_t size
= dp_packet_size(packet
);
2832 if (size
> dev
->max_packet_len
2833 && !(packet
->mbuf
.ol_flags
& PKT_TX_TCP_SEG
)) {
2834 VLOG_WARN_RL(&rl
, "Too big size %u max_packet_len %d", size
,
2835 dev
->max_packet_len
);
2840 pkts
[txcnt
] = dpdk_copy_dp_packet_to_mbuf(dev
->dpdk_mp
->mp
, packet
);
2841 if (OVS_UNLIKELY(!pkts
[txcnt
])) {
2849 if (OVS_LIKELY(txcnt
)) {
2850 if (dev
->type
== DPDK_DEV_VHOST
) {
2851 __netdev_dpdk_vhost_send(netdev
, qid
, pkts
, txcnt
);
2853 tx_failure
+= netdev_dpdk_eth_tx_burst(dev
, qid
,
2854 (struct rte_mbuf
**)pkts
,
2859 dropped
+= qos_drops
+ mtu_drops
+ tx_failure
;
2860 if (OVS_UNLIKELY(dropped
)) {
2861 rte_spinlock_lock(&dev
->stats_lock
);
2862 dev
->stats
.tx_dropped
+= dropped
;
2863 sw_stats
->tx_failure_drops
+= tx_failure
;
2864 sw_stats
->tx_mtu_exceeded_drops
+= mtu_drops
;
2865 sw_stats
->tx_qos_drops
+= qos_drops
;
2866 rte_spinlock_unlock(&dev
->stats_lock
);
2871 netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
2872 struct dp_packet_batch
*batch
,
2873 bool concurrent_txq OVS_UNUSED
)
2876 if (OVS_UNLIKELY(batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
2877 dpdk_do_tx_copy(netdev
, qid
, batch
);
2878 dp_packet_delete_batch(batch
, true);
2880 __netdev_dpdk_vhost_send(netdev
, qid
, batch
->packets
,
2881 dp_packet_batch_size(batch
));
2887 netdev_dpdk_send__(struct netdev_dpdk
*dev
, int qid
,
2888 struct dp_packet_batch
*batch
,
2889 bool concurrent_txq
)
2891 if (OVS_UNLIKELY(!(dev
->flags
& NETDEV_UP
))) {
2892 dp_packet_delete_batch(batch
, true);
2896 if (OVS_UNLIKELY(concurrent_txq
)) {
2897 qid
= qid
% dev
->up
.n_txq
;
2898 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
2901 if (OVS_UNLIKELY(batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
2902 struct netdev
*netdev
= &dev
->up
;
2904 dpdk_do_tx_copy(netdev
, qid
, batch
);
2905 dp_packet_delete_batch(batch
, true);
2907 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2909 int tx_failure
, mtu_drops
, qos_drops
, hwol_drops
;
2910 int batch_cnt
= dp_packet_batch_size(batch
);
2911 struct rte_mbuf
**pkts
= (struct rte_mbuf
**) batch
->packets
;
2913 hwol_drops
= batch_cnt
;
2914 if (userspace_tso_enabled()) {
2915 batch_cnt
= netdev_dpdk_prep_hwol_batch(dev
, pkts
, batch_cnt
);
2917 hwol_drops
-= batch_cnt
;
2918 mtu_drops
= batch_cnt
;
2919 batch_cnt
= netdev_dpdk_filter_packet_len(dev
, pkts
, batch_cnt
);
2920 mtu_drops
-= batch_cnt
;
2921 qos_drops
= batch_cnt
;
2922 batch_cnt
= netdev_dpdk_qos_run(dev
, pkts
, batch_cnt
, true);
2923 qos_drops
-= batch_cnt
;
2925 tx_failure
= netdev_dpdk_eth_tx_burst(dev
, qid
, pkts
, batch_cnt
);
2927 dropped
= tx_failure
+ mtu_drops
+ qos_drops
+ hwol_drops
;
2928 if (OVS_UNLIKELY(dropped
)) {
2929 rte_spinlock_lock(&dev
->stats_lock
);
2930 dev
->stats
.tx_dropped
+= dropped
;
2931 sw_stats
->tx_failure_drops
+= tx_failure
;
2932 sw_stats
->tx_mtu_exceeded_drops
+= mtu_drops
;
2933 sw_stats
->tx_qos_drops
+= qos_drops
;
2934 sw_stats
->tx_invalid_hwol_drops
+= hwol_drops
;
2935 rte_spinlock_unlock(&dev
->stats_lock
);
2939 if (OVS_UNLIKELY(concurrent_txq
)) {
2940 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
2945 netdev_dpdk_eth_send(struct netdev
*netdev
, int qid
,
2946 struct dp_packet_batch
*batch
, bool concurrent_txq
)
2948 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2950 netdev_dpdk_send__(dev
, qid
, batch
, concurrent_txq
);
2955 netdev_dpdk_set_etheraddr__(struct netdev_dpdk
*dev
, const struct eth_addr mac
)
2956 OVS_REQUIRES(dev
->mutex
)
2960 if (dev
->type
== DPDK_DEV_ETH
) {
2961 struct rte_ether_addr ea
;
2963 memcpy(ea
.addr_bytes
, mac
.ea
, ETH_ADDR_LEN
);
2964 err
= -rte_eth_dev_default_mac_addr_set(dev
->port_id
, &ea
);
2969 VLOG_WARN("%s: Failed to set requested mac("ETH_ADDR_FMT
"): %s",
2970 netdev_get_name(&dev
->up
), ETH_ADDR_ARGS(mac
),
2978 netdev_dpdk_set_etheraddr(struct netdev
*netdev
, const struct eth_addr mac
)
2980 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2983 ovs_mutex_lock(&dev
->mutex
);
2984 if (!eth_addr_equals(dev
->hwaddr
, mac
)) {
2985 err
= netdev_dpdk_set_etheraddr__(dev
, mac
);
2987 netdev_change_seq_changed(netdev
);
2990 ovs_mutex_unlock(&dev
->mutex
);
2996 netdev_dpdk_get_etheraddr(const struct netdev
*netdev
, struct eth_addr
*mac
)
2998 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3000 ovs_mutex_lock(&dev
->mutex
);
3002 ovs_mutex_unlock(&dev
->mutex
);
3008 netdev_dpdk_get_mtu(const struct netdev
*netdev
, int *mtup
)
3010 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3012 ovs_mutex_lock(&dev
->mutex
);
3014 ovs_mutex_unlock(&dev
->mutex
);
3020 netdev_dpdk_set_mtu(struct netdev
*netdev
, int mtu
)
3022 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3024 /* XXX: Ensure that the overall frame length of the requested MTU does not
3025 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
3026 * the L2 frame length is calculated for a given MTU when
3027 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
3028 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
3029 * not include vlan headers. As such we should use
3030 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
3031 * (8 bytes) for comparison. This avoids a failure later with
3032 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
3033 * a method to retrieve the upper bound MTU for a given device.
3035 if (MTU_TO_MAX_FRAME_LEN(mtu
) > NETDEV_DPDK_MAX_PKT_LEN
3036 || mtu
< RTE_ETHER_MIN_MTU
) {
3037 VLOG_WARN("%s: unsupported MTU %d\n", dev
->up
.name
, mtu
);
3041 ovs_mutex_lock(&dev
->mutex
);
3042 if (dev
->requested_mtu
!= mtu
) {
3043 dev
->requested_mtu
= mtu
;
3044 netdev_request_reconfigure(netdev
);
3046 ovs_mutex_unlock(&dev
->mutex
);
3052 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
);
3055 netdev_dpdk_vhost_get_stats(const struct netdev
*netdev
,
3056 struct netdev_stats
*stats
)
3058 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3060 ovs_mutex_lock(&dev
->mutex
);
3062 rte_spinlock_lock(&dev
->stats_lock
);
3063 /* Supported Stats */
3064 stats
->rx_packets
= dev
->stats
.rx_packets
;
3065 stats
->tx_packets
= dev
->stats
.tx_packets
;
3066 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
3067 stats
->tx_dropped
= dev
->stats
.tx_dropped
;
3068 stats
->multicast
= dev
->stats
.multicast
;
3069 stats
->rx_bytes
= dev
->stats
.rx_bytes
;
3070 stats
->tx_bytes
= dev
->stats
.tx_bytes
;
3071 stats
->rx_errors
= dev
->stats
.rx_errors
;
3072 stats
->rx_length_errors
= dev
->stats
.rx_length_errors
;
3074 stats
->rx_1_to_64_packets
= dev
->stats
.rx_1_to_64_packets
;
3075 stats
->rx_65_to_127_packets
= dev
->stats
.rx_65_to_127_packets
;
3076 stats
->rx_128_to_255_packets
= dev
->stats
.rx_128_to_255_packets
;
3077 stats
->rx_256_to_511_packets
= dev
->stats
.rx_256_to_511_packets
;
3078 stats
->rx_512_to_1023_packets
= dev
->stats
.rx_512_to_1023_packets
;
3079 stats
->rx_1024_to_1522_packets
= dev
->stats
.rx_1024_to_1522_packets
;
3080 stats
->rx_1523_to_max_packets
= dev
->stats
.rx_1523_to_max_packets
;
3082 rte_spinlock_unlock(&dev
->stats_lock
);
3084 ovs_mutex_unlock(&dev
->mutex
);
3090 netdev_dpdk_convert_xstats(struct netdev_stats
*stats
,
3091 const struct rte_eth_xstat
*xstats
,
3092 const struct rte_eth_xstat_name
*names
,
3093 const unsigned int size
)
3095 /* DPDK XSTATS Counter names definition. */
3096 #define DPDK_XSTATS \
3097 DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
3098 DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
3099 DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
3100 DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
3101 DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
3102 DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
3103 DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
3104 DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
3105 DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
3106 DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
3107 DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
3108 DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
3109 DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
3110 DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
3111 DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
3112 DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
3113 DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
3114 DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
3115 DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
3116 DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
3117 DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
3118 DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
3120 for (unsigned int i
= 0; i
< size
; i
++) {
3121 #define DPDK_XSTAT(MEMBER, NAME) \
3122 if (strcmp(NAME, names[i].name) == 0) { \
3123 stats->MEMBER = xstats[i].value; \
3133 netdev_dpdk_get_stats(const struct netdev
*netdev
, struct netdev_stats
*stats
)
3135 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3136 struct rte_eth_stats rte_stats
;
3139 netdev_dpdk_get_carrier(netdev
, &gg
);
3140 ovs_mutex_lock(&dev
->mutex
);
3142 struct rte_eth_xstat
*rte_xstats
= NULL
;
3143 struct rte_eth_xstat_name
*rte_xstats_names
= NULL
;
3144 int rte_xstats_len
, rte_xstats_new_len
, rte_xstats_ret
;
3146 if (rte_eth_stats_get(dev
->port_id
, &rte_stats
)) {
3147 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT
,
3149 ovs_mutex_unlock(&dev
->mutex
);
3153 /* Get length of statistics */
3154 rte_xstats_len
= rte_eth_xstats_get_names(dev
->port_id
, NULL
, 0);
3155 if (rte_xstats_len
< 0) {
3156 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
3160 /* Reserve memory for xstats names and values */
3161 rte_xstats_names
= xcalloc(rte_xstats_len
, sizeof *rte_xstats_names
);
3162 rte_xstats
= xcalloc(rte_xstats_len
, sizeof *rte_xstats
);
3164 /* Retreive xstats names */
3165 rte_xstats_new_len
= rte_eth_xstats_get_names(dev
->port_id
,
3168 if (rte_xstats_new_len
!= rte_xstats_len
) {
3169 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT
,
3173 /* Retreive xstats values */
3174 memset(rte_xstats
, 0xff, sizeof *rte_xstats
* rte_xstats_len
);
3175 rte_xstats_ret
= rte_eth_xstats_get(dev
->port_id
, rte_xstats
,
3177 if (rte_xstats_ret
> 0 && rte_xstats_ret
<= rte_xstats_len
) {
3178 netdev_dpdk_convert_xstats(stats
, rte_xstats
, rte_xstats_names
,
3181 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
3187 free(rte_xstats_names
);
3189 stats
->rx_packets
= rte_stats
.ipackets
;
3190 stats
->tx_packets
= rte_stats
.opackets
;
3191 stats
->rx_bytes
= rte_stats
.ibytes
;
3192 stats
->tx_bytes
= rte_stats
.obytes
;
3193 stats
->rx_errors
= rte_stats
.ierrors
;
3194 stats
->tx_errors
= rte_stats
.oerrors
;
3196 rte_spinlock_lock(&dev
->stats_lock
);
3197 stats
->tx_dropped
= dev
->stats
.tx_dropped
;
3198 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
3199 rte_spinlock_unlock(&dev
->stats_lock
);
3201 /* These are the available DPDK counters for packets not received due to
3202 * local resource constraints in DPDK and NIC respectively. */
3203 stats
->rx_dropped
+= rte_stats
.rx_nombuf
+ rte_stats
.imissed
;
3204 stats
->rx_missed_errors
= rte_stats
.imissed
;
3206 ovs_mutex_unlock(&dev
->mutex
);
3212 netdev_dpdk_get_custom_stats(const struct netdev
*netdev
,
3213 struct netdev_custom_stats
*custom_stats
)
3217 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3218 int rte_xstats_ret
, sw_stats_size
;
3220 netdev_dpdk_get_sw_custom_stats(netdev
, custom_stats
);
3222 ovs_mutex_lock(&dev
->mutex
);
3224 if (netdev_dpdk_configure_xstats(dev
)) {
3225 uint64_t *values
= xcalloc(dev
->rte_xstats_ids_size
,
3229 rte_eth_xstats_get_by_id(dev
->port_id
, dev
->rte_xstats_ids
,
3230 values
, dev
->rte_xstats_ids_size
);
3232 if (rte_xstats_ret
> 0 &&
3233 rte_xstats_ret
<= dev
->rte_xstats_ids_size
) {
3235 sw_stats_size
= custom_stats
->size
;
3236 custom_stats
->size
+= rte_xstats_ret
;
3237 custom_stats
->counters
= xrealloc(custom_stats
->counters
,
3238 custom_stats
->size
*
3239 sizeof *custom_stats
->counters
);
3241 for (i
= 0; i
< rte_xstats_ret
; i
++) {
3242 ovs_strlcpy(custom_stats
->counters
[sw_stats_size
+ i
].name
,
3243 netdev_dpdk_get_xstat_name(dev
,
3244 dev
->rte_xstats_ids
[i
]),
3245 NETDEV_CUSTOM_STATS_NAME_SIZE
);
3246 custom_stats
->counters
[sw_stats_size
+ i
].value
= values
[i
];
3249 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
3251 /* Let's clear statistics cache, so it will be
3253 netdev_dpdk_clear_xstats(dev
);
3259 ovs_mutex_unlock(&dev
->mutex
);
3265 netdev_dpdk_get_sw_custom_stats(const struct netdev
*netdev
,
3266 struct netdev_custom_stats
*custom_stats
)
3268 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3272 SW_CSTAT(tx_retries) \
3273 SW_CSTAT(tx_failure_drops) \
3274 SW_CSTAT(tx_mtu_exceeded_drops) \
3275 SW_CSTAT(tx_qos_drops) \
3276 SW_CSTAT(rx_qos_drops) \
3277 SW_CSTAT(tx_invalid_hwol_drops)
3279 #define SW_CSTAT(NAME) + 1
3280 custom_stats
->size
= SW_CSTATS
;
3282 custom_stats
->counters
= xcalloc(custom_stats
->size
,
3283 sizeof *custom_stats
->counters
);
3285 ovs_mutex_lock(&dev
->mutex
);
3287 rte_spinlock_lock(&dev
->stats_lock
);
3289 #define SW_CSTAT(NAME) \
3290 custom_stats->counters[i++].value = dev->sw_stats->NAME;
3293 rte_spinlock_unlock(&dev
->stats_lock
);
3295 ovs_mutex_unlock(&dev
->mutex
);
3299 #define SW_CSTAT(NAME) \
3300 if (custom_stats->counters[i].value != UINT64_MAX) { \
3301 ovs_strlcpy(custom_stats->counters[n].name, \
3302 "ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
3303 custom_stats->counters[n].value = custom_stats->counters[i].value; \
3310 custom_stats
->size
= n
;
3315 netdev_dpdk_get_features(const struct netdev
*netdev
,
3316 enum netdev_features
*current
,
3317 enum netdev_features
*advertised
,
3318 enum netdev_features
*supported
,
3319 enum netdev_features
*peer
)
3321 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3322 struct rte_eth_link link
;
3323 uint32_t feature
= 0;
3325 ovs_mutex_lock(&dev
->mutex
);
3327 ovs_mutex_unlock(&dev
->mutex
);
3329 /* Match against OpenFlow defined link speed values. */
3330 if (link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) {
3331 switch (link
.link_speed
) {
3332 case ETH_SPEED_NUM_10M
:
3333 feature
|= NETDEV_F_10MB_FD
;
3335 case ETH_SPEED_NUM_100M
:
3336 feature
|= NETDEV_F_100MB_FD
;
3338 case ETH_SPEED_NUM_1G
:
3339 feature
|= NETDEV_F_1GB_FD
;
3341 case ETH_SPEED_NUM_10G
:
3342 feature
|= NETDEV_F_10GB_FD
;
3344 case ETH_SPEED_NUM_40G
:
3345 feature
|= NETDEV_F_40GB_FD
;
3347 case ETH_SPEED_NUM_100G
:
3348 feature
|= NETDEV_F_100GB_FD
;
3351 feature
|= NETDEV_F_OTHER
;
3353 } else if (link
.link_duplex
== ETH_LINK_HALF_DUPLEX
) {
3354 switch (link
.link_speed
) {
3355 case ETH_SPEED_NUM_10M
:
3356 feature
|= NETDEV_F_10MB_HD
;
3358 case ETH_SPEED_NUM_100M
:
3359 feature
|= NETDEV_F_100MB_HD
;
3361 case ETH_SPEED_NUM_1G
:
3362 feature
|= NETDEV_F_1GB_HD
;
3365 feature
|= NETDEV_F_OTHER
;
3369 if (link
.link_autoneg
) {
3370 feature
|= NETDEV_F_AUTONEG
;
3374 *advertised
= *supported
= *peer
= 0;
3379 static struct ingress_policer
*
3380 netdev_dpdk_policer_construct(uint32_t rate
, uint32_t burst
)
3382 struct ingress_policer
*policer
= NULL
;
3383 uint64_t rate_bytes
;
3384 uint64_t burst_bytes
;
3387 policer
= xmalloc(sizeof *policer
);
3388 rte_spinlock_init(&policer
->policer_lock
);
3390 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
3391 rate_bytes
= rate
* 1000ULL / 8;
3392 burst_bytes
= burst
* 1000ULL / 8;
3394 policer
->app_srtcm_params
.cir
= rate_bytes
;
3395 policer
->app_srtcm_params
.cbs
= burst_bytes
;
3396 policer
->app_srtcm_params
.ebs
= 0;
3397 err
= rte_meter_srtcm_profile_config(&policer
->in_prof
,
3398 &policer
->app_srtcm_params
);
3400 err
= rte_meter_srtcm_config(&policer
->in_policer
,
3404 VLOG_ERR("Could not create rte meter for ingress policer");
3413 netdev_dpdk_set_policing(struct netdev
* netdev
, uint32_t policer_rate
,
3414 uint32_t policer_burst
)
3416 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3417 struct ingress_policer
*policer
;
3419 /* Force to 0 if no rate specified,
3420 * default to 8000 kbits if burst is 0,
3421 * else stick with user-specified value.
3423 policer_burst
= (!policer_rate
? 0
3424 : !policer_burst
? 8000
3427 ovs_mutex_lock(&dev
->mutex
);
3429 policer
= ovsrcu_get_protected(struct ingress_policer
*,
3430 &dev
->ingress_policer
);
3432 if (dev
->policer_rate
== policer_rate
&&
3433 dev
->policer_burst
== policer_burst
) {
3434 /* Assume that settings haven't changed since we last set them. */
3435 ovs_mutex_unlock(&dev
->mutex
);
3439 /* Destroy any existing ingress policer for the device if one exists */
3441 ovsrcu_postpone(free
, policer
);
3444 if (policer_rate
!= 0) {
3445 policer
= netdev_dpdk_policer_construct(policer_rate
, policer_burst
);
3449 ovsrcu_set(&dev
->ingress_policer
, policer
);
3450 dev
->policer_rate
= policer_rate
;
3451 dev
->policer_burst
= policer_burst
;
3452 ovs_mutex_unlock(&dev
->mutex
);
3458 netdev_dpdk_get_ifindex(const struct netdev
*netdev
)
3460 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3462 ovs_mutex_lock(&dev
->mutex
);
3463 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
3464 * postive integer to meet RFC 2863 recommendations.
3466 int ifindex
= hash_string(netdev
->name
, 0) % 0xfffffe + 1;
3467 ovs_mutex_unlock(&dev
->mutex
);
3473 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
)
3475 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3477 ovs_mutex_lock(&dev
->mutex
);
3478 check_link_status(dev
);
3479 *carrier
= dev
->link
.link_status
;
3481 ovs_mutex_unlock(&dev
->mutex
);
3487 netdev_dpdk_vhost_get_carrier(const struct netdev
*netdev
, bool *carrier
)
3489 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3491 ovs_mutex_lock(&dev
->mutex
);
3493 if (is_vhost_running(dev
)) {
3499 ovs_mutex_unlock(&dev
->mutex
);
3504 static long long int
3505 netdev_dpdk_get_carrier_resets(const struct netdev
*netdev
)
3507 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3508 long long int carrier_resets
;
3510 ovs_mutex_lock(&dev
->mutex
);
3511 carrier_resets
= dev
->link_reset_cnt
;
3512 ovs_mutex_unlock(&dev
->mutex
);
3514 return carrier_resets
;
3518 netdev_dpdk_set_miimon(struct netdev
*netdev OVS_UNUSED
,
3519 long long int interval OVS_UNUSED
)
3525 netdev_dpdk_update_flags__(struct netdev_dpdk
*dev
,
3526 enum netdev_flags off
, enum netdev_flags on
,
3527 enum netdev_flags
*old_flagsp
)
3528 OVS_REQUIRES(dev
->mutex
)
3530 if ((off
| on
) & ~(NETDEV_UP
| NETDEV_PROMISC
)) {
3534 *old_flagsp
= dev
->flags
;
3538 if (dev
->flags
== *old_flagsp
) {
3542 if (dev
->type
== DPDK_DEV_ETH
) {
3544 if ((dev
->flags
^ *old_flagsp
) & NETDEV_UP
) {
3547 if (dev
->flags
& NETDEV_UP
) {
3548 err
= rte_eth_dev_set_link_up(dev
->port_id
);
3550 err
= rte_eth_dev_set_link_down(dev
->port_id
);
3552 if (err
== -ENOTSUP
) {
3553 VLOG_INFO("Interface %s does not support link state "
3554 "configuration", netdev_get_name(&dev
->up
));
3555 } else if (err
< 0) {
3556 VLOG_ERR("Interface %s link change error: %s",
3557 netdev_get_name(&dev
->up
), rte_strerror(-err
));
3558 dev
->flags
= *old_flagsp
;
3563 if (dev
->flags
& NETDEV_PROMISC
) {
3564 rte_eth_promiscuous_enable(dev
->port_id
);
3567 netdev_change_seq_changed(&dev
->up
);
3569 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3570 * running then change netdev's change_seq to trigger link state
3573 if ((NETDEV_UP
& ((*old_flagsp
^ on
) | (*old_flagsp
^ off
)))
3574 && is_vhost_running(dev
)) {
3575 netdev_change_seq_changed(&dev
->up
);
3577 /* Clear statistics if device is getting up. */
3578 if (NETDEV_UP
& on
) {
3579 rte_spinlock_lock(&dev
->stats_lock
);
3580 memset(&dev
->stats
, 0, sizeof dev
->stats
);
3581 rte_spinlock_unlock(&dev
->stats_lock
);
3590 netdev_dpdk_update_flags(struct netdev
*netdev
,
3591 enum netdev_flags off
, enum netdev_flags on
,
3592 enum netdev_flags
*old_flagsp
)
3594 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3597 ovs_mutex_lock(&dev
->mutex
);
3598 error
= netdev_dpdk_update_flags__(dev
, off
, on
, old_flagsp
);
3599 ovs_mutex_unlock(&dev
->mutex
);
3605 netdev_dpdk_vhost_user_get_status(const struct netdev
*netdev
,
3608 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3610 ovs_mutex_lock(&dev
->mutex
);
3612 bool client_mode
= dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
;
3613 smap_add_format(args
, "mode", "%s", client_mode
? "client" : "server");
3615 int vid
= netdev_dpdk_get_vid(dev
);
3617 smap_add_format(args
, "status", "disconnected");
3618 ovs_mutex_unlock(&dev
->mutex
);
3621 smap_add_format(args
, "status", "connected");
3624 char socket_name
[PATH_MAX
];
3625 if (!rte_vhost_get_ifname(vid
, socket_name
, PATH_MAX
)) {
3626 smap_add_format(args
, "socket", "%s", socket_name
);
3630 if (!rte_vhost_get_negotiated_features(vid
, &features
)) {
3631 smap_add_format(args
, "features", "0x%016"PRIx64
, features
);
3635 if (!rte_vhost_get_mtu(vid
, &mtu
)) {
3636 smap_add_format(args
, "mtu", "%d", mtu
);
3639 int numa
= rte_vhost_get_numa_node(vid
);
3641 smap_add_format(args
, "numa", "%d", numa
);
3644 uint16_t vring_num
= rte_vhost_get_vring_num(vid
);
3646 smap_add_format(args
, "num_of_vrings", "%d", vring_num
);
3649 for (int i
= 0; i
< vring_num
; i
++) {
3650 struct rte_vhost_vring vring
;
3652 rte_vhost_get_vhost_vring(vid
, i
, &vring
);
3653 smap_add_nocopy(args
, xasprintf("vring_%d_size", i
),
3654 xasprintf("%d", vring
.size
));
3657 ovs_mutex_unlock(&dev
->mutex
);
3662 * Convert a given uint32_t link speed defined in DPDK to a string
3666 netdev_dpdk_link_speed_to_str__(uint32_t link_speed
)
3668 switch (link_speed
) {
3669 case ETH_SPEED_NUM_10M
: return "10Mbps";
3670 case ETH_SPEED_NUM_100M
: return "100Mbps";
3671 case ETH_SPEED_NUM_1G
: return "1Gbps";
3672 case ETH_SPEED_NUM_2_5G
: return "2.5Gbps";
3673 case ETH_SPEED_NUM_5G
: return "5Gbps";
3674 case ETH_SPEED_NUM_10G
: return "10Gbps";
3675 case ETH_SPEED_NUM_20G
: return "20Gbps";
3676 case ETH_SPEED_NUM_25G
: return "25Gbps";
3677 case ETH_SPEED_NUM_40G
: return "40Gbps";
3678 case ETH_SPEED_NUM_50G
: return "50Gbps";
3679 case ETH_SPEED_NUM_56G
: return "56Gbps";
3680 case ETH_SPEED_NUM_100G
: return "100Gbps";
3681 default: return "Not Defined";
3686 netdev_dpdk_get_status(const struct netdev
*netdev
, struct smap
*args
)
3688 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3689 struct rte_eth_dev_info dev_info
;
3690 uint32_t link_speed
;
3693 if (!rte_eth_dev_is_valid_port(dev
->port_id
)) {
3697 ovs_mutex_lock(&dpdk_mutex
);
3698 ovs_mutex_lock(&dev
->mutex
);
3699 rte_eth_dev_info_get(dev
->port_id
, &dev_info
);
3700 link_speed
= dev
->link
.link_speed
;
3701 dev_flags
= *dev_info
.dev_flags
;
3702 ovs_mutex_unlock(&dev
->mutex
);
3703 const struct rte_bus
*bus
;
3704 const struct rte_pci_device
*pci_dev
;
3705 uint16_t vendor_id
= PCI_ANY_ID
;
3706 uint16_t device_id
= PCI_ANY_ID
;
3707 bus
= rte_bus_find_by_device(dev_info
.device
);
3708 if (bus
&& !strcmp(bus
->name
, "pci")) {
3709 pci_dev
= RTE_DEV_TO_PCI(dev_info
.device
);
3711 vendor_id
= pci_dev
->id
.vendor_id
;
3712 device_id
= pci_dev
->id
.device_id
;
3715 ovs_mutex_unlock(&dpdk_mutex
);
3717 smap_add_format(args
, "port_no", DPDK_PORT_ID_FMT
, dev
->port_id
);
3718 smap_add_format(args
, "numa_id", "%d",
3719 rte_eth_dev_socket_id(dev
->port_id
));
3720 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
3721 smap_add_format(args
, "min_rx_bufsize", "%u", dev_info
.min_rx_bufsize
);
3722 smap_add_format(args
, "max_rx_pktlen", "%u", dev
->max_packet_len
);
3723 smap_add_format(args
, "max_rx_queues", "%u", dev_info
.max_rx_queues
);
3724 smap_add_format(args
, "max_tx_queues", "%u", dev_info
.max_tx_queues
);
3725 smap_add_format(args
, "max_mac_addrs", "%u", dev_info
.max_mac_addrs
);
3726 smap_add_format(args
, "max_hash_mac_addrs", "%u",
3727 dev_info
.max_hash_mac_addrs
);
3728 smap_add_format(args
, "max_vfs", "%u", dev_info
.max_vfs
);
3729 smap_add_format(args
, "max_vmdq_pools", "%u", dev_info
.max_vmdq_pools
);
3731 /* Querying the DPDK library for iftype may be done in future, pending
3732 * support; cf. RFC 3635 Section 3.2.4. */
3733 enum { IF_TYPE_ETHERNETCSMACD
= 6 };
3735 smap_add_format(args
, "if_type", "%"PRIu32
, IF_TYPE_ETHERNETCSMACD
);
3736 smap_add_format(args
, "if_descr", "%s %s", rte_version(),
3737 dev_info
.driver_name
);
3738 smap_add_format(args
, "pci-vendor_id", "0x%x", vendor_id
);
3739 smap_add_format(args
, "pci-device_id", "0x%x", device_id
);
3741 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3742 * In that case the speed will not be reported as part of the usual
3743 * call to get_features(). Get the link speed of the device and add it
3744 * to the device status in an easy to read string format.
3746 smap_add(args
, "link_speed",
3747 netdev_dpdk_link_speed_to_str__(link_speed
));
3749 if (dev_flags
& RTE_ETH_DEV_REPRESENTOR
) {
3750 smap_add_format(args
, "dpdk-vf-mac", ETH_ADDR_FMT
,
3751 ETH_ADDR_ARGS(dev
->hwaddr
));
3758 netdev_dpdk_set_admin_state__(struct netdev_dpdk
*dev
, bool admin_state
)
3759 OVS_REQUIRES(dev
->mutex
)
3761 enum netdev_flags old_flags
;
3764 netdev_dpdk_update_flags__(dev
, 0, NETDEV_UP
, &old_flags
);
3766 netdev_dpdk_update_flags__(dev
, NETDEV_UP
, 0, &old_flags
);
3771 netdev_dpdk_set_admin_state(struct unixctl_conn
*conn
, int argc
,
3772 const char *argv
[], void *aux OVS_UNUSED
)
3776 if (!strcasecmp(argv
[argc
- 1], "up")) {
3778 } else if ( !strcasecmp(argv
[argc
- 1], "down")) {
3781 unixctl_command_reply_error(conn
, "Invalid Admin State");
3786 struct netdev
*netdev
= netdev_from_name(argv
[1]);
3788 if (netdev
&& is_dpdk_class(netdev
->netdev_class
)) {
3789 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3791 ovs_mutex_lock(&dev
->mutex
);
3792 netdev_dpdk_set_admin_state__(dev
, up
);
3793 ovs_mutex_unlock(&dev
->mutex
);
3795 netdev_close(netdev
);
3797 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
3798 netdev_close(netdev
);
3802 struct netdev_dpdk
*dev
;
3804 ovs_mutex_lock(&dpdk_mutex
);
3805 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3806 ovs_mutex_lock(&dev
->mutex
);
3807 netdev_dpdk_set_admin_state__(dev
, up
);
3808 ovs_mutex_unlock(&dev
->mutex
);
3810 ovs_mutex_unlock(&dpdk_mutex
);
3812 unixctl_command_reply(conn
, "OK");
3816 netdev_dpdk_detach(struct unixctl_conn
*conn
, int argc OVS_UNUSED
,
3817 const char *argv
[], void *aux OVS_UNUSED
)
3820 dpdk_port_t port_id
;
3821 struct netdev_dpdk
*dev
;
3822 struct rte_device
*rte_dev
;
3823 struct ds used_interfaces
= DS_EMPTY_INITIALIZER
;
3826 ovs_mutex_lock(&dpdk_mutex
);
3828 port_id
= netdev_dpdk_get_port_by_devargs(argv
[1]);
3829 if (!rte_eth_dev_is_valid_port(port_id
)) {
3830 response
= xasprintf("Device '%s' not found in DPDK", argv
[1]);
3834 rte_dev
= rte_eth_devices
[port_id
].device
;
3835 ds_put_format(&used_interfaces
,
3836 "Device '%s' is being used by the following interfaces:",
3839 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3840 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3841 if (rte_eth_devices
[dev
->port_id
].device
== rte_dev
3842 && rte_eth_devices
[dev
->port_id
].state
!= RTE_ETH_DEV_UNUSED
) {
3844 ds_put_format(&used_interfaces
, " %s",
3845 netdev_get_name(&dev
->up
));
3850 ds_put_cstr(&used_interfaces
, ". Remove them before detaching.");
3851 response
= ds_steal_cstr(&used_interfaces
);
3852 ds_destroy(&used_interfaces
);
3855 ds_destroy(&used_interfaces
);
3857 rte_eth_dev_close(port_id
);
3858 if (rte_dev_remove(rte_dev
) < 0) {
3859 response
= xasprintf("Device '%s' can not be detached", argv
[1]);
3863 response
= xasprintf("All devices shared with device '%s' "
3864 "have been detached", argv
[1]);
3866 ovs_mutex_unlock(&dpdk_mutex
);
3867 unixctl_command_reply(conn
, response
);
3872 ovs_mutex_unlock(&dpdk_mutex
);
3873 unixctl_command_reply_error(conn
, response
);
3878 netdev_dpdk_get_mempool_info(struct unixctl_conn
*conn
,
3879 int argc
, const char *argv
[],
3880 void *aux OVS_UNUSED
)
3884 char *response
= NULL
;
3885 struct netdev
*netdev
= NULL
;
3888 netdev
= netdev_from_name(argv
[1]);
3889 if (!netdev
|| !is_dpdk_class(netdev
->netdev_class
)) {
3890 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
3895 stream
= open_memstream(&response
, &size
);
3897 response
= xasprintf("Unable to open memstream: %s.",
3898 ovs_strerror(errno
));
3899 unixctl_command_reply_error(conn
, response
);
3904 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3906 ovs_mutex_lock(&dev
->mutex
);
3907 ovs_mutex_lock(&dpdk_mp_mutex
);
3909 rte_mempool_dump(stream
, dev
->dpdk_mp
->mp
);
3911 ovs_mutex_unlock(&dpdk_mp_mutex
);
3912 ovs_mutex_unlock(&dev
->mutex
);
3914 ovs_mutex_lock(&dpdk_mp_mutex
);
3915 rte_mempool_list_dump(stream
);
3916 ovs_mutex_unlock(&dpdk_mp_mutex
);
3921 unixctl_command_reply(conn
, response
);
3924 netdev_close(netdev
);
3928 * Set virtqueue flags so that we do not receive interrupts.
3931 set_irq_status(int vid
)
3935 for (i
= 0; i
< rte_vhost_get_vring_num(vid
); i
++) {
3936 rte_vhost_enable_guest_notification(vid
, i
, 0);
3941 * Fixes mapping for vhost-user tx queues. Must be called after each
3942 * enabling/disabling of queues and n_txq modifications.
3945 netdev_dpdk_remap_txqs(struct netdev_dpdk
*dev
)
3946 OVS_REQUIRES(dev
->mutex
)
3948 int *enabled_queues
, n_enabled
= 0;
3949 int i
, k
, total_txqs
= dev
->up
.n_txq
;
3951 enabled_queues
= xcalloc(total_txqs
, sizeof *enabled_queues
);
3953 for (i
= 0; i
< total_txqs
; i
++) {
3954 /* Enabled queues always mapped to themselves. */
3955 if (dev
->tx_q
[i
].map
== i
) {
3956 enabled_queues
[n_enabled
++] = i
;
3960 if (n_enabled
== 0 && total_txqs
!= 0) {
3961 enabled_queues
[0] = OVS_VHOST_QUEUE_DISABLED
;
3966 for (i
= 0; i
< total_txqs
; i
++) {
3967 if (dev
->tx_q
[i
].map
!= i
) {
3968 dev
->tx_q
[i
].map
= enabled_queues
[k
];
3969 k
= (k
+ 1) % n_enabled
;
3973 if (VLOG_IS_DBG_ENABLED()) {
3974 struct ds mapping
= DS_EMPTY_INITIALIZER
;
3976 ds_put_format(&mapping
, "TX queue mapping for port '%s':\n",
3977 netdev_get_name(&dev
->up
));
3978 for (i
= 0; i
< total_txqs
; i
++) {
3979 ds_put_format(&mapping
, "%2d --> %2d\n", i
, dev
->tx_q
[i
].map
);
3982 VLOG_DBG("%s", ds_cstr(&mapping
));
3983 ds_destroy(&mapping
);
3986 free(enabled_queues
);
3990 * A new virtio-net device is added to a vhost port.
3995 struct netdev_dpdk
*dev
;
3996 bool exists
= false;
3998 char ifname
[IF_NAME_SZ
];
4000 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
4002 ovs_mutex_lock(&dpdk_mutex
);
4003 /* Add device to the vhost port with the same name as that passed down. */
4004 LIST_FOR_EACH(dev
, list_node
, &dpdk_list
) {
4005 ovs_mutex_lock(&dev
->mutex
);
4006 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
4007 uint32_t qp_num
= rte_vhost_get_vring_num(vid
) / VIRTIO_QNUM
;
4009 /* Get NUMA information */
4010 newnode
= rte_vhost_get_numa_node(vid
);
4011 if (newnode
== -1) {
4013 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
4016 newnode
= dev
->socket_id
;
4019 if (dev
->requested_n_txq
< qp_num
4020 || dev
->requested_n_rxq
< qp_num
4021 || dev
->requested_socket_id
!= newnode
) {
4022 dev
->requested_socket_id
= newnode
;
4023 dev
->requested_n_rxq
= qp_num
;
4024 dev
->requested_n_txq
= qp_num
;
4025 netdev_request_reconfigure(&dev
->up
);
4027 /* Reconfiguration not required. */
4028 dev
->vhost_reconfigured
= true;
4031 ovsrcu_index_set(&dev
->vid
, vid
);
4034 /* Disable notifications. */
4035 set_irq_status(vid
);
4036 netdev_change_seq_changed(&dev
->up
);
4037 ovs_mutex_unlock(&dev
->mutex
);
4040 ovs_mutex_unlock(&dev
->mutex
);
4042 ovs_mutex_unlock(&dpdk_mutex
);
4045 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname
);
4050 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
4056 /* Clears mapping for all available queues of vhost interface. */
4058 netdev_dpdk_txq_map_clear(struct netdev_dpdk
*dev
)
4059 OVS_REQUIRES(dev
->mutex
)
4063 for (i
= 0; i
< dev
->up
.n_txq
; i
++) {
4064 dev
->tx_q
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
4069 * Remove a virtio-net device from the specific vhost port. Use dev->remove
4070 * flag to stop any more packets from being sent or received to/from a VM and
4071 * ensure all currently queued packets have been sent/received before removing
4075 destroy_device(int vid
)
4077 struct netdev_dpdk
*dev
;
4078 bool exists
= false;
4079 char ifname
[IF_NAME_SZ
];
4081 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
4083 ovs_mutex_lock(&dpdk_mutex
);
4084 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
4085 if (netdev_dpdk_get_vid(dev
) == vid
) {
4087 ovs_mutex_lock(&dev
->mutex
);
4088 dev
->vhost_reconfigured
= false;
4089 ovsrcu_index_set(&dev
->vid
, -1);
4090 memset(dev
->vhost_rxq_enabled
, 0,
4091 dev
->up
.n_rxq
* sizeof *dev
->vhost_rxq_enabled
);
4092 netdev_dpdk_txq_map_clear(dev
);
4094 netdev_change_seq_changed(&dev
->up
);
4095 ovs_mutex_unlock(&dev
->mutex
);
4101 ovs_mutex_unlock(&dpdk_mutex
);
4105 * Wait for other threads to quiesce after setting the 'virtio_dev'
4106 * to NULL, before returning.
4108 ovsrcu_synchronize();
4110 * As call to ovsrcu_synchronize() will end the quiescent state,
4111 * put thread back into quiescent state before returning.
4113 ovsrcu_quiesce_start();
4114 VLOG_INFO("vHost Device '%s' has been removed", ifname
);
4116 VLOG_INFO("vHost Device '%s' not found", ifname
);
4121 vring_state_changed(int vid
, uint16_t queue_id
, int enable
)
4123 struct netdev_dpdk
*dev
;
4124 bool exists
= false;
4125 int qid
= queue_id
/ VIRTIO_QNUM
;
4126 bool is_rx
= (queue_id
% VIRTIO_QNUM
) == VIRTIO_TXQ
;
4127 char ifname
[IF_NAME_SZ
];
4129 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
4131 ovs_mutex_lock(&dpdk_mutex
);
4132 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
4133 ovs_mutex_lock(&dev
->mutex
);
4134 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
4136 bool old_state
= dev
->vhost_rxq_enabled
[qid
];
4138 dev
->vhost_rxq_enabled
[qid
] = enable
!= 0;
4139 if (old_state
!= dev
->vhost_rxq_enabled
[qid
]) {
4140 netdev_change_seq_changed(&dev
->up
);
4144 dev
->tx_q
[qid
].map
= qid
;
4146 dev
->tx_q
[qid
].map
= OVS_VHOST_QUEUE_DISABLED
;
4148 netdev_dpdk_remap_txqs(dev
);
4151 ovs_mutex_unlock(&dev
->mutex
);
4154 ovs_mutex_unlock(&dev
->mutex
);
4156 ovs_mutex_unlock(&dpdk_mutex
);
4159 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
4160 "changed to \'%s\'", queue_id
, is_rx
== true ? "rx" : "tx",
4161 qid
, ifname
, (enable
== 1) ? "enabled" : "disabled");
4163 VLOG_INFO("vHost Device '%s' not found", ifname
);
4171 destroy_connection(int vid
)
4173 struct netdev_dpdk
*dev
;
4174 char ifname
[IF_NAME_SZ
];
4175 bool exists
= false;
4177 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
4179 ovs_mutex_lock(&dpdk_mutex
);
4180 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
4181 ovs_mutex_lock(&dev
->mutex
);
4182 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
4183 uint32_t qp_num
= NR_QUEUE
;
4185 if (netdev_dpdk_get_vid(dev
) >= 0) {
4186 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
4187 "device still attached.", dev
->vhost_id
);
4190 /* Restore the number of queue pairs to default. */
4191 if (dev
->requested_n_txq
!= qp_num
4192 || dev
->requested_n_rxq
!= qp_num
) {
4193 dev
->requested_n_rxq
= qp_num
;
4194 dev
->requested_n_txq
= qp_num
;
4195 netdev_request_reconfigure(&dev
->up
);
4197 ovs_mutex_unlock(&dev
->mutex
);
4201 ovs_mutex_unlock(&dev
->mutex
);
4203 ovs_mutex_unlock(&dpdk_mutex
);
4206 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname
);
4208 VLOG_INFO("vHost Device '%s' not found", ifname
);
4213 void vhost_guest_notified(int vid OVS_UNUSED
)
4215 COVERAGE_INC(vhost_notification
);
4219 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
4220 * or vhostuserclient netdev.
4222 * Returns a value greater or equal to zero for a valid vid or '-1' if
4223 * there is no valid vid associated. A vid of '-1' must not be used in
4224 * rte_vhost_ APi calls.
4226 * Once obtained and validated, a vid can be used by a PMD for multiple
4227 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
4228 * not fetch the vid again for each of a series of API calls.
4232 netdev_dpdk_get_vid(const struct netdev_dpdk
*dev
)
4234 return ovsrcu_index_get(&dev
->vid
);
4237 struct ingress_policer
*
4238 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
)
4240 return ovsrcu_get(struct ingress_policer
*, &dev
->ingress_policer
);
4244 netdev_dpdk_class_init(void)
4246 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4248 /* This function can be called for different classes. The initialization
4249 * needs to be done only once */
4250 if (ovsthread_once_start(&once
)) {
4253 ovs_thread_create("dpdk_watchdog", dpdk_watchdog
, NULL
);
4254 unixctl_command_register("netdev-dpdk/set-admin-state",
4255 "[netdev] up|down", 1, 2,
4256 netdev_dpdk_set_admin_state
, NULL
);
4258 unixctl_command_register("netdev-dpdk/detach",
4259 "pci address of device", 1, 1,
4260 netdev_dpdk_detach
, NULL
);
4262 unixctl_command_register("netdev-dpdk/get-mempool-info",
4264 netdev_dpdk_get_mempool_info
, NULL
);
4266 ret
= rte_eth_dev_callback_register(RTE_ETH_ALL
,
4267 RTE_ETH_EVENT_INTR_RESET
,
4268 dpdk_eth_event_callback
, NULL
);
4270 VLOG_ERR("Ethernet device callback register error: %s",
4271 rte_strerror(-ret
));
4274 ovsthread_once_done(&once
);
4283 * Initialize QoS configuration operations.
4286 qos_conf_init(struct qos_conf
*conf
, const struct dpdk_qos_ops
*ops
)
4289 rte_spinlock_init(&conf
->lock
);
4293 * Search existing QoS operations in qos_ops and compare each set of
4294 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
4297 static const struct dpdk_qos_ops
*
4298 qos_lookup_name(const char *name
)
4300 const struct dpdk_qos_ops
*const *opsp
;
4302 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
4303 const struct dpdk_qos_ops
*ops
= *opsp
;
4304 if (!strcmp(name
, ops
->qos_name
)) {
4312 netdev_dpdk_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
4315 const struct dpdk_qos_ops
*const *opsp
;
4317 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
4318 const struct dpdk_qos_ops
*ops
= *opsp
;
4319 if (ops
->qos_construct
&& ops
->qos_name
[0] != '\0') {
4320 sset_add(types
, ops
->qos_name
);
4327 netdev_dpdk_get_qos(const struct netdev
*netdev
,
4328 const char **typep
, struct smap
*details
)
4330 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4331 struct qos_conf
*qos_conf
;
4334 ovs_mutex_lock(&dev
->mutex
);
4335 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4337 *typep
= qos_conf
->ops
->qos_name
;
4338 error
= (qos_conf
->ops
->qos_get
4339 ? qos_conf
->ops
->qos_get(qos_conf
, details
): 0);
4341 /* No QoS configuration set, return an empty string */
4344 ovs_mutex_unlock(&dev
->mutex
);
4350 netdev_dpdk_set_qos(struct netdev
*netdev
, const char *type
,
4351 const struct smap
*details
)
4353 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4354 const struct dpdk_qos_ops
*new_ops
= NULL
;
4355 struct qos_conf
*qos_conf
, *new_qos_conf
= NULL
;
4358 ovs_mutex_lock(&dev
->mutex
);
4360 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4362 new_ops
= qos_lookup_name(type
);
4364 if (!new_ops
|| !new_ops
->qos_construct
) {
4365 new_qos_conf
= NULL
;
4366 if (type
&& type
[0]) {
4369 } else if (qos_conf
&& qos_conf
->ops
== new_ops
4370 && qos_conf
->ops
->qos_is_equal(qos_conf
, details
)) {
4371 new_qos_conf
= qos_conf
;
4373 error
= new_ops
->qos_construct(details
, &new_qos_conf
);
4377 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
4378 type
, netdev
->name
, rte_strerror(error
));
4381 if (new_qos_conf
!= qos_conf
) {
4382 ovsrcu_set(&dev
->qos_conf
, new_qos_conf
);
4384 ovsrcu_postpone(qos_conf
->ops
->qos_destruct
, qos_conf
);
4388 ovs_mutex_unlock(&dev
->mutex
);
4394 netdev_dpdk_get_queue(const struct netdev
*netdev
, uint32_t queue_id
,
4395 struct smap
*details
)
4397 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4398 struct qos_conf
*qos_conf
;
4401 ovs_mutex_lock(&dev
->mutex
);
4403 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4404 if (!qos_conf
|| !qos_conf
->ops
|| !qos_conf
->ops
->qos_queue_get
) {
4407 error
= qos_conf
->ops
->qos_queue_get(details
, queue_id
, qos_conf
);
4410 ovs_mutex_unlock(&dev
->mutex
);
4416 netdev_dpdk_set_queue(struct netdev
*netdev
, uint32_t queue_id
,
4417 const struct smap
*details
)
4419 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4420 struct qos_conf
*qos_conf
;
4423 ovs_mutex_lock(&dev
->mutex
);
4425 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4426 if (!qos_conf
|| !qos_conf
->ops
|| !qos_conf
->ops
->qos_queue_construct
) {
4429 error
= qos_conf
->ops
->qos_queue_construct(details
, queue_id
,
4433 if (error
&& error
!= EOPNOTSUPP
) {
4434 VLOG_ERR("Failed to set QoS queue %d on port %s: %s",
4435 queue_id
, netdev_get_name(netdev
), rte_strerror(error
));
4438 ovs_mutex_unlock(&dev
->mutex
);
4444 netdev_dpdk_delete_queue(struct netdev
*netdev
, uint32_t queue_id
)
4446 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4447 struct qos_conf
*qos_conf
;
4450 ovs_mutex_lock(&dev
->mutex
);
4452 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4453 if (qos_conf
&& qos_conf
->ops
&& qos_conf
->ops
->qos_queue_destruct
) {
4454 qos_conf
->ops
->qos_queue_destruct(qos_conf
, queue_id
);
4459 ovs_mutex_unlock(&dev
->mutex
);
4465 netdev_dpdk_get_queue_stats(const struct netdev
*netdev
, uint32_t queue_id
,
4466 struct netdev_queue_stats
*stats
)
4468 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4469 struct qos_conf
*qos_conf
;
4472 ovs_mutex_lock(&dev
->mutex
);
4474 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4475 if (qos_conf
&& qos_conf
->ops
&& qos_conf
->ops
->qos_queue_get_stats
) {
4476 qos_conf
->ops
->qos_queue_get_stats(qos_conf
, queue_id
, stats
);
4481 ovs_mutex_unlock(&dev
->mutex
);
4487 netdev_dpdk_queue_dump_start(const struct netdev
*netdev
, void **statep
)
4490 struct qos_conf
*qos_conf
;
4491 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4493 ovs_mutex_lock(&dev
->mutex
);
4495 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4496 if (qos_conf
&& qos_conf
->ops
4497 && qos_conf
->ops
->qos_queue_dump_state_init
) {
4498 struct netdev_dpdk_queue_state
*state
;
4500 *statep
= state
= xmalloc(sizeof *state
);
4501 error
= qos_conf
->ops
->qos_queue_dump_state_init(qos_conf
, state
);
4506 ovs_mutex_unlock(&dev
->mutex
);
4512 netdev_dpdk_queue_dump_next(const struct netdev
*netdev
, void *state_
,
4513 uint32_t *queue_idp
, struct smap
*details
)
4515 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4516 struct netdev_dpdk_queue_state
*state
= state_
;
4517 struct qos_conf
*qos_conf
;
4520 ovs_mutex_lock(&dev
->mutex
);
4522 while (state
->cur_queue
< state
->n_queues
) {
4523 uint32_t queue_id
= state
->queues
[state
->cur_queue
++];
4525 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4526 if (qos_conf
&& qos_conf
->ops
&& qos_conf
->ops
->qos_queue_get
) {
4527 *queue_idp
= queue_id
;
4528 error
= qos_conf
->ops
->qos_queue_get(details
, queue_id
, qos_conf
);
4533 ovs_mutex_unlock(&dev
->mutex
);
4539 netdev_dpdk_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
4542 struct netdev_dpdk_queue_state
*state
= state_
;
4544 free(state
->queues
);
4551 /* egress-policer details */
4553 struct egress_policer
{
4554 struct qos_conf qos_conf
;
4555 struct rte_meter_srtcm_params app_srtcm_params
;
4556 struct rte_meter_srtcm egress_meter
;
4557 struct rte_meter_srtcm_profile egress_prof
;
4561 egress_policer_details_to_param(const struct smap
*details
,
4562 struct rte_meter_srtcm_params
*params
)
4564 memset(params
, 0, sizeof *params
);
4565 params
->cir
= smap_get_ullong(details
, "cir", 0);
4566 params
->cbs
= smap_get_ullong(details
, "cbs", 0);
4571 egress_policer_qos_construct(const struct smap
*details
,
4572 struct qos_conf
**conf
)
4574 struct egress_policer
*policer
;
4577 policer
= xmalloc(sizeof *policer
);
4578 qos_conf_init(&policer
->qos_conf
, &egress_policer_ops
);
4579 egress_policer_details_to_param(details
, &policer
->app_srtcm_params
);
4580 err
= rte_meter_srtcm_profile_config(&policer
->egress_prof
,
4581 &policer
->app_srtcm_params
);
4583 err
= rte_meter_srtcm_config(&policer
->egress_meter
,
4584 &policer
->egress_prof
);
4588 *conf
= &policer
->qos_conf
;
4590 VLOG_ERR("Could not create rte meter for egress policer");
4600 egress_policer_qos_destruct(struct qos_conf
*conf
)
4602 struct egress_policer
*policer
= CONTAINER_OF(conf
, struct egress_policer
,
4608 egress_policer_qos_get(const struct qos_conf
*conf
, struct smap
*details
)
4610 struct egress_policer
*policer
=
4611 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4613 smap_add_format(details
, "cir", "%"PRIu64
, policer
->app_srtcm_params
.cir
);
4614 smap_add_format(details
, "cbs", "%"PRIu64
, policer
->app_srtcm_params
.cbs
);
4620 egress_policer_qos_is_equal(const struct qos_conf
*conf
,
4621 const struct smap
*details
)
4623 struct egress_policer
*policer
=
4624 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4625 struct rte_meter_srtcm_params params
;
4627 egress_policer_details_to_param(details
, ¶ms
);
4629 return !memcmp(¶ms
, &policer
->app_srtcm_params
, sizeof params
);
4633 egress_policer_run(struct qos_conf
*conf
, struct rte_mbuf
**pkts
, int pkt_cnt
,
4637 struct egress_policer
*policer
=
4638 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4640 cnt
= srtcm_policer_run_single_packet(&policer
->egress_meter
,
4641 &policer
->egress_prof
, pkts
,
4642 pkt_cnt
, should_steal
);
4647 static const struct dpdk_qos_ops egress_policer_ops
= {
4648 .qos_name
= "egress-policer", /* qos_name */
4649 .qos_construct
= egress_policer_qos_construct
,
4650 .qos_destruct
= egress_policer_qos_destruct
,
4651 .qos_get
= egress_policer_qos_get
,
4652 .qos_is_equal
= egress_policer_qos_is_equal
,
4653 .qos_run
= egress_policer_run
4656 /* trtcm-policer details */
4658 struct trtcm_policer
{
4659 struct qos_conf qos_conf
;
4660 struct rte_meter_trtcm_rfc4115_params meter_params
;
4661 struct rte_meter_trtcm_rfc4115_profile meter_profile
;
4662 struct rte_meter_trtcm_rfc4115 meter
;
4663 struct netdev_queue_stats stats
;
4667 struct trtcm_policer_queue
{
4668 struct hmap_node hmap_node
;
4670 struct rte_meter_trtcm_rfc4115_params meter_params
;
4671 struct rte_meter_trtcm_rfc4115_profile meter_profile
;
4672 struct rte_meter_trtcm_rfc4115 meter
;
4673 struct netdev_queue_stats stats
;
4677 trtcm_policer_details_to_param(const struct smap
*details
,
4678 struct rte_meter_trtcm_rfc4115_params
*params
)
4680 memset(params
, 0, sizeof *params
);
4681 params
->cir
= smap_get_ullong(details
, "cir", 0);
4682 params
->eir
= smap_get_ullong(details
, "eir", 0);
4683 params
->cbs
= smap_get_ullong(details
, "cbs", 0);
4684 params
->ebs
= smap_get_ullong(details
, "ebs", 0);
4688 trtcm_policer_param_to_detail(
4689 const struct rte_meter_trtcm_rfc4115_params
*params
,
4690 struct smap
*details
)
4692 smap_add_format(details
, "cir", "%"PRIu64
, params
->cir
);
4693 smap_add_format(details
, "eir", "%"PRIu64
, params
->eir
);
4694 smap_add_format(details
, "cbs", "%"PRIu64
, params
->cbs
);
4695 smap_add_format(details
, "ebs", "%"PRIu64
, params
->ebs
);
4700 trtcm_policer_qos_construct(const struct smap
*details
,
4701 struct qos_conf
**conf
)
4703 struct trtcm_policer
*policer
;
4706 policer
= xmalloc(sizeof *policer
);
4707 qos_conf_init(&policer
->qos_conf
, &trtcm_policer_ops
);
4708 trtcm_policer_details_to_param(details
, &policer
->meter_params
);
4709 err
= rte_meter_trtcm_rfc4115_profile_config(&policer
->meter_profile
,
4710 &policer
->meter_params
);
4712 err
= rte_meter_trtcm_rfc4115_config(&policer
->meter
,
4713 &policer
->meter_profile
);
4717 *conf
= &policer
->qos_conf
;
4718 memset(&policer
->stats
, 0, sizeof policer
->stats
);
4719 hmap_init(&policer
->queues
);
4730 trtcm_policer_qos_destruct(struct qos_conf
*conf
)
4732 struct trtcm_policer_queue
*queue
, *next_queue
;
4733 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4736 HMAP_FOR_EACH_SAFE (queue
, next_queue
, hmap_node
, &policer
->queues
) {
4737 hmap_remove(&policer
->queues
, &queue
->hmap_node
);
4740 hmap_destroy(&policer
->queues
);
4745 trtcm_policer_qos_get(const struct qos_conf
*conf
, struct smap
*details
)
4747 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4750 trtcm_policer_param_to_detail(&policer
->meter_params
, details
);
4755 trtcm_policer_qos_is_equal(const struct qos_conf
*conf
,
4756 const struct smap
*details
)
4758 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4760 struct rte_meter_trtcm_rfc4115_params params
;
4762 trtcm_policer_details_to_param(details
, ¶ms
);
4764 return !memcmp(¶ms
, &policer
->meter_params
, sizeof params
);
4767 static struct trtcm_policer_queue
*
4768 trtcm_policer_qos_find_queue(struct trtcm_policer
*policer
, uint32_t queue_id
)
4770 struct trtcm_policer_queue
*queue
;
4771 HMAP_FOR_EACH_WITH_HASH (queue
, hmap_node
, hash_2words(queue_id
, 0),
4773 if (queue
->queue_id
== queue_id
) {
4781 trtcm_policer_run_single_packet(struct trtcm_policer
*policer
,
4782 struct rte_mbuf
*pkt
, uint64_t time
)
4784 enum rte_color pkt_color
;
4785 struct trtcm_policer_queue
*queue
;
4786 uint32_t pkt_len
= rte_pktmbuf_pkt_len(pkt
) - sizeof(struct rte_ether_hdr
);
4787 struct dp_packet
*dpkt
= CONTAINER_OF(pkt
, struct dp_packet
, mbuf
);
4789 queue
= trtcm_policer_qos_find_queue(policer
, dpkt
->md
.skb_priority
);
4791 /* If no queue is found, use the default queue, which MUST exist. */
4792 queue
= trtcm_policer_qos_find_queue(policer
, 0);
4798 pkt_color
= rte_meter_trtcm_rfc4115_color_blind_check(&queue
->meter
,
4799 &queue
->meter_profile
,
4803 if (pkt_color
== RTE_COLOR_RED
) {
4804 queue
->stats
.tx_errors
++;
4806 queue
->stats
.tx_bytes
+= pkt_len
;
4807 queue
->stats
.tx_packets
++;
4810 pkt_color
= rte_meter_trtcm_rfc4115_color_aware_check(&policer
->meter
,
4811 &policer
->meter_profile
,
4815 if (pkt_color
== RTE_COLOR_RED
) {
4816 policer
->stats
.tx_errors
++;
4820 policer
->stats
.tx_bytes
+= pkt_len
;
4821 policer
->stats
.tx_packets
++;
4826 trtcm_policer_run(struct qos_conf
*conf
, struct rte_mbuf
**pkts
, int pkt_cnt
,
4831 struct rte_mbuf
*pkt
= NULL
;
4832 uint64_t current_time
= rte_rdtsc();
4834 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4837 for (i
= 0; i
< pkt_cnt
; i
++) {
4840 if (trtcm_policer_run_single_packet(policer
, pkt
, current_time
)) {
4847 rte_pktmbuf_free(pkt
);
4855 trtcm_policer_qos_queue_construct(const struct smap
*details
,
4856 uint32_t queue_id
, struct qos_conf
*conf
)
4859 struct trtcm_policer_queue
*queue
;
4860 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4863 queue
= trtcm_policer_qos_find_queue(policer
, queue_id
);
4865 queue
= xmalloc(sizeof *queue
);
4866 queue
->queue_id
= queue_id
;
4867 memset(&queue
->stats
, 0, sizeof queue
->stats
);
4868 queue
->stats
.created
= time_msec();
4869 hmap_insert(&policer
->queues
, &queue
->hmap_node
,
4870 hash_2words(queue_id
, 0));
4872 if (queue_id
== 0 && smap_is_empty(details
)) {
4873 /* No default queue configured, use port values */
4874 memcpy(&queue
->meter_params
, &policer
->meter_params
,
4875 sizeof queue
->meter_params
);
4877 trtcm_policer_details_to_param(details
, &queue
->meter_params
);
4880 err
= rte_meter_trtcm_rfc4115_profile_config(&queue
->meter_profile
,
4881 &queue
->meter_params
);
4884 err
= rte_meter_trtcm_rfc4115_config(&queue
->meter
,
4885 &queue
->meter_profile
);
4888 hmap_remove(&policer
->queues
, &queue
->hmap_node
);
4896 trtcm_policer_qos_queue_destruct(struct qos_conf
*conf
, uint32_t queue_id
)
4898 struct trtcm_policer_queue
*queue
;
4899 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4902 queue
= trtcm_policer_qos_find_queue(policer
, queue_id
);
4904 hmap_remove(&policer
->queues
, &queue
->hmap_node
);
4910 trtcm_policer_qos_queue_get(struct smap
*details
, uint32_t queue_id
,
4911 const struct qos_conf
*conf
)
4913 struct trtcm_policer_queue
*queue
;
4914 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4917 queue
= trtcm_policer_qos_find_queue(policer
, queue_id
);
4922 trtcm_policer_param_to_detail(&queue
->meter_params
, details
);
4927 trtcm_policer_qos_queue_get_stats(const struct qos_conf
*conf
,
4929 struct netdev_queue_stats
*stats
)
4931 struct trtcm_policer_queue
*queue
;
4932 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4935 queue
= trtcm_policer_qos_find_queue(policer
, queue_id
);
4939 memcpy(stats
, &queue
->stats
, sizeof *stats
);
4944 trtcm_policer_qos_queue_dump_state_init(const struct qos_conf
*conf
,
4945 struct netdev_dpdk_queue_state
*state
)
4948 struct trtcm_policer_queue
*queue
;
4949 struct trtcm_policer
*policer
= CONTAINER_OF(conf
, struct trtcm_policer
,
4952 state
->n_queues
= hmap_count(&policer
->queues
);
4953 state
->cur_queue
= 0;
4954 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
4956 HMAP_FOR_EACH (queue
, hmap_node
, &policer
->queues
) {
4957 state
->queues
[i
++] = queue
->queue_id
;
4962 static const struct dpdk_qos_ops trtcm_policer_ops
= {
4963 .qos_name
= "trtcm-policer",
4964 .qos_construct
= trtcm_policer_qos_construct
,
4965 .qos_destruct
= trtcm_policer_qos_destruct
,
4966 .qos_get
= trtcm_policer_qos_get
,
4967 .qos_is_equal
= trtcm_policer_qos_is_equal
,
4968 .qos_run
= trtcm_policer_run
,
4969 .qos_queue_construct
= trtcm_policer_qos_queue_construct
,
4970 .qos_queue_destruct
= trtcm_policer_qos_queue_destruct
,
4971 .qos_queue_get
= trtcm_policer_qos_queue_get
,
4972 .qos_queue_get_stats
= trtcm_policer_qos_queue_get_stats
,
4973 .qos_queue_dump_state_init
= trtcm_policer_qos_queue_dump_state_init
4977 netdev_dpdk_reconfigure(struct netdev
*netdev
)
4979 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4982 ovs_mutex_lock(&dev
->mutex
);
4984 if (netdev
->n_txq
== dev
->requested_n_txq
4985 && netdev
->n_rxq
== dev
->requested_n_rxq
4986 && dev
->mtu
== dev
->requested_mtu
4987 && dev
->lsc_interrupt_mode
== dev
->requested_lsc_interrupt_mode
4988 && dev
->rxq_size
== dev
->requested_rxq_size
4989 && dev
->txq_size
== dev
->requested_txq_size
4990 && eth_addr_equals(dev
->hwaddr
, dev
->requested_hwaddr
)
4991 && dev
->socket_id
== dev
->requested_socket_id
4992 && dev
->started
&& !dev
->reset_needed
) {
4993 /* Reconfiguration is unnecessary */
4998 if (dev
->reset_needed
) {
4999 rte_eth_dev_reset(dev
->port_id
);
5000 if_notifier_manual_report();
5001 dev
->reset_needed
= false;
5003 rte_eth_dev_stop(dev
->port_id
);
5006 dev
->started
= false;
5008 err
= netdev_dpdk_mempool_configure(dev
);
5009 if (err
&& err
!= EEXIST
) {
5013 dev
->lsc_interrupt_mode
= dev
->requested_lsc_interrupt_mode
;
5015 netdev
->n_txq
= dev
->requested_n_txq
;
5016 netdev
->n_rxq
= dev
->requested_n_rxq
;
5018 dev
->rxq_size
= dev
->requested_rxq_size
;
5019 dev
->txq_size
= dev
->requested_txq_size
;
5021 rte_free(dev
->tx_q
);
5023 if (!eth_addr_equals(dev
->hwaddr
, dev
->requested_hwaddr
)) {
5024 err
= netdev_dpdk_set_etheraddr__(dev
, dev
->requested_hwaddr
);
5030 err
= dpdk_eth_dev_init(dev
);
5031 if (dev
->hw_ol_features
& NETDEV_TX_TSO_OFFLOAD
) {
5032 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_TSO
;
5033 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_CKSUM
;
5034 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_UDP_CKSUM
;
5035 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_IPV4_CKSUM
;
5036 if (dev
->hw_ol_features
& NETDEV_TX_SCTP_CHECKSUM_OFFLOAD
) {
5037 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_SCTP_CKSUM
;
5041 /* If both requested and actual hwaddr were previously
5042 * unset (initialized to 0), then first device init above
5043 * will have set actual hwaddr to something new.
5044 * This would trigger spurious MAC reconfiguration unless
5045 * the requested MAC is kept in sync.
5047 * This is harmless in case requested_hwaddr was
5048 * configured by the user, as netdev_dpdk_set_etheraddr__()
5049 * will have succeeded to get to this point.
5051 dev
->requested_hwaddr
= dev
->hwaddr
;
5053 dev
->tx_q
= netdev_dpdk_alloc_txq(netdev
->n_txq
);
5058 netdev_change_seq_changed(netdev
);
5061 ovs_mutex_unlock(&dev
->mutex
);
5066 dpdk_vhost_reconfigure_helper(struct netdev_dpdk
*dev
)
5067 OVS_REQUIRES(dev
->mutex
)
5069 dev
->up
.n_txq
= dev
->requested_n_txq
;
5070 dev
->up
.n_rxq
= dev
->requested_n_rxq
;
5073 /* Always keep RX queue 0 enabled for implementations that won't
5074 * report vring states. */
5075 dev
->vhost_rxq_enabled
[0] = true;
5077 /* Enable TX queue 0 by default if it wasn't disabled. */
5078 if (dev
->tx_q
[0].map
== OVS_VHOST_QUEUE_MAP_UNKNOWN
) {
5079 dev
->tx_q
[0].map
= 0;
5082 if (userspace_tso_enabled()) {
5083 dev
->hw_ol_features
|= NETDEV_TX_TSO_OFFLOAD
;
5084 VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev
->up
));
5087 netdev_dpdk_remap_txqs(dev
);
5089 err
= netdev_dpdk_mempool_configure(dev
);
5091 /* A new mempool was created or re-used. */
5092 netdev_change_seq_changed(&dev
->up
);
5093 } else if (err
!= EEXIST
) {
5096 if (netdev_dpdk_get_vid(dev
) >= 0) {
5097 if (dev
->vhost_reconfigured
== false) {
5098 dev
->vhost_reconfigured
= true;
5099 /* Carrier status may need updating. */
5100 netdev_change_seq_changed(&dev
->up
);
5108 netdev_dpdk_vhost_reconfigure(struct netdev
*netdev
)
5110 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
5113 ovs_mutex_lock(&dev
->mutex
);
5114 err
= dpdk_vhost_reconfigure_helper(dev
);
5115 ovs_mutex_unlock(&dev
->mutex
);
5121 netdev_dpdk_vhost_client_reconfigure(struct netdev
*netdev
)
5123 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
5125 uint64_t vhost_flags
= 0;
5126 uint64_t vhost_unsup_flags
;
5128 ovs_mutex_lock(&dev
->mutex
);
5130 /* Configure vHost client mode if requested and if the following criteria
5132 * 1. Device hasn't been registered yet.
5133 * 2. A path has been specified.
5135 if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
) && dev
->vhost_id
) {
5136 /* Register client-mode device. */
5137 vhost_flags
|= RTE_VHOST_USER_CLIENT
;
5139 /* There is no support for multi-segments buffers. */
5140 vhost_flags
|= RTE_VHOST_USER_LINEARBUF_SUPPORT
;
5142 /* Enable IOMMU support, if explicitly requested. */
5143 if (dpdk_vhost_iommu_enabled()) {
5144 vhost_flags
|= RTE_VHOST_USER_IOMMU_SUPPORT
;
5147 /* Enable POSTCOPY support, if explicitly requested. */
5148 if (dpdk_vhost_postcopy_enabled()) {
5149 vhost_flags
|= RTE_VHOST_USER_POSTCOPY_SUPPORT
;
5152 /* Enable External Buffers if TCP Segmentation Offload is enabled. */
5153 if (userspace_tso_enabled()) {
5154 vhost_flags
|= RTE_VHOST_USER_EXTBUF_SUPPORT
;
5157 err
= rte_vhost_driver_register(dev
->vhost_id
, vhost_flags
);
5159 VLOG_ERR("vhost-user device setup failure for device %s\n",
5163 /* Configuration successful */
5164 dev
->vhost_driver_flags
|= vhost_flags
;
5165 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
5166 "using client socket '%s'",
5167 dev
->up
.name
, dev
->vhost_id
);
5170 err
= rte_vhost_driver_callback_register(dev
->vhost_id
,
5171 &virtio_net_device_ops
);
5173 VLOG_ERR("rte_vhost_driver_callback_register failed for "
5174 "vhost user client port: %s\n", dev
->up
.name
);
5178 if (userspace_tso_enabled()) {
5179 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_TSO
;
5180 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_CKSUM
;
5181 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_UDP_CKSUM
;
5182 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_SCTP_CKSUM
;
5183 netdev
->ol_flags
|= NETDEV_TX_OFFLOAD_IPV4_CKSUM
;
5184 vhost_unsup_flags
= 1ULL << VIRTIO_NET_F_HOST_ECN
5185 | 1ULL << VIRTIO_NET_F_HOST_UFO
;
5187 /* This disables checksum offloading and all the features
5188 * that depends on it (TSO, UFO, ECN) according to virtio
5190 vhost_unsup_flags
= 1ULL << VIRTIO_NET_F_CSUM
;
5193 err
= rte_vhost_driver_disable_features(dev
->vhost_id
,
5196 VLOG_ERR("rte_vhost_driver_disable_features failed for "
5197 "vhost user client port: %s\n", dev
->up
.name
);
5201 err
= rte_vhost_driver_start(dev
->vhost_id
);
5203 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
5204 "client port: %s\n", dev
->up
.name
);
5209 err
= dpdk_vhost_reconfigure_helper(dev
);
5212 ovs_mutex_unlock(&dev
->mutex
);
5218 netdev_dpdk_get_port_id(struct netdev
*netdev
)
5220 struct netdev_dpdk
*dev
;
5223 if (!is_dpdk_class(netdev
->netdev_class
)) {
5227 dev
= netdev_dpdk_cast(netdev
);
5228 ovs_mutex_lock(&dev
->mutex
);
5230 ovs_mutex_unlock(&dev
->mutex
);
5236 netdev_dpdk_flow_api_supported(struct netdev
*netdev
)
5238 struct netdev_dpdk
*dev
;
5241 if (!is_dpdk_class(netdev
->netdev_class
)) {
5245 dev
= netdev_dpdk_cast(netdev
);
5246 ovs_mutex_lock(&dev
->mutex
);
5247 if (dev
->type
== DPDK_DEV_ETH
) {
5248 /* TODO: Check if we able to offload some minimal flow. */
5251 ovs_mutex_unlock(&dev
->mutex
);
5257 netdev_dpdk_rte_flow_destroy(struct netdev
*netdev
,
5258 struct rte_flow
*rte_flow
,
5259 struct rte_flow_error
*error
)
5261 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
5264 ovs_mutex_lock(&dev
->mutex
);
5265 ret
= rte_flow_destroy(dev
->port_id
, rte_flow
, error
);
5266 ovs_mutex_unlock(&dev
->mutex
);
5271 netdev_dpdk_rte_flow_create(struct netdev
*netdev
,
5272 const struct rte_flow_attr
*attr
,
5273 const struct rte_flow_item
*items
,
5274 const struct rte_flow_action
*actions
,
5275 struct rte_flow_error
*error
)
5277 struct rte_flow
*flow
;
5278 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
5280 ovs_mutex_lock(&dev
->mutex
);
5281 flow
= rte_flow_create(dev
->port_id
, attr
, items
, actions
, error
);
5282 ovs_mutex_unlock(&dev
->mutex
);
5287 netdev_dpdk_rte_flow_query_count(struct netdev
*netdev
,
5288 struct rte_flow
*rte_flow
,
5289 struct rte_flow_query_count
*query
,
5290 struct rte_flow_error
*error
)
5292 struct rte_flow_action_count count
= { .shared
= 0, .id
= 0 };
5293 const struct rte_flow_action actions
[] = {
5295 .type
= RTE_FLOW_ACTION_TYPE_COUNT
,
5299 .type
= RTE_FLOW_ACTION_TYPE_END
,
5302 struct netdev_dpdk
*dev
;
5305 if (!is_dpdk_class(netdev
->netdev_class
)) {
5309 dev
= netdev_dpdk_cast(netdev
);
5310 ovs_mutex_lock(&dev
->mutex
);
5311 ret
= rte_flow_query(dev
->port_id
, rte_flow
, actions
, query
, error
);
5312 ovs_mutex_unlock(&dev
->mutex
);
5316 #define NETDEV_DPDK_CLASS_COMMON \
5318 .alloc = netdev_dpdk_alloc, \
5319 .dealloc = netdev_dpdk_dealloc, \
5320 .get_config = netdev_dpdk_get_config, \
5321 .get_numa_id = netdev_dpdk_get_numa_id, \
5322 .set_etheraddr = netdev_dpdk_set_etheraddr, \
5323 .get_etheraddr = netdev_dpdk_get_etheraddr, \
5324 .get_mtu = netdev_dpdk_get_mtu, \
5325 .set_mtu = netdev_dpdk_set_mtu, \
5326 .get_ifindex = netdev_dpdk_get_ifindex, \
5327 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
5328 .set_miimon_interval = netdev_dpdk_set_miimon, \
5329 .set_policing = netdev_dpdk_set_policing, \
5330 .get_qos_types = netdev_dpdk_get_qos_types, \
5331 .get_qos = netdev_dpdk_get_qos, \
5332 .set_qos = netdev_dpdk_set_qos, \
5333 .get_queue = netdev_dpdk_get_queue, \
5334 .set_queue = netdev_dpdk_set_queue, \
5335 .delete_queue = netdev_dpdk_delete_queue, \
5336 .get_queue_stats = netdev_dpdk_get_queue_stats, \
5337 .queue_dump_start = netdev_dpdk_queue_dump_start, \
5338 .queue_dump_next = netdev_dpdk_queue_dump_next, \
5339 .queue_dump_done = netdev_dpdk_queue_dump_done, \
5340 .update_flags = netdev_dpdk_update_flags, \
5341 .rxq_alloc = netdev_dpdk_rxq_alloc, \
5342 .rxq_construct = netdev_dpdk_rxq_construct, \
5343 .rxq_destruct = netdev_dpdk_rxq_destruct, \
5344 .rxq_dealloc = netdev_dpdk_rxq_dealloc
5346 #define NETDEV_DPDK_CLASS_BASE \
5347 NETDEV_DPDK_CLASS_COMMON, \
5348 .init = netdev_dpdk_class_init, \
5349 .destruct = netdev_dpdk_destruct, \
5350 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
5351 .get_carrier = netdev_dpdk_get_carrier, \
5352 .get_stats = netdev_dpdk_get_stats, \
5353 .get_custom_stats = netdev_dpdk_get_custom_stats, \
5354 .get_features = netdev_dpdk_get_features, \
5355 .get_status = netdev_dpdk_get_status, \
5356 .reconfigure = netdev_dpdk_reconfigure, \
5357 .rxq_recv = netdev_dpdk_rxq_recv
5359 static const struct netdev_class dpdk_class
= {
5361 NETDEV_DPDK_CLASS_BASE
,
5362 .construct
= netdev_dpdk_construct
,
5363 .set_config
= netdev_dpdk_set_config
,
5364 .send
= netdev_dpdk_eth_send
,
5367 static const struct netdev_class dpdk_vhost_class
= {
5368 .type
= "dpdkvhostuser",
5369 NETDEV_DPDK_CLASS_COMMON
,
5370 .construct
= netdev_dpdk_vhost_construct
,
5371 .destruct
= netdev_dpdk_vhost_destruct
,
5372 .send
= netdev_dpdk_vhost_send
,
5373 .get_carrier
= netdev_dpdk_vhost_get_carrier
,
5374 .get_stats
= netdev_dpdk_vhost_get_stats
,
5375 .get_custom_stats
= netdev_dpdk_get_sw_custom_stats
,
5376 .get_status
= netdev_dpdk_vhost_user_get_status
,
5377 .reconfigure
= netdev_dpdk_vhost_reconfigure
,
5378 .rxq_recv
= netdev_dpdk_vhost_rxq_recv
,
5379 .rxq_enabled
= netdev_dpdk_vhost_rxq_enabled
,
5382 static const struct netdev_class dpdk_vhost_client_class
= {
5383 .type
= "dpdkvhostuserclient",
5384 NETDEV_DPDK_CLASS_COMMON
,
5385 .construct
= netdev_dpdk_vhost_client_construct
,
5386 .destruct
= netdev_dpdk_vhost_destruct
,
5387 .set_config
= netdev_dpdk_vhost_client_set_config
,
5388 .send
= netdev_dpdk_vhost_send
,
5389 .get_carrier
= netdev_dpdk_vhost_get_carrier
,
5390 .get_stats
= netdev_dpdk_vhost_get_stats
,
5391 .get_custom_stats
= netdev_dpdk_get_sw_custom_stats
,
5392 .get_status
= netdev_dpdk_vhost_user_get_status
,
5393 .reconfigure
= netdev_dpdk_vhost_client_reconfigure
,
5394 .rxq_recv
= netdev_dpdk_vhost_rxq_recv
,
5395 .rxq_enabled
= netdev_dpdk_vhost_rxq_enabled
,
5399 netdev_dpdk_register(void)
5401 netdev_register_provider(&dpdk_class
);
5402 netdev_register_provider(&dpdk_vhost_class
);
5403 netdev_register_provider(&dpdk_vhost_client_class
);