2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "netdev-dpdk.h"
25 #include <linux/virtio_net.h>
26 #include <sys/socket.h>
29 #include <rte_bus_pci.h>
30 #include <rte_config.h>
31 #include <rte_cycles.h>
32 #include <rte_errno.h>
33 #include <rte_eth_ring.h>
34 #include <rte_ethdev.h>
36 #include <rte_malloc.h>
38 #include <rte_meter.h>
40 #include <rte_version.h>
41 #include <rte_vhost.h>
46 #include "dp-packet.h"
48 #include "dpif-netdev.h"
49 #include "fatal-signal.h"
50 #include "netdev-provider.h"
51 #include "netdev-vport.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "openvswitch/list.h"
55 #include "openvswitch/match.h"
56 #include "openvswitch/ofp-print.h"
57 #include "openvswitch/shash.h"
58 #include "openvswitch/vlog.h"
61 #include "ovs-thread.h"
66 #include "unaligned.h"
71 enum {VIRTIO_RXQ
, VIRTIO_TXQ
, VIRTIO_QNUM
};
73 VLOG_DEFINE_THIS_MODULE(netdev_dpdk
);
74 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
76 COVERAGE_DEFINE(vhost_tx_contention
);
78 #define DPDK_PORT_WATCHDOG_INTERVAL 5
80 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
81 #define OVS_VPORT_DPDK "ovs_dpdk"
84 * need to reserve tons of extra space in the mbufs so we can align the
85 * DMA addresses to 4KB.
86 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
87 * performance for standard Ethernet MTU.
89 #define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
90 + (2 * VLAN_HEADER_LEN))
91 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
92 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
93 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
94 - ETHER_HDR_LEN - ETHER_CRC_LEN)
95 #define NETDEV_DPDK_MBUF_ALIGN 1024
96 #define NETDEV_DPDK_MAX_PKT_LEN 9728
98 /* Max and min number of packets in the mempool. OVS tries to allocate a
99 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
100 * enough hugepages) we keep halving the number until the allocation succeeds
101 * or we reach MIN_NB_MBUF */
103 #define MAX_NB_MBUF (4096 * 64)
104 #define MIN_NB_MBUF (4096 * 4)
105 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
107 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
108 BUILD_ASSERT_DECL(MAX_NB_MBUF
% ROUND_DOWN_POW2(MAX_NB_MBUF
/ MIN_NB_MBUF
)
111 /* The smallest possible NB_MBUF that we're going to try should be a multiple
112 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
113 BUILD_ASSERT_DECL((MAX_NB_MBUF
/ ROUND_DOWN_POW2(MAX_NB_MBUF
/ MIN_NB_MBUF
))
118 /* Default size of Physical NIC RXQ */
119 #define NIC_PORT_DEFAULT_RXQ_SIZE 2048
120 /* Default size of Physical NIC TXQ */
121 #define NIC_PORT_DEFAULT_TXQ_SIZE 2048
122 /* Maximum size of Physical NIC Queues */
123 #define NIC_PORT_MAX_Q_SIZE 4096
125 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
126 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
127 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
128 * yet mapped to another queue. */
130 #define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
132 /* DPDK library uses uint16_t for port_id. */
133 typedef uint16_t dpdk_port_t
;
134 #define DPDK_PORT_ID_FMT "%"PRIu16
136 /* Minimum amount of vhost tx retries, effectively a disable. */
137 #define VHOST_ENQ_RETRY_MIN 0
138 /* Maximum amount of vhost tx retries. */
139 #define VHOST_ENQ_RETRY_MAX 32
140 /* Legacy default value for vhost tx retries. */
141 #define VHOST_ENQ_RETRY_DEF 8
143 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
145 static const struct rte_eth_conf port_conf
= {
147 .mq_mode
= ETH_MQ_RX_RSS
,
154 .rss_hf
= ETH_RSS_IP
| ETH_RSS_UDP
| ETH_RSS_TCP
,
158 .mq_mode
= ETH_MQ_TX_NONE
,
163 * These callbacks allow virtio-net devices to be added to vhost ports when
164 * configuration has been fully completed.
166 static int new_device(int vid
);
167 static void destroy_device(int vid
);
168 static int vring_state_changed(int vid
, uint16_t queue_id
, int enable
);
169 static void destroy_connection(int vid
);
170 static const struct vhost_device_ops virtio_net_device_ops
=
172 .new_device
= new_device
,
173 .destroy_device
= destroy_device
,
174 .vring_state_changed
= vring_state_changed
,
175 .features_changed
= NULL
,
176 .new_connection
= NULL
,
177 .destroy_connection
= destroy_connection
,
180 /* Custom software stats for dpdk ports */
181 struct netdev_dpdk_sw_stats
{
182 /* No. of retries when unable to transmit. */
184 /* Packet drops when unable to transmit; Probably Tx queue is full. */
185 uint64_t tx_failure_drops
;
186 /* Packet length greater than device MTU. */
187 uint64_t tx_mtu_exceeded_drops
;
188 /* Packet drops in egress policer processing. */
189 uint64_t tx_qos_drops
;
190 /* Packet drops in ingress policer processing. */
191 uint64_t rx_qos_drops
;
194 enum { DPDK_RING_SIZE
= 256 };
195 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE
));
196 enum { DRAIN_TSC
= 200000ULL };
203 /* Quality of Service */
205 /* An instance of a QoS configuration. Always associated with a particular
208 * Each QoS implementation subclasses this with whatever additional data it
212 const struct dpdk_qos_ops
*ops
;
216 /* A particular implementation of dpdk QoS operations.
218 * The functions below return 0 if successful or a positive errno value on
219 * failure, except where otherwise noted. All of them must be provided, except
220 * where otherwise noted.
222 struct dpdk_qos_ops
{
224 /* Name of the QoS type */
225 const char *qos_name
;
227 /* Called to construct a qos_conf object. The implementation should make
228 * the appropriate calls to configure QoS according to 'details'.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 * This function must return 0 if and only if it sets '*conf' to an
235 * initialized 'struct qos_conf'.
237 * For all QoS implementations it should always be non-null.
239 int (*qos_construct
)(const struct smap
*details
, struct qos_conf
**conf
);
241 /* Destroys the data structures allocated by the implementation as part of
244 * For all QoS implementations it should always be non-null.
246 void (*qos_destruct
)(struct qos_conf
*conf
);
248 /* Retrieves details of 'conf' configuration into 'details'.
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
252 * (which is built as ovs-vswitchd.conf.db(8)).
254 int (*qos_get
)(const struct qos_conf
*conf
, struct smap
*details
);
256 /* Returns true if 'conf' is already configured according to 'details'.
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
262 * For all QoS implementations it should always be non-null.
264 bool (*qos_is_equal
)(const struct qos_conf
*conf
,
265 const struct smap
*details
);
267 /* Modify an array of rte_mbufs. The modification is specific to
268 * each qos implementation.
270 * The function should take and array of mbufs and an int representing
271 * the current number of mbufs present in the array.
273 * After the function has performed a qos modification to the array of
274 * mbufs it returns an int representing the number of mbufs now present in
275 * the array. This value is can then be passed to the port send function
276 * along with the modified array for transmission.
278 * For all QoS implementations it should always be non-null.
280 int (*qos_run
)(struct qos_conf
*qos_conf
, struct rte_mbuf
**pkts
,
281 int pkt_cnt
, bool should_steal
);
284 /* dpdk_qos_ops for each type of user space QoS implementation */
285 static const struct dpdk_qos_ops egress_policer_ops
;
288 * Array of dpdk_qos_ops, contains pointer to all supported QoS
291 static const struct dpdk_qos_ops
*const qos_confs
[] = {
296 static struct ovs_mutex dpdk_mutex
= OVS_MUTEX_INITIALIZER
;
298 /* Contains all 'struct dpdk_dev's. */
299 static struct ovs_list dpdk_list
OVS_GUARDED_BY(dpdk_mutex
)
300 = OVS_LIST_INITIALIZER(&dpdk_list
);
302 static struct ovs_mutex dpdk_mp_mutex
OVS_ACQ_AFTER(dpdk_mutex
)
303 = OVS_MUTEX_INITIALIZER
;
305 /* Contains all 'struct dpdk_mp's. */
306 static struct ovs_list dpdk_mp_list
OVS_GUARDED_BY(dpdk_mp_mutex
)
307 = OVS_LIST_INITIALIZER(&dpdk_mp_list
);
310 struct rte_mempool
*mp
;
314 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mp_mutex
);
317 /* There should be one 'struct dpdk_tx_queue' created for
318 * each netdev tx queue. */
319 struct dpdk_tx_queue
{
320 /* Padding to make dpdk_tx_queue exactly one cache line long. */
321 PADDED_MEMBERS(CACHE_LINE_SIZE
,
322 /* Protects the members and the NIC queue from concurrent access.
323 * It is used only if the queue is shared among different pmd threads
324 * (see 'concurrent_txq'). */
325 rte_spinlock_t tx_lock
;
326 /* Mapping of configured vhost-user queue to enabled by guest. */
331 /* dpdk has no way to remove dpdk ring ethernet devices
332 so we have to keep them around once they've been created
335 static struct ovs_list dpdk_ring_list
OVS_GUARDED_BY(dpdk_mutex
)
336 = OVS_LIST_INITIALIZER(&dpdk_ring_list
);
339 /* For the client rings */
340 struct rte_ring
*cring_tx
;
341 struct rte_ring
*cring_rx
;
342 unsigned int user_port_id
; /* User given port no, parsed from port name */
343 dpdk_port_t eth_port_id
; /* ethernet device port id */
344 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
347 struct ingress_policer
{
348 struct rte_meter_srtcm_params app_srtcm_params
;
349 struct rte_meter_srtcm in_policer
;
350 struct rte_meter_srtcm_profile in_prof
;
351 rte_spinlock_t policer_lock
;
354 enum dpdk_hw_ol_features
{
355 NETDEV_RX_CHECKSUM_OFFLOAD
= 1 << 0,
356 NETDEV_RX_HW_CRC_STRIP
= 1 << 1,
357 NETDEV_RX_HW_SCATTER
= 1 << 2
361 * In order to avoid confusion in variables names, following naming convention
362 * should be used, if possible:
364 * 'struct netdev' : 'netdev'
365 * 'struct netdev_dpdk' : 'dev'
366 * 'struct netdev_rxq' : 'rxq'
367 * 'struct netdev_rxq_dpdk' : 'rx'
370 * struct netdev *netdev = netdev_from_name(name);
371 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
373 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
378 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline0
,
381 /* If true, device was attached by rte_eth_dev_attach(). */
383 /* If true, rte_eth_dev_start() was successfully called */
385 struct eth_addr hwaddr
;
390 enum dpdk_dev_type type
;
391 enum netdev_flags flags
;
394 /* Device arguments for dpdk ports. */
396 /* Identifier used to distinguish vhost devices from each other. */
399 struct dpdk_tx_queue
*tx_q
;
400 struct rte_eth_link link
;
403 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline1
,
404 struct ovs_mutex mutex
OVS_ACQ_AFTER(dpdk_mutex
);
405 struct dpdk_mp
*dpdk_mp
;
407 /* virtio identifier for vhost devices */
410 /* True if vHost device is 'up' and has been reconfigured at least once */
411 bool vhost_reconfigured
;
413 atomic_uint8_t vhost_tx_retries_max
;
414 /* 2 pad bytes here. */
417 PADDED_MEMBERS(CACHE_LINE_SIZE
,
420 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
422 /* QoS configuration and lock for the device */
423 OVSRCU_TYPE(struct qos_conf
*) qos_conf
;
425 /* Ingress Policer */
426 OVSRCU_TYPE(struct ingress_policer
*) ingress_policer
;
427 uint32_t policer_rate
;
428 uint32_t policer_burst
;
430 /* Array of vhost rxq states, see vring_state_changed. */
431 bool *vhost_rxq_enabled
;
434 PADDED_MEMBERS(CACHE_LINE_SIZE
,
435 struct netdev_stats stats
;
436 struct netdev_dpdk_sw_stats
*sw_stats
;
438 rte_spinlock_t stats_lock
;
439 /* 36 pad bytes here. */
442 PADDED_MEMBERS(CACHE_LINE_SIZE
,
443 /* The following properties cannot be changed when a device is running,
444 * so we remember the request and update them next time
445 * netdev_dpdk*_reconfigure() is called */
449 int requested_rxq_size
;
450 int requested_txq_size
;
452 /* Number of rx/tx descriptors for physical devices */
456 /* Socket ID detected when vHost device is brought up */
457 int requested_socket_id
;
459 /* Denotes whether vHost port is client/server mode */
460 uint64_t vhost_driver_flags
;
462 /* DPDK-ETH Flow control */
463 struct rte_eth_fc_conf fc_conf
;
465 /* DPDK-ETH hardware offload features,
466 * from the enum set 'dpdk_hw_ol_features' */
467 uint32_t hw_ol_features
;
469 /* Properties for link state change detection mode.
470 * If lsc_interrupt_mode is set to false, poll mode is used,
471 * otherwise interrupt mode is used. */
472 bool requested_lsc_interrupt_mode
;
473 bool lsc_interrupt_mode
;
476 PADDED_MEMBERS(CACHE_LINE_SIZE
,
477 /* Names of all XSTATS counters */
478 struct rte_eth_xstat_name
*rte_xstats_names
;
479 int rte_xstats_names_size
;
480 int rte_xstats_ids_size
;
481 uint64_t *rte_xstats_ids
;
485 struct netdev_rxq_dpdk
{
486 struct netdev_rxq up
;
490 static void netdev_dpdk_destruct(struct netdev
*netdev
);
491 static void netdev_dpdk_vhost_destruct(struct netdev
*netdev
);
493 static int netdev_dpdk_get_sw_custom_stats(const struct netdev
*,
494 struct netdev_custom_stats
*);
495 static void netdev_dpdk_clear_xstats(struct netdev_dpdk
*dev
);
497 int netdev_dpdk_get_vid(const struct netdev_dpdk
*dev
);
499 struct ingress_policer
*
500 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
);
503 is_dpdk_class(const struct netdev_class
*class)
505 return class->destruct
== netdev_dpdk_destruct
506 || class->destruct
== netdev_dpdk_vhost_destruct
;
509 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
510 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
511 * value, insufficient buffers are allocated to accomodate the packet in its
512 * entirety. Furthermore, certain drivers need to ensure that there is also
513 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
514 * frames). If the RX buffer is too small, then the driver enables scatter RX
515 * behaviour, which reduces performance. To prevent this, use a buffer size
516 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
519 dpdk_buf_size(int mtu
)
521 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu
), NETDEV_DPDK_MBUF_ALIGN
)
522 + RTE_PKTMBUF_HEADROOM
;
525 /* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
527 * Unlike xmalloc(), this function can return NULL on failure. */
529 dpdk_rte_mzalloc(size_t sz
)
531 return rte_zmalloc(OVS_VPORT_DPDK
, sz
, OVS_CACHE_LINE_SIZE
);
535 free_dpdk_buf(struct dp_packet
*p
)
537 struct rte_mbuf
*pkt
= (struct rte_mbuf
*) p
;
539 rte_pktmbuf_free(pkt
);
543 ovs_rte_pktmbuf_init(struct rte_mempool
*mp OVS_UNUSED
,
544 void *opaque_arg OVS_UNUSED
,
546 unsigned i OVS_UNUSED
)
548 struct rte_mbuf
*pkt
= _p
;
550 dp_packet_init_dpdk((struct dp_packet
*) pkt
);
554 dpdk_mp_full(const struct rte_mempool
*mp
) OVS_REQUIRES(dpdk_mp_mutex
)
556 /* At this point we want to know if all the mbufs are back
557 * in the mempool. rte_mempool_full() is not atomic but it's
558 * the best available and as we are no longer requesting mbufs
559 * from the mempool, it means mbufs will not move from
560 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
561 * the ring is counted before caches, so we won't get false
562 * positives in this use case and we handle false negatives.
564 * If future implementations of rte_mempool_full() were to change
565 * it could be possible for a false positive. Even that would
566 * likely be ok, as there are additional checks during mempool
567 * freeing but it would make things racey.
569 return rte_mempool_full(mp
);
572 /* Free unused mempools. */
574 dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex
)
576 struct dpdk_mp
*dmp
, *next
;
578 LIST_FOR_EACH_SAFE (dmp
, next
, list_node
, &dpdk_mp_list
) {
579 if (!dmp
->refcount
&& dpdk_mp_full(dmp
->mp
)) {
580 VLOG_DBG("Freeing mempool \"%s\"", dmp
->mp
->name
);
581 ovs_list_remove(&dmp
->list_node
);
582 rte_mempool_free(dmp
->mp
);
588 /* Calculating the required number of mbufs differs depending on the
589 * mempool model being used. Check if per port memory is in use before
593 dpdk_calculate_mbufs(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
598 /* Shared memory are being used.
599 * XXX: this is a really rough method of provisioning memory.
600 * It's impossible to determine what the exact memory requirements are
601 * when the number of ports and rxqs that utilize a particular mempool
602 * can change dynamically at runtime. For now, use this rough
605 if (mtu
>= ETHER_MTU
) {
606 n_mbufs
= MAX_NB_MBUF
;
608 n_mbufs
= MIN_NB_MBUF
;
611 /* Per port memory is being used.
612 * XXX: rough estimation of number of mbufs required for this port:
613 * <packets required to fill the device rxqs>
614 * + <packets that could be stuck on other ports txqs>
615 * + <packets in the pmd threads>
616 * + <additional memory for corner cases>
618 n_mbufs
= dev
->requested_n_rxq
* dev
->requested_rxq_size
619 + dev
->requested_n_txq
* dev
->requested_txq_size
620 + MIN(RTE_MAX_LCORE
, dev
->requested_n_rxq
) * NETDEV_MAX_BURST
627 static struct dpdk_mp
*
628 dpdk_mp_create(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
630 char mp_name
[RTE_MEMPOOL_NAMESIZE
];
631 const char *netdev_name
= netdev_get_name(&dev
->up
);
632 int socket_id
= dev
->requested_socket_id
;
633 uint32_t n_mbufs
= 0;
634 uint32_t mbuf_size
= 0;
635 uint32_t aligned_mbuf_size
= 0;
636 uint32_t mbuf_priv_data_len
= 0;
637 uint32_t pkt_size
= 0;
638 uint32_t hash
= hash_string(netdev_name
, 0);
639 struct dpdk_mp
*dmp
= NULL
;
642 dmp
= dpdk_rte_mzalloc(sizeof *dmp
);
646 dmp
->socket_id
= socket_id
;
650 /* Get the size of each mbuf, based on the MTU */
651 mbuf_size
= MTU_TO_FRAME_LEN(mtu
);
653 n_mbufs
= dpdk_calculate_mbufs(dev
, mtu
, per_port_mp
);
656 /* Full DPDK memory pool name must be unique and cannot be
657 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
658 * mempool case this can result in one device using a mempool
659 * which references a different device in it's name. However as
660 * mempool names are hashed, the device name will not be readable
661 * so this is not an issue for tasks such as debugging.
663 ret
= snprintf(mp_name
, RTE_MEMPOOL_NAMESIZE
,
664 "ovs%08x%02d%05d%07u",
665 hash
, socket_id
, mtu
, n_mbufs
);
666 if (ret
< 0 || ret
>= RTE_MEMPOOL_NAMESIZE
) {
667 VLOG_DBG("snprintf returned %d. "
668 "Failed to generate a mempool name for \"%s\". "
669 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
670 ret
, netdev_name
, hash
, socket_id
, mtu
, n_mbufs
);
674 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
675 "on socket %d for %d Rx and %d Tx queues, "
676 "cache line size of %u",
677 netdev_name
, n_mbufs
, mbuf_size
, socket_id
,
678 dev
->requested_n_rxq
, dev
->requested_n_txq
,
679 RTE_CACHE_LINE_SIZE
);
681 /* The size of the mbuf's private area (i.e. area that holds OvS'
683 mbuf_priv_data_len
= sizeof(struct dp_packet
) -
684 sizeof(struct rte_mbuf
);
685 /* The size of the entire dp_packet. */
686 pkt_size
= sizeof(struct dp_packet
) + mbuf_size
;
687 /* mbuf size, rounded up to cacheline size. */
688 aligned_mbuf_size
= ROUND_UP(pkt_size
, RTE_CACHE_LINE_SIZE
);
689 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
690 * This maintains mbuf size cache alignment, while also honoring RX
691 * buffer alignment in the data portion of the mbuf. If this adjustment
692 * is not made, there is a possiblity later on that for an element of
693 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
694 * This is problematic in the case of multi-segment mbufs, particularly
695 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
696 * header, for example.
698 mbuf_priv_data_len
+= (aligned_mbuf_size
- pkt_size
);
700 dmp
->mp
= rte_pktmbuf_pool_create(mp_name
, n_mbufs
, MP_CACHE_SZ
,
706 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
708 /* rte_pktmbuf_pool_create has done some initialization of the
709 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
710 * initializes some OVS specific fields of dp_packet.
712 rte_mempool_obj_iter(dmp
->mp
, ovs_rte_pktmbuf_init
, NULL
);
714 } else if (rte_errno
== EEXIST
) {
715 /* A mempool with the same name already exists. We just
716 * retrieve its pointer to be returned to the caller. */
717 dmp
->mp
= rte_mempool_lookup(mp_name
);
718 /* As the mempool create returned EEXIST we can expect the
719 * lookup has returned a valid pointer. If for some reason
720 * that's not the case we keep track of it. */
721 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
725 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
726 "%u mbufs, retrying with %u mbufs",
727 mp_name
, n_mbufs
, n_mbufs
/ 2);
729 } while (!dmp
->mp
&& rte_errno
== ENOMEM
&& (n_mbufs
/= 2) >= MIN_NB_MBUF
);
731 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
738 static struct dpdk_mp
*
739 dpdk_mp_get(struct netdev_dpdk
*dev
, int mtu
, bool per_port_mp
)
741 struct dpdk_mp
*dmp
, *next
;
744 ovs_mutex_lock(&dpdk_mp_mutex
);
745 /* Check if shared memory is being used, if so check existing mempools
746 * to see if reuse is possible. */
748 LIST_FOR_EACH (dmp
, list_node
, &dpdk_mp_list
) {
749 if (dmp
->socket_id
== dev
->requested_socket_id
750 && dmp
->mtu
== mtu
) {
751 VLOG_DBG("Reusing mempool \"%s\"", dmp
->mp
->name
);
758 /* Sweep mempools after reuse or before create. */
762 dmp
= dpdk_mp_create(dev
, mtu
, per_port_mp
);
764 /* Shared memory will hit the reuse case above so will not
765 * request a mempool that already exists but we need to check
766 * for the EEXIST case for per port memory case. Compare the
767 * mempool returned by dmp to each entry in dpdk_mp_list. If a
768 * match is found, free dmp as a new entry is not required, set
769 * dmp to point to the existing entry and increment the refcount
770 * to avoid being freed at a later stage.
772 if (per_port_mp
&& rte_errno
== EEXIST
) {
773 LIST_FOR_EACH (next
, list_node
, &dpdk_mp_list
) {
774 if (dmp
->mp
== next
->mp
) {
781 ovs_list_push_back(&dpdk_mp_list
, &dmp
->list_node
);
786 ovs_mutex_unlock(&dpdk_mp_mutex
);
791 /* Decrement reference to a mempool. */
793 dpdk_mp_put(struct dpdk_mp
*dmp
)
799 ovs_mutex_lock(&dpdk_mp_mutex
);
800 ovs_assert(dmp
->refcount
);
802 ovs_mutex_unlock(&dpdk_mp_mutex
);
805 /* Depending on the memory model being used this function tries to
806 * identify and reuse an existing mempool or tries to allocate a new
807 * mempool on requested_socket_id with mbuf size corresponding to the
808 * requested_mtu. On success, a new configuration will be applied.
809 * On error, device will be left unchanged. */
811 netdev_dpdk_mempool_configure(struct netdev_dpdk
*dev
)
812 OVS_REQUIRES(dev
->mutex
)
814 uint32_t buf_size
= dpdk_buf_size(dev
->requested_mtu
);
817 bool per_port_mp
= dpdk_per_port_memory();
819 /* With shared memory we do not need to configure a mempool if the MTU
820 * and socket ID have not changed, the previous configuration is still
821 * valid so return 0 */
822 if (!per_port_mp
&& dev
->mtu
== dev
->requested_mtu
823 && dev
->socket_id
== dev
->requested_socket_id
) {
827 dmp
= dpdk_mp_get(dev
, FRAME_LEN_TO_MTU(buf_size
), per_port_mp
);
829 VLOG_ERR("Failed to create memory pool for netdev "
830 "%s, with MTU %d on socket %d: %s\n",
831 dev
->up
.name
, dev
->requested_mtu
, dev
->requested_socket_id
,
832 rte_strerror(rte_errno
));
835 /* Check for any pre-existing dpdk_mp for the device before accessing
836 * the associated mempool.
838 if (dev
->dpdk_mp
!= NULL
) {
839 /* A new MTU was requested, decrement the reference count for the
840 * devices current dpdk_mp. This is required even if a pointer to
841 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
842 * has already been incremented by dpdk_mp_get at this stage so it
843 * must be decremented to keep an accurate refcount for the
846 dpdk_mp_put(dev
->dpdk_mp
);
849 dev
->mtu
= dev
->requested_mtu
;
850 dev
->socket_id
= dev
->requested_socket_id
;
851 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
858 check_link_status(struct netdev_dpdk
*dev
)
860 struct rte_eth_link link
;
862 rte_eth_link_get_nowait(dev
->port_id
, &link
);
864 if (dev
->link
.link_status
!= link
.link_status
) {
865 netdev_change_seq_changed(&dev
->up
);
867 dev
->link_reset_cnt
++;
869 if (dev
->link
.link_status
) {
871 "Port "DPDK_PORT_ID_FMT
" Link Up - speed %u Mbps - %s",
872 dev
->port_id
, (unsigned) dev
->link
.link_speed
,
873 (dev
->link
.link_duplex
== ETH_LINK_FULL_DUPLEX
)
874 ? "full-duplex" : "half-duplex");
876 VLOG_DBG_RL(&rl
, "Port "DPDK_PORT_ID_FMT
" Link Down",
883 dpdk_watchdog(void *dummy OVS_UNUSED
)
885 struct netdev_dpdk
*dev
;
887 pthread_detach(pthread_self());
890 ovs_mutex_lock(&dpdk_mutex
);
891 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
892 ovs_mutex_lock(&dev
->mutex
);
893 if (dev
->type
== DPDK_DEV_ETH
) {
894 check_link_status(dev
);
896 ovs_mutex_unlock(&dev
->mutex
);
898 ovs_mutex_unlock(&dpdk_mutex
);
899 xsleep(DPDK_PORT_WATCHDOG_INTERVAL
);
906 dpdk_eth_dev_port_config(struct netdev_dpdk
*dev
, int n_rxq
, int n_txq
)
910 struct rte_eth_conf conf
= port_conf
;
911 struct rte_eth_dev_info info
;
914 rte_eth_dev_info_get(dev
->port_id
, &info
);
916 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
917 * scatter to support jumbo RX.
918 * Setting scatter for the device is done after checking for
919 * scatter support in the device capabilites. */
920 if (dev
->mtu
> ETHER_MTU
) {
921 if (dev
->hw_ol_features
& NETDEV_RX_HW_SCATTER
) {
922 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_SCATTER
;
926 conf
.intr_conf
.lsc
= dev
->lsc_interrupt_mode
;
928 if (dev
->hw_ol_features
& NETDEV_RX_CHECKSUM_OFFLOAD
) {
929 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_CHECKSUM
;
932 if (!(dev
->hw_ol_features
& NETDEV_RX_HW_CRC_STRIP
)
933 && info
.rx_offload_capa
& DEV_RX_OFFLOAD_KEEP_CRC
) {
934 conf
.rxmode
.offloads
|= DEV_RX_OFFLOAD_KEEP_CRC
;
937 /* Limit configured rss hash functions to only those supported
938 * by the eth device. */
939 conf
.rx_adv_conf
.rss_conf
.rss_hf
&= info
.flow_type_rss_offloads
;
941 /* A device may report more queues than it makes available (this has
942 * been observed for Intel xl710, which reserves some of them for
943 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
944 * available. When this happens we can retry the configuration
945 * and request less queues */
946 while (n_rxq
&& n_txq
) {
948 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq
, n_txq
);
951 diag
= rte_eth_dev_configure(dev
->port_id
, n_rxq
, n_txq
, &conf
);
953 VLOG_WARN("Interface %s eth_dev setup error %s\n",
954 dev
->up
.name
, rte_strerror(-diag
));
958 diag
= rte_eth_dev_set_mtu(dev
->port_id
, dev
->mtu
);
960 /* A device may not support rte_eth_dev_set_mtu, in this case
961 * flag a warning to the user and include the devices configured
962 * MTU value that will be used instead. */
963 if (-ENOTSUP
== diag
) {
964 rte_eth_dev_get_mtu(dev
->port_id
, &conf_mtu
);
965 VLOG_WARN("Interface %s does not support MTU configuration, "
966 "max packet size supported is %"PRIu16
".",
967 dev
->up
.name
, conf_mtu
);
969 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
970 dev
->up
.name
, dev
->mtu
, rte_strerror(-diag
));
975 for (i
= 0; i
< n_txq
; i
++) {
976 diag
= rte_eth_tx_queue_setup(dev
->port_id
, i
, dev
->txq_size
,
977 dev
->socket_id
, NULL
);
979 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
980 dev
->up
.name
, i
, rte_strerror(-diag
));
986 /* Retry with less tx queues */
991 for (i
= 0; i
< n_rxq
; i
++) {
992 diag
= rte_eth_rx_queue_setup(dev
->port_id
, i
, dev
->rxq_size
,
993 dev
->socket_id
, NULL
,
996 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
997 dev
->up
.name
, i
, rte_strerror(-diag
));
1003 /* Retry with less rx queues */
1008 dev
->up
.n_rxq
= n_rxq
;
1009 dev
->up
.n_txq
= n_txq
;
1018 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk
*dev
) OVS_REQUIRES(dev
->mutex
)
1020 if (rte_eth_dev_flow_ctrl_set(dev
->port_id
, &dev
->fc_conf
)) {
1021 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT
,
1027 dpdk_eth_dev_init(struct netdev_dpdk
*dev
)
1028 OVS_REQUIRES(dev
->mutex
)
1030 struct rte_pktmbuf_pool_private
*mbp_priv
;
1031 struct rte_eth_dev_info info
;
1032 struct ether_addr eth_addr
;
1035 uint32_t rx_chksm_offload_capa
= DEV_RX_OFFLOAD_UDP_CKSUM
|
1036 DEV_RX_OFFLOAD_TCP_CKSUM
|
1037 DEV_RX_OFFLOAD_IPV4_CKSUM
;
1039 rte_eth_dev_info_get(dev
->port_id
, &info
);
1041 if (strstr(info
.driver_name
, "vf") != NULL
) {
1042 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1043 dev
->hw_ol_features
|= NETDEV_RX_HW_CRC_STRIP
;
1045 dev
->hw_ol_features
&= ~NETDEV_RX_HW_CRC_STRIP
;
1048 if ((info
.rx_offload_capa
& rx_chksm_offload_capa
) !=
1049 rx_chksm_offload_capa
) {
1050 VLOG_WARN("Rx checksum offload is not supported on port "
1051 DPDK_PORT_ID_FMT
, dev
->port_id
);
1052 dev
->hw_ol_features
&= ~NETDEV_RX_CHECKSUM_OFFLOAD
;
1054 dev
->hw_ol_features
|= NETDEV_RX_CHECKSUM_OFFLOAD
;
1057 if (info
.rx_offload_capa
& DEV_RX_OFFLOAD_SCATTER
) {
1058 dev
->hw_ol_features
|= NETDEV_RX_HW_SCATTER
;
1060 /* Do not warn on lack of scatter support */
1061 dev
->hw_ol_features
&= ~NETDEV_RX_HW_SCATTER
;
1064 n_rxq
= MIN(info
.max_rx_queues
, dev
->up
.n_rxq
);
1065 n_txq
= MIN(info
.max_tx_queues
, dev
->up
.n_txq
);
1067 diag
= dpdk_eth_dev_port_config(dev
, n_rxq
, n_txq
);
1069 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1070 "configure error: %s",
1071 dev
->up
.name
, n_rxq
, n_txq
,
1072 dev
->lsc_interrupt_mode
? "true" : "false",
1073 rte_strerror(-diag
));
1077 diag
= rte_eth_dev_start(dev
->port_id
);
1079 VLOG_ERR("Interface %s start error: %s", dev
->up
.name
,
1080 rte_strerror(-diag
));
1083 dev
->started
= true;
1085 rte_eth_promiscuous_enable(dev
->port_id
);
1086 rte_eth_allmulticast_enable(dev
->port_id
);
1088 memset(ð_addr
, 0x0, sizeof(eth_addr
));
1089 rte_eth_macaddr_get(dev
->port_id
, ð_addr
);
1090 VLOG_INFO_RL(&rl
, "Port "DPDK_PORT_ID_FMT
": "ETH_ADDR_FMT
,
1091 dev
->port_id
, ETH_ADDR_BYTES_ARGS(eth_addr
.addr_bytes
));
1093 memcpy(dev
->hwaddr
.ea
, eth_addr
.addr_bytes
, ETH_ADDR_LEN
);
1094 rte_eth_link_get_nowait(dev
->port_id
, &dev
->link
);
1096 mbp_priv
= rte_mempool_get_priv(dev
->dpdk_mp
->mp
);
1097 dev
->buf_size
= mbp_priv
->mbuf_data_room_size
- RTE_PKTMBUF_HEADROOM
;
1101 static struct netdev_dpdk
*
1102 netdev_dpdk_cast(const struct netdev
*netdev
)
1104 return CONTAINER_OF(netdev
, struct netdev_dpdk
, up
);
1107 static struct netdev
*
1108 netdev_dpdk_alloc(void)
1110 struct netdev_dpdk
*dev
;
1112 dev
= dpdk_rte_mzalloc(sizeof *dev
);
1120 static struct dpdk_tx_queue
*
1121 netdev_dpdk_alloc_txq(unsigned int n_txqs
)
1123 struct dpdk_tx_queue
*txqs
;
1126 txqs
= dpdk_rte_mzalloc(n_txqs
* sizeof *txqs
);
1128 for (i
= 0; i
< n_txqs
; i
++) {
1129 /* Initialize map for vhost devices. */
1130 txqs
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
1131 rte_spinlock_init(&txqs
[i
].tx_lock
);
1139 common_construct(struct netdev
*netdev
, dpdk_port_t port_no
,
1140 enum dpdk_dev_type type
, int socket_id
)
1141 OVS_REQUIRES(dpdk_mutex
)
1143 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1145 ovs_mutex_init(&dev
->mutex
);
1147 rte_spinlock_init(&dev
->stats_lock
);
1149 /* If the 'sid' is negative, it means that the kernel fails
1150 * to obtain the pci numa info. In that situation, always
1152 dev
->socket_id
= socket_id
< 0 ? SOCKET0
: socket_id
;
1153 dev
->requested_socket_id
= dev
->socket_id
;
1154 dev
->port_id
= port_no
;
1157 dev
->requested_mtu
= ETHER_MTU
;
1158 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
1159 dev
->requested_lsc_interrupt_mode
= 0;
1160 ovsrcu_index_init(&dev
->vid
, -1);
1161 dev
->vhost_reconfigured
= false;
1162 dev
->attached
= false;
1164 ovsrcu_init(&dev
->qos_conf
, NULL
);
1166 ovsrcu_init(&dev
->ingress_policer
, NULL
);
1167 dev
->policer_rate
= 0;
1168 dev
->policer_burst
= 0;
1172 dev
->requested_n_rxq
= NR_QUEUE
;
1173 dev
->requested_n_txq
= NR_QUEUE
;
1174 dev
->requested_rxq_size
= NIC_PORT_DEFAULT_RXQ_SIZE
;
1175 dev
->requested_txq_size
= NIC_PORT_DEFAULT_TXQ_SIZE
;
1177 /* Initialize the flow control to NULL */
1178 memset(&dev
->fc_conf
, 0, sizeof dev
->fc_conf
);
1180 /* Initilize the hardware offload flags to 0 */
1181 dev
->hw_ol_features
= 0;
1183 dev
->flags
= NETDEV_UP
| NETDEV_PROMISC
;
1185 ovs_list_push_back(&dpdk_list
, &dev
->list_node
);
1187 netdev_request_reconfigure(netdev
);
1189 dev
->rte_xstats_names
= NULL
;
1190 dev
->rte_xstats_names_size
= 0;
1192 dev
->rte_xstats_ids
= NULL
;
1193 dev
->rte_xstats_ids_size
= 0;
1195 dev
->sw_stats
= xzalloc(sizeof *dev
->sw_stats
);
1196 dev
->sw_stats
->tx_retries
= (dev
->type
== DPDK_DEV_VHOST
) ? 0 : UINT64_MAX
;
1201 /* dev_name must be the prefix followed by a positive decimal number.
1202 * (no leading + or - signs are allowed) */
1204 dpdk_dev_parse_name(const char dev_name
[], const char prefix
[],
1205 unsigned int *port_no
)
1209 if (strncmp(dev_name
, prefix
, strlen(prefix
))) {
1213 cport
= dev_name
+ strlen(prefix
);
1215 if (str_to_uint(cport
, 10, port_no
)) {
1222 /* Get the number of OVS interfaces which have the same DPDK
1223 * rte device (e.g. same pci bus address).
1224 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1227 netdev_dpdk_get_num_ports(struct rte_device
*device
)
1228 OVS_REQUIRES(dpdk_mutex
)
1230 struct netdev_dpdk
*dev
;
1233 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
1234 if (rte_eth_devices
[dev
->port_id
].device
== device
1235 && rte_eth_devices
[dev
->port_id
].state
!= RTE_ETH_DEV_UNUSED
) {
1243 vhost_common_construct(struct netdev
*netdev
)
1244 OVS_REQUIRES(dpdk_mutex
)
1246 int socket_id
= rte_lcore_to_socket_id(rte_get_master_lcore());
1247 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1249 dev
->vhost_rxq_enabled
= dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM
*
1250 sizeof *dev
->vhost_rxq_enabled
);
1251 if (!dev
->vhost_rxq_enabled
) {
1254 dev
->tx_q
= netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM
);
1256 rte_free(dev
->vhost_rxq_enabled
);
1260 atomic_init(&dev
->vhost_tx_retries_max
, VHOST_ENQ_RETRY_DEF
);
1262 return common_construct(netdev
, DPDK_ETH_PORT_ID_INVALID
,
1263 DPDK_DEV_VHOST
, socket_id
);
1267 netdev_dpdk_vhost_construct(struct netdev
*netdev
)
1269 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1270 const char *name
= netdev
->name
;
1273 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1274 * the file system. '/' or '\' would traverse directories, so they're not
1275 * acceptable in 'name'. */
1276 if (strchr(name
, '/') || strchr(name
, '\\')) {
1277 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1278 "A valid name must not include '/' or '\\'",
1283 ovs_mutex_lock(&dpdk_mutex
);
1284 /* Take the name of the vhost-user port and append it to the location where
1285 * the socket is to be created, then register the socket.
1287 dev
->vhost_id
= xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name
);
1289 dev
->vhost_driver_flags
&= ~RTE_VHOST_USER_CLIENT
;
1290 err
= rte_vhost_driver_register(dev
->vhost_id
, dev
->vhost_driver_flags
);
1292 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
1296 fatal_signal_add_file_to_unlink(dev
->vhost_id
);
1297 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1298 dev
->vhost_id
, name
);
1301 err
= rte_vhost_driver_callback_register(dev
->vhost_id
,
1302 &virtio_net_device_ops
);
1304 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1305 "port: %s\n", name
);
1309 err
= rte_vhost_driver_disable_features(dev
->vhost_id
,
1310 1ULL << VIRTIO_NET_F_HOST_TSO4
1311 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1312 | 1ULL << VIRTIO_NET_F_CSUM
);
1314 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1315 "port: %s\n", name
);
1319 err
= rte_vhost_driver_start(dev
->vhost_id
);
1321 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1322 "port: %s\n", name
);
1326 err
= vhost_common_construct(netdev
);
1328 VLOG_ERR("vhost_common_construct failed for vhost user "
1329 "port: %s\n", name
);
1334 free(dev
->vhost_id
);
1335 dev
->vhost_id
= NULL
;
1338 ovs_mutex_unlock(&dpdk_mutex
);
1339 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1340 "please migrate to dpdkvhostuserclient ports.");
1345 netdev_dpdk_vhost_client_construct(struct netdev
*netdev
)
1349 ovs_mutex_lock(&dpdk_mutex
);
1350 err
= vhost_common_construct(netdev
);
1352 VLOG_ERR("vhost_common_construct failed for vhost user client"
1353 "port: %s\n", netdev
->name
);
1355 ovs_mutex_unlock(&dpdk_mutex
);
1360 netdev_dpdk_construct(struct netdev
*netdev
)
1364 ovs_mutex_lock(&dpdk_mutex
);
1365 err
= common_construct(netdev
, DPDK_ETH_PORT_ID_INVALID
,
1366 DPDK_DEV_ETH
, SOCKET0
);
1367 ovs_mutex_unlock(&dpdk_mutex
);
1372 common_destruct(struct netdev_dpdk
*dev
)
1373 OVS_REQUIRES(dpdk_mutex
)
1374 OVS_EXCLUDED(dev
->mutex
)
1376 rte_free(dev
->tx_q
);
1377 dpdk_mp_put(dev
->dpdk_mp
);
1379 ovs_list_remove(&dev
->list_node
);
1380 free(ovsrcu_get_protected(struct ingress_policer
*,
1381 &dev
->ingress_policer
));
1382 free(dev
->sw_stats
);
1383 ovs_mutex_destroy(&dev
->mutex
);
1387 netdev_dpdk_destruct(struct netdev
*netdev
)
1389 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1390 struct rte_device
*rte_dev
;
1391 struct rte_eth_dev
*eth_dev
;
1392 bool remove_on_close
;
1394 ovs_mutex_lock(&dpdk_mutex
);
1396 rte_eth_dev_stop(dev
->port_id
);
1397 dev
->started
= false;
1399 if (dev
->attached
) {
1400 /* Retrieve eth device data before closing it.
1401 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1403 eth_dev
= &rte_eth_devices
[dev
->port_id
];
1406 (eth_dev
->data
->dev_flags
& RTE_ETH_DEV_CLOSE_REMOVE
);
1407 rte_dev
= eth_dev
->device
;
1409 /* Remove the eth device. */
1410 rte_eth_dev_close(dev
->port_id
);
1412 /* Remove this rte device and all its eth devices if flag
1413 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1414 * are not supported), or if all the eth devices belonging to the rte
1415 * device are closed.
1417 if (!remove_on_close
|| !netdev_dpdk_get_num_ports(rte_dev
)) {
1418 int ret
= rte_dev_remove(rte_dev
);
1421 VLOG_ERR("Device '%s' can not be detached: %s.",
1422 dev
->devargs
, rte_strerror(-ret
));
1424 /* Device was closed and detached. */
1425 VLOG_INFO("Device '%s' has been removed and detached",
1429 /* Device was only closed. rte_dev_remove() was not called. */
1430 VLOG_INFO("Device '%s' has been removed", dev
->devargs
);
1434 netdev_dpdk_clear_xstats(dev
);
1436 common_destruct(dev
);
1438 ovs_mutex_unlock(&dpdk_mutex
);
1441 /* rte_vhost_driver_unregister() can call back destroy_device(), which will
1442 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1443 * deadlock, none of the mutexes must be held while calling this function. */
1445 dpdk_vhost_driver_unregister(struct netdev_dpdk
*dev OVS_UNUSED
,
1447 OVS_EXCLUDED(dpdk_mutex
)
1448 OVS_EXCLUDED(dev
->mutex
)
1450 return rte_vhost_driver_unregister(vhost_id
);
1454 netdev_dpdk_vhost_destruct(struct netdev
*netdev
)
1456 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1459 ovs_mutex_lock(&dpdk_mutex
);
1461 /* Guest becomes an orphan if still attached. */
1462 if (netdev_dpdk_get_vid(dev
) >= 0
1463 && !(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
1464 VLOG_ERR("Removing port '%s' while vhost device still attached.",
1466 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1467 "socket '%s' must be restarted.", dev
->vhost_id
);
1470 vhost_id
= dev
->vhost_id
;
1471 dev
->vhost_id
= NULL
;
1472 rte_free(dev
->vhost_rxq_enabled
);
1474 common_destruct(dev
);
1476 ovs_mutex_unlock(&dpdk_mutex
);
1482 if (dpdk_vhost_driver_unregister(dev
, vhost_id
)) {
1483 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1484 netdev
->name
, vhost_id
);
1485 } else if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
1486 /* OVS server mode - remove this socket from list for deletion */
1487 fatal_signal_remove_file_to_unlink(vhost_id
);
1494 netdev_dpdk_dealloc(struct netdev
*netdev
)
1496 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1502 netdev_dpdk_clear_xstats(struct netdev_dpdk
*dev
)
1504 /* If statistics are already allocated, we have to
1505 * reconfigure, as port_id could have been changed. */
1506 if (dev
->rte_xstats_names
) {
1507 free(dev
->rte_xstats_names
);
1508 dev
->rte_xstats_names
= NULL
;
1509 dev
->rte_xstats_names_size
= 0;
1511 if (dev
->rte_xstats_ids
) {
1512 free(dev
->rte_xstats_ids
);
1513 dev
->rte_xstats_ids
= NULL
;
1514 dev
->rte_xstats_ids_size
= 0;
1519 netdev_dpdk_get_xstat_name(struct netdev_dpdk
*dev
, uint64_t id
)
1521 if (id
>= dev
->rte_xstats_names_size
) {
1524 return dev
->rte_xstats_names
[id
].name
;
1528 netdev_dpdk_configure_xstats(struct netdev_dpdk
*dev
)
1529 OVS_REQUIRES(dev
->mutex
)
1533 struct rte_eth_xstat
*rte_xstats
;
1538 /* Retrieving all XSTATS names. If something will go wrong
1539 * or amount of counters will be equal 0, rte_xstats_names
1540 * buffer will be marked as NULL, and any further xstats
1541 * query won't be performed (e.g. during netdev_dpdk_get_stats
1547 if (dev
->rte_xstats_names
== NULL
|| dev
->rte_xstats_ids
== NULL
) {
1548 dev
->rte_xstats_names_size
=
1549 rte_eth_xstats_get_names(dev
->port_id
, NULL
, 0);
1551 if (dev
->rte_xstats_names_size
< 0) {
1552 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT
,
1554 dev
->rte_xstats_names_size
= 0;
1556 /* Reserve memory for xstats names and values */
1557 dev
->rte_xstats_names
= xcalloc(dev
->rte_xstats_names_size
,
1558 sizeof *dev
->rte_xstats_names
);
1560 if (dev
->rte_xstats_names
) {
1561 /* Retreive xstats names */
1563 rte_eth_xstats_get_names(dev
->port_id
,
1564 dev
->rte_xstats_names
,
1565 dev
->rte_xstats_names_size
);
1567 if (rte_xstats_len
< 0) {
1568 VLOG_WARN("Cannot get XSTATS names for port: "
1569 DPDK_PORT_ID_FMT
, dev
->port_id
);
1571 } else if (rte_xstats_len
!= dev
->rte_xstats_names_size
) {
1572 VLOG_WARN("XSTATS size doesn't match for port: "
1573 DPDK_PORT_ID_FMT
, dev
->port_id
);
1577 dev
->rte_xstats_ids
= xcalloc(dev
->rte_xstats_names_size
,
1580 /* We have to calculate number of counters */
1581 rte_xstats
= xmalloc(rte_xstats_len
* sizeof *rte_xstats
);
1582 memset(rte_xstats
, 0xff, sizeof *rte_xstats
* rte_xstats_len
);
1584 /* Retreive xstats values */
1585 if (rte_eth_xstats_get(dev
->port_id
, rte_xstats
,
1586 rte_xstats_len
) > 0) {
1587 dev
->rte_xstats_ids_size
= 0;
1589 for (uint32_t i
= 0; i
< rte_xstats_len
; i
++) {
1590 id
= rte_xstats
[i
].id
;
1591 name
= netdev_dpdk_get_xstat_name(dev
, id
);
1592 /* We need to filter out everything except
1593 * dropped, error and management counters */
1594 if (string_ends_with(name
, "_errors") ||
1595 strstr(name
, "_management_") ||
1596 string_ends_with(name
, "_dropped")) {
1598 dev
->rte_xstats_ids
[xstats_no
] = id
;
1602 dev
->rte_xstats_ids_size
= xstats_no
;
1605 VLOG_WARN("Can't get XSTATS IDs for port: "
1606 DPDK_PORT_ID_FMT
, dev
->port_id
);
1613 /* Already configured */
1619 netdev_dpdk_clear_xstats(dev
);
1625 netdev_dpdk_get_config(const struct netdev
*netdev
, struct smap
*args
)
1627 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1629 ovs_mutex_lock(&dev
->mutex
);
1631 smap_add_format(args
, "requested_rx_queues", "%d", dev
->requested_n_rxq
);
1632 smap_add_format(args
, "configured_rx_queues", "%d", netdev
->n_rxq
);
1633 smap_add_format(args
, "requested_tx_queues", "%d", dev
->requested_n_txq
);
1634 smap_add_format(args
, "configured_tx_queues", "%d", netdev
->n_txq
);
1635 smap_add_format(args
, "mtu", "%d", dev
->mtu
);
1637 if (dev
->type
== DPDK_DEV_ETH
) {
1638 smap_add_format(args
, "requested_rxq_descriptors", "%d",
1639 dev
->requested_rxq_size
);
1640 smap_add_format(args
, "configured_rxq_descriptors", "%d",
1642 smap_add_format(args
, "requested_txq_descriptors", "%d",
1643 dev
->requested_txq_size
);
1644 smap_add_format(args
, "configured_txq_descriptors", "%d",
1646 if (dev
->hw_ol_features
& NETDEV_RX_CHECKSUM_OFFLOAD
) {
1647 smap_add(args
, "rx_csum_offload", "true");
1649 smap_add(args
, "rx_csum_offload", "false");
1651 smap_add(args
, "lsc_interrupt_mode",
1652 dev
->lsc_interrupt_mode
? "true" : "false");
1654 ovs_mutex_unlock(&dev
->mutex
);
1659 static struct netdev_dpdk
*
1660 netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id
)
1661 OVS_REQUIRES(dpdk_mutex
)
1663 struct netdev_dpdk
*dev
;
1665 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
1666 if (dev
->port_id
== port_id
) {
1675 netdev_dpdk_get_port_by_mac(const char *mac_str
)
1677 dpdk_port_t port_id
;
1678 struct eth_addr mac
, port_mac
;
1680 if (!eth_addr_from_string(mac_str
, &mac
)) {
1681 VLOG_ERR("invalid mac: %s", mac_str
);
1682 return DPDK_ETH_PORT_ID_INVALID
;
1685 RTE_ETH_FOREACH_DEV (port_id
) {
1686 struct ether_addr ea
;
1688 rte_eth_macaddr_get(port_id
, &ea
);
1689 memcpy(port_mac
.ea
, ea
.addr_bytes
, ETH_ADDR_LEN
);
1690 if (eth_addr_equals(mac
, port_mac
)) {
1695 return DPDK_ETH_PORT_ID_INVALID
;
1698 /* Return the first DPDK port id matching the devargs pattern. */
1699 static dpdk_port_t
netdev_dpdk_get_port_by_devargs(const char *devargs
)
1700 OVS_REQUIRES(dpdk_mutex
)
1702 dpdk_port_t port_id
;
1703 struct rte_dev_iterator iterator
;
1705 RTE_ETH_FOREACH_MATCHING_DEV (port_id
, devargs
, &iterator
) {
1706 /* If a break is done - must call rte_eth_iterator_cleanup. */
1707 rte_eth_iterator_cleanup(&iterator
);
1715 * Normally, a PCI id (optionally followed by a representor number)
1716 * is enough for identifying a specific DPDK port.
1717 * However, for some NICs having multiple ports sharing the same PCI
1718 * id, using PCI id won't work then.
1720 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1722 * Note that the compatibility is fully kept: user can still use the
1723 * PCI id for adding ports (when it's enough for them).
1726 netdev_dpdk_process_devargs(struct netdev_dpdk
*dev
,
1727 const char *devargs
, char **errp
)
1728 OVS_REQUIRES(dpdk_mutex
)
1730 dpdk_port_t new_port_id
;
1732 if (strncmp(devargs
, "class=eth,mac=", 14) == 0) {
1733 new_port_id
= netdev_dpdk_get_port_by_mac(&devargs
[14]);
1735 new_port_id
= netdev_dpdk_get_port_by_devargs(devargs
);
1736 if (!rte_eth_dev_is_valid_port(new_port_id
)) {
1737 /* Device not found in DPDK, attempt to attach it */
1738 if (rte_dev_probe(devargs
)) {
1739 new_port_id
= DPDK_ETH_PORT_ID_INVALID
;
1741 new_port_id
= netdev_dpdk_get_port_by_devargs(devargs
);
1742 if (rte_eth_dev_is_valid_port(new_port_id
)) {
1743 /* Attach successful */
1744 dev
->attached
= true;
1745 VLOG_INFO("Device '%s' attached to DPDK", devargs
);
1747 /* Attach unsuccessful */
1748 new_port_id
= DPDK_ETH_PORT_ID_INVALID
;
1754 if (new_port_id
== DPDK_ETH_PORT_ID_INVALID
) {
1755 VLOG_WARN_BUF(errp
, "Error attaching device '%s' to DPDK", devargs
);
1762 dpdk_set_rxq_config(struct netdev_dpdk
*dev
, const struct smap
*args
)
1763 OVS_REQUIRES(dev
->mutex
)
1767 new_n_rxq
= MAX(smap_get_int(args
, "n_rxq", NR_QUEUE
), 1);
1768 if (new_n_rxq
!= dev
->requested_n_rxq
) {
1769 dev
->requested_n_rxq
= new_n_rxq
;
1770 netdev_request_reconfigure(&dev
->up
);
1775 dpdk_process_queue_size(struct netdev
*netdev
, const struct smap
*args
,
1776 const char *flag
, int default_size
, int *new_size
)
1778 int queue_size
= smap_get_int(args
, flag
, default_size
);
1780 if (queue_size
<= 0 || queue_size
> NIC_PORT_MAX_Q_SIZE
1781 || !is_pow2(queue_size
)) {
1782 queue_size
= default_size
;
1785 if (queue_size
!= *new_size
) {
1786 *new_size
= queue_size
;
1787 netdev_request_reconfigure(netdev
);
1792 netdev_dpdk_set_config(struct netdev
*netdev
, const struct smap
*args
,
1795 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1796 bool rx_fc_en
, tx_fc_en
, autoneg
, lsc_interrupt_mode
;
1797 bool flow_control_requested
= true;
1798 enum rte_eth_fc_mode fc_mode
;
1799 static const enum rte_eth_fc_mode fc_mode_set
[2][2] = {
1800 {RTE_FC_NONE
, RTE_FC_TX_PAUSE
},
1801 {RTE_FC_RX_PAUSE
, RTE_FC_FULL
}
1803 const char *new_devargs
;
1806 ovs_mutex_lock(&dpdk_mutex
);
1807 ovs_mutex_lock(&dev
->mutex
);
1809 dpdk_set_rxq_config(dev
, args
);
1811 dpdk_process_queue_size(netdev
, args
, "n_rxq_desc",
1812 NIC_PORT_DEFAULT_RXQ_SIZE
,
1813 &dev
->requested_rxq_size
);
1814 dpdk_process_queue_size(netdev
, args
, "n_txq_desc",
1815 NIC_PORT_DEFAULT_TXQ_SIZE
,
1816 &dev
->requested_txq_size
);
1818 new_devargs
= smap_get(args
, "dpdk-devargs");
1820 if (dev
->devargs
&& strcmp(new_devargs
, dev
->devargs
)) {
1821 /* The user requested a new device. If we return error, the caller
1822 * will delete this netdev and try to recreate it. */
1827 /* dpdk-devargs is required for device configuration */
1828 if (new_devargs
&& new_devargs
[0]) {
1829 /* Don't process dpdk-devargs if value is unchanged and port id
1831 if (!(dev
->devargs
&& !strcmp(dev
->devargs
, new_devargs
)
1832 && rte_eth_dev_is_valid_port(dev
->port_id
))) {
1833 dpdk_port_t new_port_id
= netdev_dpdk_process_devargs(dev
,
1836 if (!rte_eth_dev_is_valid_port(new_port_id
)) {
1838 } else if (new_port_id
== dev
->port_id
) {
1839 /* Already configured, do not reconfigure again */
1842 struct netdev_dpdk
*dup_dev
;
1844 dup_dev
= netdev_dpdk_lookup_by_port_id(new_port_id
);
1846 VLOG_WARN_BUF(errp
, "'%s' is trying to use device '%s' "
1847 "which is already in use by '%s'",
1848 netdev_get_name(netdev
), new_devargs
,
1849 netdev_get_name(&dup_dev
->up
));
1852 int sid
= rte_eth_dev_socket_id(new_port_id
);
1854 dev
->requested_socket_id
= sid
< 0 ? SOCKET0
: sid
;
1855 dev
->devargs
= xstrdup(new_devargs
);
1856 dev
->port_id
= new_port_id
;
1857 netdev_request_reconfigure(&dev
->up
);
1858 netdev_dpdk_clear_xstats(dev
);
1864 VLOG_WARN_BUF(errp
, "'%s' is missing 'options:dpdk-devargs'. "
1865 "The old 'dpdk<port_id>' names are not supported",
1866 netdev_get_name(netdev
));
1874 lsc_interrupt_mode
= smap_get_bool(args
, "dpdk-lsc-interrupt", false);
1875 if (dev
->requested_lsc_interrupt_mode
!= lsc_interrupt_mode
) {
1876 dev
->requested_lsc_interrupt_mode
= lsc_interrupt_mode
;
1877 netdev_request_reconfigure(netdev
);
1880 rx_fc_en
= smap_get_bool(args
, "rx-flow-ctrl", false);
1881 tx_fc_en
= smap_get_bool(args
, "tx-flow-ctrl", false);
1882 autoneg
= smap_get_bool(args
, "flow-ctrl-autoneg", false);
1884 fc_mode
= fc_mode_set
[tx_fc_en
][rx_fc_en
];
1886 if (!smap_get(args
, "rx-flow-ctrl") && !smap_get(args
, "tx-flow-ctrl")
1887 && !smap_get(args
, "flow-ctrl-autoneg")) {
1888 /* FIXME: User didn't ask for flow control configuration.
1889 * For now we'll not print a warning if flow control is not
1890 * supported by the DPDK port. */
1891 flow_control_requested
= false;
1894 /* Get the Flow control configuration. */
1895 err
= -rte_eth_dev_flow_ctrl_get(dev
->port_id
, &dev
->fc_conf
);
1897 if (err
== ENOTSUP
) {
1898 if (flow_control_requested
) {
1899 VLOG_WARN("%s: Flow control is not supported.",
1900 netdev_get_name(netdev
));
1902 err
= 0; /* Not fatal. */
1904 VLOG_WARN("%s: Cannot get flow control parameters: %s",
1905 netdev_get_name(netdev
), rte_strerror(err
));
1910 if (dev
->fc_conf
.mode
!= fc_mode
|| autoneg
!= dev
->fc_conf
.autoneg
) {
1911 dev
->fc_conf
.mode
= fc_mode
;
1912 dev
->fc_conf
.autoneg
= autoneg
;
1913 dpdk_eth_flow_ctrl_setup(dev
);
1917 ovs_mutex_unlock(&dev
->mutex
);
1918 ovs_mutex_unlock(&dpdk_mutex
);
1924 netdev_dpdk_ring_set_config(struct netdev
*netdev
, const struct smap
*args
,
1925 char **errp OVS_UNUSED
)
1927 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1929 ovs_mutex_lock(&dev
->mutex
);
1930 dpdk_set_rxq_config(dev
, args
);
1931 ovs_mutex_unlock(&dev
->mutex
);
1937 netdev_dpdk_vhost_client_set_config(struct netdev
*netdev
,
1938 const struct smap
*args
,
1939 char **errp OVS_UNUSED
)
1941 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1943 int max_tx_retries
, cur_max_tx_retries
;
1945 ovs_mutex_lock(&dev
->mutex
);
1946 if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
)) {
1947 path
= smap_get(args
, "vhost-server-path");
1948 if (!nullable_string_is_equal(path
, dev
->vhost_id
)) {
1949 free(dev
->vhost_id
);
1950 dev
->vhost_id
= nullable_xstrdup(path
);
1951 /* check zero copy configuration */
1952 if (smap_get_bool(args
, "dq-zero-copy", false)) {
1953 dev
->vhost_driver_flags
|= RTE_VHOST_USER_DEQUEUE_ZERO_COPY
;
1955 dev
->vhost_driver_flags
&= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY
;
1957 netdev_request_reconfigure(netdev
);
1961 max_tx_retries
= smap_get_int(args
, "tx-retries-max",
1962 VHOST_ENQ_RETRY_DEF
);
1963 if (max_tx_retries
< VHOST_ENQ_RETRY_MIN
1964 || max_tx_retries
> VHOST_ENQ_RETRY_MAX
) {
1965 max_tx_retries
= VHOST_ENQ_RETRY_DEF
;
1967 atomic_read_relaxed(&dev
->vhost_tx_retries_max
, &cur_max_tx_retries
);
1968 if (max_tx_retries
!= cur_max_tx_retries
) {
1969 atomic_store_relaxed(&dev
->vhost_tx_retries_max
, max_tx_retries
);
1970 VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
1971 netdev_get_name(netdev
), max_tx_retries
);
1973 ovs_mutex_unlock(&dev
->mutex
);
1979 netdev_dpdk_get_numa_id(const struct netdev
*netdev
)
1981 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1983 return dev
->socket_id
;
1986 /* Sets the number of tx queues for the dpdk interface. */
1988 netdev_dpdk_set_tx_multiq(struct netdev
*netdev
, unsigned int n_txq
)
1990 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1992 ovs_mutex_lock(&dev
->mutex
);
1994 if (dev
->requested_n_txq
== n_txq
) {
1998 dev
->requested_n_txq
= n_txq
;
1999 netdev_request_reconfigure(netdev
);
2002 ovs_mutex_unlock(&dev
->mutex
);
2006 static struct netdev_rxq
*
2007 netdev_dpdk_rxq_alloc(void)
2009 struct netdev_rxq_dpdk
*rx
= dpdk_rte_mzalloc(sizeof *rx
);
2018 static struct netdev_rxq_dpdk
*
2019 netdev_rxq_dpdk_cast(const struct netdev_rxq
*rxq
)
2021 return CONTAINER_OF(rxq
, struct netdev_rxq_dpdk
, up
);
2025 netdev_dpdk_rxq_construct(struct netdev_rxq
*rxq
)
2027 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2028 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2030 ovs_mutex_lock(&dev
->mutex
);
2031 rx
->port_id
= dev
->port_id
;
2032 ovs_mutex_unlock(&dev
->mutex
);
2038 netdev_dpdk_rxq_destruct(struct netdev_rxq
*rxq OVS_UNUSED
)
2043 netdev_dpdk_rxq_dealloc(struct netdev_rxq
*rxq
)
2045 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2050 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2051 * 'pkts', even in case of failure.
2053 * Returns the number of packets that weren't transmitted. */
2055 netdev_dpdk_eth_tx_burst(struct netdev_dpdk
*dev
, int qid
,
2056 struct rte_mbuf
**pkts
, int cnt
)
2060 while (nb_tx
!= cnt
) {
2063 ret
= rte_eth_tx_burst(dev
->port_id
, qid
, pkts
+ nb_tx
, cnt
- nb_tx
);
2071 if (OVS_UNLIKELY(nb_tx
!= cnt
)) {
2072 /* Free buffers, which we couldn't transmit, one at a time (each
2073 * packet could come from a different mempool) */
2076 for (i
= nb_tx
; i
< cnt
; i
++) {
2077 rte_pktmbuf_free(pkts
[i
]);
2085 netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm
*meter
,
2086 struct rte_meter_srtcm_profile
*profile
,
2087 struct rte_mbuf
*pkt
, uint64_t time
)
2089 uint32_t pkt_len
= rte_pktmbuf_pkt_len(pkt
) - sizeof(struct ether_hdr
);
2091 return rte_meter_srtcm_color_blind_check(meter
, profile
, time
, pkt_len
) ==
2096 netdev_dpdk_policer_run(struct rte_meter_srtcm
*meter
,
2097 struct rte_meter_srtcm_profile
*profile
,
2098 struct rte_mbuf
**pkts
, int pkt_cnt
,
2103 struct rte_mbuf
*pkt
= NULL
;
2104 uint64_t current_time
= rte_rdtsc();
2106 for (i
= 0; i
< pkt_cnt
; i
++) {
2108 /* Handle current packet */
2109 if (netdev_dpdk_policer_pkt_handle(meter
, profile
,
2110 pkt
, current_time
)) {
2117 rte_pktmbuf_free(pkt
);
2126 ingress_policer_run(struct ingress_policer
*policer
, struct rte_mbuf
**pkts
,
2127 int pkt_cnt
, bool should_steal
)
2131 rte_spinlock_lock(&policer
->policer_lock
);
2132 cnt
= netdev_dpdk_policer_run(&policer
->in_policer
, &policer
->in_prof
,
2133 pkts
, pkt_cnt
, should_steal
);
2134 rte_spinlock_unlock(&policer
->policer_lock
);
2140 is_vhost_running(struct netdev_dpdk
*dev
)
2142 return (netdev_dpdk_get_vid(dev
) >= 0 && dev
->vhost_reconfigured
);
2146 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats
*stats
,
2147 unsigned int packet_size
)
2149 /* Hard-coded search for the size bucket. */
2150 if (packet_size
< 256) {
2151 if (packet_size
>= 128) {
2152 stats
->rx_128_to_255_packets
++;
2153 } else if (packet_size
<= 64) {
2154 stats
->rx_1_to_64_packets
++;
2156 stats
->rx_65_to_127_packets
++;
2159 if (packet_size
>= 1523) {
2160 stats
->rx_1523_to_max_packets
++;
2161 } else if (packet_size
>= 1024) {
2162 stats
->rx_1024_to_1522_packets
++;
2163 } else if (packet_size
< 512) {
2164 stats
->rx_256_to_511_packets
++;
2166 stats
->rx_512_to_1023_packets
++;
2172 netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk
*dev
,
2173 struct dp_packet
**packets
, int count
,
2176 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2177 struct netdev_stats
*stats
= &dev
->stats
;
2178 struct dp_packet
*packet
;
2179 unsigned int packet_size
;
2182 stats
->rx_packets
+= count
;
2183 stats
->rx_dropped
+= qos_drops
;
2184 for (i
= 0; i
< count
; i
++) {
2185 packet
= packets
[i
];
2186 packet_size
= dp_packet_size(packet
);
2188 if (OVS_UNLIKELY(packet_size
< ETH_HEADER_LEN
)) {
2189 /* This only protects the following multicast counting from
2190 * too short packets, but it does not stop the packet from
2191 * further processing. */
2193 stats
->rx_length_errors
++;
2197 netdev_dpdk_vhost_update_rx_size_counters(stats
, packet_size
);
2199 struct eth_header
*eh
= (struct eth_header
*) dp_packet_data(packet
);
2200 if (OVS_UNLIKELY(eth_addr_is_multicast(eh
->eth_dst
))) {
2204 stats
->rx_bytes
+= packet_size
;
2207 sw_stats
->rx_qos_drops
+= qos_drops
;
2211 * The receive path for the vhost port is the TX path out from guest.
2214 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
*rxq
,
2215 struct dp_packet_batch
*batch
, int *qfill
)
2217 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2218 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
2220 uint16_t qos_drops
= 0;
2221 int qid
= rxq
->queue_id
* VIRTIO_QNUM
+ VIRTIO_TXQ
;
2222 int vid
= netdev_dpdk_get_vid(dev
);
2224 if (OVS_UNLIKELY(vid
< 0 || !dev
->vhost_reconfigured
2225 || !(dev
->flags
& NETDEV_UP
))) {
2229 nb_rx
= rte_vhost_dequeue_burst(vid
, qid
, dev
->dpdk_mp
->mp
,
2230 (struct rte_mbuf
**) batch
->packets
,
2237 if (nb_rx
== NETDEV_MAX_BURST
) {
2238 /* The DPDK API returns a uint32_t which often has invalid bits in
2239 * the upper 16-bits. Need to restrict the value to uint16_t. */
2240 *qfill
= rte_vhost_rx_queue_count(vid
, qid
) & UINT16_MAX
;
2248 nb_rx
= ingress_policer_run(policer
,
2249 (struct rte_mbuf
**) batch
->packets
,
2254 rte_spinlock_lock(&dev
->stats_lock
);
2255 netdev_dpdk_vhost_update_rx_counters(dev
, batch
->packets
,
2257 rte_spinlock_unlock(&dev
->stats_lock
);
2259 batch
->count
= nb_rx
;
2260 dp_packet_batch_init_packet_fields(batch
);
2266 netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq
*rxq
)
2268 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2270 return dev
->vhost_rxq_enabled
[rxq
->queue_id
];
2274 netdev_dpdk_rxq_recv(struct netdev_rxq
*rxq
, struct dp_packet_batch
*batch
,
2277 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
2278 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
2279 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
2283 if (OVS_UNLIKELY(!(dev
->flags
& NETDEV_UP
))) {
2287 nb_rx
= rte_eth_rx_burst(rx
->port_id
, rxq
->queue_id
,
2288 (struct rte_mbuf
**) batch
->packets
,
2296 nb_rx
= ingress_policer_run(policer
,
2297 (struct rte_mbuf
**) batch
->packets
,
2302 /* Update stats to reflect dropped packets */
2303 if (OVS_UNLIKELY(dropped
)) {
2304 rte_spinlock_lock(&dev
->stats_lock
);
2305 dev
->stats
.rx_dropped
+= dropped
;
2306 dev
->sw_stats
->rx_qos_drops
+= dropped
;
2307 rte_spinlock_unlock(&dev
->stats_lock
);
2310 batch
->count
= nb_rx
;
2311 dp_packet_batch_init_packet_fields(batch
);
2314 if (nb_rx
== NETDEV_MAX_BURST
) {
2315 *qfill
= rte_eth_rx_queue_count(rx
->port_id
, rxq
->queue_id
);
2325 netdev_dpdk_qos_run(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
2326 int cnt
, bool should_steal
)
2328 struct qos_conf
*qos_conf
= ovsrcu_get(struct qos_conf
*, &dev
->qos_conf
);
2331 rte_spinlock_lock(&qos_conf
->lock
);
2332 cnt
= qos_conf
->ops
->qos_run(qos_conf
, pkts
, cnt
, should_steal
);
2333 rte_spinlock_unlock(&qos_conf
->lock
);
2340 netdev_dpdk_filter_packet_len(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
2345 struct rte_mbuf
*pkt
;
2347 for (i
= 0; i
< pkt_cnt
; i
++) {
2349 if (OVS_UNLIKELY(pkt
->pkt_len
> dev
->max_packet_len
)) {
2350 VLOG_WARN_RL(&rl
, "%s: Too big size %" PRIu32
" max_packet_len %d",
2351 dev
->up
.name
, pkt
->pkt_len
, dev
->max_packet_len
);
2352 rte_pktmbuf_free(pkt
);
2356 if (OVS_UNLIKELY(i
!= cnt
)) {
2366 netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk
*dev
,
2367 struct dp_packet
**packets
,
2369 struct netdev_dpdk_sw_stats
*sw_stats_add
)
2371 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2372 int dropped
= sw_stats_add
->tx_mtu_exceeded_drops
+
2373 sw_stats_add
->tx_qos_drops
+
2374 sw_stats_add
->tx_failure_drops
;
2375 struct netdev_stats
*stats
= &dev
->stats
;
2376 int sent
= attempted
- dropped
;
2379 stats
->tx_packets
+= sent
;
2380 stats
->tx_dropped
+= dropped
;
2382 for (i
= 0; i
< sent
; i
++) {
2383 stats
->tx_bytes
+= dp_packet_size(packets
[i
]);
2386 sw_stats
->tx_retries
+= sw_stats_add
->tx_retries
;
2387 sw_stats
->tx_failure_drops
+= sw_stats_add
->tx_failure_drops
;
2388 sw_stats
->tx_mtu_exceeded_drops
+= sw_stats_add
->tx_mtu_exceeded_drops
;
2389 sw_stats
->tx_qos_drops
+= sw_stats_add
->tx_qos_drops
;
2393 __netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
2394 struct dp_packet
**pkts
, int cnt
)
2396 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2397 struct rte_mbuf
**cur_pkts
= (struct rte_mbuf
**) pkts
;
2398 struct netdev_dpdk_sw_stats sw_stats_add
;
2399 unsigned int n_packets_to_free
= cnt
;
2400 unsigned int total_packets
= cnt
;
2402 int max_retries
= VHOST_ENQ_RETRY_MIN
;
2403 int vid
= netdev_dpdk_get_vid(dev
);
2405 qid
= dev
->tx_q
[qid
% netdev
->n_txq
].map
;
2407 if (OVS_UNLIKELY(vid
< 0 || !dev
->vhost_reconfigured
|| qid
< 0
2408 || !(dev
->flags
& NETDEV_UP
))) {
2409 rte_spinlock_lock(&dev
->stats_lock
);
2410 dev
->stats
.tx_dropped
+= cnt
;
2411 rte_spinlock_unlock(&dev
->stats_lock
);
2415 if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev
->tx_q
[qid
].tx_lock
))) {
2416 COVERAGE_INC(vhost_tx_contention
);
2417 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
2420 cnt
= netdev_dpdk_filter_packet_len(dev
, cur_pkts
, cnt
);
2421 sw_stats_add
.tx_mtu_exceeded_drops
= total_packets
- cnt
;
2423 /* Check has QoS has been configured for the netdev */
2424 sw_stats_add
.tx_qos_drops
= cnt
;
2425 cnt
= netdev_dpdk_qos_run(dev
, cur_pkts
, cnt
, true);
2426 sw_stats_add
.tx_qos_drops
-= cnt
;
2428 n_packets_to_free
= cnt
;
2431 int vhost_qid
= qid
* VIRTIO_QNUM
+ VIRTIO_RXQ
;
2432 unsigned int tx_pkts
;
2434 tx_pkts
= rte_vhost_enqueue_burst(vid
, vhost_qid
, cur_pkts
, cnt
);
2435 if (OVS_LIKELY(tx_pkts
)) {
2436 /* Packets have been sent.*/
2438 /* Prepare for possible retry.*/
2439 cur_pkts
= &cur_pkts
[tx_pkts
];
2440 if (OVS_UNLIKELY(cnt
&& !retries
)) {
2442 * Read max retries as there are packets not sent
2443 * and no retries have already occurred.
2445 atomic_read_relaxed(&dev
->vhost_tx_retries_max
, &max_retries
);
2448 /* No packets sent - do not retry.*/
2451 } while (cnt
&& (retries
++ < max_retries
));
2453 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
2455 sw_stats_add
.tx_failure_drops
= cnt
;
2456 sw_stats_add
.tx_retries
= MIN(retries
, max_retries
);
2458 rte_spinlock_lock(&dev
->stats_lock
);
2459 netdev_dpdk_vhost_update_tx_counters(dev
, pkts
, total_packets
,
2461 rte_spinlock_unlock(&dev
->stats_lock
);
2464 for (i
= 0; i
< n_packets_to_free
; i
++) {
2465 dp_packet_delete(pkts
[i
]);
2469 /* Tx function. Transmit packets indefinitely */
2471 dpdk_do_tx_copy(struct netdev
*netdev
, int qid
, struct dp_packet_batch
*batch
)
2472 OVS_NO_THREAD_SAFETY_ANALYSIS
2474 const size_t batch_cnt
= dp_packet_batch_size(batch
);
2475 #if !defined(__CHECKER__) && !defined(_WIN32)
2476 const size_t PKT_ARRAY_SIZE
= batch_cnt
;
2478 /* Sparse or MSVC doesn't like variable length array. */
2479 enum { PKT_ARRAY_SIZE
= NETDEV_MAX_BURST
};
2481 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2482 struct rte_mbuf
*pkts
[PKT_ARRAY_SIZE
];
2483 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2484 uint32_t cnt
= batch_cnt
;
2485 uint32_t dropped
= 0;
2486 uint32_t tx_failure
= 0;
2487 uint32_t mtu_drops
= 0;
2488 uint32_t qos_drops
= 0;
2490 if (dev
->type
!= DPDK_DEV_VHOST
) {
2491 /* Check if QoS has been configured for this netdev. */
2492 cnt
= netdev_dpdk_qos_run(dev
, (struct rte_mbuf
**) batch
->packets
,
2494 qos_drops
= batch_cnt
- cnt
;
2499 for (uint32_t i
= 0; i
< cnt
; i
++) {
2500 struct dp_packet
*packet
= batch
->packets
[i
];
2501 uint32_t size
= dp_packet_size(packet
);
2503 if (OVS_UNLIKELY(size
> dev
->max_packet_len
)) {
2504 VLOG_WARN_RL(&rl
, "Too big size %u max_packet_len %d",
2505 size
, dev
->max_packet_len
);
2511 pkts
[txcnt
] = rte_pktmbuf_alloc(dev
->dpdk_mp
->mp
);
2512 if (OVS_UNLIKELY(!pkts
[txcnt
])) {
2517 /* We have to do a copy for now */
2518 memcpy(rte_pktmbuf_mtod(pkts
[txcnt
], void *),
2519 dp_packet_data(packet
), size
);
2520 dp_packet_set_size((struct dp_packet
*)pkts
[txcnt
], size
);
2525 if (OVS_LIKELY(txcnt
)) {
2526 if (dev
->type
== DPDK_DEV_VHOST
) {
2527 __netdev_dpdk_vhost_send(netdev
, qid
, (struct dp_packet
**) pkts
,
2530 tx_failure
= netdev_dpdk_eth_tx_burst(dev
, qid
, pkts
, txcnt
);
2534 dropped
+= qos_drops
+ mtu_drops
+ tx_failure
;
2535 if (OVS_UNLIKELY(dropped
)) {
2536 rte_spinlock_lock(&dev
->stats_lock
);
2537 dev
->stats
.tx_dropped
+= dropped
;
2538 sw_stats
->tx_failure_drops
+= tx_failure
;
2539 sw_stats
->tx_mtu_exceeded_drops
+= mtu_drops
;
2540 sw_stats
->tx_qos_drops
+= qos_drops
;
2541 rte_spinlock_unlock(&dev
->stats_lock
);
2546 netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
2547 struct dp_packet_batch
*batch
,
2548 bool concurrent_txq OVS_UNUSED
)
2551 if (OVS_UNLIKELY(batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
2552 dpdk_do_tx_copy(netdev
, qid
, batch
);
2553 dp_packet_delete_batch(batch
, true);
2555 __netdev_dpdk_vhost_send(netdev
, qid
, batch
->packets
,
2556 dp_packet_batch_size(batch
));
2562 netdev_dpdk_send__(struct netdev_dpdk
*dev
, int qid
,
2563 struct dp_packet_batch
*batch
,
2564 bool concurrent_txq
)
2566 if (OVS_UNLIKELY(!(dev
->flags
& NETDEV_UP
))) {
2567 dp_packet_delete_batch(batch
, true);
2571 if (OVS_UNLIKELY(concurrent_txq
)) {
2572 qid
= qid
% dev
->up
.n_txq
;
2573 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
2576 if (OVS_UNLIKELY(batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
2577 struct netdev
*netdev
= &dev
->up
;
2579 dpdk_do_tx_copy(netdev
, qid
, batch
);
2580 dp_packet_delete_batch(batch
, true);
2582 struct netdev_dpdk_sw_stats
*sw_stats
= dev
->sw_stats
;
2583 int tx_cnt
, dropped
;
2584 int tx_failure
, mtu_drops
, qos_drops
;
2585 int batch_cnt
= dp_packet_batch_size(batch
);
2586 struct rte_mbuf
**pkts
= (struct rte_mbuf
**) batch
->packets
;
2588 tx_cnt
= netdev_dpdk_filter_packet_len(dev
, pkts
, batch_cnt
);
2589 mtu_drops
= batch_cnt
- tx_cnt
;
2591 tx_cnt
= netdev_dpdk_qos_run(dev
, pkts
, tx_cnt
, true);
2592 qos_drops
-= tx_cnt
;
2594 tx_failure
= netdev_dpdk_eth_tx_burst(dev
, qid
, pkts
, tx_cnt
);
2596 dropped
= tx_failure
+ mtu_drops
+ qos_drops
;
2597 if (OVS_UNLIKELY(dropped
)) {
2598 rte_spinlock_lock(&dev
->stats_lock
);
2599 dev
->stats
.tx_dropped
+= dropped
;
2600 sw_stats
->tx_failure_drops
+= tx_failure
;
2601 sw_stats
->tx_mtu_exceeded_drops
+= mtu_drops
;
2602 sw_stats
->tx_qos_drops
+= qos_drops
;
2603 rte_spinlock_unlock(&dev
->stats_lock
);
2607 if (OVS_UNLIKELY(concurrent_txq
)) {
2608 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
2613 netdev_dpdk_eth_send(struct netdev
*netdev
, int qid
,
2614 struct dp_packet_batch
*batch
, bool concurrent_txq
)
2616 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2618 netdev_dpdk_send__(dev
, qid
, batch
, concurrent_txq
);
2623 netdev_dpdk_set_etheraddr(struct netdev
*netdev
, const struct eth_addr mac
)
2625 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2627 ovs_mutex_lock(&dev
->mutex
);
2628 if (!eth_addr_equals(dev
->hwaddr
, mac
)) {
2630 netdev_change_seq_changed(netdev
);
2632 ovs_mutex_unlock(&dev
->mutex
);
2638 netdev_dpdk_get_etheraddr(const struct netdev
*netdev
, struct eth_addr
*mac
)
2640 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2642 ovs_mutex_lock(&dev
->mutex
);
2644 ovs_mutex_unlock(&dev
->mutex
);
2650 netdev_dpdk_get_mtu(const struct netdev
*netdev
, int *mtup
)
2652 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2654 ovs_mutex_lock(&dev
->mutex
);
2656 ovs_mutex_unlock(&dev
->mutex
);
2662 netdev_dpdk_set_mtu(struct netdev
*netdev
, int mtu
)
2664 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2666 /* XXX: Ensure that the overall frame length of the requested MTU does not
2667 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2668 * the L2 frame length is calculated for a given MTU when
2669 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2670 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2671 * not include vlan headers. As such we should use
2672 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2673 * (8 bytes) for comparison. This avoids a failure later with
2674 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
2675 * a method to retrieve the upper bound MTU for a given device.
2677 if (MTU_TO_MAX_FRAME_LEN(mtu
) > NETDEV_DPDK_MAX_PKT_LEN
2678 || mtu
< ETHER_MIN_MTU
) {
2679 VLOG_WARN("%s: unsupported MTU %d\n", dev
->up
.name
, mtu
);
2683 ovs_mutex_lock(&dev
->mutex
);
2684 if (dev
->requested_mtu
!= mtu
) {
2685 dev
->requested_mtu
= mtu
;
2686 netdev_request_reconfigure(netdev
);
2688 ovs_mutex_unlock(&dev
->mutex
);
2694 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
);
2697 netdev_dpdk_vhost_get_stats(const struct netdev
*netdev
,
2698 struct netdev_stats
*stats
)
2700 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2702 ovs_mutex_lock(&dev
->mutex
);
2704 rte_spinlock_lock(&dev
->stats_lock
);
2705 /* Supported Stats */
2706 stats
->rx_packets
= dev
->stats
.rx_packets
;
2707 stats
->tx_packets
= dev
->stats
.tx_packets
;
2708 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
2709 stats
->tx_dropped
= dev
->stats
.tx_dropped
;
2710 stats
->multicast
= dev
->stats
.multicast
;
2711 stats
->rx_bytes
= dev
->stats
.rx_bytes
;
2712 stats
->tx_bytes
= dev
->stats
.tx_bytes
;
2713 stats
->rx_errors
= dev
->stats
.rx_errors
;
2714 stats
->rx_length_errors
= dev
->stats
.rx_length_errors
;
2716 stats
->rx_1_to_64_packets
= dev
->stats
.rx_1_to_64_packets
;
2717 stats
->rx_65_to_127_packets
= dev
->stats
.rx_65_to_127_packets
;
2718 stats
->rx_128_to_255_packets
= dev
->stats
.rx_128_to_255_packets
;
2719 stats
->rx_256_to_511_packets
= dev
->stats
.rx_256_to_511_packets
;
2720 stats
->rx_512_to_1023_packets
= dev
->stats
.rx_512_to_1023_packets
;
2721 stats
->rx_1024_to_1522_packets
= dev
->stats
.rx_1024_to_1522_packets
;
2722 stats
->rx_1523_to_max_packets
= dev
->stats
.rx_1523_to_max_packets
;
2724 rte_spinlock_unlock(&dev
->stats_lock
);
2726 ovs_mutex_unlock(&dev
->mutex
);
2732 netdev_dpdk_convert_xstats(struct netdev_stats
*stats
,
2733 const struct rte_eth_xstat
*xstats
,
2734 const struct rte_eth_xstat_name
*names
,
2735 const unsigned int size
)
2737 /* DPDK XSTATS Counter names definition. */
2738 #define DPDK_XSTATS \
2739 DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
2740 DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
2741 DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
2742 DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
2743 DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
2744 DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
2745 DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
2746 DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
2747 DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
2748 DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
2749 DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
2750 DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
2751 DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
2752 DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
2753 DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
2754 DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
2755 DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
2756 DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
2757 DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
2758 DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
2759 DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
2760 DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
2762 for (unsigned int i
= 0; i
< size
; i
++) {
2763 #define DPDK_XSTAT(MEMBER, NAME) \
2764 if (strcmp(NAME, names[i].name) == 0) { \
2765 stats->MEMBER = xstats[i].value; \
2775 netdev_dpdk_get_stats(const struct netdev
*netdev
, struct netdev_stats
*stats
)
2777 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2778 struct rte_eth_stats rte_stats
;
2781 netdev_dpdk_get_carrier(netdev
, &gg
);
2782 ovs_mutex_lock(&dev
->mutex
);
2784 struct rte_eth_xstat
*rte_xstats
= NULL
;
2785 struct rte_eth_xstat_name
*rte_xstats_names
= NULL
;
2786 int rte_xstats_len
, rte_xstats_new_len
, rte_xstats_ret
;
2788 if (rte_eth_stats_get(dev
->port_id
, &rte_stats
)) {
2789 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT
,
2791 ovs_mutex_unlock(&dev
->mutex
);
2795 /* Get length of statistics */
2796 rte_xstats_len
= rte_eth_xstats_get_names(dev
->port_id
, NULL
, 0);
2797 if (rte_xstats_len
< 0) {
2798 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
2802 /* Reserve memory for xstats names and values */
2803 rte_xstats_names
= xcalloc(rte_xstats_len
, sizeof *rte_xstats_names
);
2804 rte_xstats
= xcalloc(rte_xstats_len
, sizeof *rte_xstats
);
2806 /* Retreive xstats names */
2807 rte_xstats_new_len
= rte_eth_xstats_get_names(dev
->port_id
,
2810 if (rte_xstats_new_len
!= rte_xstats_len
) {
2811 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT
,
2815 /* Retreive xstats values */
2816 memset(rte_xstats
, 0xff, sizeof *rte_xstats
* rte_xstats_len
);
2817 rte_xstats_ret
= rte_eth_xstats_get(dev
->port_id
, rte_xstats
,
2819 if (rte_xstats_ret
> 0 && rte_xstats_ret
<= rte_xstats_len
) {
2820 netdev_dpdk_convert_xstats(stats
, rte_xstats
, rte_xstats_names
,
2823 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
2829 free(rte_xstats_names
);
2831 stats
->rx_packets
= rte_stats
.ipackets
;
2832 stats
->tx_packets
= rte_stats
.opackets
;
2833 stats
->rx_bytes
= rte_stats
.ibytes
;
2834 stats
->tx_bytes
= rte_stats
.obytes
;
2835 stats
->rx_errors
= rte_stats
.ierrors
;
2836 stats
->tx_errors
= rte_stats
.oerrors
;
2838 rte_spinlock_lock(&dev
->stats_lock
);
2839 stats
->tx_dropped
= dev
->stats
.tx_dropped
;
2840 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
2841 rte_spinlock_unlock(&dev
->stats_lock
);
2843 /* These are the available DPDK counters for packets not received due to
2844 * local resource constraints in DPDK and NIC respectively. */
2845 stats
->rx_dropped
+= rte_stats
.rx_nombuf
+ rte_stats
.imissed
;
2846 stats
->rx_missed_errors
= rte_stats
.imissed
;
2848 ovs_mutex_unlock(&dev
->mutex
);
2854 netdev_dpdk_get_custom_stats(const struct netdev
*netdev
,
2855 struct netdev_custom_stats
*custom_stats
)
2859 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2860 int rte_xstats_ret
, sw_stats_size
;
2862 netdev_dpdk_get_sw_custom_stats(netdev
, custom_stats
);
2864 ovs_mutex_lock(&dev
->mutex
);
2866 if (netdev_dpdk_configure_xstats(dev
)) {
2867 uint64_t *values
= xcalloc(dev
->rte_xstats_ids_size
,
2871 rte_eth_xstats_get_by_id(dev
->port_id
, dev
->rte_xstats_ids
,
2872 values
, dev
->rte_xstats_ids_size
);
2874 if (rte_xstats_ret
> 0 &&
2875 rte_xstats_ret
<= dev
->rte_xstats_ids_size
) {
2877 sw_stats_size
= custom_stats
->size
;
2878 custom_stats
->size
+= rte_xstats_ret
;
2879 custom_stats
->counters
= xrealloc(custom_stats
->counters
,
2880 custom_stats
->size
*
2881 sizeof *custom_stats
->counters
);
2883 for (i
= 0; i
< rte_xstats_ret
; i
++) {
2884 ovs_strlcpy(custom_stats
->counters
[sw_stats_size
+ i
].name
,
2885 netdev_dpdk_get_xstat_name(dev
,
2886 dev
->rte_xstats_ids
[i
]),
2887 NETDEV_CUSTOM_STATS_NAME_SIZE
);
2888 custom_stats
->counters
[sw_stats_size
+ i
].value
= values
[i
];
2891 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT
,
2893 /* Let's clear statistics cache, so it will be
2895 netdev_dpdk_clear_xstats(dev
);
2901 ovs_mutex_unlock(&dev
->mutex
);
2907 netdev_dpdk_get_sw_custom_stats(const struct netdev
*netdev
,
2908 struct netdev_custom_stats
*custom_stats
)
2910 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2914 SW_CSTAT(tx_retries) \
2915 SW_CSTAT(tx_failure_drops) \
2916 SW_CSTAT(tx_mtu_exceeded_drops) \
2917 SW_CSTAT(tx_qos_drops) \
2918 SW_CSTAT(rx_qos_drops)
2920 #define SW_CSTAT(NAME) + 1
2921 custom_stats
->size
= SW_CSTATS
;
2923 custom_stats
->counters
= xcalloc(custom_stats
->size
,
2924 sizeof *custom_stats
->counters
);
2926 ovs_mutex_lock(&dev
->mutex
);
2928 rte_spinlock_lock(&dev
->stats_lock
);
2930 #define SW_CSTAT(NAME) \
2931 custom_stats->counters[i++].value = dev->sw_stats->NAME;
2934 rte_spinlock_unlock(&dev
->stats_lock
);
2936 ovs_mutex_unlock(&dev
->mutex
);
2940 #define SW_CSTAT(NAME) \
2941 if (custom_stats->counters[i].value != UINT64_MAX) { \
2942 ovs_strlcpy(custom_stats->counters[n].name, \
2943 "ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
2944 custom_stats->counters[n].value = custom_stats->counters[i].value; \
2951 custom_stats
->size
= n
;
2956 netdev_dpdk_get_features(const struct netdev
*netdev
,
2957 enum netdev_features
*current
,
2958 enum netdev_features
*advertised
,
2959 enum netdev_features
*supported
,
2960 enum netdev_features
*peer
)
2962 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2963 struct rte_eth_link link
;
2964 uint32_t feature
= 0;
2966 ovs_mutex_lock(&dev
->mutex
);
2968 ovs_mutex_unlock(&dev
->mutex
);
2970 /* Match against OpenFlow defined link speed values. */
2971 if (link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) {
2972 switch (link
.link_speed
) {
2973 case ETH_SPEED_NUM_10M
:
2974 feature
|= NETDEV_F_10MB_FD
;
2976 case ETH_SPEED_NUM_100M
:
2977 feature
|= NETDEV_F_100MB_FD
;
2979 case ETH_SPEED_NUM_1G
:
2980 feature
|= NETDEV_F_1GB_FD
;
2982 case ETH_SPEED_NUM_10G
:
2983 feature
|= NETDEV_F_10GB_FD
;
2985 case ETH_SPEED_NUM_40G
:
2986 feature
|= NETDEV_F_40GB_FD
;
2988 case ETH_SPEED_NUM_100G
:
2989 feature
|= NETDEV_F_100GB_FD
;
2992 feature
|= NETDEV_F_OTHER
;
2994 } else if (link
.link_duplex
== ETH_LINK_HALF_DUPLEX
) {
2995 switch (link
.link_speed
) {
2996 case ETH_SPEED_NUM_10M
:
2997 feature
|= NETDEV_F_10MB_HD
;
2999 case ETH_SPEED_NUM_100M
:
3000 feature
|= NETDEV_F_100MB_HD
;
3002 case ETH_SPEED_NUM_1G
:
3003 feature
|= NETDEV_F_1GB_HD
;
3006 feature
|= NETDEV_F_OTHER
;
3010 if (link
.link_autoneg
) {
3011 feature
|= NETDEV_F_AUTONEG
;
3015 *advertised
= *supported
= *peer
= 0;
3020 static struct ingress_policer
*
3021 netdev_dpdk_policer_construct(uint32_t rate
, uint32_t burst
)
3023 struct ingress_policer
*policer
= NULL
;
3024 uint64_t rate_bytes
;
3025 uint64_t burst_bytes
;
3028 policer
= xmalloc(sizeof *policer
);
3029 rte_spinlock_init(&policer
->policer_lock
);
3031 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
3032 rate_bytes
= rate
* 1000ULL / 8;
3033 burst_bytes
= burst
* 1000ULL / 8;
3035 policer
->app_srtcm_params
.cir
= rate_bytes
;
3036 policer
->app_srtcm_params
.cbs
= burst_bytes
;
3037 policer
->app_srtcm_params
.ebs
= 0;
3038 err
= rte_meter_srtcm_profile_config(&policer
->in_prof
,
3039 &policer
->app_srtcm_params
);
3041 err
= rte_meter_srtcm_config(&policer
->in_policer
,
3045 VLOG_ERR("Could not create rte meter for ingress policer");
3054 netdev_dpdk_set_policing(struct netdev
* netdev
, uint32_t policer_rate
,
3055 uint32_t policer_burst
)
3057 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3058 struct ingress_policer
*policer
;
3060 /* Force to 0 if no rate specified,
3061 * default to 8000 kbits if burst is 0,
3062 * else stick with user-specified value.
3064 policer_burst
= (!policer_rate
? 0
3065 : !policer_burst
? 8000
3068 ovs_mutex_lock(&dev
->mutex
);
3070 policer
= ovsrcu_get_protected(struct ingress_policer
*,
3071 &dev
->ingress_policer
);
3073 if (dev
->policer_rate
== policer_rate
&&
3074 dev
->policer_burst
== policer_burst
) {
3075 /* Assume that settings haven't changed since we last set them. */
3076 ovs_mutex_unlock(&dev
->mutex
);
3080 /* Destroy any existing ingress policer for the device if one exists */
3082 ovsrcu_postpone(free
, policer
);
3085 if (policer_rate
!= 0) {
3086 policer
= netdev_dpdk_policer_construct(policer_rate
, policer_burst
);
3090 ovsrcu_set(&dev
->ingress_policer
, policer
);
3091 dev
->policer_rate
= policer_rate
;
3092 dev
->policer_burst
= policer_burst
;
3093 ovs_mutex_unlock(&dev
->mutex
);
3099 netdev_dpdk_get_ifindex(const struct netdev
*netdev
)
3101 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3103 ovs_mutex_lock(&dev
->mutex
);
3104 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
3105 * postive integer to meet RFC 2863 recommendations.
3107 int ifindex
= hash_string(netdev
->name
, 0) % 0xfffffe + 1;
3108 ovs_mutex_unlock(&dev
->mutex
);
3114 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
)
3116 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3118 ovs_mutex_lock(&dev
->mutex
);
3119 check_link_status(dev
);
3120 *carrier
= dev
->link
.link_status
;
3122 ovs_mutex_unlock(&dev
->mutex
);
3128 netdev_dpdk_vhost_get_carrier(const struct netdev
*netdev
, bool *carrier
)
3130 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3132 ovs_mutex_lock(&dev
->mutex
);
3134 if (is_vhost_running(dev
)) {
3140 ovs_mutex_unlock(&dev
->mutex
);
3145 static long long int
3146 netdev_dpdk_get_carrier_resets(const struct netdev
*netdev
)
3148 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3149 long long int carrier_resets
;
3151 ovs_mutex_lock(&dev
->mutex
);
3152 carrier_resets
= dev
->link_reset_cnt
;
3153 ovs_mutex_unlock(&dev
->mutex
);
3155 return carrier_resets
;
3159 netdev_dpdk_set_miimon(struct netdev
*netdev OVS_UNUSED
,
3160 long long int interval OVS_UNUSED
)
3166 netdev_dpdk_update_flags__(struct netdev_dpdk
*dev
,
3167 enum netdev_flags off
, enum netdev_flags on
,
3168 enum netdev_flags
*old_flagsp
)
3169 OVS_REQUIRES(dev
->mutex
)
3171 if ((off
| on
) & ~(NETDEV_UP
| NETDEV_PROMISC
)) {
3175 *old_flagsp
= dev
->flags
;
3179 if (dev
->flags
== *old_flagsp
) {
3183 if (dev
->type
== DPDK_DEV_ETH
) {
3185 if ((dev
->flags
^ *old_flagsp
) & NETDEV_UP
) {
3188 if (dev
->flags
& NETDEV_UP
) {
3189 err
= rte_eth_dev_set_link_up(dev
->port_id
);
3191 err
= rte_eth_dev_set_link_down(dev
->port_id
);
3193 if (err
== -ENOTSUP
) {
3194 VLOG_INFO("Interface %s does not support link state "
3195 "configuration", netdev_get_name(&dev
->up
));
3196 } else if (err
< 0) {
3197 VLOG_ERR("Interface %s link change error: %s",
3198 netdev_get_name(&dev
->up
), rte_strerror(-err
));
3199 dev
->flags
= *old_flagsp
;
3204 if (dev
->flags
& NETDEV_PROMISC
) {
3205 rte_eth_promiscuous_enable(dev
->port_id
);
3208 netdev_change_seq_changed(&dev
->up
);
3210 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3211 * running then change netdev's change_seq to trigger link state
3214 if ((NETDEV_UP
& ((*old_flagsp
^ on
) | (*old_flagsp
^ off
)))
3215 && is_vhost_running(dev
)) {
3216 netdev_change_seq_changed(&dev
->up
);
3218 /* Clear statistics if device is getting up. */
3219 if (NETDEV_UP
& on
) {
3220 rte_spinlock_lock(&dev
->stats_lock
);
3221 memset(&dev
->stats
, 0, sizeof dev
->stats
);
3222 rte_spinlock_unlock(&dev
->stats_lock
);
3231 netdev_dpdk_update_flags(struct netdev
*netdev
,
3232 enum netdev_flags off
, enum netdev_flags on
,
3233 enum netdev_flags
*old_flagsp
)
3235 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3238 ovs_mutex_lock(&dev
->mutex
);
3239 error
= netdev_dpdk_update_flags__(dev
, off
, on
, old_flagsp
);
3240 ovs_mutex_unlock(&dev
->mutex
);
3246 netdev_dpdk_vhost_user_get_status(const struct netdev
*netdev
,
3249 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3251 ovs_mutex_lock(&dev
->mutex
);
3253 bool client_mode
= dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
;
3254 smap_add_format(args
, "mode", "%s", client_mode
? "client" : "server");
3256 int vid
= netdev_dpdk_get_vid(dev
);
3258 smap_add_format(args
, "status", "disconnected");
3259 ovs_mutex_unlock(&dev
->mutex
);
3262 smap_add_format(args
, "status", "connected");
3265 char socket_name
[PATH_MAX
];
3266 if (!rte_vhost_get_ifname(vid
, socket_name
, PATH_MAX
)) {
3267 smap_add_format(args
, "socket", "%s", socket_name
);
3271 if (!rte_vhost_get_negotiated_features(vid
, &features
)) {
3272 smap_add_format(args
, "features", "0x%016"PRIx64
, features
);
3276 if (!rte_vhost_get_mtu(vid
, &mtu
)) {
3277 smap_add_format(args
, "mtu", "%d", mtu
);
3280 int numa
= rte_vhost_get_numa_node(vid
);
3282 smap_add_format(args
, "numa", "%d", numa
);
3285 uint16_t vring_num
= rte_vhost_get_vring_num(vid
);
3287 smap_add_format(args
, "num_of_vrings", "%d", vring_num
);
3290 for (int i
= 0; i
< vring_num
; i
++) {
3291 struct rte_vhost_vring vring
;
3293 rte_vhost_get_vhost_vring(vid
, i
, &vring
);
3294 smap_add_nocopy(args
, xasprintf("vring_%d_size", i
),
3295 xasprintf("%d", vring
.size
));
3298 ovs_mutex_unlock(&dev
->mutex
);
3303 * Convert a given uint32_t link speed defined in DPDK to a string
3307 netdev_dpdk_link_speed_to_str__(uint32_t link_speed
)
3309 switch (link_speed
) {
3310 case ETH_SPEED_NUM_10M
: return "10Mbps";
3311 case ETH_SPEED_NUM_100M
: return "100Mbps";
3312 case ETH_SPEED_NUM_1G
: return "1Gbps";
3313 case ETH_SPEED_NUM_2_5G
: return "2.5Gbps";
3314 case ETH_SPEED_NUM_5G
: return "5Gbps";
3315 case ETH_SPEED_NUM_10G
: return "10Gbps";
3316 case ETH_SPEED_NUM_20G
: return "20Gbps";
3317 case ETH_SPEED_NUM_25G
: return "25Gbps";
3318 case ETH_SPEED_NUM_40G
: return "40Gbps";
3319 case ETH_SPEED_NUM_50G
: return "50Gbps";
3320 case ETH_SPEED_NUM_56G
: return "56Gbps";
3321 case ETH_SPEED_NUM_100G
: return "100Gbps";
3322 default: return "Not Defined";
3327 netdev_dpdk_get_status(const struct netdev
*netdev
, struct smap
*args
)
3329 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3330 struct rte_eth_dev_info dev_info
;
3331 uint32_t link_speed
;
3333 if (!rte_eth_dev_is_valid_port(dev
->port_id
)) {
3337 ovs_mutex_lock(&dpdk_mutex
);
3338 ovs_mutex_lock(&dev
->mutex
);
3339 rte_eth_dev_info_get(dev
->port_id
, &dev_info
);
3340 link_speed
= dev
->link
.link_speed
;
3341 ovs_mutex_unlock(&dev
->mutex
);
3342 const struct rte_bus
*bus
;
3343 const struct rte_pci_device
*pci_dev
;
3344 uint16_t vendor_id
= PCI_ANY_ID
;
3345 uint16_t device_id
= PCI_ANY_ID
;
3346 bus
= rte_bus_find_by_device(dev_info
.device
);
3347 if (bus
&& !strcmp(bus
->name
, "pci")) {
3348 pci_dev
= RTE_DEV_TO_PCI(dev_info
.device
);
3350 vendor_id
= pci_dev
->id
.vendor_id
;
3351 device_id
= pci_dev
->id
.device_id
;
3354 ovs_mutex_unlock(&dpdk_mutex
);
3356 smap_add_format(args
, "port_no", DPDK_PORT_ID_FMT
, dev
->port_id
);
3357 smap_add_format(args
, "numa_id", "%d",
3358 rte_eth_dev_socket_id(dev
->port_id
));
3359 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
3360 smap_add_format(args
, "min_rx_bufsize", "%u", dev_info
.min_rx_bufsize
);
3361 smap_add_format(args
, "max_rx_pktlen", "%u", dev
->max_packet_len
);
3362 smap_add_format(args
, "max_rx_queues", "%u", dev_info
.max_rx_queues
);
3363 smap_add_format(args
, "max_tx_queues", "%u", dev_info
.max_tx_queues
);
3364 smap_add_format(args
, "max_mac_addrs", "%u", dev_info
.max_mac_addrs
);
3365 smap_add_format(args
, "max_hash_mac_addrs", "%u",
3366 dev_info
.max_hash_mac_addrs
);
3367 smap_add_format(args
, "max_vfs", "%u", dev_info
.max_vfs
);
3368 smap_add_format(args
, "max_vmdq_pools", "%u", dev_info
.max_vmdq_pools
);
3370 /* Querying the DPDK library for iftype may be done in future, pending
3371 * support; cf. RFC 3635 Section 3.2.4. */
3372 enum { IF_TYPE_ETHERNETCSMACD
= 6 };
3374 smap_add_format(args
, "if_type", "%"PRIu32
, IF_TYPE_ETHERNETCSMACD
);
3375 smap_add_format(args
, "if_descr", "%s %s", rte_version(),
3376 dev_info
.driver_name
);
3377 smap_add_format(args
, "pci-vendor_id", "0x%x", vendor_id
);
3378 smap_add_format(args
, "pci-device_id", "0x%x", device_id
);
3380 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3381 * In that case the speed will not be reported as part of the usual
3382 * call to get_features(). Get the link speed of the device and add it
3383 * to the device status in an easy to read string format.
3385 smap_add(args
, "link_speed",
3386 netdev_dpdk_link_speed_to_str__(link_speed
));
3392 netdev_dpdk_set_admin_state__(struct netdev_dpdk
*dev
, bool admin_state
)
3393 OVS_REQUIRES(dev
->mutex
)
3395 enum netdev_flags old_flags
;
3398 netdev_dpdk_update_flags__(dev
, 0, NETDEV_UP
, &old_flags
);
3400 netdev_dpdk_update_flags__(dev
, NETDEV_UP
, 0, &old_flags
);
3405 netdev_dpdk_set_admin_state(struct unixctl_conn
*conn
, int argc
,
3406 const char *argv
[], void *aux OVS_UNUSED
)
3410 if (!strcasecmp(argv
[argc
- 1], "up")) {
3412 } else if ( !strcasecmp(argv
[argc
- 1], "down")) {
3415 unixctl_command_reply_error(conn
, "Invalid Admin State");
3420 struct netdev
*netdev
= netdev_from_name(argv
[1]);
3422 if (netdev
&& is_dpdk_class(netdev
->netdev_class
)) {
3423 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3425 ovs_mutex_lock(&dev
->mutex
);
3426 netdev_dpdk_set_admin_state__(dev
, up
);
3427 ovs_mutex_unlock(&dev
->mutex
);
3429 netdev_close(netdev
);
3431 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
3432 netdev_close(netdev
);
3436 struct netdev_dpdk
*dev
;
3438 ovs_mutex_lock(&dpdk_mutex
);
3439 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3440 ovs_mutex_lock(&dev
->mutex
);
3441 netdev_dpdk_set_admin_state__(dev
, up
);
3442 ovs_mutex_unlock(&dev
->mutex
);
3444 ovs_mutex_unlock(&dpdk_mutex
);
3446 unixctl_command_reply(conn
, "OK");
3450 netdev_dpdk_detach(struct unixctl_conn
*conn
, int argc OVS_UNUSED
,
3451 const char *argv
[], void *aux OVS_UNUSED
)
3454 dpdk_port_t port_id
;
3455 struct netdev_dpdk
*dev
;
3456 struct rte_device
*rte_dev
;
3457 struct ds used_interfaces
= DS_EMPTY_INITIALIZER
;
3460 ovs_mutex_lock(&dpdk_mutex
);
3462 port_id
= netdev_dpdk_get_port_by_devargs(argv
[1]);
3463 if (!rte_eth_dev_is_valid_port(port_id
)) {
3464 response
= xasprintf("Device '%s' not found in DPDK", argv
[1]);
3468 rte_dev
= rte_eth_devices
[port_id
].device
;
3469 ds_put_format(&used_interfaces
,
3470 "Device '%s' is being used by the following interfaces:",
3473 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3474 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3475 if (rte_eth_devices
[dev
->port_id
].device
== rte_dev
3476 && rte_eth_devices
[dev
->port_id
].state
!= RTE_ETH_DEV_UNUSED
) {
3478 ds_put_format(&used_interfaces
, " %s",
3479 netdev_get_name(&dev
->up
));
3484 ds_put_cstr(&used_interfaces
, ". Remove them before detaching.");
3485 response
= ds_steal_cstr(&used_interfaces
);
3486 ds_destroy(&used_interfaces
);
3489 ds_destroy(&used_interfaces
);
3491 rte_eth_dev_close(port_id
);
3492 if (rte_dev_remove(rte_dev
) < 0) {
3493 response
= xasprintf("Device '%s' can not be detached", argv
[1]);
3497 response
= xasprintf("All devices shared with device '%s' "
3498 "have been detached", argv
[1]);
3500 ovs_mutex_unlock(&dpdk_mutex
);
3501 unixctl_command_reply(conn
, response
);
3506 ovs_mutex_unlock(&dpdk_mutex
);
3507 unixctl_command_reply_error(conn
, response
);
3512 netdev_dpdk_get_mempool_info(struct unixctl_conn
*conn
,
3513 int argc
, const char *argv
[],
3514 void *aux OVS_UNUSED
)
3518 char *response
= NULL
;
3519 struct netdev
*netdev
= NULL
;
3522 netdev
= netdev_from_name(argv
[1]);
3523 if (!netdev
|| !is_dpdk_class(netdev
->netdev_class
)) {
3524 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
3529 stream
= open_memstream(&response
, &size
);
3531 response
= xasprintf("Unable to open memstream: %s.",
3532 ovs_strerror(errno
));
3533 unixctl_command_reply_error(conn
, response
);
3538 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3540 ovs_mutex_lock(&dev
->mutex
);
3541 ovs_mutex_lock(&dpdk_mp_mutex
);
3543 rte_mempool_dump(stream
, dev
->dpdk_mp
->mp
);
3545 ovs_mutex_unlock(&dpdk_mp_mutex
);
3546 ovs_mutex_unlock(&dev
->mutex
);
3548 ovs_mutex_lock(&dpdk_mp_mutex
);
3549 rte_mempool_list_dump(stream
);
3550 ovs_mutex_unlock(&dpdk_mp_mutex
);
3555 unixctl_command_reply(conn
, response
);
3558 netdev_close(netdev
);
3562 * Set virtqueue flags so that we do not receive interrupts.
3565 set_irq_status(int vid
)
3569 for (i
= 0; i
< rte_vhost_get_vring_num(vid
); i
++) {
3570 rte_vhost_enable_guest_notification(vid
, i
, 0);
3575 * Fixes mapping for vhost-user tx queues. Must be called after each
3576 * enabling/disabling of queues and n_txq modifications.
3579 netdev_dpdk_remap_txqs(struct netdev_dpdk
*dev
)
3580 OVS_REQUIRES(dev
->mutex
)
3582 int *enabled_queues
, n_enabled
= 0;
3583 int i
, k
, total_txqs
= dev
->up
.n_txq
;
3585 enabled_queues
= xcalloc(total_txqs
, sizeof *enabled_queues
);
3587 for (i
= 0; i
< total_txqs
; i
++) {
3588 /* Enabled queues always mapped to themselves. */
3589 if (dev
->tx_q
[i
].map
== i
) {
3590 enabled_queues
[n_enabled
++] = i
;
3594 if (n_enabled
== 0 && total_txqs
!= 0) {
3595 enabled_queues
[0] = OVS_VHOST_QUEUE_DISABLED
;
3600 for (i
= 0; i
< total_txqs
; i
++) {
3601 if (dev
->tx_q
[i
].map
!= i
) {
3602 dev
->tx_q
[i
].map
= enabled_queues
[k
];
3603 k
= (k
+ 1) % n_enabled
;
3607 if (VLOG_IS_DBG_ENABLED()) {
3608 struct ds mapping
= DS_EMPTY_INITIALIZER
;
3610 ds_put_format(&mapping
, "TX queue mapping for port '%s':\n",
3611 netdev_get_name(&dev
->up
));
3612 for (i
= 0; i
< total_txqs
; i
++) {
3613 ds_put_format(&mapping
, "%2d --> %2d\n", i
, dev
->tx_q
[i
].map
);
3616 VLOG_DBG("%s", ds_cstr(&mapping
));
3617 ds_destroy(&mapping
);
3620 free(enabled_queues
);
3624 * A new virtio-net device is added to a vhost port.
3629 struct netdev_dpdk
*dev
;
3630 bool exists
= false;
3632 char ifname
[IF_NAME_SZ
];
3634 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
3636 ovs_mutex_lock(&dpdk_mutex
);
3637 /* Add device to the vhost port with the same name as that passed down. */
3638 LIST_FOR_EACH(dev
, list_node
, &dpdk_list
) {
3639 ovs_mutex_lock(&dev
->mutex
);
3640 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
3641 uint32_t qp_num
= rte_vhost_get_vring_num(vid
) / VIRTIO_QNUM
;
3643 /* Get NUMA information */
3644 newnode
= rte_vhost_get_numa_node(vid
);
3645 if (newnode
== -1) {
3647 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
3650 newnode
= dev
->socket_id
;
3653 if (dev
->requested_n_txq
< qp_num
3654 || dev
->requested_n_rxq
< qp_num
3655 || dev
->requested_socket_id
!= newnode
) {
3656 dev
->requested_socket_id
= newnode
;
3657 dev
->requested_n_rxq
= qp_num
;
3658 dev
->requested_n_txq
= qp_num
;
3659 netdev_request_reconfigure(&dev
->up
);
3661 /* Reconfiguration not required. */
3662 dev
->vhost_reconfigured
= true;
3665 ovsrcu_index_set(&dev
->vid
, vid
);
3668 /* Disable notifications. */
3669 set_irq_status(vid
);
3670 netdev_change_seq_changed(&dev
->up
);
3671 ovs_mutex_unlock(&dev
->mutex
);
3674 ovs_mutex_unlock(&dev
->mutex
);
3676 ovs_mutex_unlock(&dpdk_mutex
);
3679 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname
);
3684 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
3690 /* Clears mapping for all available queues of vhost interface. */
3692 netdev_dpdk_txq_map_clear(struct netdev_dpdk
*dev
)
3693 OVS_REQUIRES(dev
->mutex
)
3697 for (i
= 0; i
< dev
->up
.n_txq
; i
++) {
3698 dev
->tx_q
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
3703 * Remove a virtio-net device from the specific vhost port. Use dev->remove
3704 * flag to stop any more packets from being sent or received to/from a VM and
3705 * ensure all currently queued packets have been sent/received before removing
3709 destroy_device(int vid
)
3711 struct netdev_dpdk
*dev
;
3712 bool exists
= false;
3713 char ifname
[IF_NAME_SZ
];
3715 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
3717 ovs_mutex_lock(&dpdk_mutex
);
3718 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3719 if (netdev_dpdk_get_vid(dev
) == vid
) {
3721 ovs_mutex_lock(&dev
->mutex
);
3722 dev
->vhost_reconfigured
= false;
3723 ovsrcu_index_set(&dev
->vid
, -1);
3724 memset(dev
->vhost_rxq_enabled
, 0,
3725 dev
->up
.n_rxq
* sizeof *dev
->vhost_rxq_enabled
);
3726 netdev_dpdk_txq_map_clear(dev
);
3728 netdev_change_seq_changed(&dev
->up
);
3729 ovs_mutex_unlock(&dev
->mutex
);
3735 ovs_mutex_unlock(&dpdk_mutex
);
3739 * Wait for other threads to quiesce after setting the 'virtio_dev'
3740 * to NULL, before returning.
3742 ovsrcu_synchronize();
3744 * As call to ovsrcu_synchronize() will end the quiescent state,
3745 * put thread back into quiescent state before returning.
3747 ovsrcu_quiesce_start();
3748 VLOG_INFO("vHost Device '%s' has been removed", ifname
);
3750 VLOG_INFO("vHost Device '%s' not found", ifname
);
3755 vring_state_changed(int vid
, uint16_t queue_id
, int enable
)
3757 struct netdev_dpdk
*dev
;
3758 bool exists
= false;
3759 int qid
= queue_id
/ VIRTIO_QNUM
;
3760 bool is_rx
= (queue_id
% VIRTIO_QNUM
) == VIRTIO_TXQ
;
3761 char ifname
[IF_NAME_SZ
];
3763 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
3765 ovs_mutex_lock(&dpdk_mutex
);
3766 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3767 ovs_mutex_lock(&dev
->mutex
);
3768 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
3770 bool old_state
= dev
->vhost_rxq_enabled
[qid
];
3772 dev
->vhost_rxq_enabled
[qid
] = enable
!= 0;
3773 if (old_state
!= dev
->vhost_rxq_enabled
[qid
]) {
3774 netdev_change_seq_changed(&dev
->up
);
3778 dev
->tx_q
[qid
].map
= qid
;
3780 dev
->tx_q
[qid
].map
= OVS_VHOST_QUEUE_DISABLED
;
3782 netdev_dpdk_remap_txqs(dev
);
3785 ovs_mutex_unlock(&dev
->mutex
);
3788 ovs_mutex_unlock(&dev
->mutex
);
3790 ovs_mutex_unlock(&dpdk_mutex
);
3793 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
3794 "changed to \'%s\'", queue_id
, is_rx
== true ? "rx" : "tx",
3795 qid
, ifname
, (enable
== 1) ? "enabled" : "disabled");
3797 VLOG_INFO("vHost Device '%s' not found", ifname
);
3805 destroy_connection(int vid
)
3807 struct netdev_dpdk
*dev
;
3808 char ifname
[IF_NAME_SZ
];
3809 bool exists
= false;
3811 rte_vhost_get_ifname(vid
, ifname
, sizeof ifname
);
3813 ovs_mutex_lock(&dpdk_mutex
);
3814 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
3815 ovs_mutex_lock(&dev
->mutex
);
3816 if (nullable_string_is_equal(ifname
, dev
->vhost_id
)) {
3817 uint32_t qp_num
= NR_QUEUE
;
3819 if (netdev_dpdk_get_vid(dev
) >= 0) {
3820 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
3821 "device still attached.", dev
->vhost_id
);
3824 /* Restore the number of queue pairs to default. */
3825 if (dev
->requested_n_txq
!= qp_num
3826 || dev
->requested_n_rxq
!= qp_num
) {
3827 dev
->requested_n_rxq
= qp_num
;
3828 dev
->requested_n_txq
= qp_num
;
3829 netdev_request_reconfigure(&dev
->up
);
3831 ovs_mutex_unlock(&dev
->mutex
);
3835 ovs_mutex_unlock(&dev
->mutex
);
3837 ovs_mutex_unlock(&dpdk_mutex
);
3840 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname
);
3842 VLOG_INFO("vHost Device '%s' not found", ifname
);
3847 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
3848 * or vhostuserclient netdev.
3850 * Returns a value greater or equal to zero for a valid vid or '-1' if
3851 * there is no valid vid associated. A vid of '-1' must not be used in
3852 * rte_vhost_ APi calls.
3854 * Once obtained and validated, a vid can be used by a PMD for multiple
3855 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
3856 * not fetch the vid again for each of a series of API calls.
3860 netdev_dpdk_get_vid(const struct netdev_dpdk
*dev
)
3862 return ovsrcu_index_get(&dev
->vid
);
3865 struct ingress_policer
*
3866 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
)
3868 return ovsrcu_get(struct ingress_policer
*, &dev
->ingress_policer
);
3872 netdev_dpdk_class_init(void)
3874 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
3876 /* This function can be called for different classes. The initialization
3877 * needs to be done only once */
3878 if (ovsthread_once_start(&once
)) {
3879 ovs_thread_create("dpdk_watchdog", dpdk_watchdog
, NULL
);
3880 unixctl_command_register("netdev-dpdk/set-admin-state",
3881 "[netdev] up|down", 1, 2,
3882 netdev_dpdk_set_admin_state
, NULL
);
3884 unixctl_command_register("netdev-dpdk/detach",
3885 "pci address of device", 1, 1,
3886 netdev_dpdk_detach
, NULL
);
3888 unixctl_command_register("netdev-dpdk/get-mempool-info",
3890 netdev_dpdk_get_mempool_info
, NULL
);
3892 ovsthread_once_done(&once
);
3901 dpdk_ring_create(const char dev_name
[], unsigned int port_no
,
3902 dpdk_port_t
*eth_port_id
)
3904 struct dpdk_ring
*ring_pair
;
3908 ring_pair
= dpdk_rte_mzalloc(sizeof *ring_pair
);
3913 /* XXX: Add support for multiquque ring. */
3914 ring_name
= xasprintf("%s_tx", dev_name
);
3916 /* Create single producer tx ring, netdev does explicit locking. */
3917 ring_pair
->cring_tx
= rte_ring_create(ring_name
, DPDK_RING_SIZE
, SOCKET0
,
3920 if (ring_pair
->cring_tx
== NULL
) {
3921 rte_free(ring_pair
);
3925 ring_name
= xasprintf("%s_rx", dev_name
);
3927 /* Create single consumer rx ring, netdev does explicit locking. */
3928 ring_pair
->cring_rx
= rte_ring_create(ring_name
, DPDK_RING_SIZE
, SOCKET0
,
3931 if (ring_pair
->cring_rx
== NULL
) {
3932 rte_free(ring_pair
);
3936 port_id
= rte_eth_from_rings(dev_name
, &ring_pair
->cring_rx
, 1,
3937 &ring_pair
->cring_tx
, 1, SOCKET0
);
3940 rte_free(ring_pair
);
3944 ring_pair
->user_port_id
= port_no
;
3945 ring_pair
->eth_port_id
= port_id
;
3946 *eth_port_id
= port_id
;
3948 ovs_list_push_back(&dpdk_ring_list
, &ring_pair
->list_node
);
3954 dpdk_ring_open(const char dev_name
[], dpdk_port_t
*eth_port_id
)
3955 OVS_REQUIRES(dpdk_mutex
)
3957 struct dpdk_ring
*ring_pair
;
3958 unsigned int port_no
;
3961 /* Names always start with "dpdkr" */
3962 err
= dpdk_dev_parse_name(dev_name
, "dpdkr", &port_no
);
3967 /* Look through our list to find the device */
3968 LIST_FOR_EACH (ring_pair
, list_node
, &dpdk_ring_list
) {
3969 if (ring_pair
->user_port_id
== port_no
) {
3970 VLOG_INFO("Found dpdk ring device %s:", dev_name
);
3971 /* Really all that is needed */
3972 *eth_port_id
= ring_pair
->eth_port_id
;
3976 /* Need to create the device rings */
3977 return dpdk_ring_create(dev_name
, port_no
, eth_port_id
);
3981 netdev_dpdk_ring_send(struct netdev
*netdev
, int qid
,
3982 struct dp_packet_batch
*batch
, bool concurrent_txq
)
3984 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
3985 struct dp_packet
*packet
;
3987 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
3988 * the offload fields are clear. This is because the same mbuf may be
3989 * modified by the consumer of the ring and return into the datapath
3990 * without recalculating the RSS hash or revalidating the checksums. */
3991 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
3992 dp_packet_reset_offload(packet
);
3995 netdev_dpdk_send__(dev
, qid
, batch
, concurrent_txq
);
4000 netdev_dpdk_ring_construct(struct netdev
*netdev
)
4002 dpdk_port_t port_no
= 0;
4005 VLOG_WARN_ONCE("dpdkr a.k.a. ring ports are considered deprecated. "
4006 "Please migrate to virtio-based interfaces, e.g. "
4007 "dpdkvhostuserclient ports, net_virtio_user DPDK vdev.");
4009 ovs_mutex_lock(&dpdk_mutex
);
4011 err
= dpdk_ring_open(netdev
->name
, &port_no
);
4016 err
= common_construct(netdev
, port_no
, DPDK_DEV_ETH
,
4017 rte_eth_dev_socket_id(port_no
));
4019 ovs_mutex_unlock(&dpdk_mutex
);
4026 * Initialize QoS configuration operations.
4029 qos_conf_init(struct qos_conf
*conf
, const struct dpdk_qos_ops
*ops
)
4032 rte_spinlock_init(&conf
->lock
);
4036 * Search existing QoS operations in qos_ops and compare each set of
4037 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
4040 static const struct dpdk_qos_ops
*
4041 qos_lookup_name(const char *name
)
4043 const struct dpdk_qos_ops
*const *opsp
;
4045 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
4046 const struct dpdk_qos_ops
*ops
= *opsp
;
4047 if (!strcmp(name
, ops
->qos_name
)) {
4055 netdev_dpdk_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
4058 const struct dpdk_qos_ops
*const *opsp
;
4060 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
4061 const struct dpdk_qos_ops
*ops
= *opsp
;
4062 if (ops
->qos_construct
&& ops
->qos_name
[0] != '\0') {
4063 sset_add(types
, ops
->qos_name
);
4070 netdev_dpdk_get_qos(const struct netdev
*netdev
,
4071 const char **typep
, struct smap
*details
)
4073 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4074 struct qos_conf
*qos_conf
;
4077 ovs_mutex_lock(&dev
->mutex
);
4078 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4080 *typep
= qos_conf
->ops
->qos_name
;
4081 error
= (qos_conf
->ops
->qos_get
4082 ? qos_conf
->ops
->qos_get(qos_conf
, details
): 0);
4084 /* No QoS configuration set, return an empty string */
4087 ovs_mutex_unlock(&dev
->mutex
);
4093 netdev_dpdk_set_qos(struct netdev
*netdev
, const char *type
,
4094 const struct smap
*details
)
4096 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4097 const struct dpdk_qos_ops
*new_ops
= NULL
;
4098 struct qos_conf
*qos_conf
, *new_qos_conf
= NULL
;
4101 ovs_mutex_lock(&dev
->mutex
);
4103 qos_conf
= ovsrcu_get_protected(struct qos_conf
*, &dev
->qos_conf
);
4105 new_ops
= qos_lookup_name(type
);
4107 if (!new_ops
|| !new_ops
->qos_construct
) {
4108 new_qos_conf
= NULL
;
4109 if (type
&& type
[0]) {
4112 } else if (qos_conf
&& qos_conf
->ops
== new_ops
4113 && qos_conf
->ops
->qos_is_equal(qos_conf
, details
)) {
4114 new_qos_conf
= qos_conf
;
4116 error
= new_ops
->qos_construct(details
, &new_qos_conf
);
4120 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
4121 type
, netdev
->name
, rte_strerror(error
));
4124 if (new_qos_conf
!= qos_conf
) {
4125 ovsrcu_set(&dev
->qos_conf
, new_qos_conf
);
4127 ovsrcu_postpone(qos_conf
->ops
->qos_destruct
, qos_conf
);
4131 ovs_mutex_unlock(&dev
->mutex
);
4136 /* egress-policer details */
4138 struct egress_policer
{
4139 struct qos_conf qos_conf
;
4140 struct rte_meter_srtcm_params app_srtcm_params
;
4141 struct rte_meter_srtcm egress_meter
;
4142 struct rte_meter_srtcm_profile egress_prof
;
4146 egress_policer_details_to_param(const struct smap
*details
,
4147 struct rte_meter_srtcm_params
*params
)
4149 memset(params
, 0, sizeof *params
);
4150 params
->cir
= smap_get_ullong(details
, "cir", 0);
4151 params
->cbs
= smap_get_ullong(details
, "cbs", 0);
4156 egress_policer_qos_construct(const struct smap
*details
,
4157 struct qos_conf
**conf
)
4159 struct egress_policer
*policer
;
4162 policer
= xmalloc(sizeof *policer
);
4163 qos_conf_init(&policer
->qos_conf
, &egress_policer_ops
);
4164 egress_policer_details_to_param(details
, &policer
->app_srtcm_params
);
4165 err
= rte_meter_srtcm_profile_config(&policer
->egress_prof
,
4166 &policer
->app_srtcm_params
);
4168 err
= rte_meter_srtcm_config(&policer
->egress_meter
,
4169 &policer
->egress_prof
);
4173 *conf
= &policer
->qos_conf
;
4175 VLOG_ERR("Could not create rte meter for egress policer");
4185 egress_policer_qos_destruct(struct qos_conf
*conf
)
4187 struct egress_policer
*policer
= CONTAINER_OF(conf
, struct egress_policer
,
4193 egress_policer_qos_get(const struct qos_conf
*conf
, struct smap
*details
)
4195 struct egress_policer
*policer
=
4196 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4198 smap_add_format(details
, "cir", "%"PRIu64
, policer
->app_srtcm_params
.cir
);
4199 smap_add_format(details
, "cbs", "%"PRIu64
, policer
->app_srtcm_params
.cbs
);
4205 egress_policer_qos_is_equal(const struct qos_conf
*conf
,
4206 const struct smap
*details
)
4208 struct egress_policer
*policer
=
4209 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4210 struct rte_meter_srtcm_params params
;
4212 egress_policer_details_to_param(details
, ¶ms
);
4214 return !memcmp(¶ms
, &policer
->app_srtcm_params
, sizeof params
);
4218 egress_policer_run(struct qos_conf
*conf
, struct rte_mbuf
**pkts
, int pkt_cnt
,
4222 struct egress_policer
*policer
=
4223 CONTAINER_OF(conf
, struct egress_policer
, qos_conf
);
4225 cnt
= netdev_dpdk_policer_run(&policer
->egress_meter
,
4226 &policer
->egress_prof
, pkts
,
4227 pkt_cnt
, should_steal
);
4232 static const struct dpdk_qos_ops egress_policer_ops
= {
4233 "egress-policer", /* qos_name */
4234 egress_policer_qos_construct
,
4235 egress_policer_qos_destruct
,
4236 egress_policer_qos_get
,
4237 egress_policer_qos_is_equal
,
4242 netdev_dpdk_reconfigure(struct netdev
*netdev
)
4244 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4247 ovs_mutex_lock(&dev
->mutex
);
4249 if (netdev
->n_txq
== dev
->requested_n_txq
4250 && netdev
->n_rxq
== dev
->requested_n_rxq
4251 && dev
->mtu
== dev
->requested_mtu
4252 && dev
->lsc_interrupt_mode
== dev
->requested_lsc_interrupt_mode
4253 && dev
->rxq_size
== dev
->requested_rxq_size
4254 && dev
->txq_size
== dev
->requested_txq_size
4255 && dev
->socket_id
== dev
->requested_socket_id
4257 /* Reconfiguration is unnecessary */
4262 rte_eth_dev_stop(dev
->port_id
);
4263 dev
->started
= false;
4265 err
= netdev_dpdk_mempool_configure(dev
);
4266 if (err
&& err
!= EEXIST
) {
4270 dev
->lsc_interrupt_mode
= dev
->requested_lsc_interrupt_mode
;
4272 netdev
->n_txq
= dev
->requested_n_txq
;
4273 netdev
->n_rxq
= dev
->requested_n_rxq
;
4275 dev
->rxq_size
= dev
->requested_rxq_size
;
4276 dev
->txq_size
= dev
->requested_txq_size
;
4278 rte_free(dev
->tx_q
);
4279 err
= dpdk_eth_dev_init(dev
);
4280 dev
->tx_q
= netdev_dpdk_alloc_txq(netdev
->n_txq
);
4285 netdev_change_seq_changed(netdev
);
4288 ovs_mutex_unlock(&dev
->mutex
);
4293 dpdk_vhost_reconfigure_helper(struct netdev_dpdk
*dev
)
4294 OVS_REQUIRES(dev
->mutex
)
4296 dev
->up
.n_txq
= dev
->requested_n_txq
;
4297 dev
->up
.n_rxq
= dev
->requested_n_rxq
;
4300 /* Always keep RX queue 0 enabled for implementations that won't
4301 * report vring states. */
4302 dev
->vhost_rxq_enabled
[0] = true;
4304 /* Enable TX queue 0 by default if it wasn't disabled. */
4305 if (dev
->tx_q
[0].map
== OVS_VHOST_QUEUE_MAP_UNKNOWN
) {
4306 dev
->tx_q
[0].map
= 0;
4309 netdev_dpdk_remap_txqs(dev
);
4311 err
= netdev_dpdk_mempool_configure(dev
);
4313 /* A new mempool was created or re-used. */
4314 netdev_change_seq_changed(&dev
->up
);
4315 } else if (err
!= EEXIST
) {
4318 if (netdev_dpdk_get_vid(dev
) >= 0) {
4319 if (dev
->vhost_reconfigured
== false) {
4320 dev
->vhost_reconfigured
= true;
4321 /* Carrier status may need updating. */
4322 netdev_change_seq_changed(&dev
->up
);
4330 netdev_dpdk_vhost_reconfigure(struct netdev
*netdev
)
4332 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4335 ovs_mutex_lock(&dev
->mutex
);
4336 err
= dpdk_vhost_reconfigure_helper(dev
);
4337 ovs_mutex_unlock(&dev
->mutex
);
4343 netdev_dpdk_vhost_client_reconfigure(struct netdev
*netdev
)
4345 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4347 uint64_t vhost_flags
= 0;
4350 ovs_mutex_lock(&dev
->mutex
);
4352 /* Configure vHost client mode if requested and if the following criteria
4354 * 1. Device hasn't been registered yet.
4355 * 2. A path has been specified.
4357 if (!(dev
->vhost_driver_flags
& RTE_VHOST_USER_CLIENT
) && dev
->vhost_id
) {
4358 /* Register client-mode device. */
4359 vhost_flags
|= RTE_VHOST_USER_CLIENT
;
4361 /* Enable IOMMU support, if explicitly requested. */
4362 if (dpdk_vhost_iommu_enabled()) {
4363 vhost_flags
|= RTE_VHOST_USER_IOMMU_SUPPORT
;
4366 /* Enable POSTCOPY support, if explicitly requested. */
4367 if (dpdk_vhost_postcopy_enabled()) {
4368 vhost_flags
|= RTE_VHOST_USER_POSTCOPY_SUPPORT
;
4371 zc_enabled
= dev
->vhost_driver_flags
4372 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY
;
4373 /* Enable zero copy flag, if requested */
4375 vhost_flags
|= RTE_VHOST_USER_DEQUEUE_ZERO_COPY
;
4378 err
= rte_vhost_driver_register(dev
->vhost_id
, vhost_flags
);
4380 VLOG_ERR("vhost-user device setup failure for device %s\n",
4384 /* Configuration successful */
4385 dev
->vhost_driver_flags
|= vhost_flags
;
4386 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
4387 "using client socket '%s'",
4388 dev
->up
.name
, dev
->vhost_id
);
4390 VLOG_INFO("Zero copy enabled for vHost port %s", dev
->up
.name
);
4394 err
= rte_vhost_driver_callback_register(dev
->vhost_id
,
4395 &virtio_net_device_ops
);
4397 VLOG_ERR("rte_vhost_driver_callback_register failed for "
4398 "vhost user client port: %s\n", dev
->up
.name
);
4402 err
= rte_vhost_driver_disable_features(dev
->vhost_id
,
4403 1ULL << VIRTIO_NET_F_HOST_TSO4
4404 | 1ULL << VIRTIO_NET_F_HOST_TSO6
4405 | 1ULL << VIRTIO_NET_F_CSUM
);
4407 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
4408 "client port: %s\n", dev
->up
.name
);
4412 err
= rte_vhost_driver_start(dev
->vhost_id
);
4414 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
4415 "client port: %s\n", dev
->up
.name
);
4420 err
= dpdk_vhost_reconfigure_helper(dev
);
4423 ovs_mutex_unlock(&dev
->mutex
);
4429 netdev_dpdk_flow_api_supported(struct netdev
*netdev
)
4431 struct netdev_dpdk
*dev
;
4434 if (!is_dpdk_class(netdev
->netdev_class
)) {
4438 dev
= netdev_dpdk_cast(netdev
);
4439 ovs_mutex_lock(&dev
->mutex
);
4440 if (dev
->type
== DPDK_DEV_ETH
) {
4441 /* TODO: Check if we able to offload some minimal flow. */
4444 ovs_mutex_unlock(&dev
->mutex
);
4450 netdev_dpdk_rte_flow_destroy(struct netdev
*netdev
,
4451 struct rte_flow
*rte_flow
,
4452 struct rte_flow_error
*error
)
4454 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4457 ovs_mutex_lock(&dev
->mutex
);
4458 ret
= rte_flow_destroy(dev
->port_id
, rte_flow
, error
);
4459 ovs_mutex_unlock(&dev
->mutex
);
4464 netdev_dpdk_rte_flow_create(struct netdev
*netdev
,
4465 const struct rte_flow_attr
*attr
,
4466 const struct rte_flow_item
*items
,
4467 const struct rte_flow_action
*actions
,
4468 struct rte_flow_error
*error
)
4470 struct rte_flow
*flow
;
4471 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
4473 ovs_mutex_lock(&dev
->mutex
);
4474 flow
= rte_flow_create(dev
->port_id
, attr
, items
, actions
, error
);
4475 ovs_mutex_unlock(&dev
->mutex
);
4479 #define NETDEV_DPDK_CLASS_COMMON \
4481 .alloc = netdev_dpdk_alloc, \
4482 .dealloc = netdev_dpdk_dealloc, \
4483 .get_config = netdev_dpdk_get_config, \
4484 .get_numa_id = netdev_dpdk_get_numa_id, \
4485 .set_etheraddr = netdev_dpdk_set_etheraddr, \
4486 .get_etheraddr = netdev_dpdk_get_etheraddr, \
4487 .get_mtu = netdev_dpdk_get_mtu, \
4488 .set_mtu = netdev_dpdk_set_mtu, \
4489 .get_ifindex = netdev_dpdk_get_ifindex, \
4490 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
4491 .set_miimon_interval = netdev_dpdk_set_miimon, \
4492 .set_policing = netdev_dpdk_set_policing, \
4493 .get_qos_types = netdev_dpdk_get_qos_types, \
4494 .get_qos = netdev_dpdk_get_qos, \
4495 .set_qos = netdev_dpdk_set_qos, \
4496 .update_flags = netdev_dpdk_update_flags, \
4497 .rxq_alloc = netdev_dpdk_rxq_alloc, \
4498 .rxq_construct = netdev_dpdk_rxq_construct, \
4499 .rxq_destruct = netdev_dpdk_rxq_destruct, \
4500 .rxq_dealloc = netdev_dpdk_rxq_dealloc
4502 #define NETDEV_DPDK_CLASS_BASE \
4503 NETDEV_DPDK_CLASS_COMMON, \
4504 .init = netdev_dpdk_class_init, \
4505 .destruct = netdev_dpdk_destruct, \
4506 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
4507 .get_carrier = netdev_dpdk_get_carrier, \
4508 .get_stats = netdev_dpdk_get_stats, \
4509 .get_custom_stats = netdev_dpdk_get_custom_stats, \
4510 .get_features = netdev_dpdk_get_features, \
4511 .get_status = netdev_dpdk_get_status, \
4512 .reconfigure = netdev_dpdk_reconfigure, \
4513 .rxq_recv = netdev_dpdk_rxq_recv
4515 static const struct netdev_class dpdk_class
= {
4517 NETDEV_DPDK_CLASS_BASE
,
4518 .construct
= netdev_dpdk_construct
,
4519 .set_config
= netdev_dpdk_set_config
,
4520 .send
= netdev_dpdk_eth_send
,
4523 static const struct netdev_class dpdk_ring_class
= {
4525 NETDEV_DPDK_CLASS_BASE
,
4526 .construct
= netdev_dpdk_ring_construct
,
4527 .set_config
= netdev_dpdk_ring_set_config
,
4528 .send
= netdev_dpdk_ring_send
,
4531 static const struct netdev_class dpdk_vhost_class
= {
4532 .type
= "dpdkvhostuser",
4533 NETDEV_DPDK_CLASS_COMMON
,
4534 .construct
= netdev_dpdk_vhost_construct
,
4535 .destruct
= netdev_dpdk_vhost_destruct
,
4536 .send
= netdev_dpdk_vhost_send
,
4537 .get_carrier
= netdev_dpdk_vhost_get_carrier
,
4538 .get_stats
= netdev_dpdk_vhost_get_stats
,
4539 .get_custom_stats
= netdev_dpdk_get_sw_custom_stats
,
4540 .get_status
= netdev_dpdk_vhost_user_get_status
,
4541 .reconfigure
= netdev_dpdk_vhost_reconfigure
,
4542 .rxq_recv
= netdev_dpdk_vhost_rxq_recv
,
4543 .rxq_enabled
= netdev_dpdk_vhost_rxq_enabled
,
4546 static const struct netdev_class dpdk_vhost_client_class
= {
4547 .type
= "dpdkvhostuserclient",
4548 NETDEV_DPDK_CLASS_COMMON
,
4549 .construct
= netdev_dpdk_vhost_client_construct
,
4550 .destruct
= netdev_dpdk_vhost_destruct
,
4551 .set_config
= netdev_dpdk_vhost_client_set_config
,
4552 .send
= netdev_dpdk_vhost_send
,
4553 .get_carrier
= netdev_dpdk_vhost_get_carrier
,
4554 .get_stats
= netdev_dpdk_vhost_get_stats
,
4555 .get_custom_stats
= netdev_dpdk_get_sw_custom_stats
,
4556 .get_status
= netdev_dpdk_vhost_user_get_status
,
4557 .reconfigure
= netdev_dpdk_vhost_client_reconfigure
,
4558 .rxq_recv
= netdev_dpdk_vhost_rxq_recv
,
4559 .rxq_enabled
= netdev_dpdk_vhost_rxq_enabled
,
4563 netdev_dpdk_register(void)
4565 netdev_register_provider(&dpdk_class
);
4566 netdev_register_provider(&dpdk_ring_class
);
4567 netdev_register_provider(&dpdk_vhost_class
);
4568 netdev_register_provider(&dpdk_vhost_client_class
);