2 * Copyright (c) 2014, 2015, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
30 #include <sys/types.h>
36 #include "dp-packet.h"
37 #include "dpif-netdev.h"
38 #include "fatal-signal.h"
39 #include "netdev-dpdk.h"
40 #include "netdev-provider.h"
41 #include "netdev-vport.h"
43 #include "openvswitch/dynamic-string.h"
44 #include "openvswitch/list.h"
45 #include "openvswitch/ofp-print.h"
46 #include "openvswitch/vlog.h"
48 #include "ovs-thread.h"
51 #include "openvswitch/shash.h"
54 #include "unaligned.h"
58 #include "rte_config.h"
60 #include "rte_meter.h"
61 #include "rte_virtio_net.h"
63 VLOG_DEFINE_THIS_MODULE(dpdk
);
64 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
66 #define DPDK_PORT_WATCHDOG_INTERVAL 5
68 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
69 #define OVS_VPORT_DPDK "ovs_dpdk"
72 * need to reserve tons of extra space in the mbufs so we can align the
73 * DMA addresses to 4KB.
74 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
75 * performance for standard Ethernet MTU.
77 #define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN))
78 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
79 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
80 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN - ETHER_CRC_LEN)
81 #define MBUF_SIZE(mtu) ( MTU_TO_MAX_FRAME_LEN(mtu) \
82 + sizeof(struct dp_packet) \
83 + RTE_PKTMBUF_HEADROOM)
84 #define NETDEV_DPDK_MBUF_ALIGN 1024
86 /* Max and min number of packets in the mempool. OVS tries to allocate a
87 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
88 * enough hugepages) we keep halving the number until the allocation succeeds
89 * or we reach MIN_NB_MBUF */
91 #define MAX_NB_MBUF (4096 * 64)
92 #define MIN_NB_MBUF (4096 * 4)
93 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
95 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
96 BUILD_ASSERT_DECL(MAX_NB_MBUF
% ROUND_DOWN_POW2(MAX_NB_MBUF
/MIN_NB_MBUF
) == 0);
98 /* The smallest possible NB_MBUF that we're going to try should be a multiple
99 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
100 BUILD_ASSERT_DECL((MAX_NB_MBUF
/ ROUND_DOWN_POW2(MAX_NB_MBUF
/MIN_NB_MBUF
))
104 * DPDK XSTATS Counter names definition
106 #define XSTAT_RX_64_PACKETS "rx_size_64_packets"
107 #define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
108 #define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
109 #define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
110 #define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
111 #define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
112 #define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
114 #define XSTAT_TX_64_PACKETS "tx_size_64_packets"
115 #define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
116 #define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
117 #define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
118 #define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
119 #define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
120 #define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
122 #define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
123 #define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
124 #define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
125 #define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
126 #define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
127 #define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
128 #define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
132 #define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
133 #define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
135 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
136 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
137 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
138 * yet mapped to another queue. */
141 static char *cuse_dev_name
= NULL
; /* Character device cuse_dev_name. */
143 static char *vhost_sock_dir
= NULL
; /* Location of vhost-user sockets */
145 #define VHOST_ENQ_RETRY_NUM 8
147 static const struct rte_eth_conf port_conf
= {
149 .mq_mode
= ETH_MQ_RX_RSS
,
151 .header_split
= 0, /* Header Split disabled */
152 .hw_ip_checksum
= 0, /* IP checksum offload disabled */
153 .hw_vlan_filter
= 0, /* VLAN filtering disabled */
154 .jumbo_frame
= 0, /* Jumbo Frame Support disabled */
160 .rss_hf
= ETH_RSS_IP
| ETH_RSS_UDP
| ETH_RSS_TCP
,
164 .mq_mode
= ETH_MQ_TX_NONE
,
168 enum { DPDK_RING_SIZE
= 256 };
169 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE
));
170 enum { DRAIN_TSC
= 200000ULL };
177 static int rte_eal_init_ret
= ENODEV
;
179 static struct ovs_mutex dpdk_mutex
= OVS_MUTEX_INITIALIZER
;
181 /* Quality of Service */
183 /* An instance of a QoS configuration. Always associated with a particular
186 * Each QoS implementation subclasses this with whatever additional data it
190 const struct dpdk_qos_ops
*ops
;
193 /* A particular implementation of dpdk QoS operations.
195 * The functions below return 0 if successful or a positive errno value on
196 * failure, except where otherwise noted. All of them must be provided, except
197 * where otherwise noted.
199 struct dpdk_qos_ops
{
201 /* Name of the QoS type */
202 const char *qos_name
;
204 /* Called to construct the QoS implementation on 'netdev'. The
205 * implementation should make the appropriate calls to configure QoS
206 * according to 'details'. The implementation may assume that any current
207 * QoS configuration already installed should be destroyed before
208 * constructing the new configuration.
210 * The contents of 'details' should be documented as valid for 'ovs_name'
211 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
212 * (which is built as ovs-vswitchd.conf.db(8)).
214 * This function must return 0 if and only if it sets 'netdev->qos_conf'
215 * to an initialized 'struct qos_conf'.
217 * For all QoS implementations it should always be non-null.
219 int (*qos_construct
)(struct netdev
*netdev
, const struct smap
*details
);
221 /* Destroys the data structures allocated by the implementation as part of
224 * For all QoS implementations it should always be non-null.
226 void (*qos_destruct
)(struct netdev
*netdev
, struct qos_conf
*conf
);
228 /* Retrieves details of 'netdev->qos_conf' configuration into 'details'.
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
234 int (*qos_get
)(const struct netdev
*netdev
, struct smap
*details
);
236 /* Reconfigures 'netdev->qos_conf' according to 'details', performing any
237 * required calls to complete the reconfiguration.
239 * The contents of 'details' should be documented as valid for 'ovs_name'
240 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
241 * (which is built as ovs-vswitchd.conf.db(8)).
243 * This function may be null if 'qos_conf' is not configurable.
245 int (*qos_set
)(struct netdev
*netdev
, const struct smap
*details
);
247 /* Modify an array of rte_mbufs. The modification is specific to
248 * each qos implementation.
250 * The function should take and array of mbufs and an int representing
251 * the current number of mbufs present in the array.
253 * After the function has performed a qos modification to the array of
254 * mbufs it returns an int representing the number of mbufs now present in
255 * the array. This value is can then be passed to the port send function
256 * along with the modified array for transmission.
258 * For all QoS implementations it should always be non-null.
260 int (*qos_run
)(struct netdev
*netdev
, struct rte_mbuf
**pkts
,
264 /* dpdk_qos_ops for each type of user space QoS implementation */
265 static const struct dpdk_qos_ops egress_policer_ops
;
268 * Array of dpdk_qos_ops, contains pointer to all supported QoS
271 static const struct dpdk_qos_ops
*const qos_confs
[] = {
276 /* Contains all 'struct dpdk_dev's. */
277 static struct ovs_list dpdk_list
OVS_GUARDED_BY(dpdk_mutex
)
278 = OVS_LIST_INITIALIZER(&dpdk_list
);
280 static struct ovs_list dpdk_mp_list
OVS_GUARDED_BY(dpdk_mutex
)
281 = OVS_LIST_INITIALIZER(&dpdk_mp_list
);
283 /* This mutex must be used by non pmd threads when allocating or freeing
284 * mbufs through mempools. */
285 static struct ovs_mutex nonpmd_mempool_mutex
= OVS_MUTEX_INITIALIZER
;
288 struct rte_mempool
*mp
;
292 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
295 /* There should be one 'struct dpdk_tx_queue' created for
297 struct dpdk_tx_queue
{
298 rte_spinlock_t tx_lock
; /* Protects the members and the NIC queue
299 * from concurrent access. It is used only
300 * if the queue is shared among different
301 * pmd threads (see 'txq_needs_locking'). */
302 int map
; /* Mapping of configured vhost-user queues
303 * to enabled by guest. */
306 /* dpdk has no way to remove dpdk ring ethernet devices
307 so we have to keep them around once they've been created
310 static struct ovs_list dpdk_ring_list
OVS_GUARDED_BY(dpdk_mutex
)
311 = OVS_LIST_INITIALIZER(&dpdk_ring_list
);
314 /* For the client rings */
315 struct rte_ring
*cring_tx
;
316 struct rte_ring
*cring_rx
;
317 unsigned int user_port_id
; /* User given port no, parsed from port name */
318 int eth_port_id
; /* ethernet device port id */
319 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
322 struct ingress_policer
{
323 struct rte_meter_srtcm_params app_srtcm_params
;
324 struct rte_meter_srtcm in_policer
;
325 rte_spinlock_t policer_lock
;
332 enum dpdk_dev_type type
;
334 struct dpdk_tx_queue
*tx_q
;
336 struct ovs_mutex mutex
OVS_ACQ_AFTER(dpdk_mutex
);
338 struct dpdk_mp
*dpdk_mp
;
342 struct netdev_stats stats
;
344 rte_spinlock_t stats_lock
;
346 struct eth_addr hwaddr
;
347 enum netdev_flags flags
;
349 struct rte_eth_link link
;
352 /* Caller of netdev_send() might want to use more txqs than the device has.
353 * For physical NICs, if the 'requested_n_txq' less or equal to 'up.n_txq',
354 * 'txq_needs_locking' is false, otherwise it is true and we will take a
355 * spinlock on transmission. For vhost devices, 'requested_n_txq' is
357 bool txq_needs_locking
;
359 /* virtio-net structure for vhost device */
360 OVSRCU_TYPE(struct virtio_net
*) virtio_dev
;
362 /* Identifier used to distinguish vhost devices from each other */
363 char vhost_id
[PATH_MAX
];
366 struct ovs_list list_node
OVS_GUARDED_BY(dpdk_mutex
);
368 /* QoS configuration and lock for the device */
369 struct qos_conf
*qos_conf
;
370 rte_spinlock_t qos_lock
;
372 /* The following properties cannot be changed when a device is running,
373 * so we remember the request and update them next time
374 * netdev_dpdk*_reconfigure() is called */
378 /* Socket ID detected when vHost device is brought up */
379 int requested_socket_id
;
381 /* Ingress Policer */
382 OVSRCU_TYPE(struct ingress_policer
*) ingress_policer
;
383 uint32_t policer_rate
;
384 uint32_t policer_burst
;
387 struct netdev_rxq_dpdk
{
388 struct netdev_rxq up
;
392 static bool dpdk_thread_is_pmd(void);
394 static int netdev_dpdk_construct(struct netdev
*);
396 struct virtio_net
* netdev_dpdk_get_virtio(const struct netdev_dpdk
*dev
);
398 struct ingress_policer
*
399 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
);
402 is_dpdk_class(const struct netdev_class
*class)
404 return class->construct
== netdev_dpdk_construct
;
407 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
408 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
409 * value, insufficient buffers are allocated to accomodate the packet in its
410 * entirety. Furthermore, certain drivers need to ensure that there is also
411 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
412 * frames). If the RX buffer is too small, then the driver enables scatter RX
413 * behaviour, which reduces performance. To prevent this, use a buffer size that
414 * is closest to 'mtu', but which satisfies the aforementioned criteria.
417 dpdk_buf_size(int mtu
)
419 return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu
) + RTE_PKTMBUF_HEADROOM
),
420 NETDEV_DPDK_MBUF_ALIGN
);
423 /* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
424 * for all other segments data, bss and text. */
427 dpdk_rte_mzalloc(size_t sz
)
431 ptr
= rte_zmalloc(OVS_VPORT_DPDK
, sz
, OVS_CACHE_LINE_SIZE
);
438 /* XXX this function should be called only by pmd threads (or by non pmd
439 * threads holding the nonpmd_mempool_mutex) */
441 free_dpdk_buf(struct dp_packet
*p
)
443 struct rte_mbuf
*pkt
= (struct rte_mbuf
*) p
;
445 rte_pktmbuf_free(pkt
);
449 ovs_rte_pktmbuf_init(struct rte_mempool
*mp
,
450 void *opaque_arg OVS_UNUSED
,
452 unsigned i OVS_UNUSED
)
454 struct rte_mbuf
*m
= _m
;
456 rte_pktmbuf_init(mp
, opaque_arg
, _m
, i
);
458 dp_packet_init_dpdk((struct dp_packet
*) m
, m
->buf_len
);
461 static struct dpdk_mp
*
462 dpdk_mp_get(int socket_id
, int mtu
) OVS_REQUIRES(dpdk_mutex
)
464 struct dpdk_mp
*dmp
= NULL
;
465 char mp_name
[RTE_MEMPOOL_NAMESIZE
];
467 struct rte_pktmbuf_pool_private mbp_priv
;
469 LIST_FOR_EACH (dmp
, list_node
, &dpdk_mp_list
) {
470 if (dmp
->socket_id
== socket_id
&& dmp
->mtu
== mtu
) {
476 dmp
= dpdk_rte_mzalloc(sizeof *dmp
);
477 dmp
->socket_id
= socket_id
;
480 mbp_priv
.mbuf_data_room_size
= MBUF_SIZE(mtu
) - sizeof(struct dp_packet
);
481 mbp_priv
.mbuf_priv_size
= sizeof (struct dp_packet
) -
482 sizeof (struct rte_mbuf
);
484 mp_size
= MAX_NB_MBUF
;
486 if (snprintf(mp_name
, RTE_MEMPOOL_NAMESIZE
, "ovs_mp_%d_%d_%u",
487 dmp
->mtu
, dmp
->socket_id
, mp_size
) < 0) {
491 dmp
->mp
= rte_mempool_create(mp_name
, mp_size
, MBUF_SIZE(mtu
),
493 sizeof(struct rte_pktmbuf_pool_private
),
494 rte_pktmbuf_pool_init
, &mbp_priv
,
495 ovs_rte_pktmbuf_init
, NULL
,
497 } while (!dmp
->mp
&& rte_errno
== ENOMEM
&& (mp_size
/= 2) >= MIN_NB_MBUF
);
499 if (dmp
->mp
== NULL
) {
502 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", mp_name
, mp_size
);
505 ovs_list_push_back(&dpdk_mp_list
, &dmp
->list_node
);
510 dpdk_mp_put(struct dpdk_mp
*dmp
)
518 ovs_assert(dmp
->refcount
>= 0);
521 /* I could not find any API to destroy mp. */
522 if (dmp
->refcount
== 0) {
523 list_delete(dmp
->list_node
);
524 /* destroy mp-pool. */
530 check_link_status(struct netdev_dpdk
*dev
)
532 struct rte_eth_link link
;
534 rte_eth_link_get_nowait(dev
->port_id
, &link
);
536 if (dev
->link
.link_status
!= link
.link_status
) {
537 netdev_change_seq_changed(&dev
->up
);
539 dev
->link_reset_cnt
++;
541 if (dev
->link
.link_status
) {
542 VLOG_DBG_RL(&rl
, "Port %d Link Up - speed %u Mbps - %s",
543 dev
->port_id
, (unsigned)dev
->link
.link_speed
,
544 (dev
->link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) ?
545 ("full-duplex") : ("half-duplex"));
547 VLOG_DBG_RL(&rl
, "Port %d Link Down", dev
->port_id
);
553 dpdk_watchdog(void *dummy OVS_UNUSED
)
555 struct netdev_dpdk
*dev
;
557 pthread_detach(pthread_self());
560 ovs_mutex_lock(&dpdk_mutex
);
561 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
562 ovs_mutex_lock(&dev
->mutex
);
563 if (dev
->type
== DPDK_DEV_ETH
) {
564 check_link_status(dev
);
566 ovs_mutex_unlock(&dev
->mutex
);
568 ovs_mutex_unlock(&dpdk_mutex
);
569 xsleep(DPDK_PORT_WATCHDOG_INTERVAL
);
576 dpdk_eth_dev_queue_setup(struct netdev_dpdk
*dev
, int n_rxq
, int n_txq
)
581 /* A device may report more queues than it makes available (this has
582 * been observed for Intel xl710, which reserves some of them for
583 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
584 * available. When this happens we can retry the configuration
585 * and request less queues */
586 while (n_rxq
&& n_txq
) {
588 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq
, n_txq
);
591 diag
= rte_eth_dev_configure(dev
->port_id
, n_rxq
, n_txq
, &port_conf
);
596 for (i
= 0; i
< n_txq
; i
++) {
597 diag
= rte_eth_tx_queue_setup(dev
->port_id
, i
, NIC_PORT_TX_Q_SIZE
,
598 dev
->socket_id
, NULL
);
600 VLOG_INFO("Interface %s txq(%d) setup error: %s",
601 dev
->up
.name
, i
, rte_strerror(-diag
));
607 /* Retry with less tx queues */
612 for (i
= 0; i
< n_rxq
; i
++) {
613 diag
= rte_eth_rx_queue_setup(dev
->port_id
, i
, NIC_PORT_RX_Q_SIZE
,
614 dev
->socket_id
, NULL
,
617 VLOG_INFO("Interface %s rxq(%d) setup error: %s",
618 dev
->up
.name
, i
, rte_strerror(-diag
));
624 /* Retry with less rx queues */
629 dev
->up
.n_rxq
= n_rxq
;
630 dev
->up
.n_txq
= n_txq
;
640 dpdk_eth_dev_init(struct netdev_dpdk
*dev
) OVS_REQUIRES(dpdk_mutex
)
642 struct rte_pktmbuf_pool_private
*mbp_priv
;
643 struct rte_eth_dev_info info
;
644 struct ether_addr eth_addr
;
648 if (dev
->port_id
< 0 || dev
->port_id
>= rte_eth_dev_count()) {
652 rte_eth_dev_info_get(dev
->port_id
, &info
);
654 n_rxq
= MIN(info
.max_rx_queues
, dev
->up
.n_rxq
);
655 n_txq
= MIN(info
.max_tx_queues
, dev
->up
.n_txq
);
657 diag
= dpdk_eth_dev_queue_setup(dev
, n_rxq
, n_txq
);
659 VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
660 dev
->up
.name
, n_rxq
, n_txq
, rte_strerror(-diag
));
664 diag
= rte_eth_dev_start(dev
->port_id
);
666 VLOG_ERR("Interface %s start error: %s", dev
->up
.name
,
667 rte_strerror(-diag
));
671 rte_eth_promiscuous_enable(dev
->port_id
);
672 rte_eth_allmulticast_enable(dev
->port_id
);
674 memset(ð_addr
, 0x0, sizeof(eth_addr
));
675 rte_eth_macaddr_get(dev
->port_id
, ð_addr
);
676 VLOG_INFO_RL(&rl
, "Port %d: "ETH_ADDR_FMT
"",
677 dev
->port_id
, ETH_ADDR_BYTES_ARGS(eth_addr
.addr_bytes
));
679 memcpy(dev
->hwaddr
.ea
, eth_addr
.addr_bytes
, ETH_ADDR_LEN
);
680 rte_eth_link_get_nowait(dev
->port_id
, &dev
->link
);
682 mbp_priv
= rte_mempool_get_priv(dev
->dpdk_mp
->mp
);
683 dev
->buf_size
= mbp_priv
->mbuf_data_room_size
- RTE_PKTMBUF_HEADROOM
;
685 dev
->flags
= NETDEV_UP
| NETDEV_PROMISC
;
689 static struct netdev_dpdk
*
690 netdev_dpdk_cast(const struct netdev
*netdev
)
692 return CONTAINER_OF(netdev
, struct netdev_dpdk
, up
);
695 static struct netdev
*
696 netdev_dpdk_alloc(void)
698 struct netdev_dpdk
*dev
;
700 if (!rte_eal_init_ret
) { /* Only after successful initialization */
701 dev
= dpdk_rte_mzalloc(sizeof *dev
);
710 netdev_dpdk_alloc_txq(struct netdev_dpdk
*dev
, unsigned int n_txqs
)
714 dev
->tx_q
= dpdk_rte_mzalloc(n_txqs
* sizeof *dev
->tx_q
);
715 for (i
= 0; i
< n_txqs
; i
++) {
716 /* Initialize map for vhost devices. */
717 dev
->tx_q
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
718 rte_spinlock_init(&dev
->tx_q
[i
].tx_lock
);
723 netdev_dpdk_init(struct netdev
*netdev
, unsigned int port_no
,
724 enum dpdk_dev_type type
)
725 OVS_REQUIRES(dpdk_mutex
)
727 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
732 ovs_mutex_init(&dev
->mutex
);
733 ovs_mutex_lock(&dev
->mutex
);
735 rte_spinlock_init(&dev
->stats_lock
);
737 /* If the 'sid' is negative, it means that the kernel fails
738 * to obtain the pci numa info. In that situation, always
740 if (type
== DPDK_DEV_ETH
) {
741 sid
= rte_eth_dev_socket_id(port_no
);
743 sid
= rte_lcore_to_socket_id(rte_get_master_lcore());
746 dev
->socket_id
= sid
< 0 ? SOCKET0
: sid
;
747 dev
->requested_socket_id
= dev
->socket_id
;
748 dev
->port_id
= port_no
;
751 dev
->mtu
= ETHER_MTU
;
752 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
754 buf_size
= dpdk_buf_size(dev
->mtu
);
755 dev
->dpdk_mp
= dpdk_mp_get(dev
->socket_id
, FRAME_LEN_TO_MTU(buf_size
));
761 /* Initialise QoS configuration to NULL and qos lock to unlocked */
762 dev
->qos_conf
= NULL
;
763 rte_spinlock_init(&dev
->qos_lock
);
765 /* Initialise rcu pointer for ingress policer to NULL */
766 ovsrcu_init(&dev
->ingress_policer
, NULL
);
767 dev
->policer_rate
= 0;
768 dev
->policer_burst
= 0;
770 netdev
->n_rxq
= NR_QUEUE
;
771 netdev
->n_txq
= NR_QUEUE
;
772 dev
->requested_n_rxq
= netdev
->n_rxq
;
773 dev
->requested_n_txq
= netdev
->n_txq
;
775 if (type
== DPDK_DEV_ETH
) {
776 err
= dpdk_eth_dev_init(dev
);
780 netdev_dpdk_alloc_txq(dev
, netdev
->n_txq
);
781 dev
->txq_needs_locking
= netdev
->n_txq
< dev
->requested_n_txq
;
783 netdev_dpdk_alloc_txq(dev
, OVS_VHOST_MAX_QUEUE_NUM
);
784 dev
->txq_needs_locking
= true;
785 /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
786 dev
->flags
= NETDEV_UP
| NETDEV_PROMISC
;
789 ovs_list_push_back(&dpdk_list
, &dev
->list_node
);
792 ovs_mutex_unlock(&dev
->mutex
);
796 /* dev_name must be the prefix followed by a positive decimal number.
797 * (no leading + or - signs are allowed) */
799 dpdk_dev_parse_name(const char dev_name
[], const char prefix
[],
800 unsigned int *port_no
)
804 if (strncmp(dev_name
, prefix
, strlen(prefix
))) {
808 cport
= dev_name
+ strlen(prefix
);
810 if (str_to_uint(cport
, 10, port_no
)) {
818 vhost_construct_helper(struct netdev
*netdev
) OVS_REQUIRES(dpdk_mutex
)
820 if (rte_eal_init_ret
) {
821 return rte_eal_init_ret
;
824 return netdev_dpdk_init(netdev
, -1, DPDK_DEV_VHOST
);
828 netdev_dpdk_vhost_cuse_construct(struct netdev
*netdev
)
830 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
833 if (rte_eal_init_ret
) {
834 return rte_eal_init_ret
;
837 ovs_mutex_lock(&dpdk_mutex
);
838 strncpy(dev
->vhost_id
, netdev
->name
, sizeof(dev
->vhost_id
));
839 err
= vhost_construct_helper(netdev
);
840 ovs_mutex_unlock(&dpdk_mutex
);
845 netdev_dpdk_vhost_user_construct(struct netdev
*netdev
)
847 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
848 const char *name
= netdev
->name
;
851 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
852 * the file system. '/' or '\' would traverse directories, so they're not
853 * acceptable in 'name'. */
854 if (strchr(name
, '/') || strchr(name
, '\\')) {
855 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
856 "A valid name must not include '/' or '\\'",
861 if (rte_eal_init_ret
) {
862 return rte_eal_init_ret
;
865 ovs_mutex_lock(&dpdk_mutex
);
866 /* Take the name of the vhost-user port and append it to the location where
867 * the socket is to be created, then register the socket.
869 snprintf(dev
->vhost_id
, sizeof(dev
->vhost_id
), "%s/%s",
870 vhost_sock_dir
, name
);
872 err
= rte_vhost_driver_register(dev
->vhost_id
);
874 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
877 fatal_signal_add_file_to_unlink(dev
->vhost_id
);
878 VLOG_INFO("Socket %s created for vhost-user port %s\n",
879 dev
->vhost_id
, name
);
880 err
= vhost_construct_helper(netdev
);
883 ovs_mutex_unlock(&dpdk_mutex
);
888 netdev_dpdk_construct(struct netdev
*netdev
)
890 unsigned int port_no
;
893 if (rte_eal_init_ret
) {
894 return rte_eal_init_ret
;
897 /* Names always start with "dpdk" */
898 err
= dpdk_dev_parse_name(netdev
->name
, "dpdk", &port_no
);
903 ovs_mutex_lock(&dpdk_mutex
);
904 err
= netdev_dpdk_init(netdev
, port_no
, DPDK_DEV_ETH
);
905 ovs_mutex_unlock(&dpdk_mutex
);
910 netdev_dpdk_destruct(struct netdev
*netdev
)
912 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
914 ovs_mutex_lock(&dev
->mutex
);
915 rte_eth_dev_stop(dev
->port_id
);
916 free(ovsrcu_get_protected(struct ingress_policer
*,
917 &dev
->ingress_policer
));
918 ovs_mutex_unlock(&dev
->mutex
);
920 ovs_mutex_lock(&dpdk_mutex
);
922 ovs_list_remove(&dev
->list_node
);
923 dpdk_mp_put(dev
->dpdk_mp
);
924 ovs_mutex_unlock(&dpdk_mutex
);
928 netdev_dpdk_vhost_destruct(struct netdev
*netdev
)
930 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
932 /* Guest becomes an orphan if still attached. */
933 if (netdev_dpdk_get_virtio(dev
) != NULL
) {
934 VLOG_ERR("Removing port '%s' while vhost device still attached.",
936 VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
937 " '%s' must be restarted.",
941 if (rte_vhost_driver_unregister(dev
->vhost_id
)) {
942 VLOG_ERR("Unable to remove vhost-user socket %s", dev
->vhost_id
);
944 fatal_signal_remove_file_to_unlink(dev
->vhost_id
);
947 ovs_mutex_lock(&dev
->mutex
);
948 free(ovsrcu_get_protected(struct ingress_policer
*,
949 &dev
->ingress_policer
));
950 ovs_mutex_unlock(&dev
->mutex
);
952 ovs_mutex_lock(&dpdk_mutex
);
954 ovs_list_remove(&dev
->list_node
);
955 dpdk_mp_put(dev
->dpdk_mp
);
956 ovs_mutex_unlock(&dpdk_mutex
);
960 netdev_dpdk_dealloc(struct netdev
*netdev
)
962 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
968 netdev_dpdk_get_config(const struct netdev
*netdev
, struct smap
*args
)
970 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
972 ovs_mutex_lock(&dev
->mutex
);
974 smap_add_format(args
, "requested_rx_queues", "%d", dev
->requested_n_rxq
);
975 smap_add_format(args
, "configured_rx_queues", "%d", netdev
->n_rxq
);
976 smap_add_format(args
, "requested_tx_queues", "%d", dev
->requested_n_txq
);
977 smap_add_format(args
, "configured_tx_queues", "%d", netdev
->n_txq
);
978 ovs_mutex_unlock(&dev
->mutex
);
984 netdev_dpdk_set_config(struct netdev
*netdev
, const struct smap
*args
)
986 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
989 ovs_mutex_lock(&dev
->mutex
);
990 new_n_rxq
= MAX(smap_get_int(args
, "n_rxq", dev
->requested_n_rxq
), 1);
991 if (new_n_rxq
!= dev
->requested_n_rxq
) {
992 dev
->requested_n_rxq
= new_n_rxq
;
993 netdev_request_reconfigure(netdev
);
995 ovs_mutex_unlock(&dev
->mutex
);
1001 netdev_dpdk_get_numa_id(const struct netdev
*netdev
)
1003 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1005 return dev
->socket_id
;
1008 /* Sets the number of tx queues for the dpdk interface. */
1010 netdev_dpdk_set_tx_multiq(struct netdev
*netdev
, unsigned int n_txq
)
1012 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1014 ovs_mutex_lock(&dev
->mutex
);
1016 if (dev
->requested_n_txq
== n_txq
) {
1020 dev
->requested_n_txq
= n_txq
;
1021 netdev_request_reconfigure(netdev
);
1024 ovs_mutex_unlock(&dev
->mutex
);
1028 static struct netdev_rxq
*
1029 netdev_dpdk_rxq_alloc(void)
1031 struct netdev_rxq_dpdk
*rx
= dpdk_rte_mzalloc(sizeof *rx
);
1036 static struct netdev_rxq_dpdk
*
1037 netdev_rxq_dpdk_cast(const struct netdev_rxq
*rxq
)
1039 return CONTAINER_OF(rxq
, struct netdev_rxq_dpdk
, up
);
1043 netdev_dpdk_rxq_construct(struct netdev_rxq
*rxq
)
1045 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
1046 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
1048 ovs_mutex_lock(&dev
->mutex
);
1049 rx
->port_id
= dev
->port_id
;
1050 ovs_mutex_unlock(&dev
->mutex
);
1056 netdev_dpdk_rxq_destruct(struct netdev_rxq
*rxq OVS_UNUSED
)
1061 netdev_dpdk_rxq_dealloc(struct netdev_rxq
*rxq
)
1063 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
1069 netdev_dpdk_eth_tx_burst(struct netdev_dpdk
*dev
, int qid
,
1070 struct rte_mbuf
**pkts
, int cnt
)
1074 while (nb_tx
!= cnt
) {
1077 ret
= rte_eth_tx_burst(dev
->port_id
, qid
, pkts
+ nb_tx
, cnt
- nb_tx
);
1085 if (OVS_UNLIKELY(nb_tx
!= cnt
)) {
1086 /* free buffers, which we couldn't transmit, one at a time (each
1087 * packet could come from a different mempool) */
1090 for (i
= nb_tx
; i
< cnt
; i
++) {
1091 rte_pktmbuf_free(pkts
[i
]);
1093 rte_spinlock_lock(&dev
->stats_lock
);
1094 dev
->stats
.tx_dropped
+= cnt
- nb_tx
;
1095 rte_spinlock_unlock(&dev
->stats_lock
);
1100 netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm
*meter
,
1101 struct rte_mbuf
*pkt
, uint64_t time
)
1103 uint32_t pkt_len
= rte_pktmbuf_pkt_len(pkt
) - sizeof(struct ether_hdr
);
1105 return rte_meter_srtcm_color_blind_check(meter
, time
, pkt_len
) ==
1110 netdev_dpdk_policer_run(struct rte_meter_srtcm
*meter
,
1111 struct rte_mbuf
**pkts
, int pkt_cnt
)
1115 struct rte_mbuf
*pkt
= NULL
;
1116 uint64_t current_time
= rte_rdtsc();
1118 for (i
= 0; i
< pkt_cnt
; i
++) {
1120 /* Handle current packet */
1121 if (netdev_dpdk_policer_pkt_handle(meter
, pkt
, current_time
)) {
1127 rte_pktmbuf_free(pkt
);
1135 ingress_policer_run(struct ingress_policer
*policer
, struct rte_mbuf
**pkts
,
1140 rte_spinlock_lock(&policer
->policer_lock
);
1141 cnt
= netdev_dpdk_policer_run(&policer
->in_policer
, pkts
, pkt_cnt
);
1142 rte_spinlock_unlock(&policer
->policer_lock
);
1148 is_vhost_running(struct virtio_net
*virtio_dev
)
1150 return (virtio_dev
!= NULL
&& (virtio_dev
->flags
& VIRTIO_DEV_RUNNING
));
1154 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats
*stats
,
1155 unsigned int packet_size
)
1157 /* Hard-coded search for the size bucket. */
1158 if (packet_size
< 256) {
1159 if (packet_size
>= 128) {
1160 stats
->rx_128_to_255_packets
++;
1161 } else if (packet_size
<= 64) {
1162 stats
->rx_1_to_64_packets
++;
1164 stats
->rx_65_to_127_packets
++;
1167 if (packet_size
>= 1523) {
1168 stats
->rx_1523_to_max_packets
++;
1169 } else if (packet_size
>= 1024) {
1170 stats
->rx_1024_to_1522_packets
++;
1171 } else if (packet_size
< 512) {
1172 stats
->rx_256_to_511_packets
++;
1174 stats
->rx_512_to_1023_packets
++;
1180 netdev_dpdk_vhost_update_rx_counters(struct netdev_stats
*stats
,
1181 struct dp_packet
**packets
, int count
,
1185 unsigned int packet_size
;
1186 struct dp_packet
*packet
;
1188 stats
->rx_packets
+= count
;
1189 stats
->rx_dropped
+= dropped
;
1190 for (i
= 0; i
< count
; i
++) {
1191 packet
= packets
[i
];
1192 packet_size
= dp_packet_size(packet
);
1194 if (OVS_UNLIKELY(packet_size
< ETH_HEADER_LEN
)) {
1195 /* This only protects the following multicast counting from
1196 * too short packets, but it does not stop the packet from
1197 * further processing. */
1199 stats
->rx_length_errors
++;
1203 netdev_dpdk_vhost_update_rx_size_counters(stats
, packet_size
);
1205 struct eth_header
*eh
= (struct eth_header
*) dp_packet_data(packet
);
1206 if (OVS_UNLIKELY(eth_addr_is_multicast(eh
->eth_dst
))) {
1210 stats
->rx_bytes
+= packet_size
;
1215 * The receive path for the vhost port is the TX path out from guest.
1218 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
*rxq
,
1219 struct dp_packet_batch
*batch
)
1221 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
1222 struct virtio_net
*virtio_dev
= netdev_dpdk_get_virtio(dev
);
1223 int qid
= rxq
->queue_id
;
1224 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
1226 uint16_t dropped
= 0;
1228 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev
)
1229 || !(dev
->flags
& NETDEV_UP
))) {
1233 nb_rx
= rte_vhost_dequeue_burst(virtio_dev
, qid
* VIRTIO_QNUM
+ VIRTIO_TXQ
,
1235 (struct rte_mbuf
**) batch
->packets
,
1243 nb_rx
= ingress_policer_run(policer
,
1244 (struct rte_mbuf
**) batch
->packets
,
1249 rte_spinlock_lock(&dev
->stats_lock
);
1250 netdev_dpdk_vhost_update_rx_counters(&dev
->stats
, batch
->packets
,
1252 rte_spinlock_unlock(&dev
->stats_lock
);
1254 batch
->count
= (int) nb_rx
;
1259 netdev_dpdk_rxq_recv(struct netdev_rxq
*rxq
, struct dp_packet_batch
*batch
)
1261 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq
);
1262 struct netdev_dpdk
*dev
= netdev_dpdk_cast(rxq
->netdev
);
1263 struct ingress_policer
*policer
= netdev_dpdk_get_ingress_policer(dev
);
1267 nb_rx
= rte_eth_rx_burst(rx
->port_id
, rxq
->queue_id
,
1268 (struct rte_mbuf
**) batch
->packets
,
1276 nb_rx
= ingress_policer_run(policer
,
1277 (struct rte_mbuf
**)batch
->packets
,
1282 /* Update stats to reflect dropped packets */
1283 if (OVS_UNLIKELY(dropped
)) {
1284 rte_spinlock_lock(&dev
->stats_lock
);
1285 dev
->stats
.rx_dropped
+= dropped
;
1286 rte_spinlock_unlock(&dev
->stats_lock
);
1289 batch
->count
= nb_rx
;
1295 netdev_dpdk_qos_run__(struct netdev_dpdk
*dev
, struct rte_mbuf
**pkts
,
1298 struct netdev
*netdev
= &dev
->up
;
1300 if (dev
->qos_conf
!= NULL
) {
1301 rte_spinlock_lock(&dev
->qos_lock
);
1302 if (dev
->qos_conf
!= NULL
) {
1303 cnt
= dev
->qos_conf
->ops
->qos_run(netdev
, pkts
, cnt
);
1305 rte_spinlock_unlock(&dev
->qos_lock
);
1312 netdev_dpdk_vhost_update_tx_counters(struct netdev_stats
*stats
,
1313 struct dp_packet
**packets
,
1318 int sent
= attempted
- dropped
;
1320 stats
->tx_packets
+= sent
;
1321 stats
->tx_dropped
+= dropped
;
1323 for (i
= 0; i
< sent
; i
++) {
1324 stats
->tx_bytes
+= dp_packet_size(packets
[i
]);
1329 __netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
1330 struct dp_packet
**pkts
, int cnt
,
1333 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1334 struct virtio_net
*virtio_dev
= netdev_dpdk_get_virtio(dev
);
1335 struct rte_mbuf
**cur_pkts
= (struct rte_mbuf
**) pkts
;
1336 unsigned int total_pkts
= cnt
;
1337 unsigned int qos_pkts
= cnt
;
1340 qid
= dev
->tx_q
[qid
% netdev
->n_txq
].map
;
1342 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev
) || qid
< 0
1343 || !(dev
->flags
& NETDEV_UP
))) {
1344 rte_spinlock_lock(&dev
->stats_lock
);
1345 dev
->stats
.tx_dropped
+= cnt
;
1346 rte_spinlock_unlock(&dev
->stats_lock
);
1350 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
1352 /* Check has QoS has been configured for the netdev */
1353 cnt
= netdev_dpdk_qos_run__(dev
, cur_pkts
, cnt
);
1357 int vhost_qid
= qid
* VIRTIO_QNUM
+ VIRTIO_RXQ
;
1358 unsigned int tx_pkts
;
1360 tx_pkts
= rte_vhost_enqueue_burst(virtio_dev
, vhost_qid
,
1362 if (OVS_LIKELY(tx_pkts
)) {
1363 /* Packets have been sent.*/
1365 /* Prepare for possible retry.*/
1366 cur_pkts
= &cur_pkts
[tx_pkts
];
1368 /* No packets sent - do not retry.*/
1371 } while (cnt
&& (retries
++ < VHOST_ENQ_RETRY_NUM
));
1373 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
1375 rte_spinlock_lock(&dev
->stats_lock
);
1377 netdev_dpdk_vhost_update_tx_counters(&dev
->stats
, pkts
, total_pkts
, cnt
);
1378 rte_spinlock_unlock(&dev
->stats_lock
);
1384 for (i
= 0; i
< total_pkts
; i
++) {
1385 dp_packet_delete(pkts
[i
]);
1390 /* Tx function. Transmit packets indefinitely */
1392 dpdk_do_tx_copy(struct netdev
*netdev
, int qid
, struct dp_packet_batch
*batch
)
1393 OVS_NO_THREAD_SAFETY_ANALYSIS
1395 #if !defined(__CHECKER__) && !defined(_WIN32)
1396 const size_t PKT_ARRAY_SIZE
= batch
->count
;
1398 /* Sparse or MSVC doesn't like variable length array. */
1399 enum { PKT_ARRAY_SIZE
= NETDEV_MAX_BURST
};
1401 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1402 struct rte_mbuf
*mbufs
[PKT_ARRAY_SIZE
];
1407 /* If we are on a non pmd thread we have to use the mempool mutex, because
1408 * every non pmd thread shares the same mempool cache */
1410 if (!dpdk_thread_is_pmd()) {
1411 ovs_mutex_lock(&nonpmd_mempool_mutex
);
1414 for (i
= 0; i
< batch
->count
; i
++) {
1415 int size
= dp_packet_size(batch
->packets
[i
]);
1417 if (OVS_UNLIKELY(size
> dev
->max_packet_len
)) {
1418 VLOG_WARN_RL(&rl
, "Too big size %d max_packet_len %d",
1419 (int)size
, dev
->max_packet_len
);
1425 mbufs
[newcnt
] = rte_pktmbuf_alloc(dev
->dpdk_mp
->mp
);
1427 if (!mbufs
[newcnt
]) {
1428 dropped
+= batch
->count
- i
;
1432 /* Cut the size so only the truncated size is copied. */
1433 size
-= dp_packet_get_cutlen(batch
->packets
[i
]);
1434 dp_packet_reset_cutlen(batch
->packets
[i
]);
1436 /* We have to do a copy for now */
1437 memcpy(rte_pktmbuf_mtod(mbufs
[newcnt
], void *),
1438 dp_packet_data(batch
->packets
[i
]), size
);
1440 rte_pktmbuf_data_len(mbufs
[newcnt
]) = size
;
1441 rte_pktmbuf_pkt_len(mbufs
[newcnt
]) = size
;
1446 if (dev
->type
== DPDK_DEV_VHOST
) {
1447 __netdev_dpdk_vhost_send(netdev
, qid
, (struct dp_packet
**) mbufs
,
1450 unsigned int qos_pkts
= newcnt
;
1452 /* Check if QoS has been configured for this netdev. */
1453 newcnt
= netdev_dpdk_qos_run__(dev
, mbufs
, newcnt
);
1455 dropped
+= qos_pkts
- newcnt
;
1456 netdev_dpdk_eth_tx_burst(dev
, qid
, mbufs
, newcnt
);
1459 if (OVS_UNLIKELY(dropped
)) {
1460 rte_spinlock_lock(&dev
->stats_lock
);
1461 dev
->stats
.tx_dropped
+= dropped
;
1462 rte_spinlock_unlock(&dev
->stats_lock
);
1465 if (!dpdk_thread_is_pmd()) {
1466 ovs_mutex_unlock(&nonpmd_mempool_mutex
);
1471 netdev_dpdk_vhost_send(struct netdev
*netdev
, int qid
,
1472 struct dp_packet_batch
*batch
,
1476 if (OVS_UNLIKELY(batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
1477 dpdk_do_tx_copy(netdev
, qid
, batch
);
1478 dp_packet_delete_batch(batch
, may_steal
);
1480 dp_packet_batch_apply_cutlen(batch
);
1481 __netdev_dpdk_vhost_send(netdev
, qid
, batch
->packets
, batch
->count
,
1488 netdev_dpdk_send__(struct netdev_dpdk
*dev
, int qid
,
1489 struct dp_packet_batch
*batch
, bool may_steal
)
1491 if (OVS_UNLIKELY(dev
->txq_needs_locking
)) {
1492 qid
= qid
% dev
->up
.n_txq
;
1493 rte_spinlock_lock(&dev
->tx_q
[qid
].tx_lock
);
1496 if (OVS_UNLIKELY(!may_steal
||
1497 batch
->packets
[0]->source
!= DPBUF_DPDK
)) {
1498 struct netdev
*netdev
= &dev
->up
;
1500 dpdk_do_tx_copy(netdev
, qid
, batch
);
1501 dp_packet_delete_batch(batch
, may_steal
);
1503 int next_tx_idx
= 0;
1505 unsigned int qos_pkts
= 0;
1506 unsigned int temp_cnt
= 0;
1507 int cnt
= batch
->count
;
1509 for (int i
= 0; i
< cnt
; i
++) {
1510 int size
= dp_packet_size(batch
->packets
[i
]);
1512 size
-= dp_packet_get_cutlen(batch
->packets
[i
]);
1513 dp_packet_set_size(batch
->packets
[i
], size
);
1515 if (OVS_UNLIKELY(size
> dev
->max_packet_len
)) {
1516 if (next_tx_idx
!= i
) {
1517 temp_cnt
= i
- next_tx_idx
;
1518 qos_pkts
= temp_cnt
;
1520 temp_cnt
= netdev_dpdk_qos_run__(dev
,
1521 (struct rte_mbuf
**)batch
->packets
,
1523 dropped
+= qos_pkts
- temp_cnt
;
1524 netdev_dpdk_eth_tx_burst(dev
, qid
,
1525 (struct rte_mbuf
**)&batch
->packets
[next_tx_idx
],
1530 VLOG_WARN_RL(&rl
, "Too big size %d max_packet_len %d",
1531 (int)size
, dev
->max_packet_len
);
1533 dp_packet_delete(batch
->packets
[i
]);
1535 next_tx_idx
= i
+ 1;
1538 if (next_tx_idx
!= cnt
) {
1542 cnt
= netdev_dpdk_qos_run__(dev
,
1543 (struct rte_mbuf
**)batch
->packets
, cnt
);
1544 dropped
+= qos_pkts
- cnt
;
1545 netdev_dpdk_eth_tx_burst(dev
, qid
,
1546 (struct rte_mbuf
**)&batch
->packets
[next_tx_idx
],
1550 if (OVS_UNLIKELY(dropped
)) {
1551 rte_spinlock_lock(&dev
->stats_lock
);
1552 dev
->stats
.tx_dropped
+= dropped
;
1553 rte_spinlock_unlock(&dev
->stats_lock
);
1557 if (OVS_UNLIKELY(dev
->txq_needs_locking
)) {
1558 rte_spinlock_unlock(&dev
->tx_q
[qid
].tx_lock
);
1563 netdev_dpdk_eth_send(struct netdev
*netdev
, int qid
,
1564 struct dp_packet_batch
*batch
, bool may_steal
)
1566 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1568 netdev_dpdk_send__(dev
, qid
, batch
, may_steal
);
1573 netdev_dpdk_set_etheraddr(struct netdev
*netdev
, const struct eth_addr mac
)
1575 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1577 ovs_mutex_lock(&dev
->mutex
);
1578 if (!eth_addr_equals(dev
->hwaddr
, mac
)) {
1580 netdev_change_seq_changed(netdev
);
1582 ovs_mutex_unlock(&dev
->mutex
);
1588 netdev_dpdk_get_etheraddr(const struct netdev
*netdev
, struct eth_addr
*mac
)
1590 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1592 ovs_mutex_lock(&dev
->mutex
);
1594 ovs_mutex_unlock(&dev
->mutex
);
1600 netdev_dpdk_get_mtu(const struct netdev
*netdev
, int *mtup
)
1602 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1604 ovs_mutex_lock(&dev
->mutex
);
1606 ovs_mutex_unlock(&dev
->mutex
);
1612 netdev_dpdk_set_mtu(const struct netdev
*netdev
, int mtu
)
1614 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1615 int old_mtu
, err
, dpdk_mtu
;
1616 struct dpdk_mp
*old_mp
;
1620 ovs_mutex_lock(&dpdk_mutex
);
1621 ovs_mutex_lock(&dev
->mutex
);
1622 if (dev
->mtu
== mtu
) {
1627 buf_size
= dpdk_buf_size(mtu
);
1628 dpdk_mtu
= FRAME_LEN_TO_MTU(buf_size
);
1630 mp
= dpdk_mp_get(dev
->socket_id
, dpdk_mtu
);
1636 rte_eth_dev_stop(dev
->port_id
);
1639 old_mp
= dev
->dpdk_mp
;
1642 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
1644 err
= dpdk_eth_dev_init(dev
);
1648 dev
->dpdk_mp
= old_mp
;
1649 dev
->max_packet_len
= MTU_TO_FRAME_LEN(dev
->mtu
);
1650 dpdk_eth_dev_init(dev
);
1654 dpdk_mp_put(old_mp
);
1655 netdev_change_seq_changed(netdev
);
1657 ovs_mutex_unlock(&dev
->mutex
);
1658 ovs_mutex_unlock(&dpdk_mutex
);
1663 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
);
1666 netdev_dpdk_vhost_get_stats(const struct netdev
*netdev
,
1667 struct netdev_stats
*stats
)
1669 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1671 ovs_mutex_lock(&dev
->mutex
);
1673 rte_spinlock_lock(&dev
->stats_lock
);
1674 /* Supported Stats */
1675 stats
->rx_packets
+= dev
->stats
.rx_packets
;
1676 stats
->tx_packets
+= dev
->stats
.tx_packets
;
1677 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
1678 stats
->tx_dropped
+= dev
->stats
.tx_dropped
;
1679 stats
->multicast
= dev
->stats
.multicast
;
1680 stats
->rx_bytes
= dev
->stats
.rx_bytes
;
1681 stats
->tx_bytes
= dev
->stats
.tx_bytes
;
1682 stats
->rx_errors
= dev
->stats
.rx_errors
;
1683 stats
->rx_length_errors
= dev
->stats
.rx_length_errors
;
1685 stats
->rx_1_to_64_packets
= dev
->stats
.rx_1_to_64_packets
;
1686 stats
->rx_65_to_127_packets
= dev
->stats
.rx_65_to_127_packets
;
1687 stats
->rx_128_to_255_packets
= dev
->stats
.rx_128_to_255_packets
;
1688 stats
->rx_256_to_511_packets
= dev
->stats
.rx_256_to_511_packets
;
1689 stats
->rx_512_to_1023_packets
= dev
->stats
.rx_512_to_1023_packets
;
1690 stats
->rx_1024_to_1522_packets
= dev
->stats
.rx_1024_to_1522_packets
;
1691 stats
->rx_1523_to_max_packets
= dev
->stats
.rx_1523_to_max_packets
;
1693 rte_spinlock_unlock(&dev
->stats_lock
);
1695 ovs_mutex_unlock(&dev
->mutex
);
1701 netdev_dpdk_convert_xstats(struct netdev_stats
*stats
,
1702 const struct rte_eth_xstats
*xstats
,
1703 const unsigned int size
)
1705 /* XXX Current implementation is simple search through an array
1706 * to find hardcoded counter names. In future DPDK release (TBD)
1707 * XSTATS API will change so each counter will be represented by
1708 * unique ID instead of String. */
1710 for (unsigned int i
= 0; i
< size
; i
++) {
1711 if (strcmp(XSTAT_RX_64_PACKETS
, xstats
[i
].name
) == 0) {
1712 stats
->rx_1_to_64_packets
= xstats
[i
].value
;
1713 } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS
, xstats
[i
].name
) == 0) {
1714 stats
->rx_65_to_127_packets
= xstats
[i
].value
;
1715 } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS
, xstats
[i
].name
) == 0) {
1716 stats
->rx_128_to_255_packets
= xstats
[i
].value
;
1717 } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS
, xstats
[i
].name
) == 0) {
1718 stats
->rx_256_to_511_packets
= xstats
[i
].value
;
1719 } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS
,
1720 xstats
[i
].name
) == 0) {
1721 stats
->rx_512_to_1023_packets
= xstats
[i
].value
;
1722 } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS
,
1723 xstats
[i
].name
) == 0) {
1724 stats
->rx_1024_to_1522_packets
= xstats
[i
].value
;
1725 } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS
,
1726 xstats
[i
].name
) == 0) {
1727 stats
->rx_1523_to_max_packets
= xstats
[i
].value
;
1728 } else if (strcmp(XSTAT_TX_64_PACKETS
, xstats
[i
].name
) == 0) {
1729 stats
->tx_1_to_64_packets
= xstats
[i
].value
;
1730 } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS
, xstats
[i
].name
) == 0) {
1731 stats
->tx_65_to_127_packets
= xstats
[i
].value
;
1732 } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS
, xstats
[i
].name
) == 0) {
1733 stats
->tx_128_to_255_packets
= xstats
[i
].value
;
1734 } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS
, xstats
[i
].name
) == 0) {
1735 stats
->tx_256_to_511_packets
= xstats
[i
].value
;
1736 } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS
,
1737 xstats
[i
].name
) == 0) {
1738 stats
->tx_512_to_1023_packets
= xstats
[i
].value
;
1739 } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS
,
1740 xstats
[i
].name
) == 0) {
1741 stats
->tx_1024_to_1522_packets
= xstats
[i
].value
;
1742 } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS
,
1743 xstats
[i
].name
) == 0) {
1744 stats
->tx_1523_to_max_packets
= xstats
[i
].value
;
1745 } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS
, xstats
[i
].name
) == 0) {
1746 stats
->tx_multicast_packets
= xstats
[i
].value
;
1747 } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS
, xstats
[i
].name
) == 0) {
1748 stats
->rx_broadcast_packets
= xstats
[i
].value
;
1749 } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS
, xstats
[i
].name
) == 0) {
1750 stats
->tx_broadcast_packets
= xstats
[i
].value
;
1751 } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS
, xstats
[i
].name
) == 0) {
1752 stats
->rx_undersized_errors
= xstats
[i
].value
;
1753 } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS
, xstats
[i
].name
) == 0) {
1754 stats
->rx_fragmented_errors
= xstats
[i
].value
;
1755 } else if (strcmp(XSTAT_RX_JABBER_ERRORS
, xstats
[i
].name
) == 0) {
1756 stats
->rx_jabber_errors
= xstats
[i
].value
;
1762 netdev_dpdk_get_stats(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1764 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1765 struct rte_eth_stats rte_stats
;
1768 netdev_dpdk_get_carrier(netdev
, &gg
);
1769 ovs_mutex_lock(&dev
->mutex
);
1771 struct rte_eth_xstats
*rte_xstats
;
1772 int rte_xstats_len
, rte_xstats_ret
;
1774 if (rte_eth_stats_get(dev
->port_id
, &rte_stats
)) {
1775 VLOG_ERR("Can't get ETH statistics for port: %i.", dev
->port_id
);
1776 ovs_mutex_unlock(&dev
->mutex
);
1780 rte_xstats_len
= rte_eth_xstats_get(dev
->port_id
, NULL
, 0);
1781 if (rte_xstats_len
> 0) {
1782 rte_xstats
= dpdk_rte_mzalloc(sizeof(*rte_xstats
) * rte_xstats_len
);
1783 memset(rte_xstats
, 0xff, sizeof(*rte_xstats
) * rte_xstats_len
);
1784 rte_xstats_ret
= rte_eth_xstats_get(dev
->port_id
, rte_xstats
,
1786 if (rte_xstats_ret
> 0 && rte_xstats_ret
<= rte_xstats_len
) {
1787 netdev_dpdk_convert_xstats(stats
, rte_xstats
, rte_xstats_ret
);
1789 rte_free(rte_xstats
);
1791 VLOG_WARN("Can't get XSTATS counters for port: %i.", dev
->port_id
);
1794 stats
->rx_packets
= rte_stats
.ipackets
;
1795 stats
->tx_packets
= rte_stats
.opackets
;
1796 stats
->rx_bytes
= rte_stats
.ibytes
;
1797 stats
->tx_bytes
= rte_stats
.obytes
;
1798 /* DPDK counts imissed as errors, but count them here as dropped instead */
1799 stats
->rx_errors
= rte_stats
.ierrors
- rte_stats
.imissed
;
1800 stats
->tx_errors
= rte_stats
.oerrors
;
1801 stats
->multicast
= rte_stats
.imcasts
;
1803 rte_spinlock_lock(&dev
->stats_lock
);
1804 stats
->tx_dropped
= dev
->stats
.tx_dropped
;
1805 stats
->rx_dropped
= dev
->stats
.rx_dropped
;
1806 rte_spinlock_unlock(&dev
->stats_lock
);
1808 /* These are the available DPDK counters for packets not received due to
1809 * local resource constraints in DPDK and NIC respectively. */
1810 stats
->rx_dropped
+= rte_stats
.rx_nombuf
+ rte_stats
.imissed
;
1811 stats
->rx_missed_errors
= rte_stats
.imissed
;
1813 ovs_mutex_unlock(&dev
->mutex
);
1819 netdev_dpdk_get_features(const struct netdev
*netdev
,
1820 enum netdev_features
*current
,
1821 enum netdev_features
*advertised OVS_UNUSED
,
1822 enum netdev_features
*supported OVS_UNUSED
,
1823 enum netdev_features
*peer OVS_UNUSED
)
1825 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1826 struct rte_eth_link link
;
1828 ovs_mutex_lock(&dev
->mutex
);
1830 ovs_mutex_unlock(&dev
->mutex
);
1832 if (link
.link_duplex
== ETH_LINK_HALF_DUPLEX
) {
1833 if (link
.link_speed
== ETH_SPEED_NUM_10M
) {
1834 *current
= NETDEV_F_10MB_HD
;
1836 if (link
.link_speed
== ETH_SPEED_NUM_100M
) {
1837 *current
= NETDEV_F_100MB_HD
;
1839 if (link
.link_speed
== ETH_SPEED_NUM_1G
) {
1840 *current
= NETDEV_F_1GB_HD
;
1842 } else if (link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) {
1843 if (link
.link_speed
== ETH_SPEED_NUM_10M
) {
1844 *current
= NETDEV_F_10MB_FD
;
1846 if (link
.link_speed
== ETH_SPEED_NUM_100M
) {
1847 *current
= NETDEV_F_100MB_FD
;
1849 if (link
.link_speed
== ETH_SPEED_NUM_1G
) {
1850 *current
= NETDEV_F_1GB_FD
;
1852 if (link
.link_speed
== ETH_SPEED_NUM_10G
) {
1853 *current
= NETDEV_F_10GB_FD
;
1857 if (link
.link_autoneg
) {
1858 *current
|= NETDEV_F_AUTONEG
;
1864 static struct ingress_policer
*
1865 netdev_dpdk_policer_construct(uint32_t rate
, uint32_t burst
)
1867 struct ingress_policer
*policer
= NULL
;
1868 uint64_t rate_bytes
;
1869 uint64_t burst_bytes
;
1872 policer
= xmalloc(sizeof *policer
);
1873 rte_spinlock_init(&policer
->policer_lock
);
1875 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
1876 rate_bytes
= rate
* 1000/8;
1877 burst_bytes
= burst
* 1000/8;
1879 policer
->app_srtcm_params
.cir
= rate_bytes
;
1880 policer
->app_srtcm_params
.cbs
= burst_bytes
;
1881 policer
->app_srtcm_params
.ebs
= 0;
1882 err
= rte_meter_srtcm_config(&policer
->in_policer
,
1883 &policer
->app_srtcm_params
);
1885 VLOG_ERR("Could not create rte meter for ingress policer");
1893 netdev_dpdk_set_policing(struct netdev
* netdev
, uint32_t policer_rate
,
1894 uint32_t policer_burst
)
1896 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1897 struct ingress_policer
*policer
;
1899 /* Force to 0 if no rate specified,
1900 * default to 8000 kbits if burst is 0,
1901 * else stick with user-specified value.
1903 policer_burst
= (!policer_rate
? 0
1904 : !policer_burst
? 8000
1907 ovs_mutex_lock(&dev
->mutex
);
1909 policer
= ovsrcu_get_protected(struct ingress_policer
*,
1910 &dev
->ingress_policer
);
1912 if (dev
->policer_rate
== policer_rate
&&
1913 dev
->policer_burst
== policer_burst
) {
1914 /* Assume that settings haven't changed since we last set them. */
1915 ovs_mutex_unlock(&dev
->mutex
);
1919 /* Destroy any existing ingress policer for the device if one exists */
1921 ovsrcu_postpone(free
, policer
);
1924 if (policer_rate
!= 0) {
1925 policer
= netdev_dpdk_policer_construct(policer_rate
, policer_burst
);
1929 ovsrcu_set(&dev
->ingress_policer
, policer
);
1930 dev
->policer_rate
= policer_rate
;
1931 dev
->policer_burst
= policer_burst
;
1932 ovs_mutex_unlock(&dev
->mutex
);
1938 netdev_dpdk_get_ifindex(const struct netdev
*netdev
)
1940 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1943 ovs_mutex_lock(&dev
->mutex
);
1944 ifindex
= dev
->port_id
;
1945 ovs_mutex_unlock(&dev
->mutex
);
1951 netdev_dpdk_get_carrier(const struct netdev
*netdev
, bool *carrier
)
1953 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1955 ovs_mutex_lock(&dev
->mutex
);
1956 check_link_status(dev
);
1957 *carrier
= dev
->link
.link_status
;
1959 ovs_mutex_unlock(&dev
->mutex
);
1965 netdev_dpdk_vhost_get_carrier(const struct netdev
*netdev
, bool *carrier
)
1967 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1968 struct virtio_net
*virtio_dev
= netdev_dpdk_get_virtio(dev
);
1970 ovs_mutex_lock(&dev
->mutex
);
1972 if (is_vhost_running(virtio_dev
)) {
1978 ovs_mutex_unlock(&dev
->mutex
);
1983 static long long int
1984 netdev_dpdk_get_carrier_resets(const struct netdev
*netdev
)
1986 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
1987 long long int carrier_resets
;
1989 ovs_mutex_lock(&dev
->mutex
);
1990 carrier_resets
= dev
->link_reset_cnt
;
1991 ovs_mutex_unlock(&dev
->mutex
);
1993 return carrier_resets
;
1997 netdev_dpdk_set_miimon(struct netdev
*netdev OVS_UNUSED
,
1998 long long int interval OVS_UNUSED
)
2004 netdev_dpdk_update_flags__(struct netdev_dpdk
*dev
,
2005 enum netdev_flags off
, enum netdev_flags on
,
2006 enum netdev_flags
*old_flagsp
)
2007 OVS_REQUIRES(dev
->mutex
)
2011 if ((off
| on
) & ~(NETDEV_UP
| NETDEV_PROMISC
)) {
2015 *old_flagsp
= dev
->flags
;
2019 if (dev
->flags
== *old_flagsp
) {
2023 if (dev
->type
== DPDK_DEV_ETH
) {
2024 if (dev
->flags
& NETDEV_UP
) {
2025 err
= rte_eth_dev_start(dev
->port_id
);
2030 if (dev
->flags
& NETDEV_PROMISC
) {
2031 rte_eth_promiscuous_enable(dev
->port_id
);
2034 if (!(dev
->flags
& NETDEV_UP
)) {
2035 rte_eth_dev_stop(dev
->port_id
);
2038 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
2039 * running then change netdev's change_seq to trigger link state
2041 struct virtio_net
*virtio_dev
= netdev_dpdk_get_virtio(dev
);
2043 if ((NETDEV_UP
& ((*old_flagsp
^ on
) | (*old_flagsp
^ off
)))
2044 && is_vhost_running(virtio_dev
)) {
2045 netdev_change_seq_changed(&dev
->up
);
2047 /* Clear statistics if device is getting up. */
2048 if (NETDEV_UP
& on
) {
2049 rte_spinlock_lock(&dev
->stats_lock
);
2050 memset(&dev
->stats
, 0, sizeof(dev
->stats
));
2051 rte_spinlock_unlock(&dev
->stats_lock
);
2060 netdev_dpdk_update_flags(struct netdev
*netdev
,
2061 enum netdev_flags off
, enum netdev_flags on
,
2062 enum netdev_flags
*old_flagsp
)
2064 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2067 ovs_mutex_lock(&dev
->mutex
);
2068 error
= netdev_dpdk_update_flags__(dev
, off
, on
, old_flagsp
);
2069 ovs_mutex_unlock(&dev
->mutex
);
2075 netdev_dpdk_get_status(const struct netdev
*netdev
, struct smap
*args
)
2077 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2078 struct rte_eth_dev_info dev_info
;
2080 if (dev
->port_id
< 0)
2083 ovs_mutex_lock(&dev
->mutex
);
2084 rte_eth_dev_info_get(dev
->port_id
, &dev_info
);
2085 ovs_mutex_unlock(&dev
->mutex
);
2087 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
2089 smap_add_format(args
, "port_no", "%d", dev
->port_id
);
2090 smap_add_format(args
, "numa_id", "%d", rte_eth_dev_socket_id(dev
->port_id
));
2091 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
2092 smap_add_format(args
, "min_rx_bufsize", "%u", dev_info
.min_rx_bufsize
);
2093 smap_add_format(args
, "max_rx_pktlen", "%u", dev
->max_packet_len
);
2094 smap_add_format(args
, "max_rx_queues", "%u", dev_info
.max_rx_queues
);
2095 smap_add_format(args
, "max_tx_queues", "%u", dev_info
.max_tx_queues
);
2096 smap_add_format(args
, "max_mac_addrs", "%u", dev_info
.max_mac_addrs
);
2097 smap_add_format(args
, "max_hash_mac_addrs", "%u", dev_info
.max_hash_mac_addrs
);
2098 smap_add_format(args
, "max_vfs", "%u", dev_info
.max_vfs
);
2099 smap_add_format(args
, "max_vmdq_pools", "%u", dev_info
.max_vmdq_pools
);
2101 if (dev_info
.pci_dev
) {
2102 smap_add_format(args
, "pci-vendor_id", "0x%u",
2103 dev_info
.pci_dev
->id
.vendor_id
);
2104 smap_add_format(args
, "pci-device_id", "0x%x",
2105 dev_info
.pci_dev
->id
.device_id
);
2112 netdev_dpdk_set_admin_state__(struct netdev_dpdk
*dev
, bool admin_state
)
2113 OVS_REQUIRES(dev
->mutex
)
2115 enum netdev_flags old_flags
;
2118 netdev_dpdk_update_flags__(dev
, 0, NETDEV_UP
, &old_flags
);
2120 netdev_dpdk_update_flags__(dev
, NETDEV_UP
, 0, &old_flags
);
2125 netdev_dpdk_set_admin_state(struct unixctl_conn
*conn
, int argc
,
2126 const char *argv
[], void *aux OVS_UNUSED
)
2130 if (!strcasecmp(argv
[argc
- 1], "up")) {
2132 } else if ( !strcasecmp(argv
[argc
- 1], "down")) {
2135 unixctl_command_reply_error(conn
, "Invalid Admin State");
2140 struct netdev
*netdev
= netdev_from_name(argv
[1]);
2141 if (netdev
&& is_dpdk_class(netdev
->netdev_class
)) {
2142 struct netdev_dpdk
*dpdk_dev
= netdev_dpdk_cast(netdev
);
2144 ovs_mutex_lock(&dpdk_dev
->mutex
);
2145 netdev_dpdk_set_admin_state__(dpdk_dev
, up
);
2146 ovs_mutex_unlock(&dpdk_dev
->mutex
);
2148 netdev_close(netdev
);
2150 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
2151 netdev_close(netdev
);
2155 struct netdev_dpdk
*netdev
;
2157 ovs_mutex_lock(&dpdk_mutex
);
2158 LIST_FOR_EACH (netdev
, list_node
, &dpdk_list
) {
2159 ovs_mutex_lock(&netdev
->mutex
);
2160 netdev_dpdk_set_admin_state__(netdev
, up
);
2161 ovs_mutex_unlock(&netdev
->mutex
);
2163 ovs_mutex_unlock(&dpdk_mutex
);
2165 unixctl_command_reply(conn
, "OK");
2169 * Set virtqueue flags so that we do not receive interrupts.
2172 set_irq_status(struct virtio_net
*virtio_dev
)
2177 for (i
= 0; i
< virtio_dev
->virt_qp_nb
; i
++) {
2178 idx
= i
* VIRTIO_QNUM
;
2179 rte_vhost_enable_guest_notification(virtio_dev
, idx
+ VIRTIO_RXQ
, 0);
2180 rte_vhost_enable_guest_notification(virtio_dev
, idx
+ VIRTIO_TXQ
, 0);
2185 * Fixes mapping for vhost-user tx queues. Must be called after each
2186 * enabling/disabling of queues and n_txq modifications.
2189 netdev_dpdk_remap_txqs(struct netdev_dpdk
*dev
)
2190 OVS_REQUIRES(dev
->mutex
)
2192 int *enabled_queues
, n_enabled
= 0;
2193 int i
, k
, total_txqs
= dev
->up
.n_txq
;
2195 enabled_queues
= dpdk_rte_mzalloc(total_txqs
* sizeof *enabled_queues
);
2197 for (i
= 0; i
< total_txqs
; i
++) {
2198 /* Enabled queues always mapped to themselves. */
2199 if (dev
->tx_q
[i
].map
== i
) {
2200 enabled_queues
[n_enabled
++] = i
;
2204 if (n_enabled
== 0 && total_txqs
!= 0) {
2205 enabled_queues
[0] = OVS_VHOST_QUEUE_DISABLED
;
2210 for (i
= 0; i
< total_txqs
; i
++) {
2211 if (dev
->tx_q
[i
].map
!= i
) {
2212 dev
->tx_q
[i
].map
= enabled_queues
[k
];
2213 k
= (k
+ 1) % n_enabled
;
2217 VLOG_DBG("TX queue mapping for %s\n", dev
->vhost_id
);
2218 for (i
= 0; i
< total_txqs
; i
++) {
2219 VLOG_DBG("%2d --> %2d", i
, dev
->tx_q
[i
].map
);
2222 rte_free(enabled_queues
);
2226 * A new virtio-net device is added to a vhost port.
2229 new_device(struct virtio_net
*virtio_dev
)
2231 struct netdev_dpdk
*dev
;
2232 bool exists
= false;
2236 ovs_mutex_lock(&dpdk_mutex
);
2237 /* Add device to the vhost port with the same name as that passed down. */
2238 LIST_FOR_EACH(dev
, list_node
, &dpdk_list
) {
2239 if (strncmp(virtio_dev
->ifname
, dev
->vhost_id
, IF_NAME_SZ
) == 0) {
2240 uint32_t qp_num
= virtio_dev
->virt_qp_nb
;
2242 ovs_mutex_lock(&dev
->mutex
);
2243 /* Get NUMA information */
2244 err
= get_mempolicy(&newnode
, NULL
, 0, virtio_dev
,
2245 MPOL_F_NODE
| MPOL_F_ADDR
);
2247 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
2248 virtio_dev
->ifname
);
2249 newnode
= dev
->socket_id
;
2252 dev
->requested_socket_id
= newnode
;
2253 dev
->requested_n_rxq
= qp_num
;
2254 dev
->requested_n_txq
= qp_num
;
2255 netdev_request_reconfigure(&dev
->up
);
2257 ovsrcu_set(&dev
->virtio_dev
, virtio_dev
);
2260 /* Disable notifications. */
2261 set_irq_status(virtio_dev
);
2262 netdev_change_seq_changed(&dev
->up
);
2263 ovs_mutex_unlock(&dev
->mutex
);
2267 ovs_mutex_unlock(&dpdk_mutex
);
2270 VLOG_INFO("vHost Device '%s' %"PRIu64
" can't be added - name not "
2271 "found", virtio_dev
->ifname
, virtio_dev
->device_fh
);
2276 VLOG_INFO("vHost Device '%s' %"PRIu64
" has been added on numa node %i",
2277 virtio_dev
->ifname
, virtio_dev
->device_fh
, newnode
);
2281 /* Clears mapping for all available queues of vhost interface. */
2283 netdev_dpdk_txq_map_clear(struct netdev_dpdk
*dev
)
2284 OVS_REQUIRES(dev
->mutex
)
2288 for (i
= 0; i
< dev
->up
.n_txq
; i
++) {
2289 dev
->tx_q
[i
].map
= OVS_VHOST_QUEUE_MAP_UNKNOWN
;
2294 * Remove a virtio-net device from the specific vhost port. Use dev->remove
2295 * flag to stop any more packets from being sent or received to/from a VM and
2296 * ensure all currently queued packets have been sent/received before removing
2300 destroy_device(volatile struct virtio_net
*virtio_dev
)
2302 struct netdev_dpdk
*dev
;
2303 bool exists
= false;
2305 ovs_mutex_lock(&dpdk_mutex
);
2306 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
2307 if (netdev_dpdk_get_virtio(dev
) == virtio_dev
) {
2309 ovs_mutex_lock(&dev
->mutex
);
2310 virtio_dev
->flags
&= ~VIRTIO_DEV_RUNNING
;
2311 ovsrcu_set(&dev
->virtio_dev
, NULL
);
2312 /* Clear tx/rx queue settings. */
2313 netdev_dpdk_txq_map_clear(dev
);
2314 dev
->requested_n_rxq
= NR_QUEUE
;
2315 dev
->requested_n_txq
= NR_QUEUE
;
2316 netdev_request_reconfigure(&dev
->up
);
2318 netdev_change_seq_changed(&dev
->up
);
2319 ovs_mutex_unlock(&dev
->mutex
);
2325 ovs_mutex_unlock(&dpdk_mutex
);
2327 if (exists
== true) {
2329 * Wait for other threads to quiesce after setting the 'virtio_dev'
2330 * to NULL, before returning.
2332 ovsrcu_synchronize();
2334 * As call to ovsrcu_synchronize() will end the quiescent state,
2335 * put thread back into quiescent state before returning.
2337 ovsrcu_quiesce_start();
2338 VLOG_INFO("vHost Device '%s' %"PRIu64
" has been removed",
2339 virtio_dev
->ifname
, virtio_dev
->device_fh
);
2341 VLOG_INFO("vHost Device '%s' %"PRIu64
" not found", virtio_dev
->ifname
,
2342 virtio_dev
->device_fh
);
2347 vring_state_changed(struct virtio_net
*virtio_dev
, uint16_t queue_id
,
2350 struct netdev_dpdk
*dev
;
2351 bool exists
= false;
2352 int qid
= queue_id
/ VIRTIO_QNUM
;
2354 if (queue_id
% VIRTIO_QNUM
== VIRTIO_TXQ
) {
2358 ovs_mutex_lock(&dpdk_mutex
);
2359 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
2360 if (strncmp(virtio_dev
->ifname
, dev
->vhost_id
, IF_NAME_SZ
) == 0) {
2361 ovs_mutex_lock(&dev
->mutex
);
2363 dev
->tx_q
[qid
].map
= qid
;
2365 dev
->tx_q
[qid
].map
= OVS_VHOST_QUEUE_DISABLED
;
2367 netdev_dpdk_remap_txqs(dev
);
2369 ovs_mutex_unlock(&dev
->mutex
);
2373 ovs_mutex_unlock(&dpdk_mutex
);
2376 VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
2377 PRIu64
" changed to \'%s\'", queue_id
, qid
,
2378 virtio_dev
->ifname
, virtio_dev
->device_fh
,
2379 (enable
== 1) ? "enabled" : "disabled");
2381 VLOG_INFO("vHost Device '%s' %"PRIu64
" not found", virtio_dev
->ifname
,
2382 virtio_dev
->device_fh
);
2390 netdev_dpdk_get_virtio(const struct netdev_dpdk
*dev
)
2392 return ovsrcu_get(struct virtio_net
*, &dev
->virtio_dev
);
2395 struct ingress_policer
*
2396 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk
*dev
)
2398 return ovsrcu_get(struct ingress_policer
*, &dev
->ingress_policer
);
2402 * These callbacks allow virtio-net devices to be added to vhost ports when
2403 * configuration has been fully complete.
2405 static const struct virtio_net_device_ops virtio_net_device_ops
=
2407 .new_device
= new_device
,
2408 .destroy_device
= destroy_device
,
2409 .vring_state_changed
= vring_state_changed
2413 start_vhost_loop(void *dummy OVS_UNUSED
)
2415 pthread_detach(pthread_self());
2416 /* Put the cuse thread into quiescent state. */
2417 ovsrcu_quiesce_start();
2418 rte_vhost_driver_session_start();
2423 dpdk_vhost_class_init(void)
2425 rte_vhost_driver_callback_register(&virtio_net_device_ops
);
2426 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
2427 | 1ULL << VIRTIO_NET_F_HOST_TSO6
2428 | 1ULL << VIRTIO_NET_F_CSUM
);
2430 ovs_thread_create("vhost_thread", start_vhost_loop
, NULL
);
2435 dpdk_vhost_cuse_class_init(void)
2441 dpdk_vhost_user_class_init(void)
2447 dpdk_common_init(void)
2449 unixctl_command_register("netdev-dpdk/set-admin-state",
2450 "[netdev] up|down", 1, 2,
2451 netdev_dpdk_set_admin_state
, NULL
);
2458 dpdk_ring_create(const char dev_name
[], unsigned int port_no
,
2459 unsigned int *eth_port_id
)
2461 struct dpdk_ring
*ivshmem
;
2462 char ring_name
[RTE_RING_NAMESIZE
];
2465 ivshmem
= dpdk_rte_mzalloc(sizeof *ivshmem
);
2466 if (ivshmem
== NULL
) {
2470 /* XXX: Add support for multiquque ring. */
2471 err
= snprintf(ring_name
, sizeof(ring_name
), "%s_tx", dev_name
);
2476 /* Create single producer tx ring, netdev does explicit locking. */
2477 ivshmem
->cring_tx
= rte_ring_create(ring_name
, DPDK_RING_SIZE
, SOCKET0
,
2479 if (ivshmem
->cring_tx
== NULL
) {
2484 err
= snprintf(ring_name
, sizeof(ring_name
), "%s_rx", dev_name
);
2489 /* Create single consumer rx ring, netdev does explicit locking. */
2490 ivshmem
->cring_rx
= rte_ring_create(ring_name
, DPDK_RING_SIZE
, SOCKET0
,
2492 if (ivshmem
->cring_rx
== NULL
) {
2497 err
= rte_eth_from_rings(dev_name
, &ivshmem
->cring_rx
, 1,
2498 &ivshmem
->cring_tx
, 1, SOCKET0
);
2505 ivshmem
->user_port_id
= port_no
;
2506 ivshmem
->eth_port_id
= rte_eth_dev_count() - 1;
2507 ovs_list_push_back(&dpdk_ring_list
, &ivshmem
->list_node
);
2509 *eth_port_id
= ivshmem
->eth_port_id
;
2514 dpdk_ring_open(const char dev_name
[], unsigned int *eth_port_id
)
2515 OVS_REQUIRES(dpdk_mutex
)
2517 struct dpdk_ring
*ivshmem
;
2518 unsigned int port_no
;
2521 /* Names always start with "dpdkr" */
2522 err
= dpdk_dev_parse_name(dev_name
, "dpdkr", &port_no
);
2527 /* look through our list to find the device */
2528 LIST_FOR_EACH (ivshmem
, list_node
, &dpdk_ring_list
) {
2529 if (ivshmem
->user_port_id
== port_no
) {
2530 VLOG_INFO("Found dpdk ring device %s:", dev_name
);
2531 *eth_port_id
= ivshmem
->eth_port_id
; /* really all that is needed */
2535 /* Need to create the device rings */
2536 return dpdk_ring_create(dev_name
, port_no
, eth_port_id
);
2540 netdev_dpdk_ring_send(struct netdev
*netdev
, int qid
,
2541 struct dp_packet_batch
*batch
, bool may_steal
)
2543 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2546 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the
2547 * rss hash field is clear. This is because the same mbuf may be modified by
2548 * the consumer of the ring and return into the datapath without recalculating
2550 for (i
= 0; i
< batch
->count
; i
++) {
2551 dp_packet_rss_invalidate(batch
->packets
[i
]);
2554 netdev_dpdk_send__(dev
, qid
, batch
, may_steal
);
2559 netdev_dpdk_ring_construct(struct netdev
*netdev
)
2561 unsigned int port_no
= 0;
2564 if (rte_eal_init_ret
) {
2565 return rte_eal_init_ret
;
2568 ovs_mutex_lock(&dpdk_mutex
);
2570 err
= dpdk_ring_open(netdev
->name
, &port_no
);
2575 err
= netdev_dpdk_init(netdev
, port_no
, DPDK_DEV_ETH
);
2578 ovs_mutex_unlock(&dpdk_mutex
);
2585 * Initialize QoS configuration operations.
2588 qos_conf_init(struct qos_conf
*conf
, const struct dpdk_qos_ops
*ops
)
2594 * Search existing QoS operations in qos_ops and compare each set of
2595 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
2598 static const struct dpdk_qos_ops
*
2599 qos_lookup_name(const char *name
)
2601 const struct dpdk_qos_ops
*const *opsp
;
2603 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
2604 const struct dpdk_qos_ops
*ops
= *opsp
;
2605 if (!strcmp(name
, ops
->qos_name
)) {
2613 * Call qos_destruct to clean up items associated with the netdevs
2614 * qos_conf. Set netdevs qos_conf to NULL.
2617 qos_delete_conf(struct netdev
*netdev
)
2619 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2621 rte_spinlock_lock(&dev
->qos_lock
);
2622 if (dev
->qos_conf
) {
2623 if (dev
->qos_conf
->ops
->qos_destruct
) {
2624 dev
->qos_conf
->ops
->qos_destruct(netdev
, dev
->qos_conf
);
2626 dev
->qos_conf
= NULL
;
2628 rte_spinlock_unlock(&dev
->qos_lock
);
2632 netdev_dpdk_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2635 const struct dpdk_qos_ops
*const *opsp
;
2637 for (opsp
= qos_confs
; *opsp
!= NULL
; opsp
++) {
2638 const struct dpdk_qos_ops
*ops
= *opsp
;
2639 if (ops
->qos_construct
&& ops
->qos_name
[0] != '\0') {
2640 sset_add(types
, ops
->qos_name
);
2647 netdev_dpdk_get_qos(const struct netdev
*netdev
,
2648 const char **typep
, struct smap
*details
)
2650 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2653 ovs_mutex_lock(&dev
->mutex
);
2655 *typep
= dev
->qos_conf
->ops
->qos_name
;
2656 error
= (dev
->qos_conf
->ops
->qos_get
2657 ? dev
->qos_conf
->ops
->qos_get(netdev
, details
): 0);
2659 ovs_mutex_unlock(&dev
->mutex
);
2665 netdev_dpdk_set_qos(struct netdev
*netdev
,
2666 const char *type
, const struct smap
*details
)
2668 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2669 const struct dpdk_qos_ops
*new_ops
= NULL
;
2672 /* If type is empty or unsupported then the current QoS configuration
2673 * for the dpdk-netdev can be destroyed */
2674 new_ops
= qos_lookup_name(type
);
2676 if (type
[0] == '\0' || !new_ops
|| !new_ops
->qos_construct
) {
2677 qos_delete_conf(netdev
);
2681 ovs_mutex_lock(&dev
->mutex
);
2683 if (dev
->qos_conf
) {
2684 if (new_ops
== dev
->qos_conf
->ops
) {
2685 error
= new_ops
->qos_set
? new_ops
->qos_set(netdev
, details
) : 0;
2687 /* Delete existing QoS configuration. */
2688 qos_delete_conf(netdev
);
2689 ovs_assert(dev
->qos_conf
== NULL
);
2691 /* Install new QoS configuration. */
2692 error
= new_ops
->qos_construct(netdev
, details
);
2693 ovs_assert((error
== 0) == (dev
->qos_conf
!= NULL
));
2696 error
= new_ops
->qos_construct(netdev
, details
);
2697 ovs_assert((error
== 0) == (dev
->qos_conf
!= NULL
));
2700 ovs_mutex_unlock(&dev
->mutex
);
2704 /* egress-policer details */
2706 struct egress_policer
{
2707 struct qos_conf qos_conf
;
2708 struct rte_meter_srtcm_params app_srtcm_params
;
2709 struct rte_meter_srtcm egress_meter
;
2712 static struct egress_policer
*
2713 egress_policer_get__(const struct netdev
*netdev
)
2715 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2716 return CONTAINER_OF(dev
->qos_conf
, struct egress_policer
, qos_conf
);
2720 egress_policer_qos_construct(struct netdev
*netdev
,
2721 const struct smap
*details
)
2723 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2724 struct egress_policer
*policer
;
2729 rte_spinlock_lock(&dev
->qos_lock
);
2730 policer
= xmalloc(sizeof *policer
);
2731 qos_conf_init(&policer
->qos_conf
, &egress_policer_ops
);
2732 dev
->qos_conf
= &policer
->qos_conf
;
2733 cir_s
= smap_get(details
, "cir");
2734 cbs_s
= smap_get(details
, "cbs");
2735 policer
->app_srtcm_params
.cir
= cir_s
? strtoull(cir_s
, NULL
, 10) : 0;
2736 policer
->app_srtcm_params
.cbs
= cbs_s
? strtoull(cbs_s
, NULL
, 10) : 0;
2737 policer
->app_srtcm_params
.ebs
= 0;
2738 err
= rte_meter_srtcm_config(&policer
->egress_meter
,
2739 &policer
->app_srtcm_params
);
2740 rte_spinlock_unlock(&dev
->qos_lock
);
2746 egress_policer_qos_destruct(struct netdev
*netdev OVS_UNUSED
,
2747 struct qos_conf
*conf
)
2749 struct egress_policer
*policer
= CONTAINER_OF(conf
, struct egress_policer
,
2755 egress_policer_qos_get(const struct netdev
*netdev
, struct smap
*details
)
2757 struct egress_policer
*policer
= egress_policer_get__(netdev
);
2758 smap_add_format(details
, "cir", "%llu",
2759 1ULL * policer
->app_srtcm_params
.cir
);
2760 smap_add_format(details
, "cbs", "%llu",
2761 1ULL * policer
->app_srtcm_params
.cbs
);
2767 egress_policer_qos_set(struct netdev
*netdev
, const struct smap
*details
)
2769 struct egress_policer
*policer
;
2774 policer
= egress_policer_get__(netdev
);
2775 cir_s
= smap_get(details
, "cir");
2776 cbs_s
= smap_get(details
, "cbs");
2777 policer
->app_srtcm_params
.cir
= cir_s
? strtoull(cir_s
, NULL
, 10) : 0;
2778 policer
->app_srtcm_params
.cbs
= cbs_s
? strtoull(cbs_s
, NULL
, 10) : 0;
2779 policer
->app_srtcm_params
.ebs
= 0;
2780 err
= rte_meter_srtcm_config(&policer
->egress_meter
,
2781 &policer
->app_srtcm_params
);
2787 egress_policer_run(struct netdev
*netdev
, struct rte_mbuf
**pkts
, int pkt_cnt
)
2790 struct egress_policer
*policer
= egress_policer_get__(netdev
);
2792 cnt
= netdev_dpdk_policer_run(&policer
->egress_meter
, pkts
, pkt_cnt
);
2797 static const struct dpdk_qos_ops egress_policer_ops
= {
2798 "egress-policer", /* qos_name */
2799 egress_policer_qos_construct
,
2800 egress_policer_qos_destruct
,
2801 egress_policer_qos_get
,
2802 egress_policer_qos_set
,
2807 netdev_dpdk_reconfigure(struct netdev
*netdev
)
2809 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2812 ovs_mutex_lock(&dpdk_mutex
);
2813 ovs_mutex_lock(&dev
->mutex
);
2815 if (netdev
->n_txq
== dev
->requested_n_txq
2816 && netdev
->n_rxq
== dev
->requested_n_rxq
) {
2817 /* Reconfiguration is unnecessary */
2822 rte_eth_dev_stop(dev
->port_id
);
2824 netdev
->n_txq
= dev
->requested_n_txq
;
2825 netdev
->n_rxq
= dev
->requested_n_rxq
;
2827 rte_free(dev
->tx_q
);
2828 err
= dpdk_eth_dev_init(dev
);
2829 netdev_dpdk_alloc_txq(dev
, netdev
->n_txq
);
2831 dev
->txq_needs_locking
= netdev
->n_txq
< dev
->requested_n_txq
;
2835 ovs_mutex_unlock(&dev
->mutex
);
2836 ovs_mutex_unlock(&dpdk_mutex
);
2842 netdev_dpdk_vhost_user_reconfigure(struct netdev
*netdev
)
2844 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2845 struct virtio_net
*virtio_dev
= netdev_dpdk_get_virtio(dev
);
2848 ovs_mutex_lock(&dpdk_mutex
);
2849 ovs_mutex_lock(&dev
->mutex
);
2851 netdev
->n_txq
= dev
->requested_n_txq
;
2852 netdev
->n_rxq
= dev
->requested_n_rxq
;
2854 /* Enable TX queue 0 by default if it wasn't disabled. */
2855 if (dev
->tx_q
[0].map
== OVS_VHOST_QUEUE_MAP_UNKNOWN
) {
2856 dev
->tx_q
[0].map
= 0;
2859 netdev_dpdk_remap_txqs(dev
);
2861 if (dev
->requested_socket_id
!= dev
->socket_id
) {
2862 dev
->socket_id
= dev
->requested_socket_id
;
2863 /* Change mempool to new NUMA Node */
2864 dpdk_mp_put(dev
->dpdk_mp
);
2865 dev
->dpdk_mp
= dpdk_mp_get(dev
->socket_id
, dev
->mtu
);
2866 if (!dev
->dpdk_mp
) {
2872 virtio_dev
->flags
|= VIRTIO_DEV_RUNNING
;
2875 ovs_mutex_unlock(&dev
->mutex
);
2876 ovs_mutex_unlock(&dpdk_mutex
);
2882 netdev_dpdk_vhost_cuse_reconfigure(struct netdev
*netdev
)
2884 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
2886 ovs_mutex_lock(&dpdk_mutex
);
2887 ovs_mutex_lock(&dev
->mutex
);
2889 netdev
->n_txq
= dev
->requested_n_txq
;
2892 ovs_mutex_unlock(&dev
->mutex
);
2893 ovs_mutex_unlock(&dpdk_mutex
);
2898 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
2899 SET_CONFIG, SET_TX_MULTIQ, SEND, \
2900 GET_CARRIER, GET_STATS, \
2901 GET_FEATURES, GET_STATUS, \
2902 RECONFIGURE, RXQ_RECV) \
2905 true, /* is_pmd */ \
2907 NULL, /* netdev_dpdk_run */ \
2908 NULL, /* netdev_dpdk_wait */ \
2910 netdev_dpdk_alloc, \
2913 netdev_dpdk_dealloc, \
2914 netdev_dpdk_get_config, \
2916 NULL, /* get_tunnel_config */ \
2917 NULL, /* build header */ \
2918 NULL, /* push header */ \
2919 NULL, /* pop header */ \
2920 netdev_dpdk_get_numa_id, /* get_numa_id */ \
2924 NULL, /* send_wait */ \
2926 netdev_dpdk_set_etheraddr, \
2927 netdev_dpdk_get_etheraddr, \
2928 netdev_dpdk_get_mtu, \
2929 netdev_dpdk_set_mtu, \
2930 netdev_dpdk_get_ifindex, \
2932 netdev_dpdk_get_carrier_resets, \
2933 netdev_dpdk_set_miimon, \
2936 NULL, /* set_advertisements */ \
2938 netdev_dpdk_set_policing, \
2939 netdev_dpdk_get_qos_types, \
2940 NULL, /* get_qos_capabilities */ \
2941 netdev_dpdk_get_qos, \
2942 netdev_dpdk_set_qos, \
2943 NULL, /* get_queue */ \
2944 NULL, /* set_queue */ \
2945 NULL, /* delete_queue */ \
2946 NULL, /* get_queue_stats */ \
2947 NULL, /* queue_dump_start */ \
2948 NULL, /* queue_dump_next */ \
2949 NULL, /* queue_dump_done */ \
2950 NULL, /* dump_queue_stats */ \
2952 NULL, /* set_in4 */ \
2953 NULL, /* get_addr_list */ \
2954 NULL, /* add_router */ \
2955 NULL, /* get_next_hop */ \
2957 NULL, /* arp_lookup */ \
2959 netdev_dpdk_update_flags, \
2962 netdev_dpdk_rxq_alloc, \
2963 netdev_dpdk_rxq_construct, \
2964 netdev_dpdk_rxq_destruct, \
2965 netdev_dpdk_rxq_dealloc, \
2967 NULL, /* rx_wait */ \
2968 NULL, /* rxq_drain */ \
2972 process_vhost_flags(char *flag
, char *default_val
, int size
,
2973 const struct smap
*ovs_other_config
,
2979 val
= smap_get(ovs_other_config
, flag
);
2981 /* Depending on which version of vhost is in use, process the vhost-specific
2982 * flag if it is provided, otherwise resort to default value.
2984 if (val
&& (strlen(val
) <= size
)) {
2986 *new_val
= xstrdup(val
);
2987 VLOG_INFO("User-provided %s in use: %s", flag
, *new_val
);
2989 VLOG_INFO("No %s provided - defaulting to %s", flag
, default_val
);
2990 *new_val
= default_val
;
2997 grow_argv(char ***argv
, size_t cur_siz
, size_t grow_by
)
2999 return xrealloc(*argv
, sizeof(char *) * (cur_siz
+ grow_by
));
3003 dpdk_option_extend(char ***argv
, int argc
, const char *option
,
3006 char **newargv
= grow_argv(argv
, argc
, 2);
3008 newargv
[argc
] = xstrdup(option
);
3009 newargv
[argc
+1] = xstrdup(value
);
3013 move_argv(char ***argv
, size_t cur_size
, char **src_argv
, size_t src_argc
)
3015 char **newargv
= grow_argv(argv
, cur_size
, src_argc
);
3016 while (src_argc
--) {
3017 newargv
[cur_size
+src_argc
] = src_argv
[src_argc
];
3018 src_argv
[src_argc
] = NULL
;
3024 extra_dpdk_args(const char *ovs_extra_config
, char ***argv
, int argc
)
3027 char *release_tok
= xstrdup(ovs_extra_config
);
3028 char *tok
= release_tok
, *endptr
= NULL
;
3030 for (tok
= strtok_r(release_tok
, " ", &endptr
); tok
!= NULL
;
3031 tok
= strtok_r(NULL
, " ", &endptr
)) {
3032 char **newarg
= grow_argv(argv
, ret
, 1);
3034 newarg
[ret
++] = xstrdup(tok
);
3041 argv_contains(char **argv_haystack
, const size_t argc_haystack
,
3044 for (size_t i
= 0; i
< argc_haystack
; ++i
) {
3045 if (!strcmp(argv_haystack
[i
], needle
))
3052 construct_dpdk_options(const struct smap
*ovs_other_config
,
3053 char ***argv
, const int initial_size
,
3054 char **extra_args
, const size_t extra_argc
)
3056 struct dpdk_options_map
{
3057 const char *ovs_configuration
;
3058 const char *dpdk_option
;
3059 bool default_enabled
;
3060 const char *default_value
;
3062 {"dpdk-lcore-mask", "-c", false, NULL
},
3063 {"dpdk-hugepage-dir", "--huge-dir", false, NULL
},
3066 int i
, ret
= initial_size
;
3068 /*First, construct from the flat-options (non-mutex)*/
3069 for (i
= 0; i
< ARRAY_SIZE(opts
); ++i
) {
3070 const char *lookup
= smap_get(ovs_other_config
,
3071 opts
[i
].ovs_configuration
);
3072 if (!lookup
&& opts
[i
].default_enabled
) {
3073 lookup
= opts
[i
].default_value
;
3077 if (!argv_contains(extra_args
, extra_argc
, opts
[i
].dpdk_option
)) {
3078 dpdk_option_extend(argv
, ret
, opts
[i
].dpdk_option
, lookup
);
3081 VLOG_WARN("Ignoring database defined option '%s' due to "
3082 "dpdk_extras config", opts
[i
].dpdk_option
);
3090 #define MAX_DPDK_EXCL_OPTS 10
3093 construct_dpdk_mutex_options(const struct smap
*ovs_other_config
,
3094 char ***argv
, const int initial_size
,
3095 char **extra_args
, const size_t extra_argc
)
3097 struct dpdk_exclusive_options_map
{
3098 const char *category
;
3099 const char *ovs_dpdk_options
[MAX_DPDK_EXCL_OPTS
];
3100 const char *eal_dpdk_options
[MAX_DPDK_EXCL_OPTS
];
3101 const char *default_value
;
3105 {"dpdk-alloc-mem", "dpdk-socket-mem", NULL
,},
3106 {"-m", "--socket-mem", NULL
,},
3111 int i
, ret
= initial_size
;
3112 for (i
= 0; i
< ARRAY_SIZE(excl_opts
); ++i
) {
3113 int found_opts
= 0, scan
, found_pos
= -1;
3114 const char *found_value
;
3115 struct dpdk_exclusive_options_map
*popt
= &excl_opts
[i
];
3117 for (scan
= 0; scan
< MAX_DPDK_EXCL_OPTS
3118 && popt
->ovs_dpdk_options
[scan
]; ++scan
) {
3119 const char *lookup
= smap_get(ovs_other_config
,
3120 popt
->ovs_dpdk_options
[scan
]);
3121 if (lookup
&& strlen(lookup
)) {
3124 found_value
= lookup
;
3129 if (popt
->default_option
) {
3130 found_pos
= popt
->default_option
;
3131 found_value
= popt
->default_value
;
3137 if (found_opts
> 1) {
3138 VLOG_ERR("Multiple defined options for %s. Please check your"
3139 " database settings and reconfigure if necessary.",
3143 if (!argv_contains(extra_args
, extra_argc
,
3144 popt
->eal_dpdk_options
[found_pos
])) {
3145 dpdk_option_extend(argv
, ret
, popt
->eal_dpdk_options
[found_pos
],
3149 VLOG_WARN("Ignoring database defined option '%s' due to "
3150 "dpdk_extras config", popt
->eal_dpdk_options
[found_pos
]);
3158 get_dpdk_args(const struct smap
*ovs_other_config
, char ***argv
,
3161 const char *extra_configuration
;
3162 char **extra_args
= NULL
;
3164 size_t extra_argc
= 0;
3166 extra_configuration
= smap_get(ovs_other_config
, "dpdk-extra");
3167 if (extra_configuration
) {
3168 extra_argc
= extra_dpdk_args(extra_configuration
, &extra_args
, 0);
3171 i
= construct_dpdk_options(ovs_other_config
, argv
, argc
, extra_args
,
3173 i
= construct_dpdk_mutex_options(ovs_other_config
, argv
, i
, extra_args
,
3176 if (extra_configuration
) {
3177 *argv
= move_argv(argv
, i
, extra_args
, extra_argc
);
3180 return i
+ extra_argc
;
3183 static char **dpdk_argv
;
3184 static int dpdk_argc
;
3187 deferred_argv_release(void)
3190 for (result
= 0; result
< dpdk_argc
; ++result
) {
3191 free(dpdk_argv
[result
]);
3198 dpdk_init__(const struct smap
*ovs_other_config
)
3203 bool auto_determine
= true;
3207 char *sock_dir_subcomponent
;
3210 if (!smap_get_bool(ovs_other_config
, "dpdk-init", false)) {
3211 VLOG_INFO("DPDK Disabled - to change this requires a restart.\n");
3215 VLOG_INFO("DPDK Enabled, initializing");
3218 if (process_vhost_flags("cuse-dev-name", xstrdup("vhost-net"),
3219 PATH_MAX
, ovs_other_config
, &cuse_dev_name
)) {
3221 if (process_vhost_flags("vhost-sock-dir", xstrdup(ovs_rundir()),
3222 NAME_MAX
, ovs_other_config
,
3223 &sock_dir_subcomponent
)) {
3225 if (!strstr(sock_dir_subcomponent
, "..")) {
3226 vhost_sock_dir
= xasprintf("%s/%s", ovs_rundir(),
3227 sock_dir_subcomponent
);
3229 err
= stat(vhost_sock_dir
, &s
);
3231 VLOG_ERR("vhost-user sock directory '%s' does not exist.",
3235 vhost_sock_dir
= xstrdup(ovs_rundir());
3236 VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
3237 "characters '..' - using %s instead.",
3238 ovs_rundir(), sock_dir_subcomponent
, ovs_rundir());
3240 free(sock_dir_subcomponent
);
3242 vhost_sock_dir
= sock_dir_subcomponent
;
3246 argv
= grow_argv(&argv
, 0, 1);
3248 argv
[0] = xstrdup(ovs_get_program_name());
3249 argc_tmp
= get_dpdk_args(ovs_other_config
, &argv
, argc
);
3251 while (argc_tmp
!= argc
) {
3252 if (!strcmp("-c", argv
[argc
]) || !strcmp("-l", argv
[argc
])) {
3253 auto_determine
= false;
3261 * NOTE: This is an unsophisticated mechanism for determining the DPDK
3262 * lcore for the DPDK Master.
3264 if (auto_determine
) {
3266 /* Get the main thread affinity */
3268 err
= pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t
),
3271 for (i
= 0; i
< CPU_SETSIZE
; i
++) {
3272 if (CPU_ISSET(i
, &cpuset
)) {
3273 argv
= grow_argv(&argv
, argc
, 2);
3274 argv
[argc
++] = xstrdup("-c");
3275 argv
[argc
++] = xasprintf("0x%08llX", (1ULL<<i
));
3280 VLOG_ERR("Thread getaffinity error %d. Using core 0x1", err
);
3281 /* User did not set dpdk-lcore-mask and unable to get current
3282 * thread affintity - default to core 0x1 */
3283 argv
= grow_argv(&argv
, argc
, 2);
3284 argv
[argc
++] = xstrdup("-c");
3285 argv
[argc
++] = xasprintf("0x%X", 1);
3289 argv
= grow_argv(&argv
, argc
, 1);
3294 if (VLOG_IS_INFO_ENABLED()) {
3298 ds_put_cstr(&eal_args
, "EAL ARGS:");
3299 for (opt
= 0; opt
< argc
; ++opt
) {
3300 ds_put_cstr(&eal_args
, " ");
3301 ds_put_cstr(&eal_args
, argv
[opt
]);
3303 VLOG_INFO("%s", ds_cstr_ro(&eal_args
));
3304 ds_destroy(&eal_args
);
3307 /* Make sure things are initialized ... */
3308 result
= rte_eal_init(argc
, argv
);
3310 ovs_abort(result
, "Cannot init EAL");
3313 /* Set the main thread affinity back to pre rte_eal_init() value */
3314 if (auto_determine
&& !err
) {
3315 err
= pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t
),
3318 VLOG_ERR("Thread setaffinity error %d", err
);
3325 atexit(deferred_argv_release
);
3327 rte_memzone_dump(stdout
);
3328 rte_eal_init_ret
= 0;
3330 /* We are called from the main thread here */
3331 RTE_PER_LCORE(_lcore_id
) = NON_PMD_CORE_ID
;
3333 ovs_thread_create("dpdk_watchdog", dpdk_watchdog
, NULL
);
3336 /* Register CUSE device to handle IOCTLs.
3337 * Unless otherwise specified, cuse_dev_name is set to vhost-net.
3339 err
= rte_vhost_driver_register(cuse_dev_name
);
3342 VLOG_ERR("CUSE device setup failure.");
3347 dpdk_vhost_class_init();
3349 /* Finally, register the dpdk classes */
3350 netdev_dpdk_register();
3354 dpdk_init(const struct smap
*ovs_other_config
)
3356 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
3358 if (ovs_other_config
&& ovsthread_once_start(&once
)) {
3359 dpdk_init__(ovs_other_config
);
3360 ovsthread_once_done(&once
);
3364 static const struct netdev_class dpdk_class
=
3368 netdev_dpdk_construct
,
3369 netdev_dpdk_destruct
,
3370 netdev_dpdk_set_config
,
3371 netdev_dpdk_set_tx_multiq
,
3372 netdev_dpdk_eth_send
,
3373 netdev_dpdk_get_carrier
,
3374 netdev_dpdk_get_stats
,
3375 netdev_dpdk_get_features
,
3376 netdev_dpdk_get_status
,
3377 netdev_dpdk_reconfigure
,
3378 netdev_dpdk_rxq_recv
);
3380 static const struct netdev_class dpdk_ring_class
=
3384 netdev_dpdk_ring_construct
,
3385 netdev_dpdk_destruct
,
3386 netdev_dpdk_set_config
,
3387 netdev_dpdk_set_tx_multiq
,
3388 netdev_dpdk_ring_send
,
3389 netdev_dpdk_get_carrier
,
3390 netdev_dpdk_get_stats
,
3391 netdev_dpdk_get_features
,
3392 netdev_dpdk_get_status
,
3393 netdev_dpdk_reconfigure
,
3394 netdev_dpdk_rxq_recv
);
3396 static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class
=
3399 dpdk_vhost_cuse_class_init
,
3400 netdev_dpdk_vhost_cuse_construct
,
3401 netdev_dpdk_vhost_destruct
,
3404 netdev_dpdk_vhost_send
,
3405 netdev_dpdk_vhost_get_carrier
,
3406 netdev_dpdk_vhost_get_stats
,
3409 netdev_dpdk_vhost_cuse_reconfigure
,
3410 netdev_dpdk_vhost_rxq_recv
);
3412 static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class
=
3415 dpdk_vhost_user_class_init
,
3416 netdev_dpdk_vhost_user_construct
,
3417 netdev_dpdk_vhost_destruct
,
3420 netdev_dpdk_vhost_send
,
3421 netdev_dpdk_vhost_get_carrier
,
3422 netdev_dpdk_vhost_get_stats
,
3425 netdev_dpdk_vhost_user_reconfigure
,
3426 netdev_dpdk_vhost_rxq_recv
);
3429 netdev_dpdk_register(void)
3432 netdev_register_provider(&dpdk_class
);
3433 netdev_register_provider(&dpdk_ring_class
);
3435 netdev_register_provider(&dpdk_vhost_cuse_class
);
3437 netdev_register_provider(&dpdk_vhost_user_class
);
3442 dpdk_set_lcore_id(unsigned cpu
)
3444 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
3445 ovs_assert(cpu
!= NON_PMD_CORE_ID
);
3446 RTE_PER_LCORE(_lcore_id
) = cpu
;
3450 dpdk_thread_is_pmd(void)
3452 return rte_lcore_id() != NON_PMD_CORE_ID
;