2 * Copyright (c) 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
31 #include "dpif-netdev.h"
33 #include "netdev-dpdk.h"
34 #include "netdev-provider.h"
35 #include "netdev-vport.h"
37 #include "ofp-print.h"
39 #include "ovs-thread.h"
44 #include "unaligned.h"
49 VLOG_DEFINE_THIS_MODULE(dpdk
);
50 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
52 #define DPDK_PORT_WATCHDOG_INTERVAL 5
54 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
55 #define OVS_VPORT_DPDK "ovs_dpdk"
58 * need to reserve tons of extra space in the mbufs so we can align the
59 * DMA addresses to 4KB.
62 #define MTU_TO_MAX_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
63 #define MBUF_SIZE(mtu) (MTU_TO_MAX_LEN(mtu) + (512) + \
64 sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
66 /* TODO: mempool size should be based on system resources. */
67 #define NB_MBUF (4096 * 64)
68 #define MP_CACHE_SZ (256 * 2)
71 #define NON_PMD_THREAD_TX_QUEUE 0
73 /* TODO: Needs per NIC value for these constants. */
74 #define RX_PTHRESH 32 /* Default values of RX prefetch threshold reg. */
75 #define RX_HTHRESH 32 /* Default values of RX host threshold reg. */
76 #define RX_WTHRESH 16 /* Default values of RX write-back threshold reg. */
78 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
79 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
80 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
82 static const struct rte_eth_conf port_conf
= {
84 .mq_mode
= ETH_MQ_RX_RSS
,
86 .header_split
= 0, /* Header Split disabled */
87 .hw_ip_checksum
= 0, /* IP checksum offload disabled */
88 .hw_vlan_filter
= 0, /* VLAN filtering disabled */
89 .jumbo_frame
= 0, /* Jumbo Frame Support disabled */
95 .rss_hf
= ETH_RSS_IPV4_TCP
| ETH_RSS_IPV4
| ETH_RSS_IPV6
,
99 .mq_mode
= ETH_MQ_TX_NONE
,
103 static const struct rte_eth_rxconf rx_conf
= {
105 .pthresh
= RX_PTHRESH
,
106 .hthresh
= RX_HTHRESH
,
107 .wthresh
= RX_WTHRESH
,
111 static const struct rte_eth_txconf tx_conf
= {
113 .pthresh
= TX_PTHRESH
,
114 .hthresh
= TX_HTHRESH
,
115 .wthresh
= TX_WTHRESH
,
121 enum { MAX_RX_QUEUE_LEN
= 64 };
122 enum { MAX_TX_QUEUE_LEN
= 64 };
123 enum { DRAIN_TSC
= 200000ULL };
125 static int rte_eal_init_ret
= ENODEV
;
127 static struct ovs_mutex dpdk_mutex
= OVS_MUTEX_INITIALIZER
;
129 /* Contains all 'struct dpdk_dev's. */
130 static struct list dpdk_list
OVS_GUARDED_BY(dpdk_mutex
)
131 = LIST_INITIALIZER(&dpdk_list
);
133 static struct list dpdk_mp_list
OVS_GUARDED_BY(dpdk_mutex
)
134 = LIST_INITIALIZER(&dpdk_mp_list
);
137 struct rte_mempool
*mp
;
141 struct list list_node
OVS_GUARDED_BY(dpdk_mutex
);
144 struct dpdk_tx_queue
{
145 rte_spinlock_t tx_lock
;
148 struct rte_mbuf
*burst_pkts
[MAX_TX_QUEUE_LEN
];
156 struct dpdk_tx_queue tx_q
[NR_QUEUE
];
158 struct ovs_mutex mutex
OVS_ACQ_AFTER(dpdk_mutex
);
160 struct dpdk_mp
*dpdk_mp
;
164 struct netdev_stats stats_offset
;
165 struct netdev_stats stats
;
167 uint8_t hwaddr
[ETH_ADDR_LEN
];
168 enum netdev_flags flags
;
170 struct rte_eth_link link
;
174 struct list list_node
OVS_GUARDED_BY(dpdk_mutex
);
177 struct netdev_rxq_dpdk
{
178 struct netdev_rxq up
;
182 static int netdev_dpdk_construct(struct netdev
*);
185 is_dpdk_class(const struct netdev_class
*class)
187 return class->construct
== netdev_dpdk_construct
;
190 /* TODO: use dpdk malloc for entire OVS. infact huge page shld be used
191 * for all other sengments data, bss and text. */
194 dpdk_rte_mzalloc(size_t sz
)
198 ptr
= rte_zmalloc(OVS_VPORT_DPDK
, sz
, OVS_CACHE_LINE_SIZE
);
206 free_dpdk_buf(struct ofpbuf
*b
)
208 struct rte_mbuf
*pkt
= (struct rte_mbuf
*) b
->dpdk_buf
;
210 rte_mempool_put(pkt
->pool
, pkt
);
214 __rte_pktmbuf_init(struct rte_mempool
*mp
,
215 void *opaque_arg OVS_UNUSED
,
217 unsigned i OVS_UNUSED
)
219 struct rte_mbuf
*m
= _m
;
220 uint32_t buf_len
= mp
->elt_size
- sizeof(struct ofpbuf
);
222 RTE_MBUF_ASSERT(mp
->elt_size
>= sizeof(struct ofpbuf
));
224 memset(m
, 0, mp
->elt_size
);
226 /* start of buffer is just after mbuf structure */
227 m
->buf_addr
= (char *)m
+ sizeof(struct ofpbuf
);
228 m
->buf_physaddr
= rte_mempool_virt2phy(mp
, m
) +
229 sizeof(struct ofpbuf
);
230 m
->buf_len
= (uint16_t)buf_len
;
232 /* keep some headroom between start of buffer and data */
233 m
->pkt
.data
= (char*) m
->buf_addr
+ RTE_MIN(RTE_PKTMBUF_HEADROOM
, m
->buf_len
);
235 /* init some constant fields */
236 m
->type
= RTE_MBUF_PKT
;
239 m
->pkt
.in_port
= 0xff;
243 ovs_rte_pktmbuf_init(struct rte_mempool
*mp
,
244 void *opaque_arg OVS_UNUSED
,
246 unsigned i OVS_UNUSED
)
248 struct rte_mbuf
*m
= _m
;
250 __rte_pktmbuf_init(mp
, opaque_arg
, _m
, i
);
252 ofpbuf_init_dpdk((struct ofpbuf
*) m
, m
->buf_len
);
255 static struct dpdk_mp
*
256 dpdk_mp_get(int socket_id
, int mtu
) OVS_REQUIRES(dpdk_mutex
)
258 struct dpdk_mp
*dmp
= NULL
;
259 char mp_name
[RTE_MEMPOOL_NAMESIZE
];
261 LIST_FOR_EACH (dmp
, list_node
, &dpdk_mp_list
) {
262 if (dmp
->socket_id
== socket_id
&& dmp
->mtu
== mtu
) {
268 dmp
= dpdk_rte_mzalloc(sizeof *dmp
);
269 dmp
->socket_id
= socket_id
;
273 snprintf(mp_name
, RTE_MEMPOOL_NAMESIZE
, "ovs_mp_%d", dmp
->mtu
);
274 dmp
->mp
= rte_mempool_create(mp_name
, NB_MBUF
, MBUF_SIZE(mtu
),
276 sizeof(struct rte_pktmbuf_pool_private
),
277 rte_pktmbuf_pool_init
, NULL
,
278 ovs_rte_pktmbuf_init
, NULL
,
281 if (dmp
->mp
== NULL
) {
285 list_push_back(&dpdk_mp_list
, &dmp
->list_node
);
290 dpdk_mp_put(struct dpdk_mp
*dmp
)
298 ovs_assert(dmp
->refcount
>= 0);
301 /* I could not find any API to destroy mp. */
302 if (dmp
->refcount
== 0) {
303 list_delete(dmp
->list_node
);
304 /* destroy mp-pool. */
310 check_link_status(struct netdev_dpdk
*dev
)
312 struct rte_eth_link link
;
314 rte_eth_link_get_nowait(dev
->port_id
, &link
);
316 if (dev
->link
.link_status
!= link
.link_status
) {
317 netdev_change_seq_changed(&dev
->up
);
319 dev
->link_reset_cnt
++;
321 if (dev
->link
.link_status
) {
322 VLOG_DBG_RL(&rl
, "Port %d Link Up - speed %u Mbps - %s",
323 dev
->port_id
, (unsigned)dev
->link
.link_speed
,
324 (dev
->link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) ?
325 ("full-duplex") : ("half-duplex"));
327 VLOG_DBG_RL(&rl
, "Port %d Link Down", dev
->port_id
);
333 dpdk_watchdog(void *dummy OVS_UNUSED
)
335 struct netdev_dpdk
*dev
;
337 pthread_detach(pthread_self());
340 ovs_mutex_lock(&dpdk_mutex
);
341 LIST_FOR_EACH (dev
, list_node
, &dpdk_list
) {
342 ovs_mutex_lock(&dev
->mutex
);
343 check_link_status(dev
);
344 ovs_mutex_unlock(&dev
->mutex
);
346 ovs_mutex_unlock(&dpdk_mutex
);
347 xsleep(DPDK_PORT_WATCHDOG_INTERVAL
);
354 dpdk_eth_dev_init(struct netdev_dpdk
*dev
) OVS_REQUIRES(dpdk_mutex
)
356 struct rte_pktmbuf_pool_private
*mbp_priv
;
357 struct ether_addr eth_addr
;
361 if (dev
->port_id
< 0 || dev
->port_id
>= rte_eth_dev_count()) {
365 diag
= rte_eth_dev_configure(dev
->port_id
, NR_QUEUE
, NR_QUEUE
, &port_conf
);
367 VLOG_ERR("eth dev config error %d",diag
);
371 for (i
= 0; i
< NR_QUEUE
; i
++) {
372 diag
= rte_eth_tx_queue_setup(dev
->port_id
, i
, MAX_TX_QUEUE_LEN
,
373 dev
->socket_id
, &tx_conf
);
375 VLOG_ERR("eth dev tx queue setup error %d",diag
);
380 for (i
= 0; i
< NR_QUEUE
; i
++) {
381 diag
= rte_eth_rx_queue_setup(dev
->port_id
, i
, MAX_RX_QUEUE_LEN
,
383 &rx_conf
, dev
->dpdk_mp
->mp
);
385 VLOG_ERR("eth dev rx queue setup error %d",diag
);
390 diag
= rte_eth_dev_start(dev
->port_id
);
392 VLOG_ERR("eth dev start error %d",diag
);
396 rte_eth_promiscuous_enable(dev
->port_id
);
397 rte_eth_allmulticast_enable(dev
->port_id
);
399 memset(ð_addr
, 0x0, sizeof(eth_addr
));
400 rte_eth_macaddr_get(dev
->port_id
, ð_addr
);
401 VLOG_INFO_RL(&rl
, "Port %d: "ETH_ADDR_FMT
"",
402 dev
->port_id
, ETH_ADDR_ARGS(eth_addr
.addr_bytes
));
404 memcpy(dev
->hwaddr
, eth_addr
.addr_bytes
, ETH_ADDR_LEN
);
405 rte_eth_link_get_nowait(dev
->port_id
, &dev
->link
);
407 mbp_priv
= rte_mempool_get_priv(dev
->dpdk_mp
->mp
);
408 dev
->buf_size
= mbp_priv
->mbuf_data_room_size
- RTE_PKTMBUF_HEADROOM
;
410 dev
->flags
= NETDEV_UP
| NETDEV_PROMISC
;
414 static struct netdev_dpdk
*
415 netdev_dpdk_cast(const struct netdev
*netdev
)
417 return CONTAINER_OF(netdev
, struct netdev_dpdk
, up
);
420 static struct netdev
*
421 netdev_dpdk_alloc(void)
423 struct netdev_dpdk
*netdev
= dpdk_rte_mzalloc(sizeof *netdev
);
428 netdev_dpdk_construct(struct netdev
*netdev_
)
430 struct netdev_dpdk
*netdev
= netdev_dpdk_cast(netdev_
);
431 unsigned int port_no
;
436 if (rte_eal_init_ret
) {
437 return rte_eal_init_ret
;
440 ovs_mutex_lock(&dpdk_mutex
);
441 cport
= netdev_
->name
+ 4; /* Names always start with "dpdk" */
443 if (strncmp(netdev_
->name
, "dpdk", 4)) {
448 port_no
= strtol(cport
, 0, 0); /* string must be null terminated */
450 for (i
= 0; i
< NR_QUEUE
; i
++) {
451 rte_spinlock_init(&netdev
->tx_q
[i
].tx_lock
);
454 ovs_mutex_init(&netdev
->mutex
);
456 ovs_mutex_lock(&netdev
->mutex
);
459 netdev
->mtu
= ETHER_MTU
;
460 netdev
->max_packet_len
= MTU_TO_MAX_LEN(netdev
->mtu
);
462 /* TODO: need to discover device node at run time. */
463 netdev
->socket_id
= SOCKET0
;
464 netdev
->port_id
= port_no
;
466 netdev
->dpdk_mp
= dpdk_mp_get(netdev
->socket_id
, netdev
->mtu
);
467 if (!netdev
->dpdk_mp
) {
472 err
= dpdk_eth_dev_init(netdev
);
476 netdev_
->n_rxq
= NR_QUEUE
;
478 list_push_back(&dpdk_list
, &netdev
->list_node
);
481 ovs_mutex_unlock(&netdev
->mutex
);
483 ovs_mutex_unlock(&dpdk_mutex
);
488 netdev_dpdk_destruct(struct netdev
*netdev_
)
490 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
492 ovs_mutex_lock(&dev
->mutex
);
493 rte_eth_dev_stop(dev
->port_id
);
494 ovs_mutex_unlock(&dev
->mutex
);
496 ovs_mutex_lock(&dpdk_mutex
);
497 list_remove(&dev
->list_node
);
498 dpdk_mp_put(dev
->dpdk_mp
);
499 ovs_mutex_unlock(&dpdk_mutex
);
501 ovs_mutex_destroy(&dev
->mutex
);
505 netdev_dpdk_dealloc(struct netdev
*netdev_
)
507 struct netdev_dpdk
*netdev
= netdev_dpdk_cast(netdev_
);
513 netdev_dpdk_get_config(const struct netdev
*netdev_
, struct smap
*args
)
515 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
517 ovs_mutex_lock(&dev
->mutex
);
519 /* TODO: Allow to configure number of queues. */
520 smap_add_format(args
, "configured_rx_queues", "%u", netdev_
->n_rxq
);
521 smap_add_format(args
, "configured_tx_queues", "%u", netdev_
->n_rxq
);
522 ovs_mutex_unlock(&dev
->mutex
);
527 static struct netdev_rxq
*
528 netdev_dpdk_rxq_alloc(void)
530 struct netdev_rxq_dpdk
*rx
= dpdk_rte_mzalloc(sizeof *rx
);
535 static struct netdev_rxq_dpdk
*
536 netdev_rxq_dpdk_cast(const struct netdev_rxq
*rx
)
538 return CONTAINER_OF(rx
, struct netdev_rxq_dpdk
, up
);
542 netdev_dpdk_rxq_construct(struct netdev_rxq
*rxq_
)
544 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq_
);
545 struct netdev_dpdk
*netdev
= netdev_dpdk_cast(rx
->up
.netdev
);
547 ovs_mutex_lock(&netdev
->mutex
);
548 rx
->port_id
= netdev
->port_id
;
549 ovs_mutex_unlock(&netdev
->mutex
);
555 netdev_dpdk_rxq_destruct(struct netdev_rxq
*rxq_ OVS_UNUSED
)
560 netdev_dpdk_rxq_dealloc(struct netdev_rxq
*rxq_
)
562 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq_
);
568 dpdk_queue_flush(struct netdev_dpdk
*dev
, int qid
)
570 struct dpdk_tx_queue
*txq
= &dev
->tx_q
[qid
];
573 if (txq
->count
== 0) {
576 rte_spinlock_lock(&txq
->tx_lock
);
577 nb_tx
= rte_eth_tx_burst(dev
->port_id
, qid
, txq
->burst_pkts
, txq
->count
);
578 if (nb_tx
!= txq
->count
) {
579 /* free buffers if we couldn't transmit packets */
580 rte_mempool_put_bulk(dev
->dpdk_mp
->mp
,
581 (void **) &txq
->burst_pkts
[nb_tx
],
582 (txq
->count
- nb_tx
));
585 rte_spinlock_unlock(&txq
->tx_lock
);
589 netdev_dpdk_rxq_recv(struct netdev_rxq
*rxq_
, struct ofpbuf
**packets
, int *c
)
591 struct netdev_rxq_dpdk
*rx
= netdev_rxq_dpdk_cast(rxq_
);
592 struct netdev
*netdev
= rx
->up
.netdev
;
593 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
596 dpdk_queue_flush(dev
, rxq_
->queue_id
);
598 nb_rx
= rte_eth_rx_burst(rx
->port_id
, rxq_
->queue_id
,
599 (struct rte_mbuf
**) packets
,
600 MIN((int)NETDEV_MAX_RX_BATCH
,
601 (int)MAX_RX_QUEUE_LEN
));
612 dpdk_queue_pkt(struct netdev_dpdk
*dev
, int qid
,
613 struct rte_mbuf
*pkt
)
615 struct dpdk_tx_queue
*txq
= &dev
->tx_q
[qid
];
620 rte_spinlock_lock(&txq
->tx_lock
);
621 txq
->burst_pkts
[txq
->count
++] = pkt
;
622 if (txq
->count
== MAX_TX_QUEUE_LEN
) {
625 cur_tsc
= rte_get_timer_cycles();
626 if (txq
->count
== 1) {
629 diff_tsc
= cur_tsc
- txq
->tsc
;
630 if (diff_tsc
>= DRAIN_TSC
) {
633 rte_spinlock_unlock(&txq
->tx_lock
);
637 nb_tx
= rte_eth_tx_burst(dev
->port_id
, qid
, txq
->burst_pkts
, txq
->count
);
638 if (nb_tx
!= txq
->count
) {
639 /* free buffers if we couldn't transmit packets */
640 rte_mempool_put_bulk(dev
->dpdk_mp
->mp
,
641 (void **) &txq
->burst_pkts
[nb_tx
],
642 (txq
->count
- nb_tx
));
645 rte_spinlock_unlock(&txq
->tx_lock
);
648 /* Tx function. Transmit packets indefinitely */
650 dpdk_do_tx_copy(struct netdev
*netdev
, char *buf
, int size
)
652 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
653 struct rte_mbuf
*pkt
;
655 pkt
= rte_pktmbuf_alloc(dev
->dpdk_mp
->mp
);
657 ovs_mutex_lock(&dev
->mutex
);
658 dev
->stats
.tx_dropped
++;
659 ovs_mutex_unlock(&dev
->mutex
);
663 /* We have to do a copy for now */
664 memcpy(pkt
->pkt
.data
, buf
, size
);
666 rte_pktmbuf_data_len(pkt
) = size
;
667 rte_pktmbuf_pkt_len(pkt
) = size
;
669 dpdk_queue_pkt(dev
, NON_PMD_THREAD_TX_QUEUE
, pkt
);
670 dpdk_queue_flush(dev
, NON_PMD_THREAD_TX_QUEUE
);
674 netdev_dpdk_send(struct netdev
*netdev
,
675 struct ofpbuf
*ofpbuf
, bool may_steal
)
677 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
680 if (ofpbuf_size(ofpbuf
) > dev
->max_packet_len
) {
681 VLOG_WARN_RL(&rl
, "Too big size %d max_packet_len %d",
682 (int)ofpbuf_size(ofpbuf
) , dev
->max_packet_len
);
684 ovs_mutex_lock(&dev
->mutex
);
685 dev
->stats
.tx_dropped
++;
686 ovs_mutex_unlock(&dev
->mutex
);
692 if (!may_steal
|| ofpbuf
->source
!= OFPBUF_DPDK
) {
693 dpdk_do_tx_copy(netdev
, (char *) ofpbuf_data(ofpbuf
), ofpbuf_size(ofpbuf
));
696 ofpbuf_delete(ofpbuf
);
701 qid
= rte_lcore_id() % NR_QUEUE
;
703 dpdk_queue_pkt(dev
, qid
, (struct rte_mbuf
*)ofpbuf
);
713 netdev_dpdk_set_etheraddr(struct netdev
*netdev
,
714 const uint8_t mac
[ETH_ADDR_LEN
])
716 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
718 ovs_mutex_lock(&dev
->mutex
);
719 if (!eth_addr_equals(dev
->hwaddr
, mac
)) {
720 memcpy(dev
->hwaddr
, mac
, ETH_ADDR_LEN
);
721 netdev_change_seq_changed(netdev
);
723 ovs_mutex_unlock(&dev
->mutex
);
729 netdev_dpdk_get_etheraddr(const struct netdev
*netdev
,
730 uint8_t mac
[ETH_ADDR_LEN
])
732 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
734 ovs_mutex_lock(&dev
->mutex
);
735 memcpy(mac
, dev
->hwaddr
, ETH_ADDR_LEN
);
736 ovs_mutex_unlock(&dev
->mutex
);
742 netdev_dpdk_get_mtu(const struct netdev
*netdev
, int *mtup
)
744 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
746 ovs_mutex_lock(&dev
->mutex
);
748 ovs_mutex_unlock(&dev
->mutex
);
754 netdev_dpdk_set_mtu(const struct netdev
*netdev
, int mtu
)
756 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
758 struct dpdk_mp
*old_mp
;
761 ovs_mutex_lock(&dpdk_mutex
);
762 ovs_mutex_lock(&dev
->mutex
);
763 if (dev
->mtu
== mtu
) {
768 mp
= dpdk_mp_get(dev
->socket_id
, dev
->mtu
);
774 rte_eth_dev_stop(dev
->port_id
);
777 old_mp
= dev
->dpdk_mp
;
780 dev
->max_packet_len
= MTU_TO_MAX_LEN(dev
->mtu
);
782 err
= dpdk_eth_dev_init(dev
);
787 dev
->dpdk_mp
= old_mp
;
788 dev
->max_packet_len
= MTU_TO_MAX_LEN(dev
->mtu
);
789 dpdk_eth_dev_init(dev
);
794 netdev_change_seq_changed(netdev
);
796 ovs_mutex_unlock(&dev
->mutex
);
797 ovs_mutex_unlock(&dpdk_mutex
);
802 netdev_dpdk_get_carrier(const struct netdev
*netdev_
, bool *carrier
);
805 netdev_dpdk_get_stats(const struct netdev
*netdev
, struct netdev_stats
*stats
)
807 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
808 struct rte_eth_stats rte_stats
;
811 netdev_dpdk_get_carrier(netdev
, &gg
);
812 ovs_mutex_lock(&dev
->mutex
);
813 rte_eth_stats_get(dev
->port_id
, &rte_stats
);
815 *stats
= dev
->stats_offset
;
817 stats
->rx_packets
+= rte_stats
.ipackets
;
818 stats
->tx_packets
+= rte_stats
.opackets
;
819 stats
->rx_bytes
+= rte_stats
.ibytes
;
820 stats
->tx_bytes
+= rte_stats
.obytes
;
821 stats
->rx_errors
+= rte_stats
.ierrors
;
822 stats
->tx_errors
+= rte_stats
.oerrors
;
823 stats
->multicast
+= rte_stats
.imcasts
;
825 stats
->tx_dropped
+= dev
->stats
.tx_dropped
;
826 ovs_mutex_unlock(&dev
->mutex
);
832 netdev_dpdk_set_stats(struct netdev
*netdev
, const struct netdev_stats
*stats
)
834 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
836 ovs_mutex_lock(&dev
->mutex
);
837 dev
->stats_offset
= *stats
;
838 ovs_mutex_unlock(&dev
->mutex
);
844 netdev_dpdk_get_features(const struct netdev
*netdev_
,
845 enum netdev_features
*current
,
846 enum netdev_features
*advertised OVS_UNUSED
,
847 enum netdev_features
*supported OVS_UNUSED
,
848 enum netdev_features
*peer OVS_UNUSED
)
850 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
851 struct rte_eth_link link
;
853 ovs_mutex_lock(&dev
->mutex
);
855 ovs_mutex_unlock(&dev
->mutex
);
857 if (link
.link_duplex
== ETH_LINK_AUTONEG_DUPLEX
) {
858 if (link
.link_speed
== ETH_LINK_SPEED_AUTONEG
) {
859 *current
= NETDEV_F_AUTONEG
;
861 } else if (link
.link_duplex
== ETH_LINK_HALF_DUPLEX
) {
862 if (link
.link_speed
== ETH_LINK_SPEED_10
) {
863 *current
= NETDEV_F_10MB_HD
;
865 if (link
.link_speed
== ETH_LINK_SPEED_100
) {
866 *current
= NETDEV_F_100MB_HD
;
868 if (link
.link_speed
== ETH_LINK_SPEED_1000
) {
869 *current
= NETDEV_F_1GB_HD
;
871 } else if (link
.link_duplex
== ETH_LINK_FULL_DUPLEX
) {
872 if (link
.link_speed
== ETH_LINK_SPEED_10
) {
873 *current
= NETDEV_F_10MB_FD
;
875 if (link
.link_speed
== ETH_LINK_SPEED_100
) {
876 *current
= NETDEV_F_100MB_FD
;
878 if (link
.link_speed
== ETH_LINK_SPEED_1000
) {
879 *current
= NETDEV_F_1GB_FD
;
881 if (link
.link_speed
== ETH_LINK_SPEED_10000
) {
882 *current
= NETDEV_F_10GB_FD
;
890 netdev_dpdk_get_ifindex(const struct netdev
*netdev
)
892 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev
);
895 ovs_mutex_lock(&dev
->mutex
);
896 ifindex
= dev
->port_id
;
897 ovs_mutex_unlock(&dev
->mutex
);
903 netdev_dpdk_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
905 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
907 ovs_mutex_lock(&dev
->mutex
);
908 check_link_status(dev
);
909 *carrier
= dev
->link
.link_status
;
910 ovs_mutex_unlock(&dev
->mutex
);
916 netdev_dpdk_get_carrier_resets(const struct netdev
*netdev_
)
918 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
919 long long int carrier_resets
;
921 ovs_mutex_lock(&dev
->mutex
);
922 carrier_resets
= dev
->link_reset_cnt
;
923 ovs_mutex_unlock(&dev
->mutex
);
925 return carrier_resets
;
929 netdev_dpdk_set_miimon(struct netdev
*netdev_ OVS_UNUSED
,
930 long long int interval OVS_UNUSED
)
936 netdev_dpdk_update_flags__(struct netdev_dpdk
*dev
,
937 enum netdev_flags off
, enum netdev_flags on
,
938 enum netdev_flags
*old_flagsp
)
939 OVS_REQUIRES(dev
->mutex
)
943 if ((off
| on
) & ~(NETDEV_UP
| NETDEV_PROMISC
)) {
947 *old_flagsp
= dev
->flags
;
951 if (dev
->flags
== *old_flagsp
) {
955 if (dev
->flags
& NETDEV_UP
) {
956 err
= rte_eth_dev_start(dev
->port_id
);
961 if (dev
->flags
& NETDEV_PROMISC
) {
962 rte_eth_promiscuous_enable(dev
->port_id
);
965 if (!(dev
->flags
& NETDEV_UP
)) {
966 rte_eth_dev_stop(dev
->port_id
);
973 netdev_dpdk_update_flags(struct netdev
*netdev_
,
974 enum netdev_flags off
, enum netdev_flags on
,
975 enum netdev_flags
*old_flagsp
)
977 struct netdev_dpdk
*netdev
= netdev_dpdk_cast(netdev_
);
980 ovs_mutex_lock(&netdev
->mutex
);
981 error
= netdev_dpdk_update_flags__(netdev
, off
, on
, old_flagsp
);
982 ovs_mutex_unlock(&netdev
->mutex
);
988 netdev_dpdk_get_status(const struct netdev
*netdev_
, struct smap
*args
)
990 struct netdev_dpdk
*dev
= netdev_dpdk_cast(netdev_
);
991 struct rte_eth_dev_info dev_info
;
993 if (dev
->port_id
<= 0)
996 ovs_mutex_lock(&dev
->mutex
);
997 rte_eth_dev_info_get(dev
->port_id
, &dev_info
);
998 ovs_mutex_unlock(&dev
->mutex
);
1000 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
1002 smap_add_format(args
, "numa_id", "%d", rte_eth_dev_socket_id(dev
->port_id
));
1003 smap_add_format(args
, "driver_name", "%s", dev_info
.driver_name
);
1004 smap_add_format(args
, "min_rx_bufsize", "%u", dev_info
.min_rx_bufsize
);
1005 smap_add_format(args
, "max_rx_pktlen", "%u", dev_info
.max_rx_pktlen
);
1006 smap_add_format(args
, "max_rx_queues", "%u", dev_info
.max_rx_queues
);
1007 smap_add_format(args
, "max_tx_queues", "%u", dev_info
.max_tx_queues
);
1008 smap_add_format(args
, "max_mac_addrs", "%u", dev_info
.max_mac_addrs
);
1009 smap_add_format(args
, "max_hash_mac_addrs", "%u", dev_info
.max_hash_mac_addrs
);
1010 smap_add_format(args
, "max_vfs", "%u", dev_info
.max_vfs
);
1011 smap_add_format(args
, "max_vmdq_pools", "%u", dev_info
.max_vmdq_pools
);
1013 smap_add_format(args
, "pci-vendor_id", "0x%u", dev_info
.pci_dev
->id
.vendor_id
);
1014 smap_add_format(args
, "pci-device_id", "0x%x", dev_info
.pci_dev
->id
.device_id
);
1020 netdev_dpdk_set_admin_state__(struct netdev_dpdk
*dev
, bool admin_state
)
1021 OVS_REQUIRES(dev
->mutex
)
1023 enum netdev_flags old_flags
;
1026 netdev_dpdk_update_flags__(dev
, 0, NETDEV_UP
, &old_flags
);
1028 netdev_dpdk_update_flags__(dev
, NETDEV_UP
, 0, &old_flags
);
1033 netdev_dpdk_set_admin_state(struct unixctl_conn
*conn
, int argc
,
1034 const char *argv
[], void *aux OVS_UNUSED
)
1038 if (!strcasecmp(argv
[argc
- 1], "up")) {
1040 } else if ( !strcasecmp(argv
[argc
- 1], "down")) {
1043 unixctl_command_reply_error(conn
, "Invalid Admin State");
1048 struct netdev
*netdev
= netdev_from_name(argv
[1]);
1049 if (netdev
&& is_dpdk_class(netdev
->netdev_class
)) {
1050 struct netdev_dpdk
*dpdk_dev
= netdev_dpdk_cast(netdev
);
1052 ovs_mutex_lock(&dpdk_dev
->mutex
);
1053 netdev_dpdk_set_admin_state__(dpdk_dev
, up
);
1054 ovs_mutex_unlock(&dpdk_dev
->mutex
);
1056 netdev_close(netdev
);
1058 unixctl_command_reply_error(conn
, "Not a DPDK Interface");
1059 netdev_close(netdev
);
1063 struct netdev_dpdk
*netdev
;
1065 ovs_mutex_lock(&dpdk_mutex
);
1066 LIST_FOR_EACH (netdev
, list_node
, &dpdk_list
) {
1067 ovs_mutex_lock(&netdev
->mutex
);
1068 netdev_dpdk_set_admin_state__(netdev
, up
);
1069 ovs_mutex_unlock(&netdev
->mutex
);
1071 ovs_mutex_unlock(&dpdk_mutex
);
1073 unixctl_command_reply(conn
, "OK");
1077 dpdk_class_init(void)
1081 if (rte_eal_init_ret
) {
1085 result
= rte_pmd_init_all();
1087 VLOG_ERR("Cannot init PMD");
1091 result
= rte_eal_pci_probe();
1093 VLOG_ERR("Cannot probe PCI");
1097 if (rte_eth_dev_count() < 1) {
1098 VLOG_ERR("No Ethernet devices found. Try assigning ports to UIO.");
1101 VLOG_INFO("Ethernet Device Count: %d", (int)rte_eth_dev_count());
1103 list_init(&dpdk_list
);
1104 list_init(&dpdk_mp_list
);
1106 unixctl_command_register("netdev-dpdk/set-admin-state",
1107 "[netdev] up|down", 1, 2,
1108 netdev_dpdk_set_admin_state
, NULL
);
1110 ovs_thread_create("dpdk_watchdog", dpdk_watchdog
, NULL
);
1114 static struct netdev_class netdev_dpdk_class
= {
1116 dpdk_class_init
, /* init */
1117 NULL
, /* netdev_dpdk_run */
1118 NULL
, /* netdev_dpdk_wait */
1121 netdev_dpdk_construct
,
1122 netdev_dpdk_destruct
,
1123 netdev_dpdk_dealloc
,
1124 netdev_dpdk_get_config
,
1125 NULL
, /* netdev_dpdk_set_config */
1126 NULL
, /* get_tunnel_config */
1128 netdev_dpdk_send
, /* send */
1129 NULL
, /* send_wait */
1131 netdev_dpdk_set_etheraddr
,
1132 netdev_dpdk_get_etheraddr
,
1133 netdev_dpdk_get_mtu
,
1134 netdev_dpdk_set_mtu
,
1135 netdev_dpdk_get_ifindex
,
1136 netdev_dpdk_get_carrier
,
1137 netdev_dpdk_get_carrier_resets
,
1138 netdev_dpdk_set_miimon
,
1139 netdev_dpdk_get_stats
,
1140 netdev_dpdk_set_stats
,
1141 netdev_dpdk_get_features
,
1142 NULL
, /* set_advertisements */
1144 NULL
, /* set_policing */
1145 NULL
, /* get_qos_types */
1146 NULL
, /* get_qos_capabilities */
1149 NULL
, /* get_queue */
1150 NULL
, /* set_queue */
1151 NULL
, /* delete_queue */
1152 NULL
, /* get_queue_stats */
1153 NULL
, /* queue_dump_start */
1154 NULL
, /* queue_dump_next */
1155 NULL
, /* queue_dump_done */
1156 NULL
, /* dump_queue_stats */
1161 NULL
, /* add_router */
1162 NULL
, /* get_next_hop */
1163 netdev_dpdk_get_status
,
1164 NULL
, /* arp_lookup */
1166 netdev_dpdk_update_flags
,
1168 netdev_dpdk_rxq_alloc
,
1169 netdev_dpdk_rxq_construct
,
1170 netdev_dpdk_rxq_destruct
,
1171 netdev_dpdk_rxq_dealloc
,
1172 netdev_dpdk_rxq_recv
,
1173 NULL
, /* rxq_wait */
1174 NULL
, /* rxq_drain */
1178 dpdk_init(int argc
, char **argv
)
1182 if (argc
< 2 || strcmp(argv
[1], "--dpdk"))
1185 /* Make sure program name passed to rte_eal_init() is vswitchd. */
1191 /* Make sure things are initialized ... */
1192 result
= rte_eal_init(argc
, argv
);
1194 ovs_abort(result
, "Cannot init EAL\n");
1197 rte_eal_init_ret
= 0;
1200 argv
[result
] = argv
[0];
1206 netdev_dpdk_register(void)
1208 netdev_register_provider(&netdev_dpdk_class
);
1212 pmd_thread_setaffinity_cpu(int cpu
)
1218 CPU_SET(cpu
, &cpuset
);
1219 err
= pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t
), &cpuset
);
1221 VLOG_ERR("Thread affinity error %d",err
);
1224 RTE_PER_LCORE(_lcore_id
) = cpu
;