]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-dpdk.c
netdev-dpdk: add dpdk vhost-user ports
[mirror_ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2
PS
1/*
2 * Copyright (c) 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
8a9562d2
PS
19#include <string.h>
20#include <signal.h>
21#include <stdlib.h>
22#include <pthread.h>
23#include <config.h>
24#include <errno.h>
25#include <sched.h>
26#include <stdlib.h>
27#include <unistd.h>
7d1ced01 28#include <sys/stat.h>
8a9562d2 29#include <stdio.h>
7d1ced01
CL
30#include <sys/types.h>
31#include <sys/stat.h>
8a9562d2 32
7d1ced01 33#include "dirs.h"
e14deea0 34#include "dp-packet.h"
8a9562d2
PS
35#include "dpif-netdev.h"
36#include "list.h"
37#include "netdev-dpdk.h"
38#include "netdev-provider.h"
39#include "netdev-vport.h"
40#include "odp-util.h"
41#include "ofp-print.h"
94143fc4 42#include "ovs-numa.h"
8a9562d2
PS
43#include "ovs-thread.h"
44#include "ovs-rcu.h"
45#include "packets.h"
46#include "shash.h"
8a9562d2
PS
47#include "sset.h"
48#include "unaligned.h"
49#include "timeval.h"
50#include "unixctl.h"
e6211adc 51#include "openvswitch/vlog.h"
8a9562d2 52
b8e57534
MK
53#include "rte_config.h"
54#include "rte_mbuf.h"
58397e6c 55#include "rte_virtio_net.h"
b8e57534 56
8a9562d2
PS
57VLOG_DEFINE_THIS_MODULE(dpdk);
58static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
59
60#define DPDK_PORT_WATCHDOG_INTERVAL 5
61
62#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
63#define OVS_VPORT_DPDK "ovs_dpdk"
64
65/*
66 * need to reserve tons of extra space in the mbufs so we can align the
67 * DMA addresses to 4KB.
68 */
69
70#define MTU_TO_MAX_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
71#define MBUF_SIZE(mtu) (MTU_TO_MAX_LEN(mtu) + (512) + \
72 sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73
da79ce2b
DDP
74/* Max and min number of packets in the mempool. OVS tries to allocate a
75 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
76 * enough hugepages) we keep halving the number until the allocation succeeds
77 * or we reach MIN_NB_MBUF */
78
79#define MAX_NB_MBUF (4096 * 64)
80#define MIN_NB_MBUF (4096 * 4)
81#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
82
83/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
84BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF) == 0);
85
86/* The smallest possible NB_MBUF that we're going to try should be a multiple
87 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
88BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
89 % MP_CACHE_SZ == 0);
90
8a9562d2
PS
91#define SOCKET0 0
92
79f5354c
PM
93#define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
94#define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
95
7d1ced01
CL
96char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */
97char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
58397e6c 98
95e9881f
KT
99/*
100 * Maximum amount of time in micro seconds to try and enqueue to vhost.
101 */
102#define VHOST_ENQ_RETRY_USECS 100
103
8a9562d2 104static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
105 .rxmode = {
106 .mq_mode = ETH_MQ_RX_RSS,
107 .split_hdr_size = 0,
108 .header_split = 0, /* Header Split disabled */
109 .hw_ip_checksum = 0, /* IP checksum offload disabled */
110 .hw_vlan_filter = 0, /* VLAN filtering disabled */
111 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
112 .hw_strip_crc = 0,
113 },
114 .rx_adv_conf = {
115 .rss_conf = {
116 .rss_key = NULL,
543342a4 117 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
8a9562d2 118 },
a28ddd11
DDP
119 },
120 .txmode = {
121 .mq_mode = ETH_MQ_TX_NONE,
122 },
8a9562d2
PS
123};
124
3a100265 125enum { MAX_TX_QUEUE_LEN = 384 };
58f7c37b
DDP
126enum { DPDK_RING_SIZE = 256 };
127BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
128enum { DRAIN_TSC = 200000ULL };
129
58397e6c
KT
130enum dpdk_dev_type {
131 DPDK_DEV_ETH = 0,
7d1ced01 132 DPDK_DEV_VHOST = 1,
58397e6c
KT
133};
134
8a9562d2
PS
135static int rte_eal_init_ret = ENODEV;
136
137static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
138
139/* Contains all 'struct dpdk_dev's. */
ca6ba700 140static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 141 = OVS_LIST_INITIALIZER(&dpdk_list);
8a9562d2 142
ca6ba700 143static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 144 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
8a9562d2 145
db73f716
DDP
146/* This mutex must be used by non pmd threads when allocating or freeing
147 * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
148 * use mempools, a non pmd thread should hold this mutex while calling them */
bce01e3a 149static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
db73f716 150
8a9562d2
PS
151struct dpdk_mp {
152 struct rte_mempool *mp;
153 int mtu;
154 int socket_id;
155 int refcount;
ca6ba700 156 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
8a9562d2
PS
157};
158
5a034064
AW
159/* There should be one 'struct dpdk_tx_queue' created for
160 * each cpu core. */
8a9562d2 161struct dpdk_tx_queue {
94143fc4
AW
162 bool flush_tx; /* Set to true to flush queue everytime */
163 /* pkts are queued. */
8a9562d2 164 int count;
a0cb2d66
DDP
165 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
166 * from concurrent access. It is used only
167 * if the queue is shared among different
168 * pmd threads (see 'txq_needs_locking'). */
8a9562d2
PS
169 uint64_t tsc;
170 struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
171};
172
95fb793a 173/* dpdk has no way to remove dpdk ring ethernet devices
174 so we have to keep them around once they've been created
175*/
176
ca6ba700 177static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 178 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
95fb793a 179
180struct dpdk_ring {
181 /* For the client rings */
182 struct rte_ring *cring_tx;
183 struct rte_ring *cring_rx;
184 int user_port_id; /* User given port no, parsed from port name */
185 int eth_port_id; /* ethernet device port id */
ca6ba700 186 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
95fb793a 187};
188
8a9562d2
PS
189struct netdev_dpdk {
190 struct netdev up;
191 int port_id;
192 int max_packet_len;
58397e6c 193 enum dpdk_dev_type type;
8a9562d2 194
5a034064 195 struct dpdk_tx_queue *tx_q;
8a9562d2
PS
196
197 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
198
199 struct dpdk_mp *dpdk_mp;
200 int mtu;
201 int socket_id;
202 int buf_size;
8a9562d2 203 struct netdev_stats stats;
45d947c4
DDP
204 /* Protects stats */
205 rte_spinlock_t stats_lock;
8a9562d2
PS
206
207 uint8_t hwaddr[ETH_ADDR_LEN];
208 enum netdev_flags flags;
209
210 struct rte_eth_link link;
211 int link_reset_cnt;
212
a0cb2d66
DDP
213 /* The user might request more txqs than the NIC has. We remap those
214 * ('up.n_txq') on these ('real_n_txq').
215 * If the numbers match, 'txq_needs_locking' is false, otherwise it is
216 * true and we will take a spinlock on transmission */
217 int real_n_txq;
218 bool txq_needs_locking;
219
220 /* Spinlock for vhost transmission. Other DPDK devices use spinlocks in
221 * dpdk_tx_queue */
222 rte_spinlock_t vhost_tx_lock;
223
58397e6c
KT
224 /* virtio-net structure for vhost device */
225 OVSRCU_TYPE(struct virtio_net *) virtio_dev;
226
7d1ced01
CL
227 /* Identifier used to distinguish vhost devices from each other */
228 char vhost_id[PATH_MAX];
229
8a9562d2 230 /* In dpdk_list. */
ca6ba700 231 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
8a9562d2
PS
232};
233
234struct netdev_rxq_dpdk {
235 struct netdev_rxq up;
236 int port_id;
237};
238
db73f716
DDP
239static bool thread_is_pmd(void);
240
8a9562d2
PS
241static int netdev_dpdk_construct(struct netdev *);
242
58397e6c
KT
243struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
244
8a9562d2
PS
245static bool
246is_dpdk_class(const struct netdev_class *class)
247{
248 return class->construct == netdev_dpdk_construct;
249}
250
58397e6c
KT
251/* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
252 * for all other segments data, bss and text. */
8a9562d2
PS
253
254static void *
255dpdk_rte_mzalloc(size_t sz)
256{
257 void *ptr;
258
259 ptr = rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
260 if (ptr == NULL) {
261 out_of_memory();
262 }
263 return ptr;
264}
265
db73f716
DDP
266/* XXX this function should be called only by pmd threads (or by non pmd
267 * threads holding the nonpmd_mempool_mutex) */
8a9562d2 268void
e14deea0 269free_dpdk_buf(struct dp_packet *p)
8a9562d2 270{
db73f716 271 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 272
db73f716 273 rte_pktmbuf_free_seg(pkt);
8a9562d2
PS
274}
275
b3cd9f9d
PS
276static void
277__rte_pktmbuf_init(struct rte_mempool *mp,
278 void *opaque_arg OVS_UNUSED,
279 void *_m,
280 unsigned i OVS_UNUSED)
281{
282 struct rte_mbuf *m = _m;
e14deea0 283 uint32_t buf_len = mp->elt_size - sizeof(struct dp_packet);
b3cd9f9d 284
e14deea0 285 RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dp_packet));
b3cd9f9d
PS
286
287 memset(m, 0, mp->elt_size);
288
289 /* start of buffer is just after mbuf structure */
e14deea0 290 m->buf_addr = (char *)m + sizeof(struct dp_packet);
b3cd9f9d 291 m->buf_physaddr = rte_mempool_virt2phy(mp, m) +
e14deea0 292 sizeof(struct dp_packet);
b3cd9f9d
PS
293 m->buf_len = (uint16_t)buf_len;
294
295 /* keep some headroom between start of buffer and data */
b8e57534 296 m->data_off = RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
b3cd9f9d
PS
297
298 /* init some constant fields */
b3cd9f9d 299 m->pool = mp;
b8e57534
MK
300 m->nb_segs = 1;
301 m->port = 0xff;
b3cd9f9d
PS
302}
303
304static void
305ovs_rte_pktmbuf_init(struct rte_mempool *mp,
306 void *opaque_arg OVS_UNUSED,
307 void *_m,
308 unsigned i OVS_UNUSED)
309{
310 struct rte_mbuf *m = _m;
311
312 __rte_pktmbuf_init(mp, opaque_arg, _m, i);
313
cf62fa4c 314 dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
b3cd9f9d
PS
315}
316
8a9562d2
PS
317static struct dpdk_mp *
318dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
319{
320 struct dpdk_mp *dmp = NULL;
321 char mp_name[RTE_MEMPOOL_NAMESIZE];
da79ce2b 322 unsigned mp_size;
8a9562d2
PS
323
324 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
325 if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
326 dmp->refcount++;
327 return dmp;
328 }
329 }
330
331 dmp = dpdk_rte_mzalloc(sizeof *dmp);
332 dmp->socket_id = socket_id;
333 dmp->mtu = mtu;
334 dmp->refcount = 1;
335
da79ce2b
DDP
336 mp_size = MAX_NB_MBUF;
337 do {
338 if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
339 dmp->mtu, dmp->socket_id, mp_size) < 0) {
340 return NULL;
341 }
95fb793a 342
da79ce2b
DDP
343 dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
344 MP_CACHE_SZ,
345 sizeof(struct rte_pktmbuf_pool_private),
346 rte_pktmbuf_pool_init, NULL,
347 ovs_rte_pktmbuf_init, NULL,
348 socket_id, 0);
349 } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
8a9562d2
PS
350
351 if (dmp->mp == NULL) {
352 return NULL;
da79ce2b
DDP
353 } else {
354 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", mp_name, mp_size );
8a9562d2
PS
355 }
356
357 list_push_back(&dpdk_mp_list, &dmp->list_node);
358 return dmp;
359}
360
361static void
362dpdk_mp_put(struct dpdk_mp *dmp)
363{
364
365 if (!dmp) {
366 return;
367 }
368
369 dmp->refcount--;
370 ovs_assert(dmp->refcount >= 0);
371
372#if 0
373 /* I could not find any API to destroy mp. */
374 if (dmp->refcount == 0) {
375 list_delete(dmp->list_node);
376 /* destroy mp-pool. */
377 }
378#endif
379}
380
381static void
382check_link_status(struct netdev_dpdk *dev)
383{
384 struct rte_eth_link link;
385
386 rte_eth_link_get_nowait(dev->port_id, &link);
387
388 if (dev->link.link_status != link.link_status) {
3e912ffc 389 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
390
391 dev->link_reset_cnt++;
392 dev->link = link;
393 if (dev->link.link_status) {
394 VLOG_DBG_RL(&rl, "Port %d Link Up - speed %u Mbps - %s",
395 dev->port_id, (unsigned)dev->link.link_speed,
396 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
397 ("full-duplex") : ("half-duplex"));
398 } else {
399 VLOG_DBG_RL(&rl, "Port %d Link Down", dev->port_id);
400 }
401 }
402}
403
404static void *
405dpdk_watchdog(void *dummy OVS_UNUSED)
406{
407 struct netdev_dpdk *dev;
408
409 pthread_detach(pthread_self());
410
411 for (;;) {
412 ovs_mutex_lock(&dpdk_mutex);
413 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
414 ovs_mutex_lock(&dev->mutex);
415 check_link_status(dev);
416 ovs_mutex_unlock(&dev->mutex);
417 }
418 ovs_mutex_unlock(&dpdk_mutex);
419 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
420 }
421
422 return NULL;
423}
424
425static int
426dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
427{
428 struct rte_pktmbuf_pool_private *mbp_priv;
a0cb2d66 429 struct rte_eth_dev_info info;
8a9562d2
PS
430 struct ether_addr eth_addr;
431 int diag;
432 int i;
433
434 if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
95fb793a 435 return ENODEV;
8a9562d2
PS
436 }
437
a0cb2d66
DDP
438 rte_eth_dev_info_get(dev->port_id, &info);
439 dev->up.n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
440 dev->real_n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
441
442 diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq, dev->real_n_txq,
5496878c 443 &port_conf);
8a9562d2 444 if (diag) {
a0cb2d66
DDP
445 VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag, dev->up.n_rxq,
446 dev->real_n_txq);
95fb793a 447 return -diag;
8a9562d2
PS
448 }
449
a0cb2d66 450 for (i = 0; i < dev->real_n_txq; i++) {
79f5354c 451 diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
9154f798 452 dev->socket_id, NULL);
8a9562d2
PS
453 if (diag) {
454 VLOG_ERR("eth dev tx queue setup error %d",diag);
95fb793a 455 return -diag;
8a9562d2
PS
456 }
457 }
458
5496878c 459 for (i = 0; i < dev->up.n_rxq; i++) {
79f5354c 460 diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
d221ffa1 461 dev->socket_id,
9154f798 462 NULL, dev->dpdk_mp->mp);
8a9562d2
PS
463 if (diag) {
464 VLOG_ERR("eth dev rx queue setup error %d",diag);
95fb793a 465 return -diag;
8a9562d2
PS
466 }
467 }
468
469 diag = rte_eth_dev_start(dev->port_id);
470 if (diag) {
471 VLOG_ERR("eth dev start error %d",diag);
95fb793a 472 return -diag;
8a9562d2
PS
473 }
474
475 rte_eth_promiscuous_enable(dev->port_id);
476 rte_eth_allmulticast_enable(dev->port_id);
477
478 memset(&eth_addr, 0x0, sizeof(eth_addr));
479 rte_eth_macaddr_get(dev->port_id, &eth_addr);
480 VLOG_INFO_RL(&rl, "Port %d: "ETH_ADDR_FMT"",
481 dev->port_id, ETH_ADDR_ARGS(eth_addr.addr_bytes));
482
483 memcpy(dev->hwaddr, eth_addr.addr_bytes, ETH_ADDR_LEN);
484 rte_eth_link_get_nowait(dev->port_id, &dev->link);
485
486 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
487 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
488
489 dev->flags = NETDEV_UP | NETDEV_PROMISC;
490 return 0;
491}
492
493static struct netdev_dpdk *
494netdev_dpdk_cast(const struct netdev *netdev)
495{
496 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
497}
498
499static struct netdev *
500netdev_dpdk_alloc(void)
501{
502 struct netdev_dpdk *netdev = dpdk_rte_mzalloc(sizeof *netdev);
503 return &netdev->up;
504}
505
5a034064 506static void
91968eb0 507netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
5a034064 508{
bd5131ba 509 unsigned i;
5a034064
AW
510
511 netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
512 for (i = 0; i < n_txqs; i++) {
ba0358a1 513 int numa_id = ovs_numa_get_numa_id(i);
94143fc4 514
a0cb2d66
DDP
515 if (!netdev->txq_needs_locking) {
516 /* Each index is considered as a cpu core id, since there should
517 * be one tx queue for each cpu core. If the corresponding core
518 * is not on the same numa node as 'netdev', flags the
519 * 'flush_tx'. */
520 netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
521 } else {
522 /* Queues are shared among CPUs. Always flush */
523 netdev->tx_q[i].flush_tx = true;
524 }
525 rte_spinlock_init(&netdev->tx_q[i].tx_lock);
5a034064
AW
526 }
527}
528
8a9562d2 529static int
58397e6c
KT
530netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no,
531 enum dpdk_dev_type type)
5a034064 532 OVS_REQUIRES(dpdk_mutex)
8a9562d2
PS
533{
534 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1b7a04e0 535 int sid;
95fb793a 536 int err = 0;
8a9562d2 537
95fb793a 538 ovs_mutex_init(&netdev->mutex);
95fb793a 539 ovs_mutex_lock(&netdev->mutex);
8a9562d2 540
45d947c4
DDP
541 rte_spinlock_init(&netdev->stats_lock);
542
1b7a04e0
AW
543 /* If the 'sid' is negative, it means that the kernel fails
544 * to obtain the pci numa info. In that situation, always
545 * use 'SOCKET0'. */
58397e6c
KT
546 if (type == DPDK_DEV_ETH) {
547 sid = rte_eth_dev_socket_id(port_no);
548 } else {
549 sid = rte_lcore_to_socket_id(rte_get_master_lcore());
550 }
551
1b7a04e0 552 netdev->socket_id = sid < 0 ? SOCKET0 : sid;
95fb793a 553 netdev->port_id = port_no;
58397e6c 554 netdev->type = type;
8a9562d2 555 netdev->flags = 0;
8a9562d2
PS
556 netdev->mtu = ETHER_MTU;
557 netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
558
8a9562d2
PS
559 netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
560 if (!netdev->dpdk_mp) {
561 err = ENOMEM;
95fb793a 562 goto unlock;
8a9562d2
PS
563 }
564
5496878c
AW
565 netdev_->n_txq = NR_QUEUE;
566 netdev_->n_rxq = NR_QUEUE;
a0cb2d66 567 netdev->real_n_txq = NR_QUEUE;
58397e6c
KT
568
569 if (type == DPDK_DEV_ETH) {
1b99bb05
MG
570 netdev_dpdk_alloc_txq(netdev, NR_QUEUE);
571 err = dpdk_eth_dev_init(netdev);
572 if (err) {
573 goto unlock;
574 }
8a9562d2 575 }
8a9562d2
PS
576
577 list_push_back(&dpdk_list, &netdev->list_node);
578
95fb793a 579unlock:
5a034064
AW
580 if (err) {
581 rte_free(netdev->tx_q);
582 }
8a9562d2 583 ovs_mutex_unlock(&netdev->mutex);
95fb793a 584 return err;
585}
586
587static int
588dpdk_dev_parse_name(const char dev_name[], const char prefix[],
589 unsigned int *port_no)
590{
591 const char *cport;
592
593 if (strncmp(dev_name, prefix, strlen(prefix))) {
594 return ENODEV;
595 }
596
597 cport = dev_name + strlen(prefix);
bce01e3a 598 *port_no = strtol(cport, NULL, 0); /* string must be null terminated */
95fb793a 599 return 0;
600}
601
58397e6c 602static int
7d1ced01 603vhost_construct_helper(struct netdev *netdev_)
58397e6c 604{
a0cb2d66 605 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
58397e6c
KT
606
607 if (rte_eal_init_ret) {
608 return rte_eal_init_ret;
609 }
610
7d1ced01
CL
611 rte_spinlock_init(&netdev->vhost_tx_lock);
612 return netdev_dpdk_init(netdev_, -1, DPDK_DEV_VHOST);
613}
614
615static int
616netdev_dpdk_vhost_cuse_construct(struct netdev *netdev_)
617{
618 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
619 int err;
620
58397e6c 621 ovs_mutex_lock(&dpdk_mutex);
7d1ced01
CL
622 strncpy(netdev->vhost_id, netdev->up.name, sizeof(netdev->vhost_id));
623 err = vhost_construct_helper(netdev_);
58397e6c 624 ovs_mutex_unlock(&dpdk_mutex);
7d1ced01
CL
625 return err;
626}
58397e6c 627
7d1ced01
CL
628static int
629netdev_dpdk_vhost_user_construct(struct netdev *netdev_)
630{
631 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
632 int err;
a0cb2d66 633
7d1ced01
CL
634 ovs_mutex_lock(&dpdk_mutex);
635 /* Take the name of the vhost-user port and append it to the location where
636 * the socket is to be created, then register the socket.
637 */
638 snprintf(netdev->vhost_id, sizeof(netdev->vhost_id), "%s/%s",
639 vhost_sock_dir, netdev_->name);
640 err = rte_vhost_driver_register(netdev->vhost_id);
641 if (err) {
642 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
643 netdev->vhost_id);
644 }
645 VLOG_INFO("Socket %s created for vhost-user port %s\n", netdev->vhost_id, netdev_->name);
646 err = vhost_construct_helper(netdev_);
647 ovs_mutex_unlock(&dpdk_mutex);
58397e6c
KT
648 return err;
649}
650
95fb793a 651static int
652netdev_dpdk_construct(struct netdev *netdev)
653{
654 unsigned int port_no;
655 int err;
656
657 if (rte_eal_init_ret) {
658 return rte_eal_init_ret;
659 }
660
661 /* Names always start with "dpdk" */
662 err = dpdk_dev_parse_name(netdev->name, "dpdk", &port_no);
663 if (err) {
664 return err;
665 }
666
667 ovs_mutex_lock(&dpdk_mutex);
58397e6c 668 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
8a9562d2
PS
669 ovs_mutex_unlock(&dpdk_mutex);
670 return err;
671}
672
673static void
674netdev_dpdk_destruct(struct netdev *netdev_)
675{
676 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
677
678 ovs_mutex_lock(&dev->mutex);
679 rte_eth_dev_stop(dev->port_id);
680 ovs_mutex_unlock(&dev->mutex);
681
682 ovs_mutex_lock(&dpdk_mutex);
5a034064 683 rte_free(dev->tx_q);
8a9562d2
PS
684 list_remove(&dev->list_node);
685 dpdk_mp_put(dev->dpdk_mp);
686 ovs_mutex_unlock(&dpdk_mutex);
58397e6c 687}
8a9562d2 688
58397e6c
KT
689static void
690netdev_dpdk_vhost_destruct(struct netdev *netdev_)
691{
692 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
693
694 /* Can't remove a port while a guest is attached to it. */
695 if (netdev_dpdk_get_virtio(dev) != NULL) {
696 VLOG_ERR("Can not remove port, vhost device still attached");
697 return;
698 }
699
700 ovs_mutex_lock(&dpdk_mutex);
701 list_remove(&dev->list_node);
702 dpdk_mp_put(dev->dpdk_mp);
703 ovs_mutex_unlock(&dpdk_mutex);
8a9562d2
PS
704}
705
706static void
707netdev_dpdk_dealloc(struct netdev *netdev_)
708{
709 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
710
711 rte_free(netdev);
712}
713
714static int
715netdev_dpdk_get_config(const struct netdev *netdev_, struct smap *args)
716{
717 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
718
719 ovs_mutex_lock(&dev->mutex);
720
47659062 721 smap_add_format(args, "configured_rx_queues", "%d", netdev_->n_rxq);
a0cb2d66
DDP
722 smap_add_format(args, "requested_tx_queues", "%d", netdev_->n_txq);
723 smap_add_format(args, "configured_tx_queues", "%d", dev->real_n_txq);
8a9562d2
PS
724 ovs_mutex_unlock(&dev->mutex);
725
726 return 0;
727}
728
7dec44fe
AW
729static int
730netdev_dpdk_get_numa_id(const struct netdev *netdev_)
731{
732 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
733
734 return netdev->socket_id;
735}
736
5496878c
AW
737/* Sets the number of tx queues and rx queues for the dpdk interface.
738 * If the configuration fails, do not try restoring its old configuration
739 * and just returns the error. */
740static int
741netdev_dpdk_set_multiq(struct netdev *netdev_, unsigned int n_txq,
742 unsigned int n_rxq)
743{
744 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
745 int err = 0;
746
747 if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
748 return err;
749 }
750
b7ccaf67 751 ovs_mutex_lock(&dpdk_mutex);
5496878c 752 ovs_mutex_lock(&netdev->mutex);
91968eb0 753
5496878c 754 rte_eth_dev_stop(netdev->port_id);
91968eb0 755
5496878c
AW
756 netdev->up.n_txq = n_txq;
757 netdev->up.n_rxq = n_rxq;
58397e6c 758
91968eb0 759 rte_free(netdev->tx_q);
5496878c 760 err = dpdk_eth_dev_init(netdev);
a0cb2d66
DDP
761 netdev_dpdk_alloc_txq(netdev, netdev->real_n_txq);
762
763 netdev->txq_needs_locking = netdev->real_n_txq != netdev->up.n_txq;
91968eb0 764
5496878c 765 ovs_mutex_unlock(&netdev->mutex);
b7ccaf67 766 ovs_mutex_unlock(&dpdk_mutex);
5496878c
AW
767
768 return err;
769}
770
58397e6c
KT
771static int
772netdev_dpdk_vhost_set_multiq(struct netdev *netdev_, unsigned int n_txq,
a0cb2d66 773 unsigned int n_rxq)
58397e6c
KT
774{
775 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
776 int err = 0;
777
778 if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
779 return err;
780 }
781
782 ovs_mutex_lock(&dpdk_mutex);
783 ovs_mutex_lock(&netdev->mutex);
784
785 netdev->up.n_txq = n_txq;
a0cb2d66
DDP
786 netdev->real_n_txq = 1;
787 netdev->up.n_rxq = 1;
58397e6c
KT
788
789 ovs_mutex_unlock(&netdev->mutex);
790 ovs_mutex_unlock(&dpdk_mutex);
791
792 return err;
793}
794
8a9562d2
PS
795static struct netdev_rxq *
796netdev_dpdk_rxq_alloc(void)
797{
798 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
799
800 return &rx->up;
801}
802
803static struct netdev_rxq_dpdk *
804netdev_rxq_dpdk_cast(const struct netdev_rxq *rx)
805{
806 return CONTAINER_OF(rx, struct netdev_rxq_dpdk, up);
807}
808
809static int
810netdev_dpdk_rxq_construct(struct netdev_rxq *rxq_)
811{
812 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
813 struct netdev_dpdk *netdev = netdev_dpdk_cast(rx->up.netdev);
814
815 ovs_mutex_lock(&netdev->mutex);
816 rx->port_id = netdev->port_id;
817 ovs_mutex_unlock(&netdev->mutex);
818
819 return 0;
820}
821
822static void
823netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED)
824{
825}
826
827static void
828netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
829{
830 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
831
832 rte_free(rx);
833}
834
b170db2a
RW
835static inline void
836dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
8a9562d2
PS
837{
838 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1304f1f8
DDP
839 uint32_t nb_tx = 0;
840
841 while (nb_tx != txq->count) {
842 uint32_t ret;
843
844 ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
845 txq->count - nb_tx);
846 if (!ret) {
847 break;
848 }
849
850 nb_tx += ret;
851 }
8a9562d2 852
b170db2a 853 if (OVS_UNLIKELY(nb_tx != txq->count)) {
db73f716
DDP
854 /* free buffers, which we couldn't transmit, one at a time (each
855 * packet could come from a different mempool) */
856 int i;
857
858 for (i = nb_tx; i < txq->count; i++) {
859 rte_pktmbuf_free_seg(txq->burst_pkts[i]);
860 }
45d947c4 861 rte_spinlock_lock(&dev->stats_lock);
1304f1f8 862 dev->stats.tx_dropped += txq->count-nb_tx;
45d947c4 863 rte_spinlock_unlock(&dev->stats_lock);
8a9562d2 864 }
1304f1f8 865
8a9562d2 866 txq->count = 0;
844f2d74 867 txq->tsc = rte_get_timer_cycles();
b170db2a
RW
868}
869
870static inline void
871dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
872{
873 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
874
875 if (txq->count == 0) {
876 return;
877 }
b170db2a 878 dpdk_queue_flush__(dev, qid);
8a9562d2
PS
879}
880
58397e6c
KT
881static bool
882is_vhost_running(struct virtio_net *dev)
883{
884 return (dev != NULL && (dev->flags & VIRTIO_DEV_RUNNING));
885}
886
887/*
888 * The receive path for the vhost port is the TX path out from guest.
889 */
890static int
891netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq_,
892 struct dp_packet **packets, int *c)
893{
894 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
895 struct netdev *netdev = rx->up.netdev;
896 struct netdev_dpdk *vhost_dev = netdev_dpdk_cast(netdev);
897 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(vhost_dev);
898 int qid = 1;
899 uint16_t nb_rx = 0;
900
901 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
902 return EAGAIN;
903 }
904
905 nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,
906 vhost_dev->dpdk_mp->mp,
907 (struct rte_mbuf **)packets,
cd159f1a 908 NETDEV_MAX_BURST);
58397e6c
KT
909 if (!nb_rx) {
910 return EAGAIN;
911 }
912
45d947c4 913 rte_spinlock_lock(&vhost_dev->stats_lock);
58397e6c 914 vhost_dev->stats.rx_packets += (uint64_t)nb_rx;
45d947c4
DDP
915 rte_spinlock_unlock(&vhost_dev->stats_lock);
916
58397e6c
KT
917 *c = (int) nb_rx;
918 return 0;
919}
920
8a9562d2 921static int
e14deea0 922netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 923 int *c)
8a9562d2
PS
924{
925 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
926 struct netdev *netdev = rx->up.netdev;
927 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 928 int nb_rx;
8a9562d2 929
5496878c
AW
930 /* There is only one tx queue for this core. Do not flush other
931 * queueus. */
932 if (rxq_->queue_id == rte_lcore_id()) {
933 dpdk_queue_flush(dev, rxq_->queue_id);
934 }
8a9562d2
PS
935
936 nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
7d08d53e 937 (struct rte_mbuf **) packets,
cd159f1a 938 NETDEV_MAX_BURST);
8a9562d2
PS
939 if (!nb_rx) {
940 return EAGAIN;
941 }
942
8a9562d2
PS
943 *c = nb_rx;
944
945 return 0;
946}
947
58397e6c
KT
948static void
949__netdev_dpdk_vhost_send(struct netdev *netdev, struct dp_packet **pkts,
950 int cnt, bool may_steal)
951{
952 struct netdev_dpdk *vhost_dev = netdev_dpdk_cast(netdev);
953 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(vhost_dev);
95e9881f
KT
954 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
955 unsigned int total_pkts = cnt;
956 uint64_t start = 0;
58397e6c
KT
957
958 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
45d947c4 959 rte_spinlock_lock(&vhost_dev->stats_lock);
1b99bb05 960 vhost_dev->stats.tx_dropped+= cnt;
45d947c4 961 rte_spinlock_unlock(&vhost_dev->stats_lock);
1b99bb05 962 goto out;
58397e6c
KT
963 }
964
965 /* There is vHost TX single queue, So we need to lock it for TX. */
a0cb2d66 966 rte_spinlock_lock(&vhost_dev->vhost_tx_lock);
58397e6c 967
95e9881f
KT
968 do {
969 unsigned int tx_pkts;
970
971 tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,
972 cur_pkts, cnt);
973 if (OVS_LIKELY(tx_pkts)) {
974 /* Packets have been sent.*/
975 cnt -= tx_pkts;
976 /* Prepare for possible next iteration.*/
977 cur_pkts = &cur_pkts[tx_pkts];
978 } else {
979 uint64_t timeout = VHOST_ENQ_RETRY_USECS * rte_get_timer_hz() / 1E6;
980 unsigned int expired = 0;
981
982 if (!start) {
983 start = rte_get_timer_cycles();
984 }
985
986 /*
987 * Unable to enqueue packets to vhost interface.
988 * Check available entries before retrying.
989 */
990 while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {
991 if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {
992 expired = 1;
993 break;
994 }
995 }
996 if (expired) {
997 /* break out of main loop. */
998 break;
999 }
1000 }
1001 } while (cnt);
a0cb2d66 1002 rte_spinlock_unlock(&vhost_dev->vhost_tx_lock);
95e9881f 1003
45d947c4 1004 rte_spinlock_lock(&vhost_dev->stats_lock);
95e9881f
KT
1005 vhost_dev->stats.tx_packets += (total_pkts - cnt);
1006 vhost_dev->stats.tx_dropped += cnt;
45d947c4 1007 rte_spinlock_unlock(&vhost_dev->stats_lock);
58397e6c
KT
1008
1009out:
1010 if (may_steal) {
95e9881f
KT
1011 int i;
1012
1013 for (i = 0; i < total_pkts; i++) {
1b99bb05
MG
1014 dp_packet_delete(pkts[i]);
1015 }
58397e6c
KT
1016 }
1017}
1018
8a9562d2 1019inline static void
f4fd623c
DDP
1020dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
1021 struct rte_mbuf **pkts, int cnt)
8a9562d2
PS
1022{
1023 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1024 uint64_t diff_tsc;
8a9562d2 1025
f4fd623c
DDP
1026 int i = 0;
1027
f4fd623c
DDP
1028 while (i < cnt) {
1029 int freeslots = MAX_TX_QUEUE_LEN - txq->count;
1030 int tocopy = MIN(freeslots, cnt-i);
8a9562d2 1031
f4fd623c
DDP
1032 memcpy(&txq->burst_pkts[txq->count], &pkts[i],
1033 tocopy * sizeof (struct rte_mbuf *));
1034
1035 txq->count += tocopy;
1036 i += tocopy;
1037
94143fc4 1038 if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
b170db2a 1039 dpdk_queue_flush__(dev, qid);
f4fd623c 1040 }
844f2d74 1041 diff_tsc = rte_get_timer_cycles() - txq->tsc;
f4fd623c 1042 if (diff_tsc >= DRAIN_TSC) {
b170db2a 1043 dpdk_queue_flush__(dev, qid);
f4fd623c 1044 }
8a9562d2 1045 }
8a9562d2
PS
1046}
1047
1048/* Tx function. Transmit packets indefinitely */
1049static void
58397e6c 1050dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
2654cc33 1051 int cnt)
db73f716 1052 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2 1053{
bce01e3a
EJ
1054#if !defined(__CHECKER__) && !defined(_WIN32)
1055 const size_t PKT_ARRAY_SIZE = cnt;
1056#else
1057 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 1058 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
bce01e3a 1059#endif
8a9562d2 1060 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bce01e3a 1061 struct rte_mbuf *mbufs[PKT_ARRAY_SIZE];
175cf4de
RW
1062 int dropped = 0;
1063 int newcnt = 0;
1064 int i;
8a9562d2 1065
db73f716
DDP
1066 /* If we are on a non pmd thread we have to use the mempool mutex, because
1067 * every non pmd thread shares the same mempool cache */
1068
1069 if (!thread_is_pmd()) {
1070 ovs_mutex_lock(&nonpmd_mempool_mutex);
1071 }
1072
f4fd623c 1073 for (i = 0; i < cnt; i++) {
cf62fa4c 1074 int size = dp_packet_size(pkts[i]);
95fb793a 1075
f98d7864 1076 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
f4fd623c
DDP
1077 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1078 (int)size , dev->max_packet_len);
1079
175cf4de 1080 dropped++;
f4fd623c
DDP
1081 continue;
1082 }
8a9562d2 1083
f4fd623c 1084 mbufs[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
8a9562d2 1085
f4fd623c 1086 if (!mbufs[newcnt]) {
175cf4de
RW
1087 dropped += cnt - i;
1088 break;
f4fd623c
DDP
1089 }
1090
1091 /* We have to do a copy for now */
b8e57534 1092 memcpy(rte_pktmbuf_mtod(mbufs[newcnt], void *), dp_packet_data(pkts[i]), size);
f4fd623c
DDP
1093
1094 rte_pktmbuf_data_len(mbufs[newcnt]) = size;
1095 rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
1096
1097 newcnt++;
1098 }
8a9562d2 1099
f98d7864 1100 if (OVS_UNLIKELY(dropped)) {
45d947c4 1101 rte_spinlock_lock(&dev->stats_lock);
175cf4de 1102 dev->stats.tx_dropped += dropped;
45d947c4 1103 rte_spinlock_unlock(&dev->stats_lock);
175cf4de
RW
1104 }
1105
58397e6c
KT
1106 if (dev->type == DPDK_DEV_VHOST) {
1107 __netdev_dpdk_vhost_send(netdev, (struct dp_packet **) mbufs, newcnt, true);
1108 } else {
1109 dpdk_queue_pkts(dev, qid, mbufs, newcnt);
1110 dpdk_queue_flush(dev, qid);
1111 }
db73f716
DDP
1112
1113 if (!thread_is_pmd()) {
1114 ovs_mutex_unlock(&nonpmd_mempool_mutex);
1115 }
8a9562d2
PS
1116}
1117
58397e6c
KT
1118static int
1119netdev_dpdk_vhost_send(struct netdev *netdev, int qid OVS_UNUSED, struct dp_packet **pkts,
1120 int cnt, bool may_steal)
1121{
1122 if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
1123 int i;
1124
1125 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
1126 if (may_steal) {
1127 for (i = 0; i < cnt; i++) {
1128 dp_packet_delete(pkts[i]);
1129 }
1130 }
1131 } else {
1132 __netdev_dpdk_vhost_send(netdev, pkts, cnt, may_steal);
1133 }
1134 return 0;
1135}
1136
7251515e
DV
1137static inline void
1138netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
e14deea0 1139 struct dp_packet **pkts, int cnt, bool may_steal)
8a9562d2 1140{
f4fd623c 1141 int i;
8a9562d2 1142
a0cb2d66
DDP
1143 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1144 qid = qid % dev->real_n_txq;
1145 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1146 }
1147
7251515e 1148 if (OVS_UNLIKELY(!may_steal ||
cf62fa4c 1149 pkts[0]->source != DPBUF_DPDK)) {
7251515e
DV
1150 struct netdev *netdev = &dev->up;
1151
2654cc33 1152 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
b3cd9f9d
PS
1153
1154 if (may_steal) {
f4fd623c 1155 for (i = 0; i < cnt; i++) {
e14deea0 1156 dp_packet_delete(pkts[i]);
f4fd623c 1157 }
b3cd9f9d 1158 }
8a9562d2 1159 } else {
f4fd623c
DDP
1160 int next_tx_idx = 0;
1161 int dropped = 0;
8a9562d2 1162
f4fd623c 1163 for (i = 0; i < cnt; i++) {
cf62fa4c 1164 int size = dp_packet_size(pkts[i]);
1b99bb05 1165
f4fd623c
DDP
1166 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1167 if (next_tx_idx != i) {
1168 dpdk_queue_pkts(dev, qid,
1169 (struct rte_mbuf **)&pkts[next_tx_idx],
1170 i-next_tx_idx);
1ebfe1ac 1171 }
f4fd623c 1172
1ebfe1ac
DDP
1173 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1174 (int)size , dev->max_packet_len);
f4fd623c 1175
e14deea0 1176 dp_packet_delete(pkts[i]);
1ebfe1ac 1177 dropped++;
f4fd623c
DDP
1178 next_tx_idx = i + 1;
1179 }
1180 }
1181 if (next_tx_idx != cnt) {
1182 dpdk_queue_pkts(dev, qid,
1183 (struct rte_mbuf **)&pkts[next_tx_idx],
1184 cnt-next_tx_idx);
1185 }
8a9562d2 1186
f4fd623c 1187 if (OVS_UNLIKELY(dropped)) {
45d947c4 1188 rte_spinlock_lock(&dev->stats_lock);
f4fd623c 1189 dev->stats.tx_dropped += dropped;
45d947c4 1190 rte_spinlock_unlock(&dev->stats_lock);
f4fd623c 1191 }
8a9562d2 1192 }
a0cb2d66
DDP
1193
1194 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1195 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1196 }
7251515e
DV
1197}
1198
1199static int
1200netdev_dpdk_eth_send(struct netdev *netdev, int qid,
e14deea0 1201 struct dp_packet **pkts, int cnt, bool may_steal)
7251515e
DV
1202{
1203 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1204
7251515e
DV
1205 netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
1206 return 0;
8a9562d2
PS
1207}
1208
1209static int
1210netdev_dpdk_set_etheraddr(struct netdev *netdev,
1211 const uint8_t mac[ETH_ADDR_LEN])
1212{
1213 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1214
1215 ovs_mutex_lock(&dev->mutex);
1216 if (!eth_addr_equals(dev->hwaddr, mac)) {
1217 memcpy(dev->hwaddr, mac, ETH_ADDR_LEN);
045c0d1a 1218 netdev_change_seq_changed(netdev);
8a9562d2
PS
1219 }
1220 ovs_mutex_unlock(&dev->mutex);
1221
1222 return 0;
1223}
1224
1225static int
1226netdev_dpdk_get_etheraddr(const struct netdev *netdev,
1227 uint8_t mac[ETH_ADDR_LEN])
1228{
1229 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1230
1231 ovs_mutex_lock(&dev->mutex);
1232 memcpy(mac, dev->hwaddr, ETH_ADDR_LEN);
1233 ovs_mutex_unlock(&dev->mutex);
1234
1235 return 0;
1236}
1237
1238static int
1239netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
1240{
1241 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1242
1243 ovs_mutex_lock(&dev->mutex);
1244 *mtup = dev->mtu;
1245 ovs_mutex_unlock(&dev->mutex);
1246
1247 return 0;
1248}
1249
1250static int
1251netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
1252{
1253 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1254 int old_mtu, err;
1255 struct dpdk_mp *old_mp;
1256 struct dpdk_mp *mp;
1257
1258 ovs_mutex_lock(&dpdk_mutex);
1259 ovs_mutex_lock(&dev->mutex);
1260 if (dev->mtu == mtu) {
1261 err = 0;
1262 goto out;
1263 }
1264
1265 mp = dpdk_mp_get(dev->socket_id, dev->mtu);
1266 if (!mp) {
1267 err = ENOMEM;
1268 goto out;
1269 }
1270
1271 rte_eth_dev_stop(dev->port_id);
1272
1273 old_mtu = dev->mtu;
1274 old_mp = dev->dpdk_mp;
1275 dev->dpdk_mp = mp;
1276 dev->mtu = mtu;
1277 dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
1278
1279 err = dpdk_eth_dev_init(dev);
1280 if (err) {
8a9562d2
PS
1281 dpdk_mp_put(mp);
1282 dev->mtu = old_mtu;
1283 dev->dpdk_mp = old_mp;
1284 dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
1285 dpdk_eth_dev_init(dev);
1286 goto out;
1287 }
1288
1289 dpdk_mp_put(old_mp);
045c0d1a 1290 netdev_change_seq_changed(netdev);
8a9562d2
PS
1291out:
1292 ovs_mutex_unlock(&dev->mutex);
1293 ovs_mutex_unlock(&dpdk_mutex);
1294 return err;
1295}
1296
1297static int
1298netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier);
1299
58397e6c
KT
1300static int
1301netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
1302 struct netdev_stats *stats)
1303{
1304 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1305
1306 ovs_mutex_lock(&dev->mutex);
1307 memset(stats, 0, sizeof(*stats));
1308 /* Unsupported Stats */
1309 stats->rx_errors = UINT64_MAX;
1310 stats->tx_errors = UINT64_MAX;
1311 stats->multicast = UINT64_MAX;
1312 stats->collisions = UINT64_MAX;
1313 stats->rx_crc_errors = UINT64_MAX;
1314 stats->rx_fifo_errors = UINT64_MAX;
1315 stats->rx_frame_errors = UINT64_MAX;
1316 stats->rx_length_errors = UINT64_MAX;
1317 stats->rx_missed_errors = UINT64_MAX;
1318 stats->rx_over_errors = UINT64_MAX;
1319 stats->tx_aborted_errors = UINT64_MAX;
1320 stats->tx_carrier_errors = UINT64_MAX;
1321 stats->tx_errors = UINT64_MAX;
1322 stats->tx_fifo_errors = UINT64_MAX;
1323 stats->tx_heartbeat_errors = UINT64_MAX;
1324 stats->tx_window_errors = UINT64_MAX;
1325 stats->rx_bytes += UINT64_MAX;
1326 stats->rx_dropped += UINT64_MAX;
1327 stats->tx_bytes += UINT64_MAX;
1328
45d947c4 1329 rte_spinlock_lock(&dev->stats_lock);
58397e6c
KT
1330 /* Supported Stats */
1331 stats->rx_packets += dev->stats.rx_packets;
1332 stats->tx_packets += dev->stats.tx_packets;
1333 stats->tx_dropped += dev->stats.tx_dropped;
45d947c4 1334 rte_spinlock_unlock(&dev->stats_lock);
58397e6c
KT
1335 ovs_mutex_unlock(&dev->mutex);
1336
1337 return 0;
1338}
1339
8a9562d2
PS
1340static int
1341netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
1342{
1343 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1344 struct rte_eth_stats rte_stats;
1345 bool gg;
1346
1347 netdev_dpdk_get_carrier(netdev, &gg);
1348 ovs_mutex_lock(&dev->mutex);
1349 rte_eth_stats_get(dev->port_id, &rte_stats);
1350
2f9dd77f 1351 memset(stats, 0, sizeof(*stats));
8a9562d2 1352
2f9dd77f
PS
1353 stats->rx_packets = rte_stats.ipackets;
1354 stats->tx_packets = rte_stats.opackets;
1355 stats->rx_bytes = rte_stats.ibytes;
1356 stats->tx_bytes = rte_stats.obytes;
1357 stats->rx_errors = rte_stats.ierrors;
1358 stats->tx_errors = rte_stats.oerrors;
1359 stats->multicast = rte_stats.imcasts;
8a9562d2 1360
45d947c4 1361 rte_spinlock_lock(&dev->stats_lock);
2f9dd77f 1362 stats->tx_dropped = dev->stats.tx_dropped;
45d947c4 1363 rte_spinlock_unlock(&dev->stats_lock);
8a9562d2
PS
1364 ovs_mutex_unlock(&dev->mutex);
1365
1366 return 0;
1367}
1368
1369static int
1370netdev_dpdk_get_features(const struct netdev *netdev_,
1371 enum netdev_features *current,
1372 enum netdev_features *advertised OVS_UNUSED,
1373 enum netdev_features *supported OVS_UNUSED,
1374 enum netdev_features *peer OVS_UNUSED)
1375{
1376 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1377 struct rte_eth_link link;
1378
1379 ovs_mutex_lock(&dev->mutex);
1380 link = dev->link;
1381 ovs_mutex_unlock(&dev->mutex);
1382
1383 if (link.link_duplex == ETH_LINK_AUTONEG_DUPLEX) {
1384 if (link.link_speed == ETH_LINK_SPEED_AUTONEG) {
1385 *current = NETDEV_F_AUTONEG;
1386 }
1387 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
1388 if (link.link_speed == ETH_LINK_SPEED_10) {
1389 *current = NETDEV_F_10MB_HD;
1390 }
1391 if (link.link_speed == ETH_LINK_SPEED_100) {
1392 *current = NETDEV_F_100MB_HD;
1393 }
1394 if (link.link_speed == ETH_LINK_SPEED_1000) {
1395 *current = NETDEV_F_1GB_HD;
1396 }
1397 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
1398 if (link.link_speed == ETH_LINK_SPEED_10) {
1399 *current = NETDEV_F_10MB_FD;
1400 }
1401 if (link.link_speed == ETH_LINK_SPEED_100) {
1402 *current = NETDEV_F_100MB_FD;
1403 }
1404 if (link.link_speed == ETH_LINK_SPEED_1000) {
1405 *current = NETDEV_F_1GB_FD;
1406 }
1407 if (link.link_speed == ETH_LINK_SPEED_10000) {
1408 *current = NETDEV_F_10GB_FD;
1409 }
1410 }
1411
1412 return 0;
1413}
1414
1415static int
1416netdev_dpdk_get_ifindex(const struct netdev *netdev)
1417{
1418 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1419 int ifindex;
1420
1421 ovs_mutex_lock(&dev->mutex);
1422 ifindex = dev->port_id;
1423 ovs_mutex_unlock(&dev->mutex);
1424
1425 return ifindex;
1426}
1427
1428static int
1429netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier)
1430{
1431 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1432
1433 ovs_mutex_lock(&dev->mutex);
1434 check_link_status(dev);
1435 *carrier = dev->link.link_status;
58397e6c
KT
1436
1437 ovs_mutex_unlock(&dev->mutex);
1438
1439 return 0;
1440}
1441
1442static int
1443netdev_dpdk_vhost_get_carrier(const struct netdev *netdev_, bool *carrier)
1444{
1445 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1446 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
1447
1448 ovs_mutex_lock(&dev->mutex);
1449
1450 if (is_vhost_running(virtio_dev)) {
1451 *carrier = 1;
1452 } else {
1453 *carrier = 0;
1454 }
1455
8a9562d2
PS
1456 ovs_mutex_unlock(&dev->mutex);
1457
1458 return 0;
1459}
1460
1461static long long int
1462netdev_dpdk_get_carrier_resets(const struct netdev *netdev_)
1463{
1464 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1465 long long int carrier_resets;
1466
1467 ovs_mutex_lock(&dev->mutex);
1468 carrier_resets = dev->link_reset_cnt;
1469 ovs_mutex_unlock(&dev->mutex);
1470
1471 return carrier_resets;
1472}
1473
1474static int
1475netdev_dpdk_set_miimon(struct netdev *netdev_ OVS_UNUSED,
1476 long long int interval OVS_UNUSED)
1477{
ee32150e 1478 return EOPNOTSUPP;
8a9562d2
PS
1479}
1480
1481static int
1482netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
1483 enum netdev_flags off, enum netdev_flags on,
95fb793a 1484 enum netdev_flags *old_flagsp) OVS_REQUIRES(dev->mutex)
8a9562d2
PS
1485{
1486 int err;
1487
1488 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
1489 return EINVAL;
1490 }
1491
1492 *old_flagsp = dev->flags;
1493 dev->flags |= on;
1494 dev->flags &= ~off;
1495
1496 if (dev->flags == *old_flagsp) {
1497 return 0;
1498 }
1499
58397e6c
KT
1500 if (dev->type == DPDK_DEV_ETH) {
1501 if (dev->flags & NETDEV_UP) {
1502 err = rte_eth_dev_start(dev->port_id);
1503 if (err)
1504 return -err;
1505 }
8a9562d2 1506
58397e6c
KT
1507 if (dev->flags & NETDEV_PROMISC) {
1508 rte_eth_promiscuous_enable(dev->port_id);
1509 }
8a9562d2 1510
58397e6c
KT
1511 if (!(dev->flags & NETDEV_UP)) {
1512 rte_eth_dev_stop(dev->port_id);
1513 }
8a9562d2
PS
1514 }
1515
1516 return 0;
1517}
1518
1519static int
1520netdev_dpdk_update_flags(struct netdev *netdev_,
1521 enum netdev_flags off, enum netdev_flags on,
1522 enum netdev_flags *old_flagsp)
1523{
1524 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1525 int error;
1526
1527 ovs_mutex_lock(&netdev->mutex);
1528 error = netdev_dpdk_update_flags__(netdev, off, on, old_flagsp);
1529 ovs_mutex_unlock(&netdev->mutex);
1530
1531 return error;
1532}
1533
1534static int
1535netdev_dpdk_get_status(const struct netdev *netdev_, struct smap *args)
1536{
1537 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1538 struct rte_eth_dev_info dev_info;
1539
e0a801c7 1540 if (dev->port_id < 0)
8a9562d2
PS
1541 return ENODEV;
1542
1543 ovs_mutex_lock(&dev->mutex);
1544 rte_eth_dev_info_get(dev->port_id, &dev_info);
1545 ovs_mutex_unlock(&dev->mutex);
1546
1547 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1548
95fb793a 1549 smap_add_format(args, "port_no", "%d", dev->port_id);
8a9562d2
PS
1550 smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
1551 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1552 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
1553 smap_add_format(args, "max_rx_pktlen", "%u", dev_info.max_rx_pktlen);
1554 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
1555 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
1556 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
1557 smap_add_format(args, "max_hash_mac_addrs", "%u", dev_info.max_hash_mac_addrs);
1558 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
1559 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
1560
1561 smap_add_format(args, "pci-vendor_id", "0x%u", dev_info.pci_dev->id.vendor_id);
1562 smap_add_format(args, "pci-device_id", "0x%x", dev_info.pci_dev->id.device_id);
1563
1564 return 0;
1565}
1566
1567static void
1568netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
1569 OVS_REQUIRES(dev->mutex)
1570{
1571 enum netdev_flags old_flags;
1572
1573 if (admin_state) {
1574 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
1575 } else {
1576 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
1577 }
1578}
1579
1580static void
1581netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
1582 const char *argv[], void *aux OVS_UNUSED)
1583{
1584 bool up;
1585
1586 if (!strcasecmp(argv[argc - 1], "up")) {
1587 up = true;
1588 } else if ( !strcasecmp(argv[argc - 1], "down")) {
1589 up = false;
1590 } else {
1591 unixctl_command_reply_error(conn, "Invalid Admin State");
1592 return;
1593 }
1594
1595 if (argc > 2) {
1596 struct netdev *netdev = netdev_from_name(argv[1]);
1597 if (netdev && is_dpdk_class(netdev->netdev_class)) {
1598 struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
1599
1600 ovs_mutex_lock(&dpdk_dev->mutex);
1601 netdev_dpdk_set_admin_state__(dpdk_dev, up);
1602 ovs_mutex_unlock(&dpdk_dev->mutex);
1603
1604 netdev_close(netdev);
1605 } else {
1606 unixctl_command_reply_error(conn, "Not a DPDK Interface");
1607 netdev_close(netdev);
1608 return;
1609 }
1610 } else {
1611 struct netdev_dpdk *netdev;
1612
1613 ovs_mutex_lock(&dpdk_mutex);
1614 LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
1615 ovs_mutex_lock(&netdev->mutex);
1616 netdev_dpdk_set_admin_state__(netdev, up);
1617 ovs_mutex_unlock(&netdev->mutex);
1618 }
1619 ovs_mutex_unlock(&dpdk_mutex);
1620 }
1621 unixctl_command_reply(conn, "OK");
1622}
1623
58397e6c
KT
1624/*
1625 * Set virtqueue flags so that we do not receive interrupts.
1626 */
1627static void
1628set_irq_status(struct virtio_net *dev)
1629{
1630 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
1631 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
1632}
1633
1634/*
1635 * A new virtio-net device is added to a vhost port.
1636 */
1637static int
1638new_device(struct virtio_net *dev)
1639{
1640 struct netdev_dpdk *netdev;
1641 bool exists = false;
1642
1643 ovs_mutex_lock(&dpdk_mutex);
1644 /* Add device to the vhost port with the same name as that passed down. */
1645 LIST_FOR_EACH(netdev, list_node, &dpdk_list) {
7d1ced01 1646 if (strncmp(dev->ifname, netdev->vhost_id, IF_NAME_SZ) == 0) {
58397e6c
KT
1647 ovs_mutex_lock(&netdev->mutex);
1648 ovsrcu_set(&netdev->virtio_dev, dev);
1649 ovs_mutex_unlock(&netdev->mutex);
1650 exists = true;
1651 dev->flags |= VIRTIO_DEV_RUNNING;
1652 /* Disable notifications. */
1653 set_irq_status(dev);
1654 break;
1655 }
1656 }
1657 ovs_mutex_unlock(&dpdk_mutex);
1658
1659 if (!exists) {
1660 VLOG_INFO("vHost Device '%s' (%ld) can't be added - name not found",
1661 dev->ifname, dev->device_fh);
1662
1663 return -1;
1664 }
1665
1666 VLOG_INFO("vHost Device '%s' (%ld) has been added",
1667 dev->ifname, dev->device_fh);
1668 return 0;
1669}
1670
1671/*
1672 * Remove a virtio-net device from the specific vhost port. Use dev->remove
1673 * flag to stop any more packets from being sent or received to/from a VM and
1674 * ensure all currently queued packets have been sent/received before removing
1675 * the device.
1676 */
1677static void
1678destroy_device(volatile struct virtio_net *dev)
1679{
1680 struct netdev_dpdk *vhost_dev;
1681
1682 ovs_mutex_lock(&dpdk_mutex);
1683 LIST_FOR_EACH (vhost_dev, list_node, &dpdk_list) {
1684 if (netdev_dpdk_get_virtio(vhost_dev) == dev) {
1685
1686 ovs_mutex_lock(&vhost_dev->mutex);
1687 dev->flags &= ~VIRTIO_DEV_RUNNING;
1688 ovsrcu_set(&vhost_dev->virtio_dev, NULL);
1689 ovs_mutex_unlock(&vhost_dev->mutex);
1690
1691 /*
1692 * Wait for other threads to quiesce before
1693 * setting the virtio_dev to NULL.
1694 */
1695 ovsrcu_synchronize();
618f44f7
KT
1696 /*
1697 * As call to ovsrcu_synchronize() will end the quiescent state,
1698 * put thread back into quiescent state before returning.
1699 */
1700 ovsrcu_quiesce_start();
58397e6c
KT
1701 }
1702 }
1703 ovs_mutex_unlock(&dpdk_mutex);
1704
1705 VLOG_INFO("vHost Device '%s' (%ld) has been removed",
1706 dev->ifname, dev->device_fh);
1707}
1708
1709struct virtio_net *
1710netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
1711{
1712 return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);
1713}
1714
1715/*
1716 * These callbacks allow virtio-net devices to be added to vhost ports when
1717 * configuration has been fully complete.
1718 */
bce01e3a 1719static const struct virtio_net_device_ops virtio_net_device_ops =
58397e6c
KT
1720{
1721 .new_device = new_device,
1722 .destroy_device = destroy_device,
1723};
1724
1725static void *
7d1ced01 1726start_vhost_loop(void *dummy OVS_UNUSED)
58397e6c
KT
1727{
1728 pthread_detach(pthread_self());
618f44f7
KT
1729 /* Put the cuse thread into quiescent state. */
1730 ovsrcu_quiesce_start();
58397e6c
KT
1731 rte_vhost_driver_session_start();
1732 return NULL;
1733}
1734
1735static int
1736dpdk_vhost_class_init(void)
7d1ced01
CL
1737{
1738 rte_vhost_driver_callback_register(&virtio_net_device_ops);
1739 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
1740 return 0;
1741}
1742
1743static int
1744dpdk_vhost_cuse_class_init(void)
58397e6c 1745{
58397e6c
KT
1746 int err = -1;
1747
58397e6c
KT
1748
1749 /* Register CUSE device to handle IOCTLs.
1750 * Unless otherwise specified on the vswitchd command line, cuse_dev_name
1751 * is set to vhost-net.
1752 */
1753 err = rte_vhost_driver_register(cuse_dev_name);
1754
1755 if (err != 0) {
1756 VLOG_ERR("CUSE device setup failure.");
1757 return -1;
1758 }
1759
7d1ced01
CL
1760 dpdk_vhost_class_init();
1761 return 0;
1762}
1763
1764static int
1765dpdk_vhost_user_class_init(void)
1766{
1767 dpdk_vhost_class_init();
618f44f7 1768 return 0;
58397e6c
KT
1769}
1770
033e9df2
DDP
1771static void
1772dpdk_common_init(void)
1773{
1774 unixctl_command_register("netdev-dpdk/set-admin-state",
1775 "[netdev] up|down", 1, 2,
1776 netdev_dpdk_set_admin_state, NULL);
1777
1778 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
1779}
1780
95fb793a 1781/* Client Rings */
1782
95fb793a 1783static int
1784dpdk_ring_create(const char dev_name[], unsigned int port_no,
1785 unsigned int *eth_port_id)
1786{
1787 struct dpdk_ring *ivshmem;
1788 char ring_name[10];
1789 int err;
1790
1791 ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
1792 if (ivshmem == NULL) {
1793 return ENOMEM;
1794 }
1795
7251515e 1796 /* XXX: Add support for multiquque ring. */
95fb793a 1797 err = snprintf(ring_name, 10, "%s_tx", dev_name);
1798 if (err < 0) {
1799 return -err;
1800 }
1801
7251515e
DV
1802 /* Create single consumer/producer rings, netdev does explicit locking. */
1803 ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
1804 RING_F_SP_ENQ | RING_F_SC_DEQ);
95fb793a 1805 if (ivshmem->cring_tx == NULL) {
1806 rte_free(ivshmem);
1807 return ENOMEM;
1808 }
1809
1810 err = snprintf(ring_name, 10, "%s_rx", dev_name);
1811 if (err < 0) {
1812 return -err;
1813 }
1814
7251515e
DV
1815 /* Create single consumer/producer rings, netdev does explicit locking. */
1816 ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
1817 RING_F_SP_ENQ | RING_F_SC_DEQ);
95fb793a 1818 if (ivshmem->cring_rx == NULL) {
1819 rte_free(ivshmem);
1820 return ENOMEM;
1821 }
1822
d7310583
DDP
1823 err = rte_eth_from_rings(dev_name, &ivshmem->cring_rx, 1,
1824 &ivshmem->cring_tx, 1, SOCKET0);
1825
95fb793a 1826 if (err < 0) {
1827 rte_free(ivshmem);
1828 return ENODEV;
1829 }
1830
1831 ivshmem->user_port_id = port_no;
1832 ivshmem->eth_port_id = rte_eth_dev_count() - 1;
1833 list_push_back(&dpdk_ring_list, &ivshmem->list_node);
1834
1835 *eth_port_id = ivshmem->eth_port_id;
1836 return 0;
1837}
1838
1839static int
1840dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dpdk_mutex)
1841{
1842 struct dpdk_ring *ivshmem;
1843 unsigned int port_no;
1844 int err = 0;
1845
1846 /* Names always start with "dpdkr" */
1847 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
1848 if (err) {
1849 return err;
1850 }
1851
1852 /* look through our list to find the device */
1853 LIST_FOR_EACH (ivshmem, list_node, &dpdk_ring_list) {
1854 if (ivshmem->user_port_id == port_no) {
58397e6c 1855 VLOG_INFO("Found dpdk ring device %s:", dev_name);
95fb793a 1856 *eth_port_id = ivshmem->eth_port_id; /* really all that is needed */
1857 return 0;
1858 }
1859 }
1860 /* Need to create the device rings */
1861 return dpdk_ring_create(dev_name, port_no, eth_port_id);
1862}
1863
7251515e 1864static int
a0cb2d66 1865netdev_dpdk_ring_send(struct netdev *netdev_, int qid,
e14deea0 1866 struct dp_packet **pkts, int cnt, bool may_steal)
7251515e 1867{
a0cb2d66 1868 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1b99bb05
MG
1869 unsigned i;
1870
1871 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the
1872 * rss hash field is clear. This is because the same mbuf may be modified by
1873 * the consumer of the ring and return into the datapath without recalculating
1874 * the RSS hash. */
1875 for (i = 0; i < cnt; i++) {
1876 dp_packet_set_rss_hash(pkts[i], 0);
1877 }
7251515e 1878
a0cb2d66 1879 netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
7251515e
DV
1880 return 0;
1881}
1882
95fb793a 1883static int
1884netdev_dpdk_ring_construct(struct netdev *netdev)
1885{
1886 unsigned int port_no = 0;
1887 int err = 0;
1888
1889 if (rte_eal_init_ret) {
1890 return rte_eal_init_ret;
1891 }
1892
1893 ovs_mutex_lock(&dpdk_mutex);
1894
1895 err = dpdk_ring_open(netdev->name, &port_no);
1896 if (err) {
1897 goto unlock_dpdk;
1898 }
1899
58397e6c 1900 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
95fb793a 1901
1902unlock_dpdk:
1903 ovs_mutex_unlock(&dpdk_mutex);
1904 return err;
1905}
1906
58397e6c
KT
1907#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, MULTIQ, SEND, \
1908 GET_CARRIER, GET_STATS, GET_FEATURES, GET_STATUS, RXQ_RECV) \
95fb793a 1909{ \
1910 NAME, \
1911 INIT, /* init */ \
1912 NULL, /* netdev_dpdk_run */ \
1913 NULL, /* netdev_dpdk_wait */ \
1914 \
1915 netdev_dpdk_alloc, \
1916 CONSTRUCT, \
58397e6c 1917 DESTRUCT, \
95fb793a 1918 netdev_dpdk_dealloc, \
1919 netdev_dpdk_get_config, \
1920 NULL, /* netdev_dpdk_set_config */ \
1921 NULL, /* get_tunnel_config */ \
58397e6c
KT
1922 NULL, /* build header */ \
1923 NULL, /* push header */ \
1924 NULL, /* pop header */ \
7dec44fe 1925 netdev_dpdk_get_numa_id, /* get_numa_id */ \
5496878c 1926 MULTIQ, /* set_multiq */ \
95fb793a 1927 \
7251515e 1928 SEND, /* send */ \
95fb793a 1929 NULL, /* send_wait */ \
1930 \
1931 netdev_dpdk_set_etheraddr, \
1932 netdev_dpdk_get_etheraddr, \
1933 netdev_dpdk_get_mtu, \
1934 netdev_dpdk_set_mtu, \
1935 netdev_dpdk_get_ifindex, \
58397e6c 1936 GET_CARRIER, \
95fb793a 1937 netdev_dpdk_get_carrier_resets, \
1938 netdev_dpdk_set_miimon, \
58397e6c
KT
1939 GET_STATS, \
1940 GET_FEATURES, \
95fb793a 1941 NULL, /* set_advertisements */ \
1942 \
1943 NULL, /* set_policing */ \
1944 NULL, /* get_qos_types */ \
1945 NULL, /* get_qos_capabilities */ \
1946 NULL, /* get_qos */ \
1947 NULL, /* set_qos */ \
1948 NULL, /* get_queue */ \
1949 NULL, /* set_queue */ \
1950 NULL, /* delete_queue */ \
1951 NULL, /* get_queue_stats */ \
1952 NULL, /* queue_dump_start */ \
1953 NULL, /* queue_dump_next */ \
1954 NULL, /* queue_dump_done */ \
1955 NULL, /* dump_queue_stats */ \
1956 \
1957 NULL, /* get_in4 */ \
1958 NULL, /* set_in4 */ \
1959 NULL, /* get_in6 */ \
1960 NULL, /* add_router */ \
1961 NULL, /* get_next_hop */ \
58397e6c 1962 GET_STATUS, \
95fb793a 1963 NULL, /* arp_lookup */ \
1964 \
1965 netdev_dpdk_update_flags, \
1966 \
1967 netdev_dpdk_rxq_alloc, \
1968 netdev_dpdk_rxq_construct, \
1969 netdev_dpdk_rxq_destruct, \
1970 netdev_dpdk_rxq_dealloc, \
58397e6c 1971 RXQ_RECV, \
95fb793a 1972 NULL, /* rx_wait */ \
1973 NULL, /* rxq_drain */ \
1974}
8a9562d2 1975
7d1ced01
CL
1976static int
1977process_vhost_flags(char *flag, char *default_val, int size,
1978 char **argv, char **new_val)
1979{
1980 int changed = 0;
1981
1982 /* Depending on which version of vhost is in use, process the vhost-specific
1983 * flag if it is provided on the vswitchd command line, otherwise resort to
1984 * a default value.
1985 *
1986 * For vhost-user: Process "-cuse_dev_name" to set the custom location of
1987 * the vhost-user socket(s).
1988 * For vhost-cuse: Process "-vhost_sock_dir" to set the custom name of the
1989 * vhost-cuse character device.
1990 */
1991 if (!strcmp(argv[1], flag) && (strlen(argv[2]) <= size)) {
1992 changed = 1;
1993 *new_val = strdup(argv[2]);
1994 VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
1995 } else {
1996 VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
1997 *new_val = default_val;
1998 }
1999
2000 return changed;
2001}
2002
8a9562d2
PS
2003int
2004dpdk_init(int argc, char **argv)
2005{
2006 int result;
58397e6c
KT
2007 int base = 0;
2008 char *pragram_name = argv[0];
8a9562d2 2009
9441caf3 2010 if (argc < 2 || strcmp(argv[1], "--dpdk"))
8a9562d2
PS
2011 return 0;
2012
58397e6c 2013 /* Remove the --dpdk argument from arg list.*/
8a9562d2
PS
2014 argc--;
2015 argv++;
2016
7d1ced01
CL
2017#ifdef VHOST_CUSE
2018 if (process_vhost_flags("-cuse_dev_name", strdup("vhost-net"),
2019 PATH_MAX, argv, &cuse_dev_name)) {
2020#else
2021 if (process_vhost_flags("-vhost_sock_dir", strdup(ovs_rundir()),
2022 NAME_MAX, argv, &vhost_sock_dir)) {
2023 struct stat s;
2024 int err;
58397e6c 2025
7d1ced01
CL
2026 err = stat(vhost_sock_dir, &s);
2027 if (err) {
2028 VLOG_ERR("vHostUser socket DIR '%s' does not exist.",
2029 vhost_sock_dir);
2030 return err;
2031 }
2032#endif
2033 /* Remove the vhost flag configuration parameters from the argument
58397e6c
KT
2034 * list, so that the correct elements are passed to the DPDK
2035 * initialization function
2036 */
2037 argc -= 2;
7d1ced01 2038 argv += 2; /* Increment by two to bypass the vhost flag arguments */
58397e6c 2039 base = 2;
58397e6c
KT
2040 }
2041
2042 /* Keep the program name argument as this is needed for call to
2043 * rte_eal_init()
2044 */
2045 argv[0] = pragram_name;
2046
8a9562d2
PS
2047 /* Make sure things are initialized ... */
2048 result = rte_eal_init(argc, argv);
451450fa 2049 if (result < 0) {
58397e6c 2050 ovs_abort(result, "Cannot init EAL");
451450fa 2051 }
8a9562d2 2052
d7310583 2053 rte_memzone_dump(stdout);
8a9562d2
PS
2054 rte_eal_init_ret = 0;
2055
451450fa 2056 if (argc > result) {
9441caf3 2057 argv[result] = argv[0];
451450fa 2058 }
9441caf3 2059
db73f716 2060 /* We are called from the main thread here */
d5c199ea 2061 RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
db73f716 2062
58397e6c 2063 return result + 1 + base;
8a9562d2
PS
2064}
2065
bce01e3a 2066static const struct netdev_class dpdk_class =
95fb793a 2067 NETDEV_DPDK_CLASS(
2068 "dpdk",
b8e57534 2069 NULL,
5496878c 2070 netdev_dpdk_construct,
58397e6c 2071 netdev_dpdk_destruct,
7251515e 2072 netdev_dpdk_set_multiq,
58397e6c
KT
2073 netdev_dpdk_eth_send,
2074 netdev_dpdk_get_carrier,
2075 netdev_dpdk_get_stats,
2076 netdev_dpdk_get_features,
2077 netdev_dpdk_get_status,
2078 netdev_dpdk_rxq_recv);
95fb793a 2079
bce01e3a 2080static const struct netdev_class dpdk_ring_class =
95fb793a 2081 NETDEV_DPDK_CLASS(
2082 "dpdkr",
033e9df2 2083 NULL,
5496878c 2084 netdev_dpdk_ring_construct,
58397e6c 2085 netdev_dpdk_destruct,
a0cb2d66 2086 netdev_dpdk_set_multiq,
58397e6c
KT
2087 netdev_dpdk_ring_send,
2088 netdev_dpdk_get_carrier,
2089 netdev_dpdk_get_stats,
2090 netdev_dpdk_get_features,
2091 netdev_dpdk_get_status,
2092 netdev_dpdk_rxq_recv);
2093
7d1ced01 2094static const struct netdev_class dpdk_vhost_cuse_class =
58397e6c 2095 NETDEV_DPDK_CLASS(
7d1ced01
CL
2096 "dpdkvhostcuse",
2097 dpdk_vhost_cuse_class_init,
2098 netdev_dpdk_vhost_cuse_construct,
2099 netdev_dpdk_vhost_destruct,
2100 netdev_dpdk_vhost_set_multiq,
2101 netdev_dpdk_vhost_send,
2102 netdev_dpdk_vhost_get_carrier,
2103 netdev_dpdk_vhost_get_stats,
2104 NULL,
2105 NULL,
2106 netdev_dpdk_vhost_rxq_recv);
2107
2108const struct netdev_class dpdk_vhost_user_class =
2109 NETDEV_DPDK_CLASS(
2110 "dpdkvhostuser",
2111 dpdk_vhost_user_class_init,
2112 netdev_dpdk_vhost_user_construct,
58397e6c
KT
2113 netdev_dpdk_vhost_destruct,
2114 netdev_dpdk_vhost_set_multiq,
2115 netdev_dpdk_vhost_send,
2116 netdev_dpdk_vhost_get_carrier,
2117 netdev_dpdk_vhost_get_stats,
2118 NULL,
7251515e 2119 NULL,
58397e6c 2120 netdev_dpdk_vhost_rxq_recv);
95fb793a 2121
8a9562d2
PS
2122void
2123netdev_dpdk_register(void)
2124{
95fb793a 2125 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
2126
033e9df2
DDP
2127 if (rte_eal_init_ret) {
2128 return;
2129 }
2130
95fb793a 2131 if (ovsthread_once_start(&once)) {
033e9df2 2132 dpdk_common_init();
95fb793a 2133 netdev_register_provider(&dpdk_class);
2134 netdev_register_provider(&dpdk_ring_class);
7d1ced01
CL
2135#ifdef VHOST_CUSE
2136 netdev_register_provider(&dpdk_vhost_cuse_class);
2137#else
2138 netdev_register_provider(&dpdk_vhost_user_class);
2139#endif
95fb793a 2140 ovsthread_once_done(&once);
2141 }
8a9562d2 2142}
8617afff
PS
2143
2144int
bd5131ba 2145pmd_thread_setaffinity_cpu(unsigned cpu)
8617afff
PS
2146{
2147 cpu_set_t cpuset;
2148 int err;
2149
2150 CPU_ZERO(&cpuset);
2151 CPU_SET(cpu, &cpuset);
2152 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
2153 if (err) {
2154 VLOG_ERR("Thread affinity error %d",err);
2155 return err;
2156 }
abb5943d
AW
2157 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
2158 ovs_assert(cpu != NON_PMD_CORE_ID);
65f13b50 2159 RTE_PER_LCORE(_lcore_id) = cpu;
8617afff
PS
2160
2161 return 0;
2162}
db73f716 2163
db73f716
DDP
2164static bool
2165thread_is_pmd(void)
2166{
abb5943d 2167 return rte_lcore_id() != NON_PMD_CORE_ID;
db73f716 2168}