]> git.proxmox.com Git - ovs.git/blame - lib/netdev-dpdk.c
ofpbuf: Fix trivial spelling typo.
[ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2 1/*
0d4af148 2 * Copyright (c) 2014, 2015, 2016 Nicira, Inc.
8a9562d2
PS
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
8a9562d2
PS
19#include <string.h>
20#include <signal.h>
21#include <stdlib.h>
22#include <pthread.h>
23#include <config.h>
24#include <errno.h>
25#include <sched.h>
26#include <stdlib.h>
27#include <unistd.h>
7d1ced01 28#include <sys/stat.h>
8a9562d2 29#include <stdio.h>
7d1ced01
CL
30#include <sys/types.h>
31#include <sys/stat.h>
8a9562d2 32
7d1ced01 33#include "dirs.h"
e14deea0 34#include "dp-packet.h"
8a9562d2 35#include "dpif-netdev.h"
e5c0f5a4 36#include "fatal-signal.h"
8a9562d2
PS
37#include "list.h"
38#include "netdev-dpdk.h"
39#include "netdev-provider.h"
40#include "netdev-vport.h"
41#include "odp-util.h"
42#include "ofp-print.h"
94143fc4 43#include "ovs-numa.h"
8a9562d2
PS
44#include "ovs-thread.h"
45#include "ovs-rcu.h"
46#include "packets.h"
47#include "shash.h"
8a9562d2
PS
48#include "sset.h"
49#include "unaligned.h"
50#include "timeval.h"
51#include "unixctl.h"
e6211adc 52#include "openvswitch/vlog.h"
8a9562d2 53
b8e57534
MK
54#include "rte_config.h"
55#include "rte_mbuf.h"
58397e6c 56#include "rte_virtio_net.h"
b8e57534 57
8a9562d2
PS
58VLOG_DEFINE_THIS_MODULE(dpdk);
59static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
60
61#define DPDK_PORT_WATCHDOG_INTERVAL 5
62
63#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
64#define OVS_VPORT_DPDK "ovs_dpdk"
65
66/*
67 * need to reserve tons of extra space in the mbufs so we can align the
68 * DMA addresses to 4KB.
18f777b2
TP
69 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
70 * performance for standard Ethernet MTU.
8a9562d2 71 */
4be4d22c
MK
72#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN))
73#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
74#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
75#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN - ETHER_CRC_LEN)
76#define MBUF_SIZE(mtu) ( MTU_TO_MAX_FRAME_LEN(mtu) \
77 + sizeof(struct dp_packet) \
78 + RTE_PKTMBUF_HEADROOM)
79#define NETDEV_DPDK_MBUF_ALIGN 1024
8a9562d2 80
da79ce2b
DDP
81/* Max and min number of packets in the mempool. OVS tries to allocate a
82 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
83 * enough hugepages) we keep halving the number until the allocation succeeds
84 * or we reach MIN_NB_MBUF */
85
86#define MAX_NB_MBUF (4096 * 64)
87#define MIN_NB_MBUF (4096 * 4)
88#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
89
90/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
91BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF) == 0);
92
93/* The smallest possible NB_MBUF that we're going to try should be a multiple
94 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
95BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
96 % MP_CACHE_SZ == 0);
97
8a9562d2
PS
98#define SOCKET0 0
99
79f5354c
PM
100#define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
101#define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
102
585a5bea
IM
103#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
104
bd8baf47
DDP
105static char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */
106static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
58397e6c 107
95e9881f
KT
108/*
109 * Maximum amount of time in micro seconds to try and enqueue to vhost.
110 */
111#define VHOST_ENQ_RETRY_USECS 100
112
8a9562d2 113static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
114 .rxmode = {
115 .mq_mode = ETH_MQ_RX_RSS,
116 .split_hdr_size = 0,
117 .header_split = 0, /* Header Split disabled */
118 .hw_ip_checksum = 0, /* IP checksum offload disabled */
119 .hw_vlan_filter = 0, /* VLAN filtering disabled */
120 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
121 .hw_strip_crc = 0,
122 },
123 .rx_adv_conf = {
124 .rss_conf = {
125 .rss_key = NULL,
543342a4 126 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
8a9562d2 127 },
a28ddd11
DDP
128 },
129 .txmode = {
130 .mq_mode = ETH_MQ_TX_NONE,
131 },
8a9562d2
PS
132};
133
3a100265 134enum { MAX_TX_QUEUE_LEN = 384 };
58f7c37b
DDP
135enum { DPDK_RING_SIZE = 256 };
136BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
137enum { DRAIN_TSC = 200000ULL };
138
58397e6c
KT
139enum dpdk_dev_type {
140 DPDK_DEV_ETH = 0,
7d1ced01 141 DPDK_DEV_VHOST = 1,
58397e6c
KT
142};
143
8a9562d2
PS
144static int rte_eal_init_ret = ENODEV;
145
146static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
147
148/* Contains all 'struct dpdk_dev's. */
ca6ba700 149static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 150 = OVS_LIST_INITIALIZER(&dpdk_list);
8a9562d2 151
ca6ba700 152static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 153 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
8a9562d2 154
db73f716
DDP
155/* This mutex must be used by non pmd threads when allocating or freeing
156 * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
157 * use mempools, a non pmd thread should hold this mutex while calling them */
bce01e3a 158static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
db73f716 159
8a9562d2
PS
160struct dpdk_mp {
161 struct rte_mempool *mp;
162 int mtu;
163 int socket_id;
164 int refcount;
ca6ba700 165 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
8a9562d2
PS
166};
167
5a034064
AW
168/* There should be one 'struct dpdk_tx_queue' created for
169 * each cpu core. */
8a9562d2 170struct dpdk_tx_queue {
94143fc4
AW
171 bool flush_tx; /* Set to true to flush queue everytime */
172 /* pkts are queued. */
8a9562d2 173 int count;
a0cb2d66
DDP
174 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
175 * from concurrent access. It is used only
176 * if the queue is shared among different
177 * pmd threads (see 'txq_needs_locking'). */
585a5bea
IM
178 int map; /* Mapping of configured vhost-user queues
179 * to enabled by guest. */
8a9562d2
PS
180 uint64_t tsc;
181 struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
182};
183
95fb793a 184/* dpdk has no way to remove dpdk ring ethernet devices
185 so we have to keep them around once they've been created
186*/
187
ca6ba700 188static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 189 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
95fb793a 190
191struct dpdk_ring {
192 /* For the client rings */
193 struct rte_ring *cring_tx;
194 struct rte_ring *cring_rx;
b83a2df1 195 unsigned int user_port_id; /* User given port no, parsed from port name */
95fb793a 196 int eth_port_id; /* ethernet device port id */
ca6ba700 197 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
95fb793a 198};
199
8a9562d2
PS
200struct netdev_dpdk {
201 struct netdev up;
202 int port_id;
203 int max_packet_len;
58397e6c 204 enum dpdk_dev_type type;
8a9562d2 205
5a034064 206 struct dpdk_tx_queue *tx_q;
8a9562d2
PS
207
208 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
209
210 struct dpdk_mp *dpdk_mp;
211 int mtu;
212 int socket_id;
213 int buf_size;
8a9562d2 214 struct netdev_stats stats;
45d947c4
DDP
215 /* Protects stats */
216 rte_spinlock_t stats_lock;
8a9562d2 217
74ff3298 218 struct eth_addr hwaddr;
8a9562d2
PS
219 enum netdev_flags flags;
220
221 struct rte_eth_link link;
222 int link_reset_cnt;
223
a0cb2d66
DDP
224 /* The user might request more txqs than the NIC has. We remap those
225 * ('up.n_txq') on these ('real_n_txq').
226 * If the numbers match, 'txq_needs_locking' is false, otherwise it is
227 * true and we will take a spinlock on transmission */
228 int real_n_txq;
4573fbd3 229 int real_n_rxq;
a0cb2d66
DDP
230 bool txq_needs_locking;
231
58397e6c
KT
232 /* virtio-net structure for vhost device */
233 OVSRCU_TYPE(struct virtio_net *) virtio_dev;
234
7d1ced01
CL
235 /* Identifier used to distinguish vhost devices from each other */
236 char vhost_id[PATH_MAX];
237
8a9562d2 238 /* In dpdk_list. */
ca6ba700 239 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
8a9562d2
PS
240};
241
242struct netdev_rxq_dpdk {
243 struct netdev_rxq up;
244 int port_id;
245};
246
5f17de68 247static bool dpdk_thread_is_pmd(void);
db73f716 248
8a9562d2
PS
249static int netdev_dpdk_construct(struct netdev *);
250
58397e6c
KT
251struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
252
8a9562d2
PS
253static bool
254is_dpdk_class(const struct netdev_class *class)
255{
256 return class->construct == netdev_dpdk_construct;
257}
258
4be4d22c
MK
259/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
260 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
261 * value, insufficient buffers are allocated to accomodate the packet in its
262 * entirety. Furthermore, certain drivers need to ensure that there is also
263 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
264 * frames). If the RX buffer is too small, then the driver enables scatter RX
265 * behaviour, which reduces performance. To prevent this, use a buffer size that
266 * is closest to 'mtu', but which satisfies the aforementioned criteria.
267 */
268static uint32_t
269dpdk_buf_size(int mtu)
270{
271 return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
272 NETDEV_DPDK_MBUF_ALIGN);
273}
274
58397e6c
KT
275/* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
276 * for all other segments data, bss and text. */
8a9562d2
PS
277
278static void *
279dpdk_rte_mzalloc(size_t sz)
280{
281 void *ptr;
282
283 ptr = rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
284 if (ptr == NULL) {
285 out_of_memory();
286 }
287 return ptr;
288}
289
db73f716
DDP
290/* XXX this function should be called only by pmd threads (or by non pmd
291 * threads holding the nonpmd_mempool_mutex) */
8a9562d2 292void
e14deea0 293free_dpdk_buf(struct dp_packet *p)
8a9562d2 294{
db73f716 295 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 296
db73f716 297 rte_pktmbuf_free_seg(pkt);
8a9562d2
PS
298}
299
b3cd9f9d
PS
300static void
301ovs_rte_pktmbuf_init(struct rte_mempool *mp,
302 void *opaque_arg OVS_UNUSED,
303 void *_m,
304 unsigned i OVS_UNUSED)
305{
306 struct rte_mbuf *m = _m;
307
4be4d22c 308 rte_pktmbuf_init(mp, opaque_arg, _m, i);
b3cd9f9d 309
cf62fa4c 310 dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
b3cd9f9d
PS
311}
312
8a9562d2
PS
313static struct dpdk_mp *
314dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
315{
316 struct dpdk_mp *dmp = NULL;
317 char mp_name[RTE_MEMPOOL_NAMESIZE];
da79ce2b 318 unsigned mp_size;
4be4d22c 319 struct rte_pktmbuf_pool_private mbp_priv;
8a9562d2
PS
320
321 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
322 if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
323 dmp->refcount++;
324 return dmp;
325 }
326 }
327
328 dmp = dpdk_rte_mzalloc(sizeof *dmp);
329 dmp->socket_id = socket_id;
330 dmp->mtu = mtu;
331 dmp->refcount = 1;
4be4d22c
MK
332 mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet);
333 mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) - sizeof (struct rte_mbuf);
8a9562d2 334
da79ce2b
DDP
335 mp_size = MAX_NB_MBUF;
336 do {
337 if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
338 dmp->mtu, dmp->socket_id, mp_size) < 0) {
339 return NULL;
340 }
95fb793a 341
da79ce2b
DDP
342 dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
343 MP_CACHE_SZ,
344 sizeof(struct rte_pktmbuf_pool_private),
4be4d22c 345 rte_pktmbuf_pool_init, &mbp_priv,
da79ce2b
DDP
346 ovs_rte_pktmbuf_init, NULL,
347 socket_id, 0);
348 } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
8a9562d2
PS
349
350 if (dmp->mp == NULL) {
351 return NULL;
da79ce2b
DDP
352 } else {
353 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", mp_name, mp_size );
8a9562d2
PS
354 }
355
356 list_push_back(&dpdk_mp_list, &dmp->list_node);
357 return dmp;
358}
359
360static void
361dpdk_mp_put(struct dpdk_mp *dmp)
362{
363
364 if (!dmp) {
365 return;
366 }
367
368 dmp->refcount--;
369 ovs_assert(dmp->refcount >= 0);
370
371#if 0
372 /* I could not find any API to destroy mp. */
373 if (dmp->refcount == 0) {
374 list_delete(dmp->list_node);
375 /* destroy mp-pool. */
376 }
377#endif
378}
379
380static void
381check_link_status(struct netdev_dpdk *dev)
382{
383 struct rte_eth_link link;
384
385 rte_eth_link_get_nowait(dev->port_id, &link);
386
387 if (dev->link.link_status != link.link_status) {
3e912ffc 388 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
389
390 dev->link_reset_cnt++;
391 dev->link = link;
392 if (dev->link.link_status) {
393 VLOG_DBG_RL(&rl, "Port %d Link Up - speed %u Mbps - %s",
394 dev->port_id, (unsigned)dev->link.link_speed,
395 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
396 ("full-duplex") : ("half-duplex"));
397 } else {
398 VLOG_DBG_RL(&rl, "Port %d Link Down", dev->port_id);
399 }
400 }
401}
402
403static void *
404dpdk_watchdog(void *dummy OVS_UNUSED)
405{
406 struct netdev_dpdk *dev;
407
408 pthread_detach(pthread_self());
409
410 for (;;) {
411 ovs_mutex_lock(&dpdk_mutex);
412 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
413 ovs_mutex_lock(&dev->mutex);
414 check_link_status(dev);
415 ovs_mutex_unlock(&dev->mutex);
416 }
417 ovs_mutex_unlock(&dpdk_mutex);
418 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
419 }
420
421 return NULL;
422}
423
b98d7669
DDP
424static int
425dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
426{
427 int diag = 0;
428 int i;
429
430 /* A device may report more queues than it makes available (this has
431 * been observed for Intel xl710, which reserves some of them for
432 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
433 * available. When this happens we can retry the configuration
434 * and request less queues */
435 while (n_rxq && n_txq) {
436 if (diag) {
437 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
438 }
439
440 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
441 if (diag) {
442 break;
443 }
444
445 for (i = 0; i < n_txq; i++) {
446 diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
447 dev->socket_id, NULL);
448 if (diag) {
449 VLOG_INFO("Interface %s txq(%d) setup error: %s",
450 dev->up.name, i, rte_strerror(-diag));
451 break;
452 }
453 }
454
455 if (i != n_txq) {
456 /* Retry with less tx queues */
457 n_txq = i;
458 continue;
459 }
460
461 for (i = 0; i < n_rxq; i++) {
462 diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
463 dev->socket_id, NULL,
464 dev->dpdk_mp->mp);
465 if (diag) {
466 VLOG_INFO("Interface %s rxq(%d) setup error: %s",
467 dev->up.name, i, rte_strerror(-diag));
468 break;
469 }
470 }
471
472 if (i != n_rxq) {
473 /* Retry with less rx queues */
474 n_rxq = i;
475 continue;
476 }
477
478 dev->up.n_rxq = n_rxq;
479 dev->real_n_txq = n_txq;
480
481 return 0;
482 }
483
484 return diag;
485}
486
487
8a9562d2
PS
488static int
489dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
490{
491 struct rte_pktmbuf_pool_private *mbp_priv;
a0cb2d66 492 struct rte_eth_dev_info info;
8a9562d2
PS
493 struct ether_addr eth_addr;
494 int diag;
b98d7669 495 int n_rxq, n_txq;
8a9562d2
PS
496
497 if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
95fb793a 498 return ENODEV;
8a9562d2
PS
499 }
500
a0cb2d66 501 rte_eth_dev_info_get(dev->port_id, &info);
a0cb2d66 502
b98d7669
DDP
503 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
504 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
505
506 diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
8a9562d2 507 if (diag) {
b98d7669
DDP
508 VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
509 dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
95fb793a 510 return -diag;
8a9562d2
PS
511 }
512
8a9562d2
PS
513 diag = rte_eth_dev_start(dev->port_id);
514 if (diag) {
b98d7669
DDP
515 VLOG_ERR("Interface %s start error: %s", dev->up.name,
516 rte_strerror(-diag));
95fb793a 517 return -diag;
8a9562d2
PS
518 }
519
520 rte_eth_promiscuous_enable(dev->port_id);
521 rte_eth_allmulticast_enable(dev->port_id);
522
523 memset(&eth_addr, 0x0, sizeof(eth_addr));
524 rte_eth_macaddr_get(dev->port_id, &eth_addr);
525 VLOG_INFO_RL(&rl, "Port %d: "ETH_ADDR_FMT"",
ca92d173 526 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
8a9562d2 527
ca92d173 528 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
8a9562d2
PS
529 rte_eth_link_get_nowait(dev->port_id, &dev->link);
530
531 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
532 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
533
534 dev->flags = NETDEV_UP | NETDEV_PROMISC;
535 return 0;
536}
537
538static struct netdev_dpdk *
539netdev_dpdk_cast(const struct netdev *netdev)
540{
541 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
542}
543
544static struct netdev *
545netdev_dpdk_alloc(void)
546{
547 struct netdev_dpdk *netdev = dpdk_rte_mzalloc(sizeof *netdev);
548 return &netdev->up;
549}
550
5a034064 551static void
91968eb0 552netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
5a034064 553{
bd5131ba 554 unsigned i;
5a034064
AW
555
556 netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
557 for (i = 0; i < n_txqs; i++) {
ba0358a1 558 int numa_id = ovs_numa_get_numa_id(i);
94143fc4 559
a0cb2d66
DDP
560 if (!netdev->txq_needs_locking) {
561 /* Each index is considered as a cpu core id, since there should
562 * be one tx queue for each cpu core. If the corresponding core
563 * is not on the same numa node as 'netdev', flags the
564 * 'flush_tx'. */
565 netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
566 } else {
567 /* Queues are shared among CPUs. Always flush */
568 netdev->tx_q[i].flush_tx = true;
569 }
585a5bea
IM
570
571 /* Initialize map for vhost devices. */
572 netdev->tx_q[i].map = -1;
a0cb2d66 573 rte_spinlock_init(&netdev->tx_q[i].tx_lock);
5a034064
AW
574 }
575}
576
8a9562d2 577static int
58397e6c
KT
578netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no,
579 enum dpdk_dev_type type)
5a034064 580 OVS_REQUIRES(dpdk_mutex)
8a9562d2
PS
581{
582 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1b7a04e0 583 int sid;
95fb793a 584 int err = 0;
4be4d22c 585 uint32_t buf_size;
8a9562d2 586
95fb793a 587 ovs_mutex_init(&netdev->mutex);
95fb793a 588 ovs_mutex_lock(&netdev->mutex);
8a9562d2 589
45d947c4
DDP
590 rte_spinlock_init(&netdev->stats_lock);
591
1b7a04e0
AW
592 /* If the 'sid' is negative, it means that the kernel fails
593 * to obtain the pci numa info. In that situation, always
594 * use 'SOCKET0'. */
58397e6c
KT
595 if (type == DPDK_DEV_ETH) {
596 sid = rte_eth_dev_socket_id(port_no);
597 } else {
598 sid = rte_lcore_to_socket_id(rte_get_master_lcore());
599 }
600
1b7a04e0 601 netdev->socket_id = sid < 0 ? SOCKET0 : sid;
95fb793a 602 netdev->port_id = port_no;
58397e6c 603 netdev->type = type;
8a9562d2 604 netdev->flags = 0;
8a9562d2 605 netdev->mtu = ETHER_MTU;
4be4d22c 606 netdev->max_packet_len = MTU_TO_FRAME_LEN(netdev->mtu);
8a9562d2 607
4be4d22c
MK
608 buf_size = dpdk_buf_size(netdev->mtu);
609 netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, FRAME_LEN_TO_MTU(buf_size));
8a9562d2
PS
610 if (!netdev->dpdk_mp) {
611 err = ENOMEM;
95fb793a 612 goto unlock;
8a9562d2
PS
613 }
614
5496878c
AW
615 netdev_->n_txq = NR_QUEUE;
616 netdev_->n_rxq = NR_QUEUE;
a14b8947 617 netdev_->requested_n_rxq = NR_QUEUE;
a0cb2d66 618 netdev->real_n_txq = NR_QUEUE;
58397e6c
KT
619
620 if (type == DPDK_DEV_ETH) {
1b99bb05
MG
621 netdev_dpdk_alloc_txq(netdev, NR_QUEUE);
622 err = dpdk_eth_dev_init(netdev);
623 if (err) {
624 goto unlock;
625 }
585a5bea
IM
626 } else {
627 netdev_dpdk_alloc_txq(netdev, OVS_VHOST_MAX_QUEUE_NUM);
8a9562d2 628 }
8a9562d2
PS
629
630 list_push_back(&dpdk_list, &netdev->list_node);
631
95fb793a 632unlock:
5a034064
AW
633 if (err) {
634 rte_free(netdev->tx_q);
635 }
8a9562d2 636 ovs_mutex_unlock(&netdev->mutex);
95fb793a 637 return err;
638}
639
b83a2df1
MV
640/* dev_name must be the prefix followed by a positive decimal number.
641 * (no leading + or - signs are allowed) */
95fb793a 642static int
643dpdk_dev_parse_name(const char dev_name[], const char prefix[],
644 unsigned int *port_no)
645{
646 const char *cport;
647
648 if (strncmp(dev_name, prefix, strlen(prefix))) {
649 return ENODEV;
650 }
651
652 cport = dev_name + strlen(prefix);
b83a2df1
MV
653
654 if (str_to_uint(cport, 10, port_no)) {
655 return 0;
656 } else {
657 return ENODEV;
658 }
95fb793a 659}
660
58397e6c 661static int
bd8baf47 662vhost_construct_helper(struct netdev *netdev_) OVS_REQUIRES(dpdk_mutex)
58397e6c 663{
58397e6c
KT
664 if (rte_eal_init_ret) {
665 return rte_eal_init_ret;
666 }
667
7d1ced01
CL
668 return netdev_dpdk_init(netdev_, -1, DPDK_DEV_VHOST);
669}
670
671static int
672netdev_dpdk_vhost_cuse_construct(struct netdev *netdev_)
673{
674 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
675 int err;
676
58397e6c 677 ovs_mutex_lock(&dpdk_mutex);
7d1ced01
CL
678 strncpy(netdev->vhost_id, netdev->up.name, sizeof(netdev->vhost_id));
679 err = vhost_construct_helper(netdev_);
58397e6c 680 ovs_mutex_unlock(&dpdk_mutex);
7d1ced01
CL
681 return err;
682}
58397e6c 683
7d1ced01
CL
684static int
685netdev_dpdk_vhost_user_construct(struct netdev *netdev_)
686{
687 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1af27e8a 688 const char *name = netdev_->name;
7d1ced01 689 int err;
a0cb2d66 690
1af27e8a
DDP
691 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
692 * the file system. '/' or '\' would traverse directories, so they're not
693 * acceptable in 'name'. */
694 if (strchr(name, '/') || strchr(name, '\\')) {
695 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
696 "A valid name must not include '/' or '\\'",
697 name);
698 return EINVAL;
699 }
700
7d1ced01
CL
701 ovs_mutex_lock(&dpdk_mutex);
702 /* Take the name of the vhost-user port and append it to the location where
703 * the socket is to be created, then register the socket.
704 */
705 snprintf(netdev->vhost_id, sizeof(netdev->vhost_id), "%s/%s",
1af27e8a
DDP
706 vhost_sock_dir, name);
707
7d1ced01
CL
708 err = rte_vhost_driver_register(netdev->vhost_id);
709 if (err) {
710 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
711 netdev->vhost_id);
e5c0f5a4
IM
712 } else {
713 fatal_signal_add_file_to_unlink(netdev->vhost_id);
1a64eb93 714 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1af27e8a 715 netdev->vhost_id, name);
1a64eb93 716 err = vhost_construct_helper(netdev_);
7d1ced01 717 }
e5c0f5a4 718
7d1ced01 719 ovs_mutex_unlock(&dpdk_mutex);
58397e6c
KT
720 return err;
721}
722
95fb793a 723static int
724netdev_dpdk_construct(struct netdev *netdev)
725{
726 unsigned int port_no;
727 int err;
728
729 if (rte_eal_init_ret) {
730 return rte_eal_init_ret;
731 }
732
733 /* Names always start with "dpdk" */
734 err = dpdk_dev_parse_name(netdev->name, "dpdk", &port_no);
735 if (err) {
736 return err;
737 }
738
739 ovs_mutex_lock(&dpdk_mutex);
58397e6c 740 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
8a9562d2
PS
741 ovs_mutex_unlock(&dpdk_mutex);
742 return err;
743}
744
745static void
746netdev_dpdk_destruct(struct netdev *netdev_)
747{
748 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
749
750 ovs_mutex_lock(&dev->mutex);
751 rte_eth_dev_stop(dev->port_id);
752 ovs_mutex_unlock(&dev->mutex);
753
754 ovs_mutex_lock(&dpdk_mutex);
5a034064 755 rte_free(dev->tx_q);
8a9562d2
PS
756 list_remove(&dev->list_node);
757 dpdk_mp_put(dev->dpdk_mp);
758 ovs_mutex_unlock(&dpdk_mutex);
58397e6c 759}
8a9562d2 760
58397e6c
KT
761static void
762netdev_dpdk_vhost_destruct(struct netdev *netdev_)
763{
764 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
765
766 /* Can't remove a port while a guest is attached to it. */
767 if (netdev_dpdk_get_virtio(dev) != NULL) {
768 VLOG_ERR("Can not remove port, vhost device still attached");
769 return;
770 }
771
e04f7e4f
CL
772 if (rte_vhost_driver_unregister(dev->vhost_id)) {
773 VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
e5c0f5a4
IM
774 } else {
775 fatal_signal_remove_file_to_unlink(dev->vhost_id);
e04f7e4f
CL
776 }
777
58397e6c
KT
778 ovs_mutex_lock(&dpdk_mutex);
779 list_remove(&dev->list_node);
780 dpdk_mp_put(dev->dpdk_mp);
781 ovs_mutex_unlock(&dpdk_mutex);
8a9562d2
PS
782}
783
784static void
785netdev_dpdk_dealloc(struct netdev *netdev_)
786{
787 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
788
789 rte_free(netdev);
790}
791
792static int
a14b8947 793netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
8a9562d2 794{
a14b8947 795 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
796
797 ovs_mutex_lock(&dev->mutex);
798
a14b8947
IM
799 smap_add_format(args, "requested_rx_queues", "%d", netdev->requested_n_rxq);
800 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
801 smap_add_format(args, "requested_tx_queues", "%d", netdev->n_txq);
a0cb2d66 802 smap_add_format(args, "configured_tx_queues", "%d", dev->real_n_txq);
8a9562d2
PS
803 ovs_mutex_unlock(&dev->mutex);
804
805 return 0;
806}
807
a14b8947
IM
808static int
809netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args)
810{
811 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
812
813 ovs_mutex_lock(&dev->mutex);
814 netdev->requested_n_rxq = MAX(smap_get_int(args, "n_rxq",
815 netdev->requested_n_rxq), 1);
816 netdev_change_seq_changed(netdev);
817 ovs_mutex_unlock(&dev->mutex);
818
819 return 0;
820}
821
7dec44fe
AW
822static int
823netdev_dpdk_get_numa_id(const struct netdev *netdev_)
824{
825 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
826
827 return netdev->socket_id;
828}
829
5496878c
AW
830/* Sets the number of tx queues and rx queues for the dpdk interface.
831 * If the configuration fails, do not try restoring its old configuration
832 * and just returns the error. */
833static int
834netdev_dpdk_set_multiq(struct netdev *netdev_, unsigned int n_txq,
835 unsigned int n_rxq)
836{
837 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
838 int err = 0;
4b8df037 839 int old_rxq, old_txq;
5496878c
AW
840
841 if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
842 return err;
843 }
844
b7ccaf67 845 ovs_mutex_lock(&dpdk_mutex);
5496878c 846 ovs_mutex_lock(&netdev->mutex);
91968eb0 847
5496878c 848 rte_eth_dev_stop(netdev->port_id);
91968eb0 849
4b8df037
DDP
850 old_txq = netdev->up.n_txq;
851 old_rxq = netdev->up.n_rxq;
5496878c
AW
852 netdev->up.n_txq = n_txq;
853 netdev->up.n_rxq = n_rxq;
58397e6c 854
91968eb0 855 rte_free(netdev->tx_q);
5496878c 856 err = dpdk_eth_dev_init(netdev);
a0cb2d66 857 netdev_dpdk_alloc_txq(netdev, netdev->real_n_txq);
4b8df037
DDP
858 if (err) {
859 /* If there has been an error, it means that the requested queues
860 * have not been created. Restore the old numbers. */
861 netdev->up.n_txq = old_txq;
862 netdev->up.n_rxq = old_rxq;
863 }
a0cb2d66
DDP
864
865 netdev->txq_needs_locking = netdev->real_n_txq != netdev->up.n_txq;
91968eb0 866
5496878c 867 ovs_mutex_unlock(&netdev->mutex);
b7ccaf67 868 ovs_mutex_unlock(&dpdk_mutex);
5496878c
AW
869
870 return err;
871}
872
58397e6c 873static int
4573fbd3 874netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev_, unsigned int n_txq,
a0cb2d66 875 unsigned int n_rxq)
58397e6c
KT
876{
877 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
878 int err = 0;
879
880 if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
881 return err;
882 }
883
884 ovs_mutex_lock(&dpdk_mutex);
885 ovs_mutex_lock(&netdev->mutex);
886
887 netdev->up.n_txq = n_txq;
a0cb2d66
DDP
888 netdev->real_n_txq = 1;
889 netdev->up.n_rxq = 1;
4573fbd3
FL
890 netdev->txq_needs_locking = netdev->real_n_txq != netdev->up.n_txq;
891
892 ovs_mutex_unlock(&netdev->mutex);
893 ovs_mutex_unlock(&dpdk_mutex);
894
895 return err;
896}
897
898static int
899netdev_dpdk_vhost_set_multiq(struct netdev *netdev_, unsigned int n_txq,
900 unsigned int n_rxq)
901{
902 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
903 int err = 0;
904
905 if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
906 return err;
907 }
908
909 ovs_mutex_lock(&dpdk_mutex);
910 ovs_mutex_lock(&netdev->mutex);
911
4573fbd3
FL
912 netdev->up.n_txq = n_txq;
913 netdev->up.n_rxq = n_rxq;
58397e6c
KT
914
915 ovs_mutex_unlock(&netdev->mutex);
916 ovs_mutex_unlock(&dpdk_mutex);
917
918 return err;
919}
920
8a9562d2
PS
921static struct netdev_rxq *
922netdev_dpdk_rxq_alloc(void)
923{
924 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
925
926 return &rx->up;
927}
928
929static struct netdev_rxq_dpdk *
930netdev_rxq_dpdk_cast(const struct netdev_rxq *rx)
931{
932 return CONTAINER_OF(rx, struct netdev_rxq_dpdk, up);
933}
934
935static int
936netdev_dpdk_rxq_construct(struct netdev_rxq *rxq_)
937{
938 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
939 struct netdev_dpdk *netdev = netdev_dpdk_cast(rx->up.netdev);
940
941 ovs_mutex_lock(&netdev->mutex);
942 rx->port_id = netdev->port_id;
943 ovs_mutex_unlock(&netdev->mutex);
944
945 return 0;
946}
947
948static void
949netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED)
950{
951}
952
953static void
954netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
955{
956 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
957
958 rte_free(rx);
959}
960
b170db2a
RW
961static inline void
962dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
8a9562d2
PS
963{
964 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1304f1f8
DDP
965 uint32_t nb_tx = 0;
966
967 while (nb_tx != txq->count) {
968 uint32_t ret;
969
970 ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
971 txq->count - nb_tx);
972 if (!ret) {
973 break;
974 }
975
976 nb_tx += ret;
977 }
8a9562d2 978
b170db2a 979 if (OVS_UNLIKELY(nb_tx != txq->count)) {
db73f716
DDP
980 /* free buffers, which we couldn't transmit, one at a time (each
981 * packet could come from a different mempool) */
982 int i;
983
984 for (i = nb_tx; i < txq->count; i++) {
985 rte_pktmbuf_free_seg(txq->burst_pkts[i]);
986 }
45d947c4 987 rte_spinlock_lock(&dev->stats_lock);
1304f1f8 988 dev->stats.tx_dropped += txq->count-nb_tx;
45d947c4 989 rte_spinlock_unlock(&dev->stats_lock);
8a9562d2 990 }
1304f1f8 991
8a9562d2 992 txq->count = 0;
844f2d74 993 txq->tsc = rte_get_timer_cycles();
b170db2a
RW
994}
995
996static inline void
997dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
998{
999 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1000
1001 if (txq->count == 0) {
1002 return;
1003 }
b170db2a 1004 dpdk_queue_flush__(dev, qid);
8a9562d2
PS
1005}
1006
58397e6c
KT
1007static bool
1008is_vhost_running(struct virtio_net *dev)
1009{
1010 return (dev != NULL && (dev->flags & VIRTIO_DEV_RUNNING));
1011}
1012
9e3ddd45
TP
1013static inline void
1014netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
1015 struct dp_packet **packets, int count)
1016{
1017 int i;
1018 struct dp_packet *packet;
1019
1020 stats->rx_packets += count;
1021 for (i = 0; i < count; i++) {
1022 packet = packets[i];
1023
1024 if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
1025 /* This only protects the following multicast counting from
1026 * too short packets, but it does not stop the packet from
1027 * further processing. */
1028 stats->rx_errors++;
1029 stats->rx_length_errors++;
1030 continue;
1031 }
1032
1033 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
1034 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
1035 stats->multicast++;
1036 }
1037
1038 stats->rx_bytes += dp_packet_size(packet);
1039 }
1040}
1041
58397e6c
KT
1042/*
1043 * The receive path for the vhost port is the TX path out from guest.
1044 */
1045static int
1046netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq_,
1047 struct dp_packet **packets, int *c)
1048{
1049 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
1050 struct netdev *netdev = rx->up.netdev;
1051 struct netdev_dpdk *vhost_dev = netdev_dpdk_cast(netdev);
1052 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(vhost_dev);
4573fbd3 1053 int qid = rxq_->queue_id;
58397e6c
KT
1054 uint16_t nb_rx = 0;
1055
1056 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
1057 return EAGAIN;
1058 }
1059
4573fbd3
FL
1060 if (rxq_->queue_id >= vhost_dev->real_n_rxq) {
1061 return EOPNOTSUPP;
1062 }
1063
1064 nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ,
58397e6c
KT
1065 vhost_dev->dpdk_mp->mp,
1066 (struct rte_mbuf **)packets,
cd159f1a 1067 NETDEV_MAX_BURST);
58397e6c
KT
1068 if (!nb_rx) {
1069 return EAGAIN;
1070 }
1071
45d947c4 1072 rte_spinlock_lock(&vhost_dev->stats_lock);
9e3ddd45 1073 netdev_dpdk_vhost_update_rx_counters(&vhost_dev->stats, packets, nb_rx);
45d947c4
DDP
1074 rte_spinlock_unlock(&vhost_dev->stats_lock);
1075
58397e6c
KT
1076 *c = (int) nb_rx;
1077 return 0;
1078}
1079
8a9562d2 1080static int
e14deea0 1081netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets,
91088554 1082 int *c)
8a9562d2
PS
1083{
1084 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
1085 struct netdev *netdev = rx->up.netdev;
1086 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1087 int nb_rx;
8a9562d2 1088
5496878c 1089 /* There is only one tx queue for this core. Do not flush other
dc6ba5dc
WL
1090 * queues.
1091 * Do not flush tx queue which is shared among CPUs
1092 * since it is always flushed */
1093 if (rxq_->queue_id == rte_lcore_id() &&
1094 OVS_LIKELY(!dev->txq_needs_locking)) {
5496878c
AW
1095 dpdk_queue_flush(dev, rxq_->queue_id);
1096 }
8a9562d2
PS
1097
1098 nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
7d08d53e 1099 (struct rte_mbuf **) packets,
cd159f1a 1100 NETDEV_MAX_BURST);
8a9562d2
PS
1101 if (!nb_rx) {
1102 return EAGAIN;
1103 }
1104
8a9562d2
PS
1105 *c = nb_rx;
1106
1107 return 0;
1108}
1109
9e3ddd45
TP
1110static inline void
1111netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
1112 struct dp_packet **packets,
1113 int attempted,
1114 int dropped)
1115{
1116 int i;
1117 int sent = attempted - dropped;
1118
1119 stats->tx_packets += sent;
1120 stats->tx_dropped += dropped;
1121
1122 for (i = 0; i < sent; i++) {
1123 stats->tx_bytes += dp_packet_size(packets[i]);
1124 }
1125}
1126
58397e6c 1127static void
4573fbd3
FL
1128__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
1129 struct dp_packet **pkts, int cnt,
1130 bool may_steal)
58397e6c
KT
1131{
1132 struct netdev_dpdk *vhost_dev = netdev_dpdk_cast(netdev);
1133 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(vhost_dev);
95e9881f
KT
1134 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
1135 unsigned int total_pkts = cnt;
1136 uint64_t start = 0;
58397e6c 1137
585a5bea
IM
1138 qid = vhost_dev->tx_q[qid % vhost_dev->real_n_txq].map;
1139
1140 if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid == -1)) {
45d947c4 1141 rte_spinlock_lock(&vhost_dev->stats_lock);
1b99bb05 1142 vhost_dev->stats.tx_dropped+= cnt;
45d947c4 1143 rte_spinlock_unlock(&vhost_dev->stats_lock);
1b99bb05 1144 goto out;
58397e6c
KT
1145 }
1146
585a5bea 1147 rte_spinlock_lock(&vhost_dev->tx_q[qid].tx_lock);
58397e6c 1148
95e9881f 1149 do {
4573fbd3 1150 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
95e9881f
KT
1151 unsigned int tx_pkts;
1152
4573fbd3 1153 tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
95e9881f
KT
1154 cur_pkts, cnt);
1155 if (OVS_LIKELY(tx_pkts)) {
1156 /* Packets have been sent.*/
1157 cnt -= tx_pkts;
1158 /* Prepare for possible next iteration.*/
1159 cur_pkts = &cur_pkts[tx_pkts];
1160 } else {
1161 uint64_t timeout = VHOST_ENQ_RETRY_USECS * rte_get_timer_hz() / 1E6;
1162 unsigned int expired = 0;
1163
1164 if (!start) {
1165 start = rte_get_timer_cycles();
1166 }
1167
1168 /*
1169 * Unable to enqueue packets to vhost interface.
1170 * Check available entries before retrying.
1171 */
4573fbd3 1172 while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {
95e9881f
KT
1173 if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {
1174 expired = 1;
1175 break;
1176 }
1177 }
1178 if (expired) {
1179 /* break out of main loop. */
1180 break;
1181 }
1182 }
1183 } while (cnt);
4573fbd3 1184
585a5bea 1185 rte_spinlock_unlock(&vhost_dev->tx_q[qid].tx_lock);
95e9881f 1186
45d947c4 1187 rte_spinlock_lock(&vhost_dev->stats_lock);
9e3ddd45
TP
1188 netdev_dpdk_vhost_update_tx_counters(&vhost_dev->stats, pkts, total_pkts,
1189 cnt);
45d947c4 1190 rte_spinlock_unlock(&vhost_dev->stats_lock);
58397e6c
KT
1191
1192out:
1193 if (may_steal) {
95e9881f
KT
1194 int i;
1195
1196 for (i = 0; i < total_pkts; i++) {
1b99bb05
MG
1197 dp_packet_delete(pkts[i]);
1198 }
58397e6c
KT
1199 }
1200}
1201
8a9562d2 1202inline static void
f4fd623c
DDP
1203dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
1204 struct rte_mbuf **pkts, int cnt)
8a9562d2
PS
1205{
1206 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1207 uint64_t diff_tsc;
8a9562d2 1208
f4fd623c
DDP
1209 int i = 0;
1210
f4fd623c
DDP
1211 while (i < cnt) {
1212 int freeslots = MAX_TX_QUEUE_LEN - txq->count;
1213 int tocopy = MIN(freeslots, cnt-i);
8a9562d2 1214
f4fd623c
DDP
1215 memcpy(&txq->burst_pkts[txq->count], &pkts[i],
1216 tocopy * sizeof (struct rte_mbuf *));
1217
1218 txq->count += tocopy;
1219 i += tocopy;
1220
94143fc4 1221 if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
b170db2a 1222 dpdk_queue_flush__(dev, qid);
f4fd623c 1223 }
844f2d74 1224 diff_tsc = rte_get_timer_cycles() - txq->tsc;
f4fd623c 1225 if (diff_tsc >= DRAIN_TSC) {
b170db2a 1226 dpdk_queue_flush__(dev, qid);
f4fd623c 1227 }
8a9562d2 1228 }
8a9562d2
PS
1229}
1230
1231/* Tx function. Transmit packets indefinitely */
1232static void
58397e6c 1233dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
2654cc33 1234 int cnt)
db73f716 1235 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2 1236{
bce01e3a
EJ
1237#if !defined(__CHECKER__) && !defined(_WIN32)
1238 const size_t PKT_ARRAY_SIZE = cnt;
1239#else
1240 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 1241 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
bce01e3a 1242#endif
8a9562d2 1243 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
bce01e3a 1244 struct rte_mbuf *mbufs[PKT_ARRAY_SIZE];
175cf4de
RW
1245 int dropped = 0;
1246 int newcnt = 0;
1247 int i;
8a9562d2 1248
db73f716
DDP
1249 /* If we are on a non pmd thread we have to use the mempool mutex, because
1250 * every non pmd thread shares the same mempool cache */
1251
5f17de68 1252 if (!dpdk_thread_is_pmd()) {
db73f716
DDP
1253 ovs_mutex_lock(&nonpmd_mempool_mutex);
1254 }
1255
f4fd623c 1256 for (i = 0; i < cnt; i++) {
cf62fa4c 1257 int size = dp_packet_size(pkts[i]);
95fb793a 1258
f98d7864 1259 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
f4fd623c
DDP
1260 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1261 (int)size , dev->max_packet_len);
1262
175cf4de 1263 dropped++;
f4fd623c
DDP
1264 continue;
1265 }
8a9562d2 1266
f4fd623c 1267 mbufs[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
8a9562d2 1268
f4fd623c 1269 if (!mbufs[newcnt]) {
175cf4de
RW
1270 dropped += cnt - i;
1271 break;
f4fd623c
DDP
1272 }
1273
1274 /* We have to do a copy for now */
b8e57534 1275 memcpy(rte_pktmbuf_mtod(mbufs[newcnt], void *), dp_packet_data(pkts[i]), size);
f4fd623c
DDP
1276
1277 rte_pktmbuf_data_len(mbufs[newcnt]) = size;
1278 rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
1279
1280 newcnt++;
1281 }
8a9562d2 1282
f98d7864 1283 if (OVS_UNLIKELY(dropped)) {
45d947c4 1284 rte_spinlock_lock(&dev->stats_lock);
175cf4de 1285 dev->stats.tx_dropped += dropped;
45d947c4 1286 rte_spinlock_unlock(&dev->stats_lock);
175cf4de
RW
1287 }
1288
58397e6c 1289 if (dev->type == DPDK_DEV_VHOST) {
4573fbd3 1290 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
58397e6c
KT
1291 } else {
1292 dpdk_queue_pkts(dev, qid, mbufs, newcnt);
1293 dpdk_queue_flush(dev, qid);
1294 }
db73f716 1295
5f17de68 1296 if (!dpdk_thread_is_pmd()) {
db73f716
DDP
1297 ovs_mutex_unlock(&nonpmd_mempool_mutex);
1298 }
8a9562d2
PS
1299}
1300
58397e6c 1301static int
4573fbd3 1302netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
58397e6c
KT
1303 int cnt, bool may_steal)
1304{
1305 if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
1306 int i;
1307
1308 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
1309 if (may_steal) {
1310 for (i = 0; i < cnt; i++) {
1311 dp_packet_delete(pkts[i]);
1312 }
1313 }
1314 } else {
4573fbd3 1315 __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
58397e6c
KT
1316 }
1317 return 0;
1318}
1319
7251515e
DV
1320static inline void
1321netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
e14deea0 1322 struct dp_packet **pkts, int cnt, bool may_steal)
8a9562d2 1323{
f4fd623c 1324 int i;
8a9562d2 1325
a0cb2d66
DDP
1326 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1327 qid = qid % dev->real_n_txq;
1328 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1329 }
1330
7251515e 1331 if (OVS_UNLIKELY(!may_steal ||
cf62fa4c 1332 pkts[0]->source != DPBUF_DPDK)) {
7251515e
DV
1333 struct netdev *netdev = &dev->up;
1334
2654cc33 1335 dpdk_do_tx_copy(netdev, qid, pkts, cnt);
b3cd9f9d
PS
1336
1337 if (may_steal) {
f4fd623c 1338 for (i = 0; i < cnt; i++) {
e14deea0 1339 dp_packet_delete(pkts[i]);
f4fd623c 1340 }
b3cd9f9d 1341 }
8a9562d2 1342 } else {
f4fd623c
DDP
1343 int next_tx_idx = 0;
1344 int dropped = 0;
8a9562d2 1345
f4fd623c 1346 for (i = 0; i < cnt; i++) {
cf62fa4c 1347 int size = dp_packet_size(pkts[i]);
1b99bb05 1348
f4fd623c
DDP
1349 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1350 if (next_tx_idx != i) {
1351 dpdk_queue_pkts(dev, qid,
1352 (struct rte_mbuf **)&pkts[next_tx_idx],
1353 i-next_tx_idx);
1ebfe1ac 1354 }
f4fd623c 1355
1ebfe1ac
DDP
1356 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1357 (int)size , dev->max_packet_len);
f4fd623c 1358
e14deea0 1359 dp_packet_delete(pkts[i]);
1ebfe1ac 1360 dropped++;
f4fd623c
DDP
1361 next_tx_idx = i + 1;
1362 }
1363 }
1364 if (next_tx_idx != cnt) {
1365 dpdk_queue_pkts(dev, qid,
1366 (struct rte_mbuf **)&pkts[next_tx_idx],
1367 cnt-next_tx_idx);
1368 }
8a9562d2 1369
f4fd623c 1370 if (OVS_UNLIKELY(dropped)) {
45d947c4 1371 rte_spinlock_lock(&dev->stats_lock);
f4fd623c 1372 dev->stats.tx_dropped += dropped;
45d947c4 1373 rte_spinlock_unlock(&dev->stats_lock);
f4fd623c 1374 }
8a9562d2 1375 }
a0cb2d66
DDP
1376
1377 if (OVS_UNLIKELY(dev->txq_needs_locking)) {
1378 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1379 }
7251515e
DV
1380}
1381
1382static int
1383netdev_dpdk_eth_send(struct netdev *netdev, int qid,
e14deea0 1384 struct dp_packet **pkts, int cnt, bool may_steal)
7251515e
DV
1385{
1386 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1387
7251515e
DV
1388 netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
1389 return 0;
8a9562d2
PS
1390}
1391
1392static int
74ff3298 1393netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
8a9562d2
PS
1394{
1395 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1396
1397 ovs_mutex_lock(&dev->mutex);
1398 if (!eth_addr_equals(dev->hwaddr, mac)) {
74ff3298 1399 dev->hwaddr = mac;
045c0d1a 1400 netdev_change_seq_changed(netdev);
8a9562d2
PS
1401 }
1402 ovs_mutex_unlock(&dev->mutex);
1403
1404 return 0;
1405}
1406
1407static int
74ff3298 1408netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
8a9562d2
PS
1409{
1410 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1411
1412 ovs_mutex_lock(&dev->mutex);
74ff3298 1413 *mac = dev->hwaddr;
8a9562d2
PS
1414 ovs_mutex_unlock(&dev->mutex);
1415
1416 return 0;
1417}
1418
1419static int
1420netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
1421{
1422 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1423
1424 ovs_mutex_lock(&dev->mutex);
1425 *mtup = dev->mtu;
1426 ovs_mutex_unlock(&dev->mutex);
1427
1428 return 0;
1429}
1430
1431static int
1432netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
1433{
1434 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4be4d22c 1435 int old_mtu, err, dpdk_mtu;
8a9562d2
PS
1436 struct dpdk_mp *old_mp;
1437 struct dpdk_mp *mp;
4be4d22c 1438 uint32_t buf_size;
8a9562d2
PS
1439
1440 ovs_mutex_lock(&dpdk_mutex);
1441 ovs_mutex_lock(&dev->mutex);
1442 if (dev->mtu == mtu) {
1443 err = 0;
1444 goto out;
1445 }
1446
4be4d22c
MK
1447 buf_size = dpdk_buf_size(mtu);
1448 dpdk_mtu = FRAME_LEN_TO_MTU(buf_size);
1449
1450 mp = dpdk_mp_get(dev->socket_id, dpdk_mtu);
8a9562d2
PS
1451 if (!mp) {
1452 err = ENOMEM;
1453 goto out;
1454 }
1455
1456 rte_eth_dev_stop(dev->port_id);
1457
1458 old_mtu = dev->mtu;
1459 old_mp = dev->dpdk_mp;
1460 dev->dpdk_mp = mp;
1461 dev->mtu = mtu;
4be4d22c 1462 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
8a9562d2
PS
1463
1464 err = dpdk_eth_dev_init(dev);
1465 if (err) {
8a9562d2
PS
1466 dpdk_mp_put(mp);
1467 dev->mtu = old_mtu;
1468 dev->dpdk_mp = old_mp;
4be4d22c 1469 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
8a9562d2
PS
1470 dpdk_eth_dev_init(dev);
1471 goto out;
1472 }
1473
1474 dpdk_mp_put(old_mp);
045c0d1a 1475 netdev_change_seq_changed(netdev);
8a9562d2
PS
1476out:
1477 ovs_mutex_unlock(&dev->mutex);
1478 ovs_mutex_unlock(&dpdk_mutex);
1479 return err;
1480}
1481
1482static int
1483netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier);
1484
58397e6c
KT
1485static int
1486netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
1487 struct netdev_stats *stats)
1488{
1489 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1490
1491 ovs_mutex_lock(&dev->mutex);
1492 memset(stats, 0, sizeof(*stats));
1493 /* Unsupported Stats */
58397e6c
KT
1494 stats->collisions = UINT64_MAX;
1495 stats->rx_crc_errors = UINT64_MAX;
1496 stats->rx_fifo_errors = UINT64_MAX;
1497 stats->rx_frame_errors = UINT64_MAX;
58397e6c
KT
1498 stats->rx_missed_errors = UINT64_MAX;
1499 stats->rx_over_errors = UINT64_MAX;
1500 stats->tx_aborted_errors = UINT64_MAX;
1501 stats->tx_carrier_errors = UINT64_MAX;
1502 stats->tx_errors = UINT64_MAX;
1503 stats->tx_fifo_errors = UINT64_MAX;
1504 stats->tx_heartbeat_errors = UINT64_MAX;
1505 stats->tx_window_errors = UINT64_MAX;
58397e6c 1506 stats->rx_dropped += UINT64_MAX;
58397e6c 1507
45d947c4 1508 rte_spinlock_lock(&dev->stats_lock);
58397e6c
KT
1509 /* Supported Stats */
1510 stats->rx_packets += dev->stats.rx_packets;
1511 stats->tx_packets += dev->stats.tx_packets;
1512 stats->tx_dropped += dev->stats.tx_dropped;
9e3ddd45
TP
1513 stats->multicast = dev->stats.multicast;
1514 stats->rx_bytes = dev->stats.rx_bytes;
1515 stats->tx_bytes = dev->stats.tx_bytes;
1516 stats->rx_errors = dev->stats.rx_errors;
1517 stats->rx_length_errors = dev->stats.rx_length_errors;
45d947c4 1518 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45 1519
58397e6c
KT
1520 ovs_mutex_unlock(&dev->mutex);
1521
1522 return 0;
1523}
1524
8a9562d2
PS
1525static int
1526netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
1527{
1528 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1529 struct rte_eth_stats rte_stats;
1530 bool gg;
1531
1532 netdev_dpdk_get_carrier(netdev, &gg);
1533 ovs_mutex_lock(&dev->mutex);
1534 rte_eth_stats_get(dev->port_id, &rte_stats);
1535
2f9dd77f 1536 memset(stats, 0, sizeof(*stats));
8a9562d2 1537
2f9dd77f
PS
1538 stats->rx_packets = rte_stats.ipackets;
1539 stats->tx_packets = rte_stats.opackets;
1540 stats->rx_bytes = rte_stats.ibytes;
1541 stats->tx_bytes = rte_stats.obytes;
9e3ddd45
TP
1542 /* DPDK counts imissed as errors, but count them here as dropped instead */
1543 stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
2f9dd77f
PS
1544 stats->tx_errors = rte_stats.oerrors;
1545 stats->multicast = rte_stats.imcasts;
8a9562d2 1546
45d947c4 1547 rte_spinlock_lock(&dev->stats_lock);
2f9dd77f 1548 stats->tx_dropped = dev->stats.tx_dropped;
45d947c4 1549 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45
TP
1550
1551 /* These are the available DPDK counters for packets not received due to
1552 * local resource constraints in DPDK and NIC respectively. */
1553 stats->rx_dropped = rte_stats.rx_nombuf + rte_stats.imissed;
1554 stats->collisions = UINT64_MAX;
1555
02ab4b1a 1556 stats->rx_length_errors = UINT64_MAX;
9e3ddd45 1557 stats->rx_over_errors = UINT64_MAX;
02ab4b1a 1558 stats->rx_crc_errors = UINT64_MAX;
9e3ddd45
TP
1559 stats->rx_frame_errors = UINT64_MAX;
1560 stats->rx_fifo_errors = UINT64_MAX;
1561 stats->rx_missed_errors = rte_stats.imissed;
1562
1563 stats->tx_aborted_errors = UINT64_MAX;
1564 stats->tx_carrier_errors = UINT64_MAX;
1565 stats->tx_fifo_errors = UINT64_MAX;
1566 stats->tx_heartbeat_errors = UINT64_MAX;
1567 stats->tx_window_errors = UINT64_MAX;
1568
8a9562d2
PS
1569 ovs_mutex_unlock(&dev->mutex);
1570
1571 return 0;
1572}
1573
1574static int
1575netdev_dpdk_get_features(const struct netdev *netdev_,
1576 enum netdev_features *current,
1577 enum netdev_features *advertised OVS_UNUSED,
1578 enum netdev_features *supported OVS_UNUSED,
1579 enum netdev_features *peer OVS_UNUSED)
1580{
1581 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1582 struct rte_eth_link link;
1583
1584 ovs_mutex_lock(&dev->mutex);
1585 link = dev->link;
1586 ovs_mutex_unlock(&dev->mutex);
1587
1588 if (link.link_duplex == ETH_LINK_AUTONEG_DUPLEX) {
1589 if (link.link_speed == ETH_LINK_SPEED_AUTONEG) {
1590 *current = NETDEV_F_AUTONEG;
1591 }
1592 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
1593 if (link.link_speed == ETH_LINK_SPEED_10) {
1594 *current = NETDEV_F_10MB_HD;
1595 }
1596 if (link.link_speed == ETH_LINK_SPEED_100) {
1597 *current = NETDEV_F_100MB_HD;
1598 }
1599 if (link.link_speed == ETH_LINK_SPEED_1000) {
1600 *current = NETDEV_F_1GB_HD;
1601 }
1602 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
1603 if (link.link_speed == ETH_LINK_SPEED_10) {
1604 *current = NETDEV_F_10MB_FD;
1605 }
1606 if (link.link_speed == ETH_LINK_SPEED_100) {
1607 *current = NETDEV_F_100MB_FD;
1608 }
1609 if (link.link_speed == ETH_LINK_SPEED_1000) {
1610 *current = NETDEV_F_1GB_FD;
1611 }
1612 if (link.link_speed == ETH_LINK_SPEED_10000) {
1613 *current = NETDEV_F_10GB_FD;
1614 }
1615 }
1616
1617 return 0;
1618}
1619
1620static int
1621netdev_dpdk_get_ifindex(const struct netdev *netdev)
1622{
1623 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1624 int ifindex;
1625
1626 ovs_mutex_lock(&dev->mutex);
1627 ifindex = dev->port_id;
1628 ovs_mutex_unlock(&dev->mutex);
1629
1630 return ifindex;
1631}
1632
1633static int
1634netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier)
1635{
1636 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1637
1638 ovs_mutex_lock(&dev->mutex);
1639 check_link_status(dev);
1640 *carrier = dev->link.link_status;
58397e6c
KT
1641
1642 ovs_mutex_unlock(&dev->mutex);
1643
1644 return 0;
1645}
1646
1647static int
1648netdev_dpdk_vhost_get_carrier(const struct netdev *netdev_, bool *carrier)
1649{
1650 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1651 struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
1652
1653 ovs_mutex_lock(&dev->mutex);
1654
1655 if (is_vhost_running(virtio_dev)) {
1656 *carrier = 1;
1657 } else {
1658 *carrier = 0;
1659 }
1660
8a9562d2
PS
1661 ovs_mutex_unlock(&dev->mutex);
1662
1663 return 0;
1664}
1665
1666static long long int
1667netdev_dpdk_get_carrier_resets(const struct netdev *netdev_)
1668{
1669 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1670 long long int carrier_resets;
1671
1672 ovs_mutex_lock(&dev->mutex);
1673 carrier_resets = dev->link_reset_cnt;
1674 ovs_mutex_unlock(&dev->mutex);
1675
1676 return carrier_resets;
1677}
1678
1679static int
1680netdev_dpdk_set_miimon(struct netdev *netdev_ OVS_UNUSED,
1681 long long int interval OVS_UNUSED)
1682{
ee32150e 1683 return EOPNOTSUPP;
8a9562d2
PS
1684}
1685
1686static int
1687netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
1688 enum netdev_flags off, enum netdev_flags on,
95fb793a 1689 enum netdev_flags *old_flagsp) OVS_REQUIRES(dev->mutex)
8a9562d2
PS
1690{
1691 int err;
1692
1693 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
1694 return EINVAL;
1695 }
1696
1697 *old_flagsp = dev->flags;
1698 dev->flags |= on;
1699 dev->flags &= ~off;
1700
1701 if (dev->flags == *old_flagsp) {
1702 return 0;
1703 }
1704
58397e6c
KT
1705 if (dev->type == DPDK_DEV_ETH) {
1706 if (dev->flags & NETDEV_UP) {
1707 err = rte_eth_dev_start(dev->port_id);
1708 if (err)
1709 return -err;
1710 }
8a9562d2 1711
58397e6c
KT
1712 if (dev->flags & NETDEV_PROMISC) {
1713 rte_eth_promiscuous_enable(dev->port_id);
1714 }
8a9562d2 1715
58397e6c
KT
1716 if (!(dev->flags & NETDEV_UP)) {
1717 rte_eth_dev_stop(dev->port_id);
1718 }
8a9562d2
PS
1719 }
1720
1721 return 0;
1722}
1723
1724static int
1725netdev_dpdk_update_flags(struct netdev *netdev_,
1726 enum netdev_flags off, enum netdev_flags on,
1727 enum netdev_flags *old_flagsp)
1728{
1729 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1730 int error;
1731
1732 ovs_mutex_lock(&netdev->mutex);
1733 error = netdev_dpdk_update_flags__(netdev, off, on, old_flagsp);
1734 ovs_mutex_unlock(&netdev->mutex);
1735
1736 return error;
1737}
1738
1739static int
1740netdev_dpdk_get_status(const struct netdev *netdev_, struct smap *args)
1741{
1742 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1743 struct rte_eth_dev_info dev_info;
1744
e0a801c7 1745 if (dev->port_id < 0)
8a9562d2
PS
1746 return ENODEV;
1747
1748 ovs_mutex_lock(&dev->mutex);
1749 rte_eth_dev_info_get(dev->port_id, &dev_info);
1750 ovs_mutex_unlock(&dev->mutex);
1751
1752 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1753
95fb793a 1754 smap_add_format(args, "port_no", "%d", dev->port_id);
8a9562d2
PS
1755 smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
1756 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1757 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
4be4d22c 1758 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
8a9562d2
PS
1759 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
1760 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
1761 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
1762 smap_add_format(args, "max_hash_mac_addrs", "%u", dev_info.max_hash_mac_addrs);
1763 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
1764 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
1765
39c2baa9 1766 if (dev_info.pci_dev) {
1767 smap_add_format(args, "pci-vendor_id", "0x%u",
1768 dev_info.pci_dev->id.vendor_id);
1769 smap_add_format(args, "pci-device_id", "0x%x",
1770 dev_info.pci_dev->id.device_id);
1771 }
8a9562d2
PS
1772
1773 return 0;
1774}
1775
1776static void
1777netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
1778 OVS_REQUIRES(dev->mutex)
1779{
1780 enum netdev_flags old_flags;
1781
1782 if (admin_state) {
1783 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
1784 } else {
1785 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
1786 }
1787}
1788
1789static void
1790netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
1791 const char *argv[], void *aux OVS_UNUSED)
1792{
1793 bool up;
1794
1795 if (!strcasecmp(argv[argc - 1], "up")) {
1796 up = true;
1797 } else if ( !strcasecmp(argv[argc - 1], "down")) {
1798 up = false;
1799 } else {
1800 unixctl_command_reply_error(conn, "Invalid Admin State");
1801 return;
1802 }
1803
1804 if (argc > 2) {
1805 struct netdev *netdev = netdev_from_name(argv[1]);
1806 if (netdev && is_dpdk_class(netdev->netdev_class)) {
1807 struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
1808
1809 ovs_mutex_lock(&dpdk_dev->mutex);
1810 netdev_dpdk_set_admin_state__(dpdk_dev, up);
1811 ovs_mutex_unlock(&dpdk_dev->mutex);
1812
1813 netdev_close(netdev);
1814 } else {
1815 unixctl_command_reply_error(conn, "Not a DPDK Interface");
1816 netdev_close(netdev);
1817 return;
1818 }
1819 } else {
1820 struct netdev_dpdk *netdev;
1821
1822 ovs_mutex_lock(&dpdk_mutex);
1823 LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
1824 ovs_mutex_lock(&netdev->mutex);
1825 netdev_dpdk_set_admin_state__(netdev, up);
1826 ovs_mutex_unlock(&netdev->mutex);
1827 }
1828 ovs_mutex_unlock(&dpdk_mutex);
1829 }
1830 unixctl_command_reply(conn, "OK");
1831}
1832
58397e6c
KT
1833/*
1834 * Set virtqueue flags so that we do not receive interrupts.
1835 */
1836static void
1837set_irq_status(struct virtio_net *dev)
1838{
4573fbd3
FL
1839 uint32_t i;
1840 uint64_t idx;
1841
1842 for (i = 0; i < dev->virt_qp_nb; i++) {
1843 idx = i * VIRTIO_QNUM;
1844 rte_vhost_enable_guest_notification(dev, idx + VIRTIO_RXQ, 0);
1845 rte_vhost_enable_guest_notification(dev, idx + VIRTIO_TXQ, 0);
1846 }
1847}
1848
585a5bea
IM
1849/*
1850 * Fixes mapping for vhost-user tx queues. Must be called after each
1851 * enabling/disabling of queues and real_n_txq modifications.
1852 */
1853static void
1854netdev_dpdk_remap_txqs(struct netdev_dpdk *netdev)
1855 OVS_REQUIRES(netdev->mutex)
1856{
1857 int *enabled_queues, n_enabled = 0;
1858 int i, k, total_txqs = netdev->real_n_txq;
1859
1860 enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
1861
1862 for (i = 0; i < total_txqs; i++) {
1863 /* Enabled queues always mapped to themselves. */
1864 if (netdev->tx_q[i].map == i) {
1865 enabled_queues[n_enabled++] = i;
1866 }
1867 }
1868
1869 if (n_enabled == 0 && total_txqs != 0) {
1870 enabled_queues[0] = -1;
1871 n_enabled = 1;
1872 }
1873
1874 k = 0;
1875 for (i = 0; i < total_txqs; i++) {
1876 if (netdev->tx_q[i].map != i) {
1877 netdev->tx_q[i].map = enabled_queues[k];
1878 k = (k + 1) % n_enabled;
1879 }
1880 }
1881
1882 VLOG_DBG("TX queue mapping for %s\n", netdev->vhost_id);
1883 for (i = 0; i < total_txqs; i++) {
1884 VLOG_DBG("%2d --> %2d", i, netdev->tx_q[i].map);
1885 }
1886
1887 rte_free(enabled_queues);
1888}
4573fbd3
FL
1889
1890static int
1891netdev_dpdk_vhost_set_queues(struct netdev_dpdk *netdev, struct virtio_net *dev)
585a5bea 1892 OVS_REQUIRES(netdev->mutex)
4573fbd3
FL
1893{
1894 uint32_t qp_num;
1895
1896 qp_num = dev->virt_qp_nb;
1897 if (qp_num > netdev->up.n_rxq) {
1898 VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "
1899 "too many queues %d > %d", dev->ifname, dev->device_fh,
1900 qp_num, netdev->up.n_rxq);
1901 return -1;
1902 }
1903
1904 netdev->real_n_rxq = qp_num;
1905 netdev->real_n_txq = qp_num;
585a5bea
IM
1906 netdev->txq_needs_locking = true;
1907
1908 netdev_dpdk_remap_txqs(netdev);
4573fbd3
FL
1909
1910 return 0;
58397e6c
KT
1911}
1912
1913/*
1914 * A new virtio-net device is added to a vhost port.
1915 */
1916static int
1917new_device(struct virtio_net *dev)
1918{
1919 struct netdev_dpdk *netdev;
1920 bool exists = false;
1921
1922 ovs_mutex_lock(&dpdk_mutex);
1923 /* Add device to the vhost port with the same name as that passed down. */
1924 LIST_FOR_EACH(netdev, list_node, &dpdk_list) {
7d1ced01 1925 if (strncmp(dev->ifname, netdev->vhost_id, IF_NAME_SZ) == 0) {
58397e6c 1926 ovs_mutex_lock(&netdev->mutex);
4573fbd3
FL
1927 if (netdev_dpdk_vhost_set_queues(netdev, dev)) {
1928 ovs_mutex_unlock(&netdev->mutex);
1929 ovs_mutex_unlock(&dpdk_mutex);
1930 return -1;
1931 }
58397e6c 1932 ovsrcu_set(&netdev->virtio_dev, dev);
58397e6c
KT
1933 exists = true;
1934 dev->flags |= VIRTIO_DEV_RUNNING;
1935 /* Disable notifications. */
1936 set_irq_status(dev);
4573fbd3 1937 ovs_mutex_unlock(&netdev->mutex);
58397e6c
KT
1938 break;
1939 }
1940 }
1941 ovs_mutex_unlock(&dpdk_mutex);
1942
1943 if (!exists) {
3a8653d7
AZ
1944 VLOG_INFO("vHost Device '%s' %"PRIu64" can't be added - name not "
1945 "found", dev->ifname, dev->device_fh);
58397e6c
KT
1946
1947 return -1;
1948 }
1949
3a8653d7
AZ
1950 VLOG_INFO("vHost Device '%s' %"PRIu64" has been added", dev->ifname,
1951 dev->device_fh);
58397e6c
KT
1952 return 0;
1953}
1954
1955/*
1956 * Remove a virtio-net device from the specific vhost port. Use dev->remove
1957 * flag to stop any more packets from being sent or received to/from a VM and
1958 * ensure all currently queued packets have been sent/received before removing
1959 * the device.
1960 */
1961static void
1962destroy_device(volatile struct virtio_net *dev)
1963{
1964 struct netdev_dpdk *vhost_dev;
afee281f 1965 bool exists = false;
58397e6c
KT
1966
1967 ovs_mutex_lock(&dpdk_mutex);
1968 LIST_FOR_EACH (vhost_dev, list_node, &dpdk_list) {
1969 if (netdev_dpdk_get_virtio(vhost_dev) == dev) {
1970
1971 ovs_mutex_lock(&vhost_dev->mutex);
1972 dev->flags &= ~VIRTIO_DEV_RUNNING;
1973 ovsrcu_set(&vhost_dev->virtio_dev, NULL);
afee281f 1974 exists = true;
58397e6c 1975 ovs_mutex_unlock(&vhost_dev->mutex);
afee281f 1976 break;
58397e6c
KT
1977 }
1978 }
afee281f 1979
58397e6c
KT
1980 ovs_mutex_unlock(&dpdk_mutex);
1981
afee281f
KT
1982 if (exists == true) {
1983 /*
1984 * Wait for other threads to quiesce after setting the 'virtio_dev'
1985 * to NULL, before returning.
1986 */
1987 ovsrcu_synchronize();
1988 /*
1989 * As call to ovsrcu_synchronize() will end the quiescent state,
1990 * put thread back into quiescent state before returning.
1991 */
1992 ovsrcu_quiesce_start();
1993 VLOG_INFO("vHost Device '%s' %"PRIu64" has been removed", dev->ifname,
1994 dev->device_fh);
1995 } else {
1996 VLOG_INFO("vHost Device '%s' %"PRIu64" not found", dev->ifname,
1997 dev->device_fh);
1998 }
1999
58397e6c
KT
2000}
2001
585a5bea
IM
2002static int
2003vring_state_changed(struct virtio_net *dev, uint16_t queue_id, int enable)
2004{
2005 struct netdev_dpdk *vhost_dev;
2006 bool exists = false;
2007 int qid = queue_id / VIRTIO_QNUM;
2008
2009 if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
2010 return 0;
2011 }
2012
2013 ovs_mutex_lock(&dpdk_mutex);
2014 LIST_FOR_EACH (vhost_dev, list_node, &dpdk_list) {
2015 if (strncmp(dev->ifname, vhost_dev->vhost_id, IF_NAME_SZ) == 0) {
2016 ovs_mutex_lock(&vhost_dev->mutex);
2017 if (enable) {
2018 vhost_dev->tx_q[qid].map = qid;
2019 } else {
2020 vhost_dev->tx_q[qid].map = -1;
2021 }
2022 netdev_dpdk_remap_txqs(vhost_dev);
2023 exists = true;
2024 ovs_mutex_unlock(&vhost_dev->mutex);
2025 break;
2026 }
2027 }
2028 ovs_mutex_unlock(&dpdk_mutex);
2029
2030 if (exists) {
2031 VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
2032 PRIu64" changed to \'%s\'", queue_id, qid, dev->ifname,
2033 dev->device_fh, (enable == 1) ? "enabled" : "disabled");
2034 } else {
2035 VLOG_INFO("vHost Device '%s' %"PRIu64" not found", dev->ifname,
2036 dev->device_fh);
2037 return -1;
2038 }
2039
2040 return 0;
2041}
2042
58397e6c
KT
2043struct virtio_net *
2044netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
2045{
2046 return ovsrcu_get(struct virtio_net *, &dev->virtio_dev);
2047}
2048
2049/*
2050 * These callbacks allow virtio-net devices to be added to vhost ports when
2051 * configuration has been fully complete.
2052 */
bce01e3a 2053static const struct virtio_net_device_ops virtio_net_device_ops =
58397e6c
KT
2054{
2055 .new_device = new_device,
2056 .destroy_device = destroy_device,
585a5bea 2057 .vring_state_changed = vring_state_changed
58397e6c
KT
2058};
2059
2060static void *
7d1ced01 2061start_vhost_loop(void *dummy OVS_UNUSED)
58397e6c
KT
2062{
2063 pthread_detach(pthread_self());
618f44f7
KT
2064 /* Put the cuse thread into quiescent state. */
2065 ovsrcu_quiesce_start();
58397e6c
KT
2066 rte_vhost_driver_session_start();
2067 return NULL;
2068}
2069
2070static int
2071dpdk_vhost_class_init(void)
7d1ced01
CL
2072{
2073 rte_vhost_driver_callback_register(&virtio_net_device_ops);
2074 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
2075 return 0;
2076}
2077
2078static int
2079dpdk_vhost_cuse_class_init(void)
58397e6c 2080{
58397e6c
KT
2081 int err = -1;
2082
58397e6c
KT
2083
2084 /* Register CUSE device to handle IOCTLs.
2085 * Unless otherwise specified on the vswitchd command line, cuse_dev_name
2086 * is set to vhost-net.
2087 */
2088 err = rte_vhost_driver_register(cuse_dev_name);
2089
2090 if (err != 0) {
2091 VLOG_ERR("CUSE device setup failure.");
2092 return -1;
2093 }
2094
7d1ced01
CL
2095 dpdk_vhost_class_init();
2096 return 0;
2097}
2098
2099static int
2100dpdk_vhost_user_class_init(void)
2101{
2102 dpdk_vhost_class_init();
618f44f7 2103 return 0;
58397e6c
KT
2104}
2105
033e9df2
DDP
2106static void
2107dpdk_common_init(void)
2108{
2109 unixctl_command_register("netdev-dpdk/set-admin-state",
2110 "[netdev] up|down", 1, 2,
2111 netdev_dpdk_set_admin_state, NULL);
2112
2113 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
2114}
2115
95fb793a 2116/* Client Rings */
2117
95fb793a 2118static int
2119dpdk_ring_create(const char dev_name[], unsigned int port_no,
2120 unsigned int *eth_port_id)
2121{
2122 struct dpdk_ring *ivshmem;
ca7e7bee 2123 char ring_name[RTE_RING_NAMESIZE];
95fb793a 2124 int err;
2125
2126 ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
2127 if (ivshmem == NULL) {
2128 return ENOMEM;
2129 }
2130
7251515e 2131 /* XXX: Add support for multiquque ring. */
ca7e7bee 2132 err = snprintf(ring_name, sizeof(ring_name), "%s_tx", dev_name);
95fb793a 2133 if (err < 0) {
2134 return -err;
2135 }
2136
8f0a76c9 2137 /* Create single producer tx ring, netdev does explicit locking. */
7251515e 2138 ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 2139 RING_F_SP_ENQ);
95fb793a 2140 if (ivshmem->cring_tx == NULL) {
2141 rte_free(ivshmem);
2142 return ENOMEM;
2143 }
2144
ca7e7bee 2145 err = snprintf(ring_name, sizeof(ring_name), "%s_rx", dev_name);
95fb793a 2146 if (err < 0) {
2147 return -err;
2148 }
2149
8f0a76c9 2150 /* Create single consumer rx ring, netdev does explicit locking. */
7251515e 2151 ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 2152 RING_F_SC_DEQ);
95fb793a 2153 if (ivshmem->cring_rx == NULL) {
2154 rte_free(ivshmem);
2155 return ENOMEM;
2156 }
2157
d7310583
DDP
2158 err = rte_eth_from_rings(dev_name, &ivshmem->cring_rx, 1,
2159 &ivshmem->cring_tx, 1, SOCKET0);
2160
95fb793a 2161 if (err < 0) {
2162 rte_free(ivshmem);
2163 return ENODEV;
2164 }
2165
2166 ivshmem->user_port_id = port_no;
2167 ivshmem->eth_port_id = rte_eth_dev_count() - 1;
2168 list_push_back(&dpdk_ring_list, &ivshmem->list_node);
2169
2170 *eth_port_id = ivshmem->eth_port_id;
2171 return 0;
2172}
2173
2174static int
2175dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dpdk_mutex)
2176{
2177 struct dpdk_ring *ivshmem;
2178 unsigned int port_no;
2179 int err = 0;
2180
2181 /* Names always start with "dpdkr" */
2182 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
2183 if (err) {
2184 return err;
2185 }
2186
2187 /* look through our list to find the device */
2188 LIST_FOR_EACH (ivshmem, list_node, &dpdk_ring_list) {
2189 if (ivshmem->user_port_id == port_no) {
58397e6c 2190 VLOG_INFO("Found dpdk ring device %s:", dev_name);
95fb793a 2191 *eth_port_id = ivshmem->eth_port_id; /* really all that is needed */
2192 return 0;
2193 }
2194 }
2195 /* Need to create the device rings */
2196 return dpdk_ring_create(dev_name, port_no, eth_port_id);
2197}
2198
7251515e 2199static int
a0cb2d66 2200netdev_dpdk_ring_send(struct netdev *netdev_, int qid,
e14deea0 2201 struct dp_packet **pkts, int cnt, bool may_steal)
7251515e 2202{
a0cb2d66 2203 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1b99bb05
MG
2204 unsigned i;
2205
2206 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the
2207 * rss hash field is clear. This is because the same mbuf may be modified by
2208 * the consumer of the ring and return into the datapath without recalculating
2209 * the RSS hash. */
2210 for (i = 0; i < cnt; i++) {
f2f44f5d 2211 dp_packet_rss_invalidate(pkts[i]);
1b99bb05 2212 }
7251515e 2213
a0cb2d66 2214 netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
7251515e
DV
2215 return 0;
2216}
2217
95fb793a 2218static int
2219netdev_dpdk_ring_construct(struct netdev *netdev)
2220{
2221 unsigned int port_no = 0;
2222 int err = 0;
2223
2224 if (rte_eal_init_ret) {
2225 return rte_eal_init_ret;
2226 }
2227
2228 ovs_mutex_lock(&dpdk_mutex);
2229
2230 err = dpdk_ring_open(netdev->name, &port_no);
2231 if (err) {
2232 goto unlock_dpdk;
2233 }
2234
58397e6c 2235 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
95fb793a 2236
2237unlock_dpdk:
2238 ovs_mutex_unlock(&dpdk_mutex);
2239 return err;
2240}
2241
58397e6c
KT
2242#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, MULTIQ, SEND, \
2243 GET_CARRIER, GET_STATS, GET_FEATURES, GET_STATUS, RXQ_RECV) \
95fb793a 2244{ \
2245 NAME, \
2246 INIT, /* init */ \
2247 NULL, /* netdev_dpdk_run */ \
2248 NULL, /* netdev_dpdk_wait */ \
2249 \
2250 netdev_dpdk_alloc, \
2251 CONSTRUCT, \
58397e6c 2252 DESTRUCT, \
95fb793a 2253 netdev_dpdk_dealloc, \
2254 netdev_dpdk_get_config, \
a14b8947 2255 netdev_dpdk_set_config, \
95fb793a 2256 NULL, /* get_tunnel_config */ \
58397e6c
KT
2257 NULL, /* build header */ \
2258 NULL, /* push header */ \
2259 NULL, /* pop header */ \
7dec44fe 2260 netdev_dpdk_get_numa_id, /* get_numa_id */ \
5496878c 2261 MULTIQ, /* set_multiq */ \
95fb793a 2262 \
7251515e 2263 SEND, /* send */ \
95fb793a 2264 NULL, /* send_wait */ \
2265 \
2266 netdev_dpdk_set_etheraddr, \
2267 netdev_dpdk_get_etheraddr, \
2268 netdev_dpdk_get_mtu, \
2269 netdev_dpdk_set_mtu, \
2270 netdev_dpdk_get_ifindex, \
58397e6c 2271 GET_CARRIER, \
95fb793a 2272 netdev_dpdk_get_carrier_resets, \
2273 netdev_dpdk_set_miimon, \
58397e6c
KT
2274 GET_STATS, \
2275 GET_FEATURES, \
95fb793a 2276 NULL, /* set_advertisements */ \
2277 \
2278 NULL, /* set_policing */ \
2279 NULL, /* get_qos_types */ \
2280 NULL, /* get_qos_capabilities */ \
2281 NULL, /* get_qos */ \
2282 NULL, /* set_qos */ \
2283 NULL, /* get_queue */ \
2284 NULL, /* set_queue */ \
2285 NULL, /* delete_queue */ \
2286 NULL, /* get_queue_stats */ \
2287 NULL, /* queue_dump_start */ \
2288 NULL, /* queue_dump_next */ \
2289 NULL, /* queue_dump_done */ \
2290 NULL, /* dump_queue_stats */ \
2291 \
2292 NULL, /* get_in4 */ \
2293 NULL, /* set_in4 */ \
2294 NULL, /* get_in6 */ \
2295 NULL, /* add_router */ \
2296 NULL, /* get_next_hop */ \
58397e6c 2297 GET_STATUS, \
95fb793a 2298 NULL, /* arp_lookup */ \
2299 \
2300 netdev_dpdk_update_flags, \
2301 \
2302 netdev_dpdk_rxq_alloc, \
2303 netdev_dpdk_rxq_construct, \
2304 netdev_dpdk_rxq_destruct, \
2305 netdev_dpdk_rxq_dealloc, \
58397e6c 2306 RXQ_RECV, \
95fb793a 2307 NULL, /* rx_wait */ \
2308 NULL, /* rxq_drain */ \
2309}
8a9562d2 2310
7d1ced01
CL
2311static int
2312process_vhost_flags(char *flag, char *default_val, int size,
2313 char **argv, char **new_val)
2314{
2315 int changed = 0;
2316
2317 /* Depending on which version of vhost is in use, process the vhost-specific
2318 * flag if it is provided on the vswitchd command line, otherwise resort to
2319 * a default value.
2320 *
3cdb27d3 2321 * For vhost-user: Process "-vhost_sock_dir" to set the custom location of
7d1ced01 2322 * the vhost-user socket(s).
3cdb27d3 2323 * For vhost-cuse: Process "-cuse_dev_name" to set the custom name of the
7d1ced01
CL
2324 * vhost-cuse character device.
2325 */
2326 if (!strcmp(argv[1], flag) && (strlen(argv[2]) <= size)) {
2327 changed = 1;
0d4af148 2328 *new_val = xstrdup(argv[2]);
7d1ced01
CL
2329 VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
2330 } else {
2331 VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
2332 *new_val = default_val;
2333 }
2334
2335 return changed;
2336}
2337
8a9562d2
PS
2338int
2339dpdk_init(int argc, char **argv)
2340{
2341 int result;
58397e6c
KT
2342 int base = 0;
2343 char *pragram_name = argv[0];
8a9562d2 2344
9441caf3 2345 if (argc < 2 || strcmp(argv[1], "--dpdk"))
8a9562d2
PS
2346 return 0;
2347
58397e6c 2348 /* Remove the --dpdk argument from arg list.*/
8a9562d2
PS
2349 argc--;
2350 argv++;
2351
dfc89a55
AZ
2352 /* Reject --user option */
2353 int i;
2354 for (i = 0; i < argc; i++) {
2355 if (!strcmp(argv[i], "--user")) {
2356 VLOG_ERR("Can not mix --dpdk and --user options, aborting.");
2357 }
2358 }
2359
7d1ced01 2360#ifdef VHOST_CUSE
0d4af148 2361 if (process_vhost_flags("-cuse_dev_name", xstrdup("vhost-net"),
7d1ced01
CL
2362 PATH_MAX, argv, &cuse_dev_name)) {
2363#else
0d4af148 2364 if (process_vhost_flags("-vhost_sock_dir", xstrdup(ovs_rundir()),
7d1ced01
CL
2365 NAME_MAX, argv, &vhost_sock_dir)) {
2366 struct stat s;
2367 int err;
58397e6c 2368
7d1ced01
CL
2369 err = stat(vhost_sock_dir, &s);
2370 if (err) {
2371 VLOG_ERR("vHostUser socket DIR '%s' does not exist.",
2372 vhost_sock_dir);
2373 return err;
2374 }
2375#endif
2376 /* Remove the vhost flag configuration parameters from the argument
58397e6c
KT
2377 * list, so that the correct elements are passed to the DPDK
2378 * initialization function
2379 */
2380 argc -= 2;
7d1ced01 2381 argv += 2; /* Increment by two to bypass the vhost flag arguments */
58397e6c 2382 base = 2;
58397e6c
KT
2383 }
2384
2385 /* Keep the program name argument as this is needed for call to
2386 * rte_eal_init()
2387 */
2388 argv[0] = pragram_name;
2389
8a9562d2
PS
2390 /* Make sure things are initialized ... */
2391 result = rte_eal_init(argc, argv);
451450fa 2392 if (result < 0) {
58397e6c 2393 ovs_abort(result, "Cannot init EAL");
451450fa 2394 }
8a9562d2 2395
d7310583 2396 rte_memzone_dump(stdout);
8a9562d2
PS
2397 rte_eal_init_ret = 0;
2398
451450fa 2399 if (argc > result) {
9441caf3 2400 argv[result] = argv[0];
451450fa 2401 }
9441caf3 2402
db73f716 2403 /* We are called from the main thread here */
d5c199ea 2404 RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
db73f716 2405
58397e6c 2406 return result + 1 + base;
8a9562d2
PS
2407}
2408
bce01e3a 2409static const struct netdev_class dpdk_class =
95fb793a 2410 NETDEV_DPDK_CLASS(
2411 "dpdk",
b8e57534 2412 NULL,
5496878c 2413 netdev_dpdk_construct,
58397e6c 2414 netdev_dpdk_destruct,
7251515e 2415 netdev_dpdk_set_multiq,
58397e6c
KT
2416 netdev_dpdk_eth_send,
2417 netdev_dpdk_get_carrier,
2418 netdev_dpdk_get_stats,
2419 netdev_dpdk_get_features,
2420 netdev_dpdk_get_status,
2421 netdev_dpdk_rxq_recv);
95fb793a 2422
bce01e3a 2423static const struct netdev_class dpdk_ring_class =
95fb793a 2424 NETDEV_DPDK_CLASS(
2425 "dpdkr",
033e9df2 2426 NULL,
5496878c 2427 netdev_dpdk_ring_construct,
58397e6c 2428 netdev_dpdk_destruct,
a0cb2d66 2429 netdev_dpdk_set_multiq,
58397e6c
KT
2430 netdev_dpdk_ring_send,
2431 netdev_dpdk_get_carrier,
2432 netdev_dpdk_get_stats,
2433 netdev_dpdk_get_features,
2434 netdev_dpdk_get_status,
2435 netdev_dpdk_rxq_recv);
2436
bd8baf47 2437static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
58397e6c 2438 NETDEV_DPDK_CLASS(
7d1ced01
CL
2439 "dpdkvhostcuse",
2440 dpdk_vhost_cuse_class_init,
2441 netdev_dpdk_vhost_cuse_construct,
2442 netdev_dpdk_vhost_destruct,
4573fbd3 2443 netdev_dpdk_vhost_cuse_set_multiq,
7d1ced01
CL
2444 netdev_dpdk_vhost_send,
2445 netdev_dpdk_vhost_get_carrier,
2446 netdev_dpdk_vhost_get_stats,
2447 NULL,
2448 NULL,
2449 netdev_dpdk_vhost_rxq_recv);
2450
bd8baf47 2451static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
7d1ced01
CL
2452 NETDEV_DPDK_CLASS(
2453 "dpdkvhostuser",
2454 dpdk_vhost_user_class_init,
2455 netdev_dpdk_vhost_user_construct,
58397e6c
KT
2456 netdev_dpdk_vhost_destruct,
2457 netdev_dpdk_vhost_set_multiq,
2458 netdev_dpdk_vhost_send,
2459 netdev_dpdk_vhost_get_carrier,
2460 netdev_dpdk_vhost_get_stats,
2461 NULL,
7251515e 2462 NULL,
58397e6c 2463 netdev_dpdk_vhost_rxq_recv);
95fb793a 2464
8a9562d2
PS
2465void
2466netdev_dpdk_register(void)
2467{
95fb793a 2468 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
2469
033e9df2
DDP
2470 if (rte_eal_init_ret) {
2471 return;
2472 }
2473
95fb793a 2474 if (ovsthread_once_start(&once)) {
033e9df2 2475 dpdk_common_init();
95fb793a 2476 netdev_register_provider(&dpdk_class);
2477 netdev_register_provider(&dpdk_ring_class);
7d1ced01
CL
2478#ifdef VHOST_CUSE
2479 netdev_register_provider(&dpdk_vhost_cuse_class);
2480#else
2481 netdev_register_provider(&dpdk_vhost_user_class);
2482#endif
95fb793a 2483 ovsthread_once_done(&once);
2484 }
8a9562d2 2485}
8617afff
PS
2486
2487int
bd5131ba 2488pmd_thread_setaffinity_cpu(unsigned cpu)
8617afff
PS
2489{
2490 cpu_set_t cpuset;
2491 int err;
2492
2493 CPU_ZERO(&cpuset);
2494 CPU_SET(cpu, &cpuset);
2495 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
2496 if (err) {
2497 VLOG_ERR("Thread affinity error %d",err);
2498 return err;
2499 }
abb5943d
AW
2500 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
2501 ovs_assert(cpu != NON_PMD_CORE_ID);
65f13b50 2502 RTE_PER_LCORE(_lcore_id) = cpu;
8617afff
PS
2503
2504 return 0;
2505}
db73f716 2506
db73f716 2507static bool
5f17de68 2508dpdk_thread_is_pmd(void)
db73f716 2509{
abb5943d 2510 return rte_lcore_id() != NON_PMD_CORE_ID;
db73f716 2511}