]> git.proxmox.com Git - ovs.git/blame - lib/netdev-dpdk.c
datapath: Improve robustness of this_cpu_ptr definition in compat layer
[ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2
PS
1/*
2 * Copyright (c) 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include <stdio.h>
20#include <string.h>
21#include <signal.h>
22#include <stdlib.h>
23#include <pthread.h>
24#include <config.h>
25#include <errno.h>
26#include <sched.h>
27#include <stdlib.h>
28#include <unistd.h>
29#include <stdio.h>
30
8a9562d2
PS
31#include "dpif-netdev.h"
32#include "list.h"
33#include "netdev-dpdk.h"
34#include "netdev-provider.h"
35#include "netdev-vport.h"
36#include "odp-util.h"
37#include "ofp-print.h"
38#include "ofpbuf.h"
39#include "ovs-thread.h"
40#include "ovs-rcu.h"
91088554 41#include "packet-dpif.h"
8a9562d2
PS
42#include "packets.h"
43#include "shash.h"
8a9562d2
PS
44#include "sset.h"
45#include "unaligned.h"
46#include "timeval.h"
47#include "unixctl.h"
48#include "vlog.h"
49
50VLOG_DEFINE_THIS_MODULE(dpdk);
51static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
52
53#define DPDK_PORT_WATCHDOG_INTERVAL 5
54
55#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
56#define OVS_VPORT_DPDK "ovs_dpdk"
57
58/*
59 * need to reserve tons of extra space in the mbufs so we can align the
60 * DMA addresses to 4KB.
61 */
62
63#define MTU_TO_MAX_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
64#define MBUF_SIZE(mtu) (MTU_TO_MAX_LEN(mtu) + (512) + \
65 sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
66
645b8934 67/* XXX: mempool size should be based on system resources. */
8a9562d2
PS
68#define NB_MBUF (4096 * 64)
69#define MP_CACHE_SZ (256 * 2)
70#define SOCKET0 0
71
72#define NON_PMD_THREAD_TX_QUEUE 0
73
79f5354c
PM
74#define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
75#define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
76
645b8934 77/* XXX: Needs per NIC value for these constants. */
8a9562d2
PS
78#define RX_PTHRESH 32 /* Default values of RX prefetch threshold reg. */
79#define RX_HTHRESH 32 /* Default values of RX host threshold reg. */
80#define RX_WTHRESH 16 /* Default values of RX write-back threshold reg. */
81
82#define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
83#define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
84#define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
85
86static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
87 .rxmode = {
88 .mq_mode = ETH_MQ_RX_RSS,
89 .split_hdr_size = 0,
90 .header_split = 0, /* Header Split disabled */
91 .hw_ip_checksum = 0, /* IP checksum offload disabled */
92 .hw_vlan_filter = 0, /* VLAN filtering disabled */
93 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
94 .hw_strip_crc = 0,
95 },
96 .rx_adv_conf = {
97 .rss_conf = {
98 .rss_key = NULL,
61a2647e
DDP
99 .rss_hf = ETH_RSS_IPV4_TCP | ETH_RSS_IPV4 | ETH_RSS_IPV6
100 | ETH_RSS_IPV4_UDP | ETH_RSS_IPV6_TCP | ETH_RSS_IPV6_UDP,
8a9562d2 101 },
a28ddd11
DDP
102 },
103 .txmode = {
104 .mq_mode = ETH_MQ_TX_NONE,
105 },
8a9562d2
PS
106};
107
108static const struct rte_eth_rxconf rx_conf = {
a28ddd11
DDP
109 .rx_thresh = {
110 .pthresh = RX_PTHRESH,
111 .hthresh = RX_HTHRESH,
112 .wthresh = RX_WTHRESH,
113 },
8a9562d2
PS
114};
115
116static const struct rte_eth_txconf tx_conf = {
a28ddd11
DDP
117 .tx_thresh = {
118 .pthresh = TX_PTHRESH,
119 .hthresh = TX_HTHRESH,
120 .wthresh = TX_WTHRESH,
121 },
122 .tx_free_thresh = 0,
123 .tx_rs_thresh = 0,
94777510 124 .txq_flags = ETH_TXQ_FLAGS_NOMULTSEGS|ETH_TXQ_FLAGS_NOOFFLOADS,
8a9562d2
PS
125};
126
3a100265
DDP
127enum { MAX_RX_QUEUE_LEN = 192 };
128enum { MAX_TX_QUEUE_LEN = 384 };
58f7c37b
DDP
129enum { DPDK_RING_SIZE = 256 };
130BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
131enum { DRAIN_TSC = 200000ULL };
132
133static int rte_eal_init_ret = ENODEV;
134
135static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
136
137/* Contains all 'struct dpdk_dev's. */
138static struct list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
139 = LIST_INITIALIZER(&dpdk_list);
140
141static struct list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
142 = LIST_INITIALIZER(&dpdk_mp_list);
143
db73f716
DDP
144/* This mutex must be used by non pmd threads when allocating or freeing
145 * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
146 * use mempools, a non pmd thread should hold this mutex while calling them */
147struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
148
8a9562d2
PS
149struct dpdk_mp {
150 struct rte_mempool *mp;
151 int mtu;
152 int socket_id;
153 int refcount;
154 struct list list_node OVS_GUARDED_BY(dpdk_mutex);
155};
156
157struct dpdk_tx_queue {
158 rte_spinlock_t tx_lock;
159 int count;
160 uint64_t tsc;
161 struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
162};
163
95fb793a 164/* dpdk has no way to remove dpdk ring ethernet devices
165 so we have to keep them around once they've been created
166*/
167
168static struct list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
169 = LIST_INITIALIZER(&dpdk_ring_list);
170
171struct dpdk_ring {
172 /* For the client rings */
173 struct rte_ring *cring_tx;
174 struct rte_ring *cring_rx;
175 int user_port_id; /* User given port no, parsed from port name */
176 int eth_port_id; /* ethernet device port id */
177 struct list list_node OVS_GUARDED_BY(dpdk_mutex);
178};
179
8a9562d2
PS
180struct netdev_dpdk {
181 struct netdev up;
182 int port_id;
183 int max_packet_len;
184
185 struct dpdk_tx_queue tx_q[NR_QUEUE];
186
187 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
188
189 struct dpdk_mp *dpdk_mp;
190 int mtu;
191 int socket_id;
192 int buf_size;
193 struct netdev_stats stats_offset;
194 struct netdev_stats stats;
195
196 uint8_t hwaddr[ETH_ADDR_LEN];
197 enum netdev_flags flags;
198
199 struct rte_eth_link link;
200 int link_reset_cnt;
201
202 /* In dpdk_list. */
203 struct list list_node OVS_GUARDED_BY(dpdk_mutex);
204};
205
206struct netdev_rxq_dpdk {
207 struct netdev_rxq up;
208 int port_id;
209};
210
db73f716
DDP
211static bool thread_is_pmd(void);
212
8a9562d2
PS
213static int netdev_dpdk_construct(struct netdev *);
214
215static bool
216is_dpdk_class(const struct netdev_class *class)
217{
218 return class->construct == netdev_dpdk_construct;
219}
220
645b8934 221/* XXX: use dpdk malloc for entire OVS. infact huge page shld be used
8a9562d2
PS
222 * for all other sengments data, bss and text. */
223
224static void *
225dpdk_rte_mzalloc(size_t sz)
226{
227 void *ptr;
228
229 ptr = rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
230 if (ptr == NULL) {
231 out_of_memory();
232 }
233 return ptr;
234}
235
db73f716
DDP
236/* XXX this function should be called only by pmd threads (or by non pmd
237 * threads holding the nonpmd_mempool_mutex) */
8a9562d2 238void
91088554 239free_dpdk_buf(struct dpif_packet *p)
8a9562d2 240{
db73f716 241 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 242
db73f716 243 rte_pktmbuf_free_seg(pkt);
8a9562d2
PS
244}
245
b3cd9f9d
PS
246static void
247__rte_pktmbuf_init(struct rte_mempool *mp,
248 void *opaque_arg OVS_UNUSED,
249 void *_m,
250 unsigned i OVS_UNUSED)
251{
252 struct rte_mbuf *m = _m;
91088554 253 uint32_t buf_len = mp->elt_size - sizeof(struct dpif_packet);
b3cd9f9d 254
91088554 255 RTE_MBUF_ASSERT(mp->elt_size >= sizeof(struct dpif_packet));
b3cd9f9d
PS
256
257 memset(m, 0, mp->elt_size);
258
259 /* start of buffer is just after mbuf structure */
91088554 260 m->buf_addr = (char *)m + sizeof(struct dpif_packet);
b3cd9f9d 261 m->buf_physaddr = rte_mempool_virt2phy(mp, m) +
91088554 262 sizeof(struct dpif_packet);
b3cd9f9d
PS
263 m->buf_len = (uint16_t)buf_len;
264
265 /* keep some headroom between start of buffer and data */
266 m->pkt.data = (char*) m->buf_addr + RTE_MIN(RTE_PKTMBUF_HEADROOM, m->buf_len);
267
268 /* init some constant fields */
269 m->type = RTE_MBUF_PKT;
270 m->pool = mp;
271 m->pkt.nb_segs = 1;
272 m->pkt.in_port = 0xff;
273}
274
275static void
276ovs_rte_pktmbuf_init(struct rte_mempool *mp,
277 void *opaque_arg OVS_UNUSED,
278 void *_m,
279 unsigned i OVS_UNUSED)
280{
281 struct rte_mbuf *m = _m;
282
283 __rte_pktmbuf_init(mp, opaque_arg, _m, i);
284
285 ofpbuf_init_dpdk((struct ofpbuf *) m, m->buf_len);
286}
287
8a9562d2
PS
288static struct dpdk_mp *
289dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
290{
291 struct dpdk_mp *dmp = NULL;
292 char mp_name[RTE_MEMPOOL_NAMESIZE];
293
294 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
295 if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
296 dmp->refcount++;
297 return dmp;
298 }
299 }
300
301 dmp = dpdk_rte_mzalloc(sizeof *dmp);
302 dmp->socket_id = socket_id;
303 dmp->mtu = mtu;
304 dmp->refcount = 1;
305
34631d72
AW
306 if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d", dmp->mtu,
307 dmp->socket_id) < 0) {
95fb793a 308 return NULL;
309 }
310
8a9562d2
PS
311 dmp->mp = rte_mempool_create(mp_name, NB_MBUF, MBUF_SIZE(mtu),
312 MP_CACHE_SZ,
313 sizeof(struct rte_pktmbuf_pool_private),
314 rte_pktmbuf_pool_init, NULL,
b3cd9f9d 315 ovs_rte_pktmbuf_init, NULL,
8a9562d2
PS
316 socket_id, 0);
317
318 if (dmp->mp == NULL) {
319 return NULL;
320 }
321
322 list_push_back(&dpdk_mp_list, &dmp->list_node);
323 return dmp;
324}
325
326static void
327dpdk_mp_put(struct dpdk_mp *dmp)
328{
329
330 if (!dmp) {
331 return;
332 }
333
334 dmp->refcount--;
335 ovs_assert(dmp->refcount >= 0);
336
337#if 0
338 /* I could not find any API to destroy mp. */
339 if (dmp->refcount == 0) {
340 list_delete(dmp->list_node);
341 /* destroy mp-pool. */
342 }
343#endif
344}
345
346static void
347check_link_status(struct netdev_dpdk *dev)
348{
349 struct rte_eth_link link;
350
351 rte_eth_link_get_nowait(dev->port_id, &link);
352
353 if (dev->link.link_status != link.link_status) {
3e912ffc 354 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
355
356 dev->link_reset_cnt++;
357 dev->link = link;
358 if (dev->link.link_status) {
359 VLOG_DBG_RL(&rl, "Port %d Link Up - speed %u Mbps - %s",
360 dev->port_id, (unsigned)dev->link.link_speed,
361 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
362 ("full-duplex") : ("half-duplex"));
363 } else {
364 VLOG_DBG_RL(&rl, "Port %d Link Down", dev->port_id);
365 }
366 }
367}
368
369static void *
370dpdk_watchdog(void *dummy OVS_UNUSED)
371{
372 struct netdev_dpdk *dev;
373
374 pthread_detach(pthread_self());
375
376 for (;;) {
377 ovs_mutex_lock(&dpdk_mutex);
378 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
379 ovs_mutex_lock(&dev->mutex);
380 check_link_status(dev);
381 ovs_mutex_unlock(&dev->mutex);
382 }
383 ovs_mutex_unlock(&dpdk_mutex);
384 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
385 }
386
387 return NULL;
388}
389
390static int
391dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
392{
393 struct rte_pktmbuf_pool_private *mbp_priv;
394 struct ether_addr eth_addr;
395 int diag;
396 int i;
397
398 if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
95fb793a 399 return ENODEV;
8a9562d2
PS
400 }
401
402 diag = rte_eth_dev_configure(dev->port_id, NR_QUEUE, NR_QUEUE, &port_conf);
403 if (diag) {
404 VLOG_ERR("eth dev config error %d",diag);
95fb793a 405 return -diag;
8a9562d2
PS
406 }
407
408 for (i = 0; i < NR_QUEUE; i++) {
79f5354c 409 diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
d221ffa1 410 dev->socket_id, &tx_conf);
8a9562d2
PS
411 if (diag) {
412 VLOG_ERR("eth dev tx queue setup error %d",diag);
95fb793a 413 return -diag;
8a9562d2
PS
414 }
415 }
416
417 for (i = 0; i < NR_QUEUE; i++) {
79f5354c 418 diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
d221ffa1 419 dev->socket_id,
a715f600 420 &rx_conf, dev->dpdk_mp->mp);
8a9562d2
PS
421 if (diag) {
422 VLOG_ERR("eth dev rx queue setup error %d",diag);
95fb793a 423 return -diag;
8a9562d2
PS
424 }
425 }
426
427 diag = rte_eth_dev_start(dev->port_id);
428 if (diag) {
429 VLOG_ERR("eth dev start error %d",diag);
95fb793a 430 return -diag;
8a9562d2
PS
431 }
432
433 rte_eth_promiscuous_enable(dev->port_id);
434 rte_eth_allmulticast_enable(dev->port_id);
435
436 memset(&eth_addr, 0x0, sizeof(eth_addr));
437 rte_eth_macaddr_get(dev->port_id, &eth_addr);
438 VLOG_INFO_RL(&rl, "Port %d: "ETH_ADDR_FMT"",
439 dev->port_id, ETH_ADDR_ARGS(eth_addr.addr_bytes));
440
441 memcpy(dev->hwaddr, eth_addr.addr_bytes, ETH_ADDR_LEN);
442 rte_eth_link_get_nowait(dev->port_id, &dev->link);
443
444 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
445 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
446
447 dev->flags = NETDEV_UP | NETDEV_PROMISC;
448 return 0;
449}
450
451static struct netdev_dpdk *
452netdev_dpdk_cast(const struct netdev *netdev)
453{
454 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
455}
456
457static struct netdev *
458netdev_dpdk_alloc(void)
459{
460 struct netdev_dpdk *netdev = dpdk_rte_mzalloc(sizeof *netdev);
461 return &netdev->up;
462}
463
464static int
95fb793a 465netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no) OVS_REQUIRES(dpdk_mutex)
8a9562d2
PS
466{
467 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
95fb793a 468 int err = 0;
8a9562d2
PS
469 int i;
470
95fb793a 471 ovs_mutex_init(&netdev->mutex);
8a9562d2 472
95fb793a 473 ovs_mutex_lock(&netdev->mutex);
8a9562d2
PS
474
475 for (i = 0; i < NR_QUEUE; i++) {
476 rte_spinlock_init(&netdev->tx_q[i].tx_lock);
477 }
478
95fb793a 479 netdev->port_id = port_no;
8a9562d2 480
8a9562d2 481 netdev->flags = 0;
8a9562d2
PS
482 netdev->mtu = ETHER_MTU;
483 netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
484
645b8934 485 /* XXX: need to discover device node at run time. */
8a9562d2 486 netdev->socket_id = SOCKET0;
8a9562d2
PS
487
488 netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
489 if (!netdev->dpdk_mp) {
490 err = ENOMEM;
95fb793a 491 goto unlock;
8a9562d2
PS
492 }
493
494 err = dpdk_eth_dev_init(netdev);
495 if (err) {
95fb793a 496 goto unlock;
8a9562d2 497 }
f00fa8cb 498 netdev_->n_txq = NR_QUEUE;
8a9562d2
PS
499 netdev_->n_rxq = NR_QUEUE;
500
501 list_push_back(&dpdk_list, &netdev->list_node);
502
95fb793a 503unlock:
8a9562d2 504 ovs_mutex_unlock(&netdev->mutex);
95fb793a 505 return err;
506}
507
508static int
509dpdk_dev_parse_name(const char dev_name[], const char prefix[],
510 unsigned int *port_no)
511{
512 const char *cport;
513
514 if (strncmp(dev_name, prefix, strlen(prefix))) {
515 return ENODEV;
516 }
517
518 cport = dev_name + strlen(prefix);
519 *port_no = strtol(cport, 0, 0); /* string must be null terminated */
520 return 0;
521}
522
523static int
524netdev_dpdk_construct(struct netdev *netdev)
525{
526 unsigned int port_no;
527 int err;
528
529 if (rte_eal_init_ret) {
530 return rte_eal_init_ret;
531 }
532
533 /* Names always start with "dpdk" */
534 err = dpdk_dev_parse_name(netdev->name, "dpdk", &port_no);
535 if (err) {
536 return err;
537 }
538
539 ovs_mutex_lock(&dpdk_mutex);
540 err = netdev_dpdk_init(netdev, port_no);
8a9562d2
PS
541 ovs_mutex_unlock(&dpdk_mutex);
542 return err;
543}
544
545static void
546netdev_dpdk_destruct(struct netdev *netdev_)
547{
548 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
549
550 ovs_mutex_lock(&dev->mutex);
551 rte_eth_dev_stop(dev->port_id);
552 ovs_mutex_unlock(&dev->mutex);
553
554 ovs_mutex_lock(&dpdk_mutex);
555 list_remove(&dev->list_node);
556 dpdk_mp_put(dev->dpdk_mp);
557 ovs_mutex_unlock(&dpdk_mutex);
558
559 ovs_mutex_destroy(&dev->mutex);
560}
561
562static void
563netdev_dpdk_dealloc(struct netdev *netdev_)
564{
565 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
566
567 rte_free(netdev);
568}
569
570static int
571netdev_dpdk_get_config(const struct netdev *netdev_, struct smap *args)
572{
573 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
574
575 ovs_mutex_lock(&dev->mutex);
576
645b8934 577 /* XXX: Allow to configure number of queues. */
8a9562d2
PS
578 smap_add_format(args, "configured_rx_queues", "%u", netdev_->n_rxq);
579 smap_add_format(args, "configured_tx_queues", "%u", netdev_->n_rxq);
580 ovs_mutex_unlock(&dev->mutex);
581
582 return 0;
583}
584
7dec44fe
AW
585static int
586netdev_dpdk_get_numa_id(const struct netdev *netdev_)
587{
588 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
589
590 return netdev->socket_id;
591}
592
8a9562d2
PS
593static struct netdev_rxq *
594netdev_dpdk_rxq_alloc(void)
595{
596 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
597
598 return &rx->up;
599}
600
601static struct netdev_rxq_dpdk *
602netdev_rxq_dpdk_cast(const struct netdev_rxq *rx)
603{
604 return CONTAINER_OF(rx, struct netdev_rxq_dpdk, up);
605}
606
607static int
608netdev_dpdk_rxq_construct(struct netdev_rxq *rxq_)
609{
610 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
611 struct netdev_dpdk *netdev = netdev_dpdk_cast(rx->up.netdev);
612
613 ovs_mutex_lock(&netdev->mutex);
614 rx->port_id = netdev->port_id;
615 ovs_mutex_unlock(&netdev->mutex);
616
617 return 0;
618}
619
620static void
621netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED)
622{
623}
624
625static void
626netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq_)
627{
628 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
629
630 rte_free(rx);
631}
632
b170db2a
RW
633static inline void
634dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
8a9562d2
PS
635{
636 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
1304f1f8
DDP
637 uint32_t nb_tx = 0;
638
639 while (nb_tx != txq->count) {
640 uint32_t ret;
641
642 ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
643 txq->count - nb_tx);
644 if (!ret) {
645 break;
646 }
647
648 nb_tx += ret;
649 }
8a9562d2 650
b170db2a 651 if (OVS_UNLIKELY(nb_tx != txq->count)) {
db73f716
DDP
652 /* free buffers, which we couldn't transmit, one at a time (each
653 * packet could come from a different mempool) */
654 int i;
655
656 for (i = nb_tx; i < txq->count; i++) {
657 rte_pktmbuf_free_seg(txq->burst_pkts[i]);
658 }
1304f1f8
DDP
659 ovs_mutex_lock(&dev->mutex);
660 dev->stats.tx_dropped += txq->count-nb_tx;
661 ovs_mutex_unlock(&dev->mutex);
8a9562d2 662 }
1304f1f8 663
8a9562d2 664 txq->count = 0;
844f2d74 665 txq->tsc = rte_get_timer_cycles();
b170db2a
RW
666}
667
668static inline void
669dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
670{
671 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
672
673 if (txq->count == 0) {
674 return;
675 }
676 rte_spinlock_lock(&txq->tx_lock);
677 dpdk_queue_flush__(dev, qid);
8a9562d2
PS
678 rte_spinlock_unlock(&txq->tx_lock);
679}
680
8a9562d2 681static int
91088554
DDP
682netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
683 int *c)
8a9562d2
PS
684{
685 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq_);
686 struct netdev *netdev = rx->up.netdev;
687 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 688 int nb_rx;
8a9562d2
PS
689
690 dpdk_queue_flush(dev, rxq_->queue_id);
691
692 nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
7d08d53e
DDP
693 (struct rte_mbuf **) packets,
694 MIN((int)NETDEV_MAX_RX_BATCH,
695 (int)MAX_RX_QUEUE_LEN));
8a9562d2
PS
696 if (!nb_rx) {
697 return EAGAIN;
698 }
699
8a9562d2
PS
700 *c = nb_rx;
701
702 return 0;
703}
704
705inline static void
f4fd623c
DDP
706dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
707 struct rte_mbuf **pkts, int cnt)
8a9562d2
PS
708{
709 struct dpdk_tx_queue *txq = &dev->tx_q[qid];
710 uint64_t diff_tsc;
8a9562d2 711
f4fd623c
DDP
712 int i = 0;
713
8a9562d2 714 rte_spinlock_lock(&txq->tx_lock);
f4fd623c
DDP
715 while (i < cnt) {
716 int freeslots = MAX_TX_QUEUE_LEN - txq->count;
717 int tocopy = MIN(freeslots, cnt-i);
8a9562d2 718
f4fd623c
DDP
719 memcpy(&txq->burst_pkts[txq->count], &pkts[i],
720 tocopy * sizeof (struct rte_mbuf *));
721
722 txq->count += tocopy;
723 i += tocopy;
724
725 if (txq->count == MAX_TX_QUEUE_LEN) {
b170db2a 726 dpdk_queue_flush__(dev, qid);
f4fd623c 727 }
844f2d74 728 diff_tsc = rte_get_timer_cycles() - txq->tsc;
f4fd623c 729 if (diff_tsc >= DRAIN_TSC) {
b170db2a 730 dpdk_queue_flush__(dev, qid);
f4fd623c 731 }
8a9562d2 732 }
8a9562d2
PS
733 rte_spinlock_unlock(&txq->tx_lock);
734}
735
736/* Tx function. Transmit packets indefinitely */
737static void
f4fd623c 738dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
db73f716 739 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2
PS
740{
741 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
f4fd623c 742 struct rte_mbuf *mbufs[cnt];
175cf4de
RW
743 int dropped = 0;
744 int newcnt = 0;
745 int i;
8a9562d2 746
db73f716
DDP
747 /* If we are on a non pmd thread we have to use the mempool mutex, because
748 * every non pmd thread shares the same mempool cache */
749
750 if (!thread_is_pmd()) {
751 ovs_mutex_lock(&nonpmd_mempool_mutex);
752 }
753
f4fd623c
DDP
754 for (i = 0; i < cnt; i++) {
755 int size = ofpbuf_size(&pkts[i]->ofpbuf);
95fb793a 756
f98d7864 757 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
f4fd623c
DDP
758 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
759 (int)size , dev->max_packet_len);
760
175cf4de 761 dropped++;
f4fd623c
DDP
762 continue;
763 }
8a9562d2 764
f4fd623c 765 mbufs[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
8a9562d2 766
f4fd623c 767 if (!mbufs[newcnt]) {
175cf4de
RW
768 dropped += cnt - i;
769 break;
f4fd623c
DDP
770 }
771
772 /* We have to do a copy for now */
773 memcpy(mbufs[newcnt]->pkt.data, ofpbuf_data(&pkts[i]->ofpbuf), size);
774
775 rte_pktmbuf_data_len(mbufs[newcnt]) = size;
776 rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
777
778 newcnt++;
779 }
8a9562d2 780
f98d7864 781 if (OVS_UNLIKELY(dropped)) {
175cf4de
RW
782 ovs_mutex_lock(&dev->mutex);
783 dev->stats.tx_dropped += dropped;
784 ovs_mutex_unlock(&dev->mutex);
785 }
786
f4fd623c 787 dpdk_queue_pkts(dev, NON_PMD_THREAD_TX_QUEUE, mbufs, newcnt);
8a9562d2 788 dpdk_queue_flush(dev, NON_PMD_THREAD_TX_QUEUE);
db73f716
DDP
789
790 if (!thread_is_pmd()) {
791 ovs_mutex_unlock(&nonpmd_mempool_mutex);
792 }
8a9562d2
PS
793}
794
795static int
f00fa8cb
AW
796netdev_dpdk_send(struct netdev *netdev, int qid, struct dpif_packet **pkts,
797 int cnt, bool may_steal)
8a9562d2
PS
798{
799 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
800 int ret;
f4fd623c 801 int i;
8a9562d2 802
f4fd623c
DDP
803 if (!may_steal || pkts[0]->ofpbuf.source != OFPBUF_DPDK) {
804 dpdk_do_tx_copy(netdev, pkts, cnt);
b3cd9f9d
PS
805
806 if (may_steal) {
f4fd623c
DDP
807 for (i = 0; i < cnt; i++) {
808 dpif_packet_delete(pkts[i]);
809 }
b3cd9f9d 810 }
8a9562d2 811 } else {
f4fd623c
DDP
812 int next_tx_idx = 0;
813 int dropped = 0;
8a9562d2 814
8a9562d2
PS
815 qid = rte_lcore_id() % NR_QUEUE;
816
f4fd623c
DDP
817 for (i = 0; i < cnt; i++) {
818 int size = ofpbuf_size(&pkts[i]->ofpbuf);
819 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
820 if (next_tx_idx != i) {
821 dpdk_queue_pkts(dev, qid,
822 (struct rte_mbuf **)&pkts[next_tx_idx],
823 i-next_tx_idx);
1ebfe1ac 824 }
f4fd623c 825
1ebfe1ac
DDP
826 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
827 (int)size , dev->max_packet_len);
f4fd623c 828
1ebfe1ac
DDP
829 dpif_packet_delete(pkts[i]);
830 dropped++;
f4fd623c
DDP
831 next_tx_idx = i + 1;
832 }
833 }
834 if (next_tx_idx != cnt) {
835 dpdk_queue_pkts(dev, qid,
836 (struct rte_mbuf **)&pkts[next_tx_idx],
837 cnt-next_tx_idx);
838 }
8a9562d2 839
f4fd623c
DDP
840 if (OVS_UNLIKELY(dropped)) {
841 ovs_mutex_lock(&dev->mutex);
842 dev->stats.tx_dropped += dropped;
843 ovs_mutex_unlock(&dev->mutex);
844 }
8a9562d2
PS
845 }
846 ret = 0;
847
8a9562d2
PS
848 return ret;
849}
850
851static int
852netdev_dpdk_set_etheraddr(struct netdev *netdev,
853 const uint8_t mac[ETH_ADDR_LEN])
854{
855 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
856
857 ovs_mutex_lock(&dev->mutex);
858 if (!eth_addr_equals(dev->hwaddr, mac)) {
859 memcpy(dev->hwaddr, mac, ETH_ADDR_LEN);
045c0d1a 860 netdev_change_seq_changed(netdev);
8a9562d2
PS
861 }
862 ovs_mutex_unlock(&dev->mutex);
863
864 return 0;
865}
866
867static int
868netdev_dpdk_get_etheraddr(const struct netdev *netdev,
869 uint8_t mac[ETH_ADDR_LEN])
870{
871 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
872
873 ovs_mutex_lock(&dev->mutex);
874 memcpy(mac, dev->hwaddr, ETH_ADDR_LEN);
875 ovs_mutex_unlock(&dev->mutex);
876
877 return 0;
878}
879
880static int
881netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
882{
883 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
884
885 ovs_mutex_lock(&dev->mutex);
886 *mtup = dev->mtu;
887 ovs_mutex_unlock(&dev->mutex);
888
889 return 0;
890}
891
892static int
893netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
894{
895 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
896 int old_mtu, err;
897 struct dpdk_mp *old_mp;
898 struct dpdk_mp *mp;
899
900 ovs_mutex_lock(&dpdk_mutex);
901 ovs_mutex_lock(&dev->mutex);
902 if (dev->mtu == mtu) {
903 err = 0;
904 goto out;
905 }
906
907 mp = dpdk_mp_get(dev->socket_id, dev->mtu);
908 if (!mp) {
909 err = ENOMEM;
910 goto out;
911 }
912
913 rte_eth_dev_stop(dev->port_id);
914
915 old_mtu = dev->mtu;
916 old_mp = dev->dpdk_mp;
917 dev->dpdk_mp = mp;
918 dev->mtu = mtu;
919 dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
920
921 err = dpdk_eth_dev_init(dev);
922 if (err) {
8a9562d2
PS
923 dpdk_mp_put(mp);
924 dev->mtu = old_mtu;
925 dev->dpdk_mp = old_mp;
926 dev->max_packet_len = MTU_TO_MAX_LEN(dev->mtu);
927 dpdk_eth_dev_init(dev);
928 goto out;
929 }
930
931 dpdk_mp_put(old_mp);
045c0d1a 932 netdev_change_seq_changed(netdev);
8a9562d2
PS
933out:
934 ovs_mutex_unlock(&dev->mutex);
935 ovs_mutex_unlock(&dpdk_mutex);
936 return err;
937}
938
939static int
940netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier);
941
942static int
943netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
944{
945 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
946 struct rte_eth_stats rte_stats;
947 bool gg;
948
949 netdev_dpdk_get_carrier(netdev, &gg);
950 ovs_mutex_lock(&dev->mutex);
951 rte_eth_stats_get(dev->port_id, &rte_stats);
952
953 *stats = dev->stats_offset;
954
955 stats->rx_packets += rte_stats.ipackets;
956 stats->tx_packets += rte_stats.opackets;
957 stats->rx_bytes += rte_stats.ibytes;
958 stats->tx_bytes += rte_stats.obytes;
959 stats->rx_errors += rte_stats.ierrors;
960 stats->tx_errors += rte_stats.oerrors;
961 stats->multicast += rte_stats.imcasts;
962
963 stats->tx_dropped += dev->stats.tx_dropped;
964 ovs_mutex_unlock(&dev->mutex);
965
966 return 0;
967}
968
969static int
970netdev_dpdk_set_stats(struct netdev *netdev, const struct netdev_stats *stats)
971{
972 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
973
974 ovs_mutex_lock(&dev->mutex);
975 dev->stats_offset = *stats;
976 ovs_mutex_unlock(&dev->mutex);
977
978 return 0;
979}
980
981static int
982netdev_dpdk_get_features(const struct netdev *netdev_,
983 enum netdev_features *current,
984 enum netdev_features *advertised OVS_UNUSED,
985 enum netdev_features *supported OVS_UNUSED,
986 enum netdev_features *peer OVS_UNUSED)
987{
988 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
989 struct rte_eth_link link;
990
991 ovs_mutex_lock(&dev->mutex);
992 link = dev->link;
993 ovs_mutex_unlock(&dev->mutex);
994
995 if (link.link_duplex == ETH_LINK_AUTONEG_DUPLEX) {
996 if (link.link_speed == ETH_LINK_SPEED_AUTONEG) {
997 *current = NETDEV_F_AUTONEG;
998 }
999 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
1000 if (link.link_speed == ETH_LINK_SPEED_10) {
1001 *current = NETDEV_F_10MB_HD;
1002 }
1003 if (link.link_speed == ETH_LINK_SPEED_100) {
1004 *current = NETDEV_F_100MB_HD;
1005 }
1006 if (link.link_speed == ETH_LINK_SPEED_1000) {
1007 *current = NETDEV_F_1GB_HD;
1008 }
1009 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
1010 if (link.link_speed == ETH_LINK_SPEED_10) {
1011 *current = NETDEV_F_10MB_FD;
1012 }
1013 if (link.link_speed == ETH_LINK_SPEED_100) {
1014 *current = NETDEV_F_100MB_FD;
1015 }
1016 if (link.link_speed == ETH_LINK_SPEED_1000) {
1017 *current = NETDEV_F_1GB_FD;
1018 }
1019 if (link.link_speed == ETH_LINK_SPEED_10000) {
1020 *current = NETDEV_F_10GB_FD;
1021 }
1022 }
1023
1024 return 0;
1025}
1026
1027static int
1028netdev_dpdk_get_ifindex(const struct netdev *netdev)
1029{
1030 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1031 int ifindex;
1032
1033 ovs_mutex_lock(&dev->mutex);
1034 ifindex = dev->port_id;
1035 ovs_mutex_unlock(&dev->mutex);
1036
1037 return ifindex;
1038}
1039
1040static int
1041netdev_dpdk_get_carrier(const struct netdev *netdev_, bool *carrier)
1042{
1043 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1044
1045 ovs_mutex_lock(&dev->mutex);
1046 check_link_status(dev);
1047 *carrier = dev->link.link_status;
1048 ovs_mutex_unlock(&dev->mutex);
1049
1050 return 0;
1051}
1052
1053static long long int
1054netdev_dpdk_get_carrier_resets(const struct netdev *netdev_)
1055{
1056 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1057 long long int carrier_resets;
1058
1059 ovs_mutex_lock(&dev->mutex);
1060 carrier_resets = dev->link_reset_cnt;
1061 ovs_mutex_unlock(&dev->mutex);
1062
1063 return carrier_resets;
1064}
1065
1066static int
1067netdev_dpdk_set_miimon(struct netdev *netdev_ OVS_UNUSED,
1068 long long int interval OVS_UNUSED)
1069{
1070 return 0;
1071}
1072
1073static int
1074netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
1075 enum netdev_flags off, enum netdev_flags on,
95fb793a 1076 enum netdev_flags *old_flagsp) OVS_REQUIRES(dev->mutex)
8a9562d2
PS
1077{
1078 int err;
1079
1080 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
1081 return EINVAL;
1082 }
1083
1084 *old_flagsp = dev->flags;
1085 dev->flags |= on;
1086 dev->flags &= ~off;
1087
1088 if (dev->flags == *old_flagsp) {
1089 return 0;
1090 }
1091
1092 if (dev->flags & NETDEV_UP) {
1093 err = rte_eth_dev_start(dev->port_id);
1094 if (err)
95fb793a 1095 return -err;
8a9562d2
PS
1096 }
1097
1098 if (dev->flags & NETDEV_PROMISC) {
1099 rte_eth_promiscuous_enable(dev->port_id);
1100 }
1101
1102 if (!(dev->flags & NETDEV_UP)) {
1103 rte_eth_dev_stop(dev->port_id);
1104 }
1105
1106 return 0;
1107}
1108
1109static int
1110netdev_dpdk_update_flags(struct netdev *netdev_,
1111 enum netdev_flags off, enum netdev_flags on,
1112 enum netdev_flags *old_flagsp)
1113{
1114 struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
1115 int error;
1116
1117 ovs_mutex_lock(&netdev->mutex);
1118 error = netdev_dpdk_update_flags__(netdev, off, on, old_flagsp);
1119 ovs_mutex_unlock(&netdev->mutex);
1120
1121 return error;
1122}
1123
1124static int
1125netdev_dpdk_get_status(const struct netdev *netdev_, struct smap *args)
1126{
1127 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
1128 struct rte_eth_dev_info dev_info;
1129
e0a801c7 1130 if (dev->port_id < 0)
8a9562d2
PS
1131 return ENODEV;
1132
1133 ovs_mutex_lock(&dev->mutex);
1134 rte_eth_dev_info_get(dev->port_id, &dev_info);
1135 ovs_mutex_unlock(&dev->mutex);
1136
1137 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1138
95fb793a 1139 smap_add_format(args, "port_no", "%d", dev->port_id);
8a9562d2
PS
1140 smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
1141 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
1142 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
1143 smap_add_format(args, "max_rx_pktlen", "%u", dev_info.max_rx_pktlen);
1144 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
1145 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
1146 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
1147 smap_add_format(args, "max_hash_mac_addrs", "%u", dev_info.max_hash_mac_addrs);
1148 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
1149 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
1150
1151 smap_add_format(args, "pci-vendor_id", "0x%u", dev_info.pci_dev->id.vendor_id);
1152 smap_add_format(args, "pci-device_id", "0x%x", dev_info.pci_dev->id.device_id);
1153
1154 return 0;
1155}
1156
1157static void
1158netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
1159 OVS_REQUIRES(dev->mutex)
1160{
1161 enum netdev_flags old_flags;
1162
1163 if (admin_state) {
1164 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
1165 } else {
1166 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
1167 }
1168}
1169
1170static void
1171netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
1172 const char *argv[], void *aux OVS_UNUSED)
1173{
1174 bool up;
1175
1176 if (!strcasecmp(argv[argc - 1], "up")) {
1177 up = true;
1178 } else if ( !strcasecmp(argv[argc - 1], "down")) {
1179 up = false;
1180 } else {
1181 unixctl_command_reply_error(conn, "Invalid Admin State");
1182 return;
1183 }
1184
1185 if (argc > 2) {
1186 struct netdev *netdev = netdev_from_name(argv[1]);
1187 if (netdev && is_dpdk_class(netdev->netdev_class)) {
1188 struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
1189
1190 ovs_mutex_lock(&dpdk_dev->mutex);
1191 netdev_dpdk_set_admin_state__(dpdk_dev, up);
1192 ovs_mutex_unlock(&dpdk_dev->mutex);
1193
1194 netdev_close(netdev);
1195 } else {
1196 unixctl_command_reply_error(conn, "Not a DPDK Interface");
1197 netdev_close(netdev);
1198 return;
1199 }
1200 } else {
1201 struct netdev_dpdk *netdev;
1202
1203 ovs_mutex_lock(&dpdk_mutex);
1204 LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
1205 ovs_mutex_lock(&netdev->mutex);
1206 netdev_dpdk_set_admin_state__(netdev, up);
1207 ovs_mutex_unlock(&netdev->mutex);
1208 }
1209 ovs_mutex_unlock(&dpdk_mutex);
1210 }
1211 unixctl_command_reply(conn, "OK");
1212}
1213
033e9df2
DDP
1214static void
1215dpdk_common_init(void)
1216{
1217 unixctl_command_register("netdev-dpdk/set-admin-state",
1218 "[netdev] up|down", 1, 2,
1219 netdev_dpdk_set_admin_state, NULL);
1220
1221 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
1222}
1223
8a9562d2
PS
1224static int
1225dpdk_class_init(void)
1226{
1227 int result;
1228
8a9562d2
PS
1229 result = rte_eal_pci_probe();
1230 if (result) {
1231 VLOG_ERR("Cannot probe PCI");
95fb793a 1232 return -result;
8a9562d2
PS
1233 }
1234
8a9562d2
PS
1235 VLOG_INFO("Ethernet Device Count: %d", (int)rte_eth_dev_count());
1236
8a9562d2
PS
1237 return 0;
1238}
1239
95fb793a 1240/* Client Rings */
1241
95fb793a 1242static int
1243dpdk_ring_create(const char dev_name[], unsigned int port_no,
1244 unsigned int *eth_port_id)
1245{
1246 struct dpdk_ring *ivshmem;
1247 char ring_name[10];
1248 int err;
1249
1250 ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
1251 if (ivshmem == NULL) {
1252 return ENOMEM;
1253 }
1254
1255 err = snprintf(ring_name, 10, "%s_tx", dev_name);
1256 if (err < 0) {
1257 return -err;
1258 }
1259
58f7c37b 1260 ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0, 0);
95fb793a 1261 if (ivshmem->cring_tx == NULL) {
1262 rte_free(ivshmem);
1263 return ENOMEM;
1264 }
1265
1266 err = snprintf(ring_name, 10, "%s_rx", dev_name);
1267 if (err < 0) {
1268 return -err;
1269 }
1270
58f7c37b 1271 ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0, 0);
95fb793a 1272 if (ivshmem->cring_rx == NULL) {
1273 rte_free(ivshmem);
1274 return ENOMEM;
1275 }
1276
d7310583
DDP
1277 err = rte_eth_from_rings(dev_name, &ivshmem->cring_rx, 1,
1278 &ivshmem->cring_tx, 1, SOCKET0);
1279
95fb793a 1280 if (err < 0) {
1281 rte_free(ivshmem);
1282 return ENODEV;
1283 }
1284
1285 ivshmem->user_port_id = port_no;
1286 ivshmem->eth_port_id = rte_eth_dev_count() - 1;
1287 list_push_back(&dpdk_ring_list, &ivshmem->list_node);
1288
1289 *eth_port_id = ivshmem->eth_port_id;
1290 return 0;
1291}
1292
1293static int
1294dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dpdk_mutex)
1295{
1296 struct dpdk_ring *ivshmem;
1297 unsigned int port_no;
1298 int err = 0;
1299
1300 /* Names always start with "dpdkr" */
1301 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
1302 if (err) {
1303 return err;
1304 }
1305
1306 /* look through our list to find the device */
1307 LIST_FOR_EACH (ivshmem, list_node, &dpdk_ring_list) {
1308 if (ivshmem->user_port_id == port_no) {
1309 VLOG_INFO("Found dpdk ring device %s:\n", dev_name);
1310 *eth_port_id = ivshmem->eth_port_id; /* really all that is needed */
1311 return 0;
1312 }
1313 }
1314 /* Need to create the device rings */
1315 return dpdk_ring_create(dev_name, port_no, eth_port_id);
1316}
1317
1318static int
1319netdev_dpdk_ring_construct(struct netdev *netdev)
1320{
1321 unsigned int port_no = 0;
1322 int err = 0;
1323
1324 if (rte_eal_init_ret) {
1325 return rte_eal_init_ret;
1326 }
1327
1328 ovs_mutex_lock(&dpdk_mutex);
1329
1330 err = dpdk_ring_open(netdev->name, &port_no);
1331 if (err) {
1332 goto unlock_dpdk;
1333 }
1334
1335 err = netdev_dpdk_init(netdev, port_no);
1336
1337unlock_dpdk:
1338 ovs_mutex_unlock(&dpdk_mutex);
1339 return err;
1340}
1341
1342#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT) \
1343{ \
1344 NAME, \
1345 INIT, /* init */ \
1346 NULL, /* netdev_dpdk_run */ \
1347 NULL, /* netdev_dpdk_wait */ \
1348 \
1349 netdev_dpdk_alloc, \
1350 CONSTRUCT, \
1351 netdev_dpdk_destruct, \
1352 netdev_dpdk_dealloc, \
1353 netdev_dpdk_get_config, \
1354 NULL, /* netdev_dpdk_set_config */ \
1355 NULL, /* get_tunnel_config */ \
7dec44fe 1356 netdev_dpdk_get_numa_id, /* get_numa_id */ \
95fb793a 1357 \
1358 netdev_dpdk_send, /* send */ \
1359 NULL, /* send_wait */ \
1360 \
1361 netdev_dpdk_set_etheraddr, \
1362 netdev_dpdk_get_etheraddr, \
1363 netdev_dpdk_get_mtu, \
1364 netdev_dpdk_set_mtu, \
1365 netdev_dpdk_get_ifindex, \
1366 netdev_dpdk_get_carrier, \
1367 netdev_dpdk_get_carrier_resets, \
1368 netdev_dpdk_set_miimon, \
1369 netdev_dpdk_get_stats, \
1370 netdev_dpdk_set_stats, \
1371 netdev_dpdk_get_features, \
1372 NULL, /* set_advertisements */ \
1373 \
1374 NULL, /* set_policing */ \
1375 NULL, /* get_qos_types */ \
1376 NULL, /* get_qos_capabilities */ \
1377 NULL, /* get_qos */ \
1378 NULL, /* set_qos */ \
1379 NULL, /* get_queue */ \
1380 NULL, /* set_queue */ \
1381 NULL, /* delete_queue */ \
1382 NULL, /* get_queue_stats */ \
1383 NULL, /* queue_dump_start */ \
1384 NULL, /* queue_dump_next */ \
1385 NULL, /* queue_dump_done */ \
1386 NULL, /* dump_queue_stats */ \
1387 \
1388 NULL, /* get_in4 */ \
1389 NULL, /* set_in4 */ \
1390 NULL, /* get_in6 */ \
1391 NULL, /* add_router */ \
1392 NULL, /* get_next_hop */ \
1393 netdev_dpdk_get_status, \
1394 NULL, /* arp_lookup */ \
1395 \
1396 netdev_dpdk_update_flags, \
1397 \
1398 netdev_dpdk_rxq_alloc, \
1399 netdev_dpdk_rxq_construct, \
1400 netdev_dpdk_rxq_destruct, \
1401 netdev_dpdk_rxq_dealloc, \
1402 netdev_dpdk_rxq_recv, \
1403 NULL, /* rx_wait */ \
1404 NULL, /* rxq_drain */ \
1405}
8a9562d2
PS
1406
1407int
1408dpdk_init(int argc, char **argv)
1409{
1410 int result;
1411
9441caf3 1412 if (argc < 2 || strcmp(argv[1], "--dpdk"))
8a9562d2
PS
1413 return 0;
1414
9441caf3
DDP
1415 /* Make sure program name passed to rte_eal_init() is vswitchd. */
1416 argv[1] = argv[0];
1417
8a9562d2
PS
1418 argc--;
1419 argv++;
1420
1421 /* Make sure things are initialized ... */
1422 result = rte_eal_init(argc, argv);
451450fa 1423 if (result < 0) {
8a9562d2 1424 ovs_abort(result, "Cannot init EAL\n");
451450fa 1425 }
8a9562d2 1426
d7310583 1427 rte_memzone_dump(stdout);
8a9562d2
PS
1428 rte_eal_init_ret = 0;
1429
451450fa 1430 if (argc > result) {
9441caf3 1431 argv[result] = argv[0];
451450fa 1432 }
9441caf3 1433
db73f716
DDP
1434 /* We are called from the main thread here */
1435 thread_set_nonpmd();
1436
9441caf3 1437 return result + 1;
8a9562d2
PS
1438}
1439
95fb793a 1440const struct netdev_class dpdk_class =
1441 NETDEV_DPDK_CLASS(
1442 "dpdk",
1443 dpdk_class_init,
1444 netdev_dpdk_construct);
1445
1446const struct netdev_class dpdk_ring_class =
1447 NETDEV_DPDK_CLASS(
1448 "dpdkr",
033e9df2 1449 NULL,
95fb793a 1450 netdev_dpdk_ring_construct);
1451
8a9562d2
PS
1452void
1453netdev_dpdk_register(void)
1454{
95fb793a 1455 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1456
033e9df2
DDP
1457 if (rte_eal_init_ret) {
1458 return;
1459 }
1460
95fb793a 1461 if (ovsthread_once_start(&once)) {
033e9df2 1462 dpdk_common_init();
95fb793a 1463 netdev_register_provider(&dpdk_class);
1464 netdev_register_provider(&dpdk_ring_class);
1465 ovsthread_once_done(&once);
1466 }
8a9562d2 1467}
8617afff
PS
1468
1469int
1470pmd_thread_setaffinity_cpu(int cpu)
1471{
1472 cpu_set_t cpuset;
1473 int err;
1474
1475 CPU_ZERO(&cpuset);
1476 CPU_SET(cpu, &cpuset);
1477 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
1478 if (err) {
1479 VLOG_ERR("Thread affinity error %d",err);
1480 return err;
1481 }
db73f716
DDP
1482 /* lcore_id 0 is reseved for use by non pmd threads. */
1483 RTE_PER_LCORE(_lcore_id) = cpu + 1;
8617afff
PS
1484
1485 return 0;
1486}
db73f716
DDP
1487
1488void
1489thread_set_nonpmd(void)
1490{
1491 /* We cannot have RTE_MAX_LCORE pmd threads, because lcore_id 0 is reserved
1492 * for non pmd threads */
1493 BUILD_ASSERT(NR_PMD_THREADS < RTE_MAX_LCORE);
1494 /* We have to use 0 to allow non pmd threads to perform certain DPDK
1495 * operations, like rte_eth_dev_configure(). */
1496 RTE_PER_LCORE(_lcore_id) = 0;
1497}
1498
1499static bool
1500thread_is_pmd(void)
1501{
1502 return rte_lcore_id() != 0;
1503}