]> git.proxmox.com Git - ovs.git/blob - lib/netdev-dpdk.c
netdev-dpdk: add DPDK pdump capability
[ovs.git] / lib / netdev-dpdk.c
1 /*
2 * Copyright (c) 2014, 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include <string.h>
20 #include <signal.h>
21 #include <stdlib.h>
22 #include <pthread.h>
23 #include <config.h>
24 #include <errno.h>
25 #include <sched.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28 #include <sys/stat.h>
29 #include <stdio.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <getopt.h>
33
34 #include "dirs.h"
35 #include "dp-packet.h"
36 #include "dpif-netdev.h"
37 #include "fatal-signal.h"
38 #include "netdev-dpdk.h"
39 #include "netdev-provider.h"
40 #include "netdev-vport.h"
41 #include "odp-util.h"
42 #include "openvswitch/dynamic-string.h"
43 #include "openvswitch/list.h"
44 #include "openvswitch/ofp-print.h"
45 #include "openvswitch/vlog.h"
46 #include "ovs-numa.h"
47 #include "ovs-thread.h"
48 #include "ovs-rcu.h"
49 #include "packets.h"
50 #include "openvswitch/shash.h"
51 #include "smap.h"
52 #include "sset.h"
53 #include "unaligned.h"
54 #include "timeval.h"
55 #include "unixctl.h"
56
57 #include "rte_config.h"
58 #include "rte_mbuf.h"
59 #include "rte_meter.h"
60 #ifdef DPDK_PDUMP
61 #include "rte_pdump.h"
62 #endif
63 #include "rte_virtio_net.h"
64
65 VLOG_DEFINE_THIS_MODULE(dpdk);
66 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
67
68 #define DPDK_PORT_WATCHDOG_INTERVAL 5
69
70 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
71 #define OVS_VPORT_DPDK "ovs_dpdk"
72
73 /*
74 * need to reserve tons of extra space in the mbufs so we can align the
75 * DMA addresses to 4KB.
76 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
77 * performance for standard Ethernet MTU.
78 */
79 #define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN + (2 * VLAN_HEADER_LEN))
80 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
81 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
82 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len)- ETHER_HDR_LEN - ETHER_CRC_LEN)
83 #define MBUF_SIZE(mtu) ( MTU_TO_MAX_FRAME_LEN(mtu) \
84 + sizeof(struct dp_packet) \
85 + RTE_PKTMBUF_HEADROOM)
86 #define NETDEV_DPDK_MBUF_ALIGN 1024
87
88 /* Max and min number of packets in the mempool. OVS tries to allocate a
89 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
90 * enough hugepages) we keep halving the number until the allocation succeeds
91 * or we reach MIN_NB_MBUF */
92
93 #define MAX_NB_MBUF (4096 * 64)
94 #define MIN_NB_MBUF (4096 * 4)
95 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
96
97 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
98 BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF) == 0);
99
100 /* The smallest possible NB_MBUF that we're going to try should be a multiple
101 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
102 BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
103 % MP_CACHE_SZ == 0);
104
105 /*
106 * DPDK XSTATS Counter names definition
107 */
108 #define XSTAT_RX_64_PACKETS "rx_size_64_packets"
109 #define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
110 #define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
111 #define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
112 #define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
113 #define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
114 #define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
115
116 #define XSTAT_TX_64_PACKETS "tx_size_64_packets"
117 #define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
118 #define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
119 #define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
120 #define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
121 #define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
122 #define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
123
124 #define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
125 #define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
126 #define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
127 #define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
128 #define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
129 #define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
130 #define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
131
132 #define SOCKET0 0
133
134 #define NIC_PORT_RX_Q_SIZE 2048 /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
135 #define NIC_PORT_TX_Q_SIZE 2048 /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
136
137 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
138 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
139 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
140 * yet mapped to another queue. */
141
142 #ifdef VHOST_CUSE
143 static char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */
144 #endif
145 static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
146
147 #define VHOST_ENQ_RETRY_NUM 8
148 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
149
150 static const struct rte_eth_conf port_conf = {
151 .rxmode = {
152 .mq_mode = ETH_MQ_RX_RSS,
153 .split_hdr_size = 0,
154 .header_split = 0, /* Header Split disabled */
155 .hw_ip_checksum = 0, /* IP checksum offload disabled */
156 .hw_vlan_filter = 0, /* VLAN filtering disabled */
157 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
158 .hw_strip_crc = 0,
159 },
160 .rx_adv_conf = {
161 .rss_conf = {
162 .rss_key = NULL,
163 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
164 },
165 },
166 .txmode = {
167 .mq_mode = ETH_MQ_TX_NONE,
168 },
169 };
170
171 enum { DPDK_RING_SIZE = 256 };
172 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
173 enum { DRAIN_TSC = 200000ULL };
174
175 enum dpdk_dev_type {
176 DPDK_DEV_ETH = 0,
177 DPDK_DEV_VHOST = 1,
178 };
179
180 static int rte_eal_init_ret = ENODEV;
181
182 static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
183
184 /* Quality of Service */
185
186 /* An instance of a QoS configuration. Always associated with a particular
187 * network device.
188 *
189 * Each QoS implementation subclasses this with whatever additional data it
190 * needs.
191 */
192 struct qos_conf {
193 const struct dpdk_qos_ops *ops;
194 };
195
196 /* A particular implementation of dpdk QoS operations.
197 *
198 * The functions below return 0 if successful or a positive errno value on
199 * failure, except where otherwise noted. All of them must be provided, except
200 * where otherwise noted.
201 */
202 struct dpdk_qos_ops {
203
204 /* Name of the QoS type */
205 const char *qos_name;
206
207 /* Called to construct the QoS implementation on 'netdev'. The
208 * implementation should make the appropriate calls to configure QoS
209 * according to 'details'. The implementation may assume that any current
210 * QoS configuration already installed should be destroyed before
211 * constructing the new configuration.
212 *
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
216 *
217 * This function must return 0 if and only if it sets 'netdev->qos_conf'
218 * to an initialized 'struct qos_conf'.
219 *
220 * For all QoS implementations it should always be non-null.
221 */
222 int (*qos_construct)(struct netdev *netdev, const struct smap *details);
223
224 /* Destroys the data structures allocated by the implementation as part of
225 * 'qos_conf.
226 *
227 * For all QoS implementations it should always be non-null.
228 */
229 void (*qos_destruct)(struct netdev *netdev, struct qos_conf *conf);
230
231 /* Retrieves details of 'netdev->qos_conf' configuration into 'details'.
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 */
237 int (*qos_get)(const struct netdev *netdev, struct smap *details);
238
239 /* Reconfigures 'netdev->qos_conf' according to 'details', performing any
240 * required calls to complete the reconfiguration.
241 *
242 * The contents of 'details' should be documented as valid for 'ovs_name'
243 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
244 * (which is built as ovs-vswitchd.conf.db(8)).
245 *
246 * This function may be null if 'qos_conf' is not configurable.
247 */
248 int (*qos_set)(struct netdev *netdev, const struct smap *details);
249
250 /* Modify an array of rte_mbufs. The modification is specific to
251 * each qos implementation.
252 *
253 * The function should take and array of mbufs and an int representing
254 * the current number of mbufs present in the array.
255 *
256 * After the function has performed a qos modification to the array of
257 * mbufs it returns an int representing the number of mbufs now present in
258 * the array. This value is can then be passed to the port send function
259 * along with the modified array for transmission.
260 *
261 * For all QoS implementations it should always be non-null.
262 */
263 int (*qos_run)(struct netdev *netdev, struct rte_mbuf **pkts,
264 int pkt_cnt);
265 };
266
267 /* dpdk_qos_ops for each type of user space QoS implementation */
268 static const struct dpdk_qos_ops egress_policer_ops;
269
270 /*
271 * Array of dpdk_qos_ops, contains pointer to all supported QoS
272 * operations.
273 */
274 static const struct dpdk_qos_ops *const qos_confs[] = {
275 &egress_policer_ops,
276 NULL
277 };
278
279 /* Contains all 'struct dpdk_dev's. */
280 static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
281 = OVS_LIST_INITIALIZER(&dpdk_list);
282
283 static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
284 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
285
286 /* This mutex must be used by non pmd threads when allocating or freeing
287 * mbufs through mempools. */
288 static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
289
290 struct dpdk_mp {
291 struct rte_mempool *mp;
292 int mtu;
293 int socket_id;
294 int refcount;
295 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
296 };
297
298 /* There should be one 'struct dpdk_tx_queue' created for
299 * each cpu core. */
300 struct dpdk_tx_queue {
301 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
302 * from concurrent access. It is used only
303 * if the queue is shared among different
304 * pmd threads (see 'concurrent_txq'). */
305 int map; /* Mapping of configured vhost-user queues
306 * to enabled by guest. */
307 };
308
309 /* dpdk has no way to remove dpdk ring ethernet devices
310 so we have to keep them around once they've been created
311 */
312
313 static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
314 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
315
316 struct dpdk_ring {
317 /* For the client rings */
318 struct rte_ring *cring_tx;
319 struct rte_ring *cring_rx;
320 unsigned int user_port_id; /* User given port no, parsed from port name */
321 int eth_port_id; /* ethernet device port id */
322 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
323 };
324
325 struct ingress_policer {
326 struct rte_meter_srtcm_params app_srtcm_params;
327 struct rte_meter_srtcm in_policer;
328 rte_spinlock_t policer_lock;
329 };
330
331 struct netdev_dpdk {
332 struct netdev up;
333 int port_id;
334 int max_packet_len;
335 enum dpdk_dev_type type;
336
337 struct dpdk_tx_queue *tx_q;
338
339 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
340
341 struct dpdk_mp *dpdk_mp;
342 int mtu;
343 int socket_id;
344 int buf_size;
345 struct netdev_stats stats;
346 /* Protects stats */
347 rte_spinlock_t stats_lock;
348
349 struct eth_addr hwaddr;
350 enum netdev_flags flags;
351
352 struct rte_eth_link link;
353 int link_reset_cnt;
354
355 /* virtio identifier for vhost devices */
356 ovsrcu_index vid;
357
358 /* True if vHost device is 'up' and has been reconfigured at least once */
359 bool vhost_reconfigured;
360
361 /* Identifier used to distinguish vhost devices from each other. It does
362 * not change during the lifetime of a struct netdev_dpdk. It can be read
363 * without holding any mutex. */
364 const char vhost_id[PATH_MAX];
365
366 /* In dpdk_list. */
367 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
368
369 /* QoS configuration and lock for the device */
370 struct qos_conf *qos_conf;
371 rte_spinlock_t qos_lock;
372
373 /* The following properties cannot be changed when a device is running,
374 * so we remember the request and update them next time
375 * netdev_dpdk*_reconfigure() is called */
376 int requested_n_txq;
377 int requested_n_rxq;
378
379 /* Socket ID detected when vHost device is brought up */
380 int requested_socket_id;
381
382 /* Ingress Policer */
383 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
384 uint32_t policer_rate;
385 uint32_t policer_burst;
386
387 /* DPDK-ETH Flow control */
388 struct rte_eth_fc_conf fc_conf;
389 };
390
391 struct netdev_rxq_dpdk {
392 struct netdev_rxq up;
393 int port_id;
394 };
395
396 static bool dpdk_thread_is_pmd(void);
397
398 static int netdev_dpdk_construct(struct netdev *);
399
400 int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
401
402 struct ingress_policer *
403 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
404
405 static bool
406 is_dpdk_class(const struct netdev_class *class)
407 {
408 return class->construct == netdev_dpdk_construct;
409 }
410
411 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
412 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
413 * value, insufficient buffers are allocated to accomodate the packet in its
414 * entirety. Furthermore, certain drivers need to ensure that there is also
415 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
416 * frames). If the RX buffer is too small, then the driver enables scatter RX
417 * behaviour, which reduces performance. To prevent this, use a buffer size that
418 * is closest to 'mtu', but which satisfies the aforementioned criteria.
419 */
420 static uint32_t
421 dpdk_buf_size(int mtu)
422 {
423 return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
424 NETDEV_DPDK_MBUF_ALIGN);
425 }
426
427 /* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
428 * for all other segments data, bss and text. */
429
430 static void *
431 dpdk_rte_mzalloc(size_t sz)
432 {
433 void *ptr;
434
435 ptr = rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
436 if (ptr == NULL) {
437 out_of_memory();
438 }
439 return ptr;
440 }
441
442 /* XXX this function should be called only by pmd threads (or by non pmd
443 * threads holding the nonpmd_mempool_mutex) */
444 void
445 free_dpdk_buf(struct dp_packet *p)
446 {
447 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
448
449 rte_pktmbuf_free(pkt);
450 }
451
452 static void
453 ovs_rte_pktmbuf_init(struct rte_mempool *mp,
454 void *opaque_arg OVS_UNUSED,
455 void *_m,
456 unsigned i OVS_UNUSED)
457 {
458 struct rte_mbuf *m = _m;
459
460 rte_pktmbuf_init(mp, opaque_arg, _m, i);
461
462 dp_packet_init_dpdk((struct dp_packet *) m, m->buf_len);
463 }
464
465 static struct dpdk_mp *
466 dpdk_mp_get(int socket_id, int mtu) OVS_REQUIRES(dpdk_mutex)
467 {
468 struct dpdk_mp *dmp = NULL;
469 char mp_name[RTE_MEMPOOL_NAMESIZE];
470 unsigned mp_size;
471 struct rte_pktmbuf_pool_private mbp_priv;
472
473 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
474 if (dmp->socket_id == socket_id && dmp->mtu == mtu) {
475 dmp->refcount++;
476 return dmp;
477 }
478 }
479
480 dmp = dpdk_rte_mzalloc(sizeof *dmp);
481 dmp->socket_id = socket_id;
482 dmp->mtu = mtu;
483 dmp->refcount = 1;
484 mbp_priv.mbuf_data_room_size = MBUF_SIZE(mtu) - sizeof(struct dp_packet);
485 mbp_priv.mbuf_priv_size = sizeof (struct dp_packet) -
486 sizeof (struct rte_mbuf);
487
488 mp_size = MAX_NB_MBUF;
489 do {
490 if (snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs_mp_%d_%d_%u",
491 dmp->mtu, dmp->socket_id, mp_size) < 0) {
492 return NULL;
493 }
494
495 dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
496 MP_CACHE_SZ,
497 sizeof(struct rte_pktmbuf_pool_private),
498 rte_pktmbuf_pool_init, &mbp_priv,
499 ovs_rte_pktmbuf_init, NULL,
500 socket_id, 0);
501 } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >= MIN_NB_MBUF);
502
503 if (dmp->mp == NULL) {
504 return NULL;
505 } else {
506 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", mp_name, mp_size );
507 }
508
509 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
510 return dmp;
511 }
512
513 static void
514 dpdk_mp_put(struct dpdk_mp *dmp) OVS_REQUIRES(dpdk_mutex)
515 {
516 if (!dmp) {
517 return;
518 }
519
520 ovs_assert(dmp->refcount);
521
522 if (!--dmp->refcount) {
523 ovs_list_remove(&dmp->list_node);
524 rte_mempool_free(dmp->mp);
525 }
526 }
527
528 static void
529 check_link_status(struct netdev_dpdk *dev)
530 {
531 struct rte_eth_link link;
532
533 rte_eth_link_get_nowait(dev->port_id, &link);
534
535 if (dev->link.link_status != link.link_status) {
536 netdev_change_seq_changed(&dev->up);
537
538 dev->link_reset_cnt++;
539 dev->link = link;
540 if (dev->link.link_status) {
541 VLOG_DBG_RL(&rl, "Port %d Link Up - speed %u Mbps - %s",
542 dev->port_id, (unsigned)dev->link.link_speed,
543 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
544 ("full-duplex") : ("half-duplex"));
545 } else {
546 VLOG_DBG_RL(&rl, "Port %d Link Down", dev->port_id);
547 }
548 }
549 }
550
551 static void *
552 dpdk_watchdog(void *dummy OVS_UNUSED)
553 {
554 struct netdev_dpdk *dev;
555
556 pthread_detach(pthread_self());
557
558 for (;;) {
559 ovs_mutex_lock(&dpdk_mutex);
560 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
561 ovs_mutex_lock(&dev->mutex);
562 if (dev->type == DPDK_DEV_ETH) {
563 check_link_status(dev);
564 }
565 ovs_mutex_unlock(&dev->mutex);
566 }
567 ovs_mutex_unlock(&dpdk_mutex);
568 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
569 }
570
571 return NULL;
572 }
573
574 static int
575 dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
576 {
577 int diag = 0;
578 int i;
579
580 /* A device may report more queues than it makes available (this has
581 * been observed for Intel xl710, which reserves some of them for
582 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
583 * available. When this happens we can retry the configuration
584 * and request less queues */
585 while (n_rxq && n_txq) {
586 if (diag) {
587 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
588 }
589
590 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
591 if (diag) {
592 break;
593 }
594
595 for (i = 0; i < n_txq; i++) {
596 diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
597 dev->socket_id, NULL);
598 if (diag) {
599 VLOG_INFO("Interface %s txq(%d) setup error: %s",
600 dev->up.name, i, rte_strerror(-diag));
601 break;
602 }
603 }
604
605 if (i != n_txq) {
606 /* Retry with less tx queues */
607 n_txq = i;
608 continue;
609 }
610
611 for (i = 0; i < n_rxq; i++) {
612 diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
613 dev->socket_id, NULL,
614 dev->dpdk_mp->mp);
615 if (diag) {
616 VLOG_INFO("Interface %s rxq(%d) setup error: %s",
617 dev->up.name, i, rte_strerror(-diag));
618 break;
619 }
620 }
621
622 if (i != n_rxq) {
623 /* Retry with less rx queues */
624 n_rxq = i;
625 continue;
626 }
627
628 dev->up.n_rxq = n_rxq;
629 dev->up.n_txq = n_txq;
630
631 return 0;
632 }
633
634 return diag;
635 }
636
637 static void
638 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
639 {
640 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
641 VLOG_WARN("Failed to enable flow control on device %d", dev->port_id);
642 }
643 }
644
645 static int
646 dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
647 {
648 struct rte_pktmbuf_pool_private *mbp_priv;
649 struct rte_eth_dev_info info;
650 struct ether_addr eth_addr;
651 int diag;
652 int n_rxq, n_txq;
653
654 if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
655 return ENODEV;
656 }
657
658 rte_eth_dev_info_get(dev->port_id, &info);
659
660 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
661 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
662
663 diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
664 if (diag) {
665 VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
666 dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
667 return -diag;
668 }
669
670 diag = rte_eth_dev_start(dev->port_id);
671 if (diag) {
672 VLOG_ERR("Interface %s start error: %s", dev->up.name,
673 rte_strerror(-diag));
674 return -diag;
675 }
676
677 rte_eth_promiscuous_enable(dev->port_id);
678 rte_eth_allmulticast_enable(dev->port_id);
679
680 memset(&eth_addr, 0x0, sizeof(eth_addr));
681 rte_eth_macaddr_get(dev->port_id, &eth_addr);
682 VLOG_INFO_RL(&rl, "Port %d: "ETH_ADDR_FMT"",
683 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
684
685 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
686 rte_eth_link_get_nowait(dev->port_id, &dev->link);
687
688 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
689 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
690
691 dev->flags = NETDEV_UP | NETDEV_PROMISC;
692
693 /* Get the Flow control configuration for DPDK-ETH */
694 diag = rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
695 if (diag) {
696 VLOG_DBG("cannot get flow control parameters on port=%d, err=%d",
697 dev->port_id, diag);
698 }
699
700 return 0;
701 }
702
703 static struct netdev_dpdk *
704 netdev_dpdk_cast(const struct netdev *netdev)
705 {
706 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
707 }
708
709 static struct netdev *
710 netdev_dpdk_alloc(void)
711 {
712 struct netdev_dpdk *dev;
713
714 if (!rte_eal_init_ret) { /* Only after successful initialization */
715 dev = dpdk_rte_mzalloc(sizeof *dev);
716 if (dev) {
717 return &dev->up;
718 }
719 }
720 return NULL;
721 }
722
723 static void
724 netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
725 {
726 unsigned i;
727
728 dev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *dev->tx_q);
729 for (i = 0; i < n_txqs; i++) {
730 /* Initialize map for vhost devices. */
731 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
732 rte_spinlock_init(&dev->tx_q[i].tx_lock);
733 }
734 }
735
736 static int
737 netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
738 enum dpdk_dev_type type)
739 OVS_REQUIRES(dpdk_mutex)
740 {
741 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
742 int sid;
743 int err = 0;
744 uint32_t buf_size;
745
746 ovs_mutex_init(&dev->mutex);
747 ovs_mutex_lock(&dev->mutex);
748
749 rte_spinlock_init(&dev->stats_lock);
750
751 /* If the 'sid' is negative, it means that the kernel fails
752 * to obtain the pci numa info. In that situation, always
753 * use 'SOCKET0'. */
754 if (type == DPDK_DEV_ETH) {
755 sid = rte_eth_dev_socket_id(port_no);
756 } else {
757 sid = rte_lcore_to_socket_id(rte_get_master_lcore());
758 }
759
760 dev->socket_id = sid < 0 ? SOCKET0 : sid;
761 dev->requested_socket_id = dev->socket_id;
762 dev->port_id = port_no;
763 dev->type = type;
764 dev->flags = 0;
765 dev->mtu = ETHER_MTU;
766 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
767 ovsrcu_index_init(&dev->vid, -1);
768 dev->vhost_reconfigured = false;
769
770 buf_size = dpdk_buf_size(dev->mtu);
771 dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
772 if (!dev->dpdk_mp) {
773 err = ENOMEM;
774 goto unlock;
775 }
776
777 /* Initialise QoS configuration to NULL and qos lock to unlocked */
778 dev->qos_conf = NULL;
779 rte_spinlock_init(&dev->qos_lock);
780
781 /* Initialise rcu pointer for ingress policer to NULL */
782 ovsrcu_init(&dev->ingress_policer, NULL);
783 dev->policer_rate = 0;
784 dev->policer_burst = 0;
785
786 netdev->n_rxq = NR_QUEUE;
787 netdev->n_txq = NR_QUEUE;
788 dev->requested_n_rxq = netdev->n_rxq;
789 dev->requested_n_txq = netdev->n_txq;
790
791 /* Initialize the flow control to NULL */
792 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
793 if (type == DPDK_DEV_ETH) {
794 err = dpdk_eth_dev_init(dev);
795 if (err) {
796 goto unlock;
797 }
798 netdev_dpdk_alloc_txq(dev, netdev->n_txq);
799 } else {
800 netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
801 /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
802 dev->flags = NETDEV_UP | NETDEV_PROMISC;
803 }
804
805 ovs_list_push_back(&dpdk_list, &dev->list_node);
806
807 unlock:
808 ovs_mutex_unlock(&dev->mutex);
809 return err;
810 }
811
812 /* dev_name must be the prefix followed by a positive decimal number.
813 * (no leading + or - signs are allowed) */
814 static int
815 dpdk_dev_parse_name(const char dev_name[], const char prefix[],
816 unsigned int *port_no)
817 {
818 const char *cport;
819
820 if (strncmp(dev_name, prefix, strlen(prefix))) {
821 return ENODEV;
822 }
823
824 cport = dev_name + strlen(prefix);
825
826 if (str_to_uint(cport, 10, port_no)) {
827 return 0;
828 } else {
829 return ENODEV;
830 }
831 }
832
833 static int
834 vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
835 {
836 if (rte_eal_init_ret) {
837 return rte_eal_init_ret;
838 }
839
840 return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
841 }
842
843 static int
844 netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
845 {
846 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
847 int err;
848
849 if (rte_eal_init_ret) {
850 return rte_eal_init_ret;
851 }
852
853 ovs_mutex_lock(&dpdk_mutex);
854 strncpy(CONST_CAST(char *, dev->vhost_id), netdev->name,
855 sizeof dev->vhost_id);
856 err = vhost_construct_helper(netdev);
857 ovs_mutex_unlock(&dpdk_mutex);
858 return err;
859 }
860
861 static int
862 netdev_dpdk_vhost_user_construct(struct netdev *netdev)
863 {
864 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
865 const char *name = netdev->name;
866 int err;
867 uint64_t flags = 0;
868
869 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
870 * the file system. '/' or '\' would traverse directories, so they're not
871 * acceptable in 'name'. */
872 if (strchr(name, '/') || strchr(name, '\\')) {
873 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
874 "A valid name must not include '/' or '\\'",
875 name);
876 return EINVAL;
877 }
878
879 if (rte_eal_init_ret) {
880 return rte_eal_init_ret;
881 }
882
883 ovs_mutex_lock(&dpdk_mutex);
884 /* Take the name of the vhost-user port and append it to the location where
885 * the socket is to be created, then register the socket.
886 */
887 snprintf(CONST_CAST(char *, dev->vhost_id), sizeof dev->vhost_id, "%s/%s",
888 vhost_sock_dir, name);
889
890 err = rte_vhost_driver_register(dev->vhost_id, flags);
891 if (err) {
892 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
893 dev->vhost_id);
894 } else {
895 fatal_signal_add_file_to_unlink(dev->vhost_id);
896 VLOG_INFO("Socket %s created for vhost-user port %s\n",
897 dev->vhost_id, name);
898 err = vhost_construct_helper(netdev);
899 }
900
901 ovs_mutex_unlock(&dpdk_mutex);
902 return err;
903 }
904
905 static int
906 netdev_dpdk_construct(struct netdev *netdev)
907 {
908 unsigned int port_no;
909 int err;
910
911 if (rte_eal_init_ret) {
912 return rte_eal_init_ret;
913 }
914
915 /* Names always start with "dpdk" */
916 err = dpdk_dev_parse_name(netdev->name, "dpdk", &port_no);
917 if (err) {
918 return err;
919 }
920
921 ovs_mutex_lock(&dpdk_mutex);
922 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
923 ovs_mutex_unlock(&dpdk_mutex);
924 return err;
925 }
926
927 static void
928 netdev_dpdk_destruct(struct netdev *netdev)
929 {
930 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
931
932 ovs_mutex_lock(&dpdk_mutex);
933 ovs_mutex_lock(&dev->mutex);
934
935 rte_eth_dev_stop(dev->port_id);
936 free(ovsrcu_get_protected(struct ingress_policer *,
937 &dev->ingress_policer));
938
939 rte_free(dev->tx_q);
940 ovs_list_remove(&dev->list_node);
941 dpdk_mp_put(dev->dpdk_mp);
942
943 ovs_mutex_unlock(&dev->mutex);
944 ovs_mutex_unlock(&dpdk_mutex);
945 }
946
947 /* rte_vhost_driver_unregister() can call back destroy_device(), which will
948 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
949 * deadlock, none of the mutexes must be held while calling this function. */
950 static int
951 dpdk_vhost_driver_unregister(struct netdev_dpdk *dev)
952 OVS_EXCLUDED(dpdk_mutex)
953 OVS_EXCLUDED(dev->mutex)
954 {
955 return rte_vhost_driver_unregister(dev->vhost_id);
956 }
957
958 static void
959 netdev_dpdk_vhost_destruct(struct netdev *netdev)
960 {
961 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
962
963 ovs_mutex_lock(&dpdk_mutex);
964 ovs_mutex_lock(&dev->mutex);
965
966 /* Guest becomes an orphan if still attached. */
967 if (netdev_dpdk_get_vid(dev) >= 0) {
968 VLOG_ERR("Removing port '%s' while vhost device still attached.",
969 netdev->name);
970 VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
971 " '%s' must be restarted.",
972 dev->vhost_id);
973 }
974
975 free(ovsrcu_get_protected(struct ingress_policer *,
976 &dev->ingress_policer));
977
978 rte_free(dev->tx_q);
979 ovs_list_remove(&dev->list_node);
980 dpdk_mp_put(dev->dpdk_mp);
981
982 ovs_mutex_unlock(&dev->mutex);
983 ovs_mutex_unlock(&dpdk_mutex);
984
985 if (dpdk_vhost_driver_unregister(dev)) {
986 VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
987 } else {
988 fatal_signal_remove_file_to_unlink(dev->vhost_id);
989 }
990 }
991
992 static void
993 netdev_dpdk_dealloc(struct netdev *netdev)
994 {
995 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
996
997 rte_free(dev);
998 }
999
1000 static int
1001 netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
1002 {
1003 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1004
1005 ovs_mutex_lock(&dev->mutex);
1006
1007 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
1008 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
1009 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1010 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
1011 ovs_mutex_unlock(&dev->mutex);
1012
1013 return 0;
1014 }
1015
1016 static int
1017 netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args)
1018 {
1019 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1020 int new_n_rxq;
1021
1022 ovs_mutex_lock(&dev->mutex);
1023 new_n_rxq = MAX(smap_get_int(args, "n_rxq", dev->requested_n_rxq), 1);
1024 if (new_n_rxq != dev->requested_n_rxq) {
1025 dev->requested_n_rxq = new_n_rxq;
1026 netdev_request_reconfigure(netdev);
1027 }
1028
1029 /* Flow control configuration for DPDK Ethernet ports. */
1030 if (dev->type == DPDK_DEV_ETH) {
1031 bool rx_fc_en = false;
1032 bool tx_fc_en = false;
1033 enum rte_eth_fc_mode fc_mode_set[2][2] =
1034 {{RTE_FC_NONE, RTE_FC_TX_PAUSE},
1035 {RTE_FC_RX_PAUSE, RTE_FC_FULL}
1036 };
1037 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
1038 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
1039 dev->fc_conf.autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
1040 dev->fc_conf.mode = fc_mode_set[tx_fc_en][rx_fc_en];
1041
1042 dpdk_eth_flow_ctrl_setup(dev);
1043 }
1044 ovs_mutex_unlock(&dev->mutex);
1045
1046 return 0;
1047 }
1048
1049 static int
1050 netdev_dpdk_get_numa_id(const struct netdev *netdev)
1051 {
1052 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1053
1054 return dev->socket_id;
1055 }
1056
1057 /* Sets the number of tx queues for the dpdk interface. */
1058 static int
1059 netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
1060 {
1061 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1062
1063 ovs_mutex_lock(&dev->mutex);
1064
1065 if (dev->requested_n_txq == n_txq) {
1066 goto out;
1067 }
1068
1069 dev->requested_n_txq = n_txq;
1070 netdev_request_reconfigure(netdev);
1071
1072 out:
1073 ovs_mutex_unlock(&dev->mutex);
1074 return 0;
1075 }
1076
1077 static struct netdev_rxq *
1078 netdev_dpdk_rxq_alloc(void)
1079 {
1080 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
1081
1082 return &rx->up;
1083 }
1084
1085 static struct netdev_rxq_dpdk *
1086 netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
1087 {
1088 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
1089 }
1090
1091 static int
1092 netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
1093 {
1094 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1095 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1096
1097 ovs_mutex_lock(&dev->mutex);
1098 rx->port_id = dev->port_id;
1099 ovs_mutex_unlock(&dev->mutex);
1100
1101 return 0;
1102 }
1103
1104 static void
1105 netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
1106 {
1107 }
1108
1109 static void
1110 netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
1111 {
1112 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1113
1114 rte_free(rx);
1115 }
1116
1117 static inline void
1118 netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
1119 struct rte_mbuf **pkts, int cnt)
1120 {
1121 uint32_t nb_tx = 0;
1122
1123 while (nb_tx != cnt) {
1124 uint32_t ret;
1125
1126 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
1127 if (!ret) {
1128 break;
1129 }
1130
1131 nb_tx += ret;
1132 }
1133
1134 if (OVS_UNLIKELY(nb_tx != cnt)) {
1135 /* free buffers, which we couldn't transmit, one at a time (each
1136 * packet could come from a different mempool) */
1137 int i;
1138
1139 for (i = nb_tx; i < cnt; i++) {
1140 rte_pktmbuf_free(pkts[i]);
1141 }
1142 rte_spinlock_lock(&dev->stats_lock);
1143 dev->stats.tx_dropped += cnt - nb_tx;
1144 rte_spinlock_unlock(&dev->stats_lock);
1145 }
1146 }
1147
1148 static inline bool
1149 netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
1150 struct rte_mbuf *pkt, uint64_t time)
1151 {
1152 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
1153
1154 return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
1155 e_RTE_METER_GREEN;
1156 }
1157
1158 static int
1159 netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
1160 struct rte_mbuf **pkts, int pkt_cnt)
1161 {
1162 int i = 0;
1163 int cnt = 0;
1164 struct rte_mbuf *pkt = NULL;
1165 uint64_t current_time = rte_rdtsc();
1166
1167 for (i = 0; i < pkt_cnt; i++) {
1168 pkt = pkts[i];
1169 /* Handle current packet */
1170 if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
1171 if (cnt != i) {
1172 pkts[cnt] = pkt;
1173 }
1174 cnt++;
1175 } else {
1176 rte_pktmbuf_free(pkt);
1177 }
1178 }
1179
1180 return cnt;
1181 }
1182
1183 static int
1184 ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
1185 int pkt_cnt)
1186 {
1187 int cnt = 0;
1188
1189 rte_spinlock_lock(&policer->policer_lock);
1190 cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts, pkt_cnt);
1191 rte_spinlock_unlock(&policer->policer_lock);
1192
1193 return cnt;
1194 }
1195
1196 static bool
1197 is_vhost_running(struct netdev_dpdk *dev)
1198 {
1199 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
1200 }
1201
1202 static inline void
1203 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
1204 unsigned int packet_size)
1205 {
1206 /* Hard-coded search for the size bucket. */
1207 if (packet_size < 256) {
1208 if (packet_size >= 128) {
1209 stats->rx_128_to_255_packets++;
1210 } else if (packet_size <= 64) {
1211 stats->rx_1_to_64_packets++;
1212 } else {
1213 stats->rx_65_to_127_packets++;
1214 }
1215 } else {
1216 if (packet_size >= 1523) {
1217 stats->rx_1523_to_max_packets++;
1218 } else if (packet_size >= 1024) {
1219 stats->rx_1024_to_1522_packets++;
1220 } else if (packet_size < 512) {
1221 stats->rx_256_to_511_packets++;
1222 } else {
1223 stats->rx_512_to_1023_packets++;
1224 }
1225 }
1226 }
1227
1228 static inline void
1229 netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
1230 struct dp_packet **packets, int count,
1231 int dropped)
1232 {
1233 int i;
1234 unsigned int packet_size;
1235 struct dp_packet *packet;
1236
1237 stats->rx_packets += count;
1238 stats->rx_dropped += dropped;
1239 for (i = 0; i < count; i++) {
1240 packet = packets[i];
1241 packet_size = dp_packet_size(packet);
1242
1243 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
1244 /* This only protects the following multicast counting from
1245 * too short packets, but it does not stop the packet from
1246 * further processing. */
1247 stats->rx_errors++;
1248 stats->rx_length_errors++;
1249 continue;
1250 }
1251
1252 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
1253
1254 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
1255 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
1256 stats->multicast++;
1257 }
1258
1259 stats->rx_bytes += packet_size;
1260 }
1261 }
1262
1263 /*
1264 * The receive path for the vhost port is the TX path out from guest.
1265 */
1266 static int
1267 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
1268 struct dp_packet_batch *batch)
1269 {
1270 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1271 int qid = rxq->queue_id;
1272 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
1273 uint16_t nb_rx = 0;
1274 uint16_t dropped = 0;
1275
1276 if (OVS_UNLIKELY(!is_vhost_running(dev)
1277 || !(dev->flags & NETDEV_UP))) {
1278 return EAGAIN;
1279 }
1280
1281 nb_rx = rte_vhost_dequeue_burst(netdev_dpdk_get_vid(dev),
1282 qid * VIRTIO_QNUM + VIRTIO_TXQ,
1283 dev->dpdk_mp->mp,
1284 (struct rte_mbuf **) batch->packets,
1285 NETDEV_MAX_BURST);
1286 if (!nb_rx) {
1287 return EAGAIN;
1288 }
1289
1290 if (policer) {
1291 dropped = nb_rx;
1292 nb_rx = ingress_policer_run(policer,
1293 (struct rte_mbuf **) batch->packets,
1294 nb_rx);
1295 dropped -= nb_rx;
1296 }
1297
1298 rte_spinlock_lock(&dev->stats_lock);
1299 netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
1300 nb_rx, dropped);
1301 rte_spinlock_unlock(&dev->stats_lock);
1302
1303 batch->count = (int) nb_rx;
1304 return 0;
1305 }
1306
1307 static int
1308 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
1309 {
1310 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1311 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
1312 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
1313 int nb_rx;
1314 int dropped = 0;
1315
1316 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
1317 (struct rte_mbuf **) batch->packets,
1318 NETDEV_MAX_BURST);
1319 if (!nb_rx) {
1320 return EAGAIN;
1321 }
1322
1323 if (policer) {
1324 dropped = nb_rx;
1325 nb_rx = ingress_policer_run(policer,
1326 (struct rte_mbuf **)batch->packets,
1327 nb_rx);
1328 dropped -= nb_rx;
1329 }
1330
1331 /* Update stats to reflect dropped packets */
1332 if (OVS_UNLIKELY(dropped)) {
1333 rte_spinlock_lock(&dev->stats_lock);
1334 dev->stats.rx_dropped += dropped;
1335 rte_spinlock_unlock(&dev->stats_lock);
1336 }
1337
1338 batch->count = nb_rx;
1339
1340 return 0;
1341 }
1342
1343 static inline int
1344 netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
1345 int cnt)
1346 {
1347 struct netdev *netdev = &dev->up;
1348
1349 if (dev->qos_conf != NULL) {
1350 rte_spinlock_lock(&dev->qos_lock);
1351 if (dev->qos_conf != NULL) {
1352 cnt = dev->qos_conf->ops->qos_run(netdev, pkts, cnt);
1353 }
1354 rte_spinlock_unlock(&dev->qos_lock);
1355 }
1356
1357 return cnt;
1358 }
1359
1360 static inline void
1361 netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
1362 struct dp_packet **packets,
1363 int attempted,
1364 int dropped)
1365 {
1366 int i;
1367 int sent = attempted - dropped;
1368
1369 stats->tx_packets += sent;
1370 stats->tx_dropped += dropped;
1371
1372 for (i = 0; i < sent; i++) {
1373 stats->tx_bytes += dp_packet_size(packets[i]);
1374 }
1375 }
1376
1377 static void
1378 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
1379 struct dp_packet **pkts, int cnt)
1380 {
1381 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1382 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
1383 unsigned int total_pkts = cnt;
1384 unsigned int qos_pkts = 0;
1385 int i, retries = 0;
1386
1387 qid = dev->tx_q[qid % netdev->n_txq].map;
1388
1389 if (OVS_UNLIKELY(!is_vhost_running(dev) || qid < 0
1390 || !(dev->flags & NETDEV_UP))) {
1391 rte_spinlock_lock(&dev->stats_lock);
1392 dev->stats.tx_dropped+= cnt;
1393 rte_spinlock_unlock(&dev->stats_lock);
1394 goto out;
1395 }
1396
1397 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1398
1399 /* Check has QoS has been configured for the netdev */
1400 cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
1401 qos_pkts = total_pkts - cnt;
1402
1403 do {
1404 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
1405 unsigned int tx_pkts;
1406
1407 tx_pkts = rte_vhost_enqueue_burst(netdev_dpdk_get_vid(dev),
1408 vhost_qid, cur_pkts, cnt);
1409 if (OVS_LIKELY(tx_pkts)) {
1410 /* Packets have been sent.*/
1411 cnt -= tx_pkts;
1412 /* Prepare for possible retry.*/
1413 cur_pkts = &cur_pkts[tx_pkts];
1414 } else {
1415 /* No packets sent - do not retry.*/
1416 break;
1417 }
1418 } while (cnt && (retries++ < VHOST_ENQ_RETRY_NUM));
1419
1420 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1421
1422 rte_spinlock_lock(&dev->stats_lock);
1423 cnt += qos_pkts;
1424 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
1425 rte_spinlock_unlock(&dev->stats_lock);
1426
1427 out:
1428 for (i = 0; i < total_pkts - qos_pkts; i++) {
1429 dp_packet_delete(pkts[i]);
1430 }
1431 }
1432
1433 /* Tx function. Transmit packets indefinitely */
1434 static void
1435 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
1436 OVS_NO_THREAD_SAFETY_ANALYSIS
1437 {
1438 #if !defined(__CHECKER__) && !defined(_WIN32)
1439 const size_t PKT_ARRAY_SIZE = batch->count;
1440 #else
1441 /* Sparse or MSVC doesn't like variable length array. */
1442 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
1443 #endif
1444 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1445 struct rte_mbuf *mbufs[PKT_ARRAY_SIZE];
1446 int dropped = 0;
1447 int newcnt = 0;
1448 int i;
1449
1450 /* If we are on a non pmd thread we have to use the mempool mutex, because
1451 * every non pmd thread shares the same mempool cache */
1452
1453 if (!dpdk_thread_is_pmd()) {
1454 ovs_mutex_lock(&nonpmd_mempool_mutex);
1455 }
1456
1457 dp_packet_batch_apply_cutlen(batch);
1458
1459 for (i = 0; i < batch->count; i++) {
1460 int size = dp_packet_size(batch->packets[i]);
1461
1462 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1463 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1464 (int)size , dev->max_packet_len);
1465
1466 dropped++;
1467 continue;
1468 }
1469
1470 mbufs[newcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
1471
1472 if (!mbufs[newcnt]) {
1473 dropped += batch->count - i;
1474 break;
1475 }
1476
1477 /* We have to do a copy for now */
1478 memcpy(rte_pktmbuf_mtod(mbufs[newcnt], void *),
1479 dp_packet_data(batch->packets[i]), size);
1480
1481 rte_pktmbuf_data_len(mbufs[newcnt]) = size;
1482 rte_pktmbuf_pkt_len(mbufs[newcnt]) = size;
1483
1484 newcnt++;
1485 }
1486
1487 if (dev->type == DPDK_DEV_VHOST) {
1488 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs,
1489 newcnt);
1490 } else {
1491 unsigned int qos_pkts = newcnt;
1492
1493 /* Check if QoS has been configured for this netdev. */
1494 newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
1495
1496 dropped += qos_pkts - newcnt;
1497 netdev_dpdk_eth_tx_burst(dev, qid, mbufs, newcnt);
1498 }
1499
1500 if (OVS_UNLIKELY(dropped)) {
1501 rte_spinlock_lock(&dev->stats_lock);
1502 dev->stats.tx_dropped += dropped;
1503 rte_spinlock_unlock(&dev->stats_lock);
1504 }
1505
1506 if (!dpdk_thread_is_pmd()) {
1507 ovs_mutex_unlock(&nonpmd_mempool_mutex);
1508 }
1509 }
1510
1511 static int
1512 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
1513 struct dp_packet_batch *batch,
1514 bool may_steal, bool concurrent_txq OVS_UNUSED)
1515 {
1516
1517 if (OVS_UNLIKELY(!may_steal || batch->packets[0]->source != DPBUF_DPDK)) {
1518 dpdk_do_tx_copy(netdev, qid, batch);
1519 dp_packet_delete_batch(batch, may_steal);
1520 } else {
1521 dp_packet_batch_apply_cutlen(batch);
1522 __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
1523 }
1524 return 0;
1525 }
1526
1527 static inline void
1528 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
1529 struct dp_packet_batch *batch, bool may_steal,
1530 bool concurrent_txq)
1531 {
1532 if (OVS_UNLIKELY(concurrent_txq)) {
1533 qid = qid % dev->up.n_txq;
1534 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
1535 }
1536
1537 if (OVS_UNLIKELY(!may_steal ||
1538 batch->packets[0]->source != DPBUF_DPDK)) {
1539 struct netdev *netdev = &dev->up;
1540
1541 dpdk_do_tx_copy(netdev, qid, batch);
1542 dp_packet_delete_batch(batch, may_steal);
1543 } else {
1544 int next_tx_idx = 0;
1545 int dropped = 0;
1546 unsigned int qos_pkts = 0;
1547 unsigned int temp_cnt = 0;
1548 int cnt = batch->count;
1549
1550 dp_packet_batch_apply_cutlen(batch);
1551
1552 for (int i = 0; i < cnt; i++) {
1553 int size = dp_packet_size(batch->packets[i]);
1554
1555 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
1556 if (next_tx_idx != i) {
1557 temp_cnt = i - next_tx_idx;
1558 qos_pkts = temp_cnt;
1559
1560 temp_cnt = netdev_dpdk_qos_run__(dev,
1561 (struct rte_mbuf**)batch->packets,
1562 temp_cnt);
1563 dropped += qos_pkts - temp_cnt;
1564 netdev_dpdk_eth_tx_burst(dev, qid,
1565 (struct rte_mbuf **)&batch->packets[next_tx_idx],
1566 temp_cnt);
1567
1568 }
1569
1570 VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
1571 (int)size , dev->max_packet_len);
1572
1573 dp_packet_delete(batch->packets[i]);
1574 dropped++;
1575 next_tx_idx = i + 1;
1576 }
1577 }
1578 if (next_tx_idx != cnt) {
1579 cnt -= next_tx_idx;
1580 qos_pkts = cnt;
1581
1582 cnt = netdev_dpdk_qos_run__(dev,
1583 (struct rte_mbuf**)batch->packets, cnt);
1584 dropped += qos_pkts - cnt;
1585 netdev_dpdk_eth_tx_burst(dev, qid,
1586 (struct rte_mbuf **)&batch->packets[next_tx_idx],
1587 cnt);
1588 }
1589
1590 if (OVS_UNLIKELY(dropped)) {
1591 rte_spinlock_lock(&dev->stats_lock);
1592 dev->stats.tx_dropped += dropped;
1593 rte_spinlock_unlock(&dev->stats_lock);
1594 }
1595 }
1596
1597 if (OVS_UNLIKELY(concurrent_txq)) {
1598 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
1599 }
1600 }
1601
1602 static int
1603 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
1604 struct dp_packet_batch *batch, bool may_steal,
1605 bool concurrent_txq)
1606 {
1607 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1608
1609 netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
1610 return 0;
1611 }
1612
1613 static int
1614 netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
1615 {
1616 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1617
1618 ovs_mutex_lock(&dev->mutex);
1619 if (!eth_addr_equals(dev->hwaddr, mac)) {
1620 dev->hwaddr = mac;
1621 netdev_change_seq_changed(netdev);
1622 }
1623 ovs_mutex_unlock(&dev->mutex);
1624
1625 return 0;
1626 }
1627
1628 static int
1629 netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
1630 {
1631 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1632
1633 ovs_mutex_lock(&dev->mutex);
1634 *mac = dev->hwaddr;
1635 ovs_mutex_unlock(&dev->mutex);
1636
1637 return 0;
1638 }
1639
1640 static int
1641 netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
1642 {
1643 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1644
1645 ovs_mutex_lock(&dev->mutex);
1646 *mtup = dev->mtu;
1647 ovs_mutex_unlock(&dev->mutex);
1648
1649 return 0;
1650 }
1651
1652 static int
1653 netdev_dpdk_set_mtu(const struct netdev *netdev, int mtu)
1654 {
1655 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1656 int old_mtu, err, dpdk_mtu;
1657 struct dpdk_mp *old_mp;
1658 struct dpdk_mp *mp;
1659 uint32_t buf_size;
1660
1661 ovs_mutex_lock(&dpdk_mutex);
1662 ovs_mutex_lock(&dev->mutex);
1663 if (dev->mtu == mtu) {
1664 err = 0;
1665 goto out;
1666 }
1667
1668 buf_size = dpdk_buf_size(mtu);
1669 dpdk_mtu = FRAME_LEN_TO_MTU(buf_size);
1670
1671 mp = dpdk_mp_get(dev->socket_id, dpdk_mtu);
1672 if (!mp) {
1673 err = ENOMEM;
1674 goto out;
1675 }
1676
1677 rte_eth_dev_stop(dev->port_id);
1678
1679 old_mtu = dev->mtu;
1680 old_mp = dev->dpdk_mp;
1681 dev->dpdk_mp = mp;
1682 dev->mtu = mtu;
1683 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1684
1685 err = dpdk_eth_dev_init(dev);
1686 if (err) {
1687 dpdk_mp_put(mp);
1688 dev->mtu = old_mtu;
1689 dev->dpdk_mp = old_mp;
1690 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1691 dpdk_eth_dev_init(dev);
1692 goto out;
1693 }
1694
1695 dpdk_mp_put(old_mp);
1696 netdev_change_seq_changed(netdev);
1697 out:
1698 ovs_mutex_unlock(&dev->mutex);
1699 ovs_mutex_unlock(&dpdk_mutex);
1700 return err;
1701 }
1702
1703 static int
1704 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
1705
1706 static int
1707 netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
1708 struct netdev_stats *stats)
1709 {
1710 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1711
1712 ovs_mutex_lock(&dev->mutex);
1713
1714 rte_spinlock_lock(&dev->stats_lock);
1715 /* Supported Stats */
1716 stats->rx_packets += dev->stats.rx_packets;
1717 stats->tx_packets += dev->stats.tx_packets;
1718 stats->rx_dropped = dev->stats.rx_dropped;
1719 stats->tx_dropped += dev->stats.tx_dropped;
1720 stats->multicast = dev->stats.multicast;
1721 stats->rx_bytes = dev->stats.rx_bytes;
1722 stats->tx_bytes = dev->stats.tx_bytes;
1723 stats->rx_errors = dev->stats.rx_errors;
1724 stats->rx_length_errors = dev->stats.rx_length_errors;
1725
1726 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
1727 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
1728 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
1729 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
1730 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
1731 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
1732 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
1733
1734 rte_spinlock_unlock(&dev->stats_lock);
1735
1736 ovs_mutex_unlock(&dev->mutex);
1737
1738 return 0;
1739 }
1740
1741 static void
1742 netdev_dpdk_convert_xstats(struct netdev_stats *stats,
1743 const struct rte_eth_xstat *xstats,
1744 const struct rte_eth_xstat_name *names,
1745 const unsigned int size)
1746 {
1747 for (unsigned int i = 0; i < size; i++) {
1748 if (strcmp(XSTAT_RX_64_PACKETS, names[i].name) == 0) {
1749 stats->rx_1_to_64_packets = xstats[i].value;
1750 } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, names[i].name) == 0) {
1751 stats->rx_65_to_127_packets = xstats[i].value;
1752 } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, names[i].name) == 0) {
1753 stats->rx_128_to_255_packets = xstats[i].value;
1754 } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, names[i].name) == 0) {
1755 stats->rx_256_to_511_packets = xstats[i].value;
1756 } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS, names[i].name) == 0) {
1757 stats->rx_512_to_1023_packets = xstats[i].value;
1758 } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS, names[i].name) == 0) {
1759 stats->rx_1024_to_1522_packets = xstats[i].value;
1760 } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
1761 stats->rx_1523_to_max_packets = xstats[i].value;
1762 } else if (strcmp(XSTAT_TX_64_PACKETS, names[i].name) == 0) {
1763 stats->tx_1_to_64_packets = xstats[i].value;
1764 } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, names[i].name) == 0) {
1765 stats->tx_65_to_127_packets = xstats[i].value;
1766 } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, names[i].name) == 0) {
1767 stats->tx_128_to_255_packets = xstats[i].value;
1768 } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, names[i].name) == 0) {
1769 stats->tx_256_to_511_packets = xstats[i].value;
1770 } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS, names[i].name) == 0) {
1771 stats->tx_512_to_1023_packets = xstats[i].value;
1772 } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS, names[i].name) == 0) {
1773 stats->tx_1024_to_1522_packets = xstats[i].value;
1774 } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
1775 stats->tx_1523_to_max_packets = xstats[i].value;
1776 } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, names[i].name) == 0) {
1777 stats->tx_multicast_packets = xstats[i].value;
1778 } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, names[i].name) == 0) {
1779 stats->rx_broadcast_packets = xstats[i].value;
1780 } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, names[i].name) == 0) {
1781 stats->tx_broadcast_packets = xstats[i].value;
1782 } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
1783 stats->rx_undersized_errors = xstats[i].value;
1784 } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
1785 stats->rx_fragmented_errors = xstats[i].value;
1786 } else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
1787 stats->rx_jabber_errors = xstats[i].value;
1788 }
1789 }
1790 }
1791
1792 static int
1793 netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
1794 {
1795 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1796 struct rte_eth_stats rte_stats;
1797 bool gg;
1798
1799 netdev_dpdk_get_carrier(netdev, &gg);
1800 ovs_mutex_lock(&dev->mutex);
1801
1802 struct rte_eth_xstat *rte_xstats = NULL;
1803 struct rte_eth_xstat_name *rte_xstats_names = NULL;
1804 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
1805
1806 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
1807 VLOG_ERR("Can't get ETH statistics for port: %i.", dev->port_id);
1808 ovs_mutex_unlock(&dev->mutex);
1809 return EPROTO;
1810 }
1811
1812 /* Get length of statistics */
1813 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1814 if (rte_xstats_len < 0) {
1815 VLOG_WARN("Cannot get XSTATS values for port: %i", dev->port_id);
1816 goto out;
1817 }
1818 /* Reserve memory for xstats names and values */
1819 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
1820 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
1821
1822 /* Retreive xstats names */
1823 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
1824 rte_xstats_names,
1825 rte_xstats_len);
1826 if (rte_xstats_new_len != rte_xstats_len) {
1827 VLOG_WARN("Cannot get XSTATS names for port: %i.", dev->port_id);
1828 goto out;
1829 }
1830 /* Retreive xstats values */
1831 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1832 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
1833 rte_xstats_len);
1834 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
1835 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
1836 rte_xstats_len);
1837 } else {
1838 VLOG_WARN("Cannot get XSTATS values for port: %i.", dev->port_id);
1839 }
1840
1841 out:
1842 free(rte_xstats);
1843 free(rte_xstats_names);
1844
1845 stats->rx_packets = rte_stats.ipackets;
1846 stats->tx_packets = rte_stats.opackets;
1847 stats->rx_bytes = rte_stats.ibytes;
1848 stats->tx_bytes = rte_stats.obytes;
1849 /* DPDK counts imissed as errors, but count them here as dropped instead */
1850 stats->rx_errors = rte_stats.ierrors - rte_stats.imissed;
1851 stats->tx_errors = rte_stats.oerrors;
1852
1853 rte_spinlock_lock(&dev->stats_lock);
1854 stats->tx_dropped = dev->stats.tx_dropped;
1855 stats->rx_dropped = dev->stats.rx_dropped;
1856 rte_spinlock_unlock(&dev->stats_lock);
1857
1858 /* These are the available DPDK counters for packets not received due to
1859 * local resource constraints in DPDK and NIC respectively. */
1860 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
1861 stats->rx_missed_errors = rte_stats.imissed;
1862
1863 ovs_mutex_unlock(&dev->mutex);
1864
1865 return 0;
1866 }
1867
1868 static int
1869 netdev_dpdk_get_features(const struct netdev *netdev,
1870 enum netdev_features *current,
1871 enum netdev_features *advertised OVS_UNUSED,
1872 enum netdev_features *supported OVS_UNUSED,
1873 enum netdev_features *peer OVS_UNUSED)
1874 {
1875 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1876 struct rte_eth_link link;
1877
1878 ovs_mutex_lock(&dev->mutex);
1879 link = dev->link;
1880 ovs_mutex_unlock(&dev->mutex);
1881
1882 if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
1883 if (link.link_speed == ETH_SPEED_NUM_10M) {
1884 *current = NETDEV_F_10MB_HD;
1885 }
1886 if (link.link_speed == ETH_SPEED_NUM_100M) {
1887 *current = NETDEV_F_100MB_HD;
1888 }
1889 if (link.link_speed == ETH_SPEED_NUM_1G) {
1890 *current = NETDEV_F_1GB_HD;
1891 }
1892 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
1893 if (link.link_speed == ETH_SPEED_NUM_10M) {
1894 *current = NETDEV_F_10MB_FD;
1895 }
1896 if (link.link_speed == ETH_SPEED_NUM_100M) {
1897 *current = NETDEV_F_100MB_FD;
1898 }
1899 if (link.link_speed == ETH_SPEED_NUM_1G) {
1900 *current = NETDEV_F_1GB_FD;
1901 }
1902 if (link.link_speed == ETH_SPEED_NUM_10G) {
1903 *current = NETDEV_F_10GB_FD;
1904 }
1905 }
1906
1907 if (link.link_autoneg) {
1908 *current |= NETDEV_F_AUTONEG;
1909 }
1910
1911 return 0;
1912 }
1913
1914 static struct ingress_policer *
1915 netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
1916 {
1917 struct ingress_policer *policer = NULL;
1918 uint64_t rate_bytes;
1919 uint64_t burst_bytes;
1920 int err = 0;
1921
1922 policer = xmalloc(sizeof *policer);
1923 rte_spinlock_init(&policer->policer_lock);
1924
1925 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
1926 rate_bytes = rate * 1000/8;
1927 burst_bytes = burst * 1000/8;
1928
1929 policer->app_srtcm_params.cir = rate_bytes;
1930 policer->app_srtcm_params.cbs = burst_bytes;
1931 policer->app_srtcm_params.ebs = 0;
1932 err = rte_meter_srtcm_config(&policer->in_policer,
1933 &policer->app_srtcm_params);
1934 if(err) {
1935 VLOG_ERR("Could not create rte meter for ingress policer");
1936 return NULL;
1937 }
1938
1939 return policer;
1940 }
1941
1942 static int
1943 netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
1944 uint32_t policer_burst)
1945 {
1946 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1947 struct ingress_policer *policer;
1948
1949 /* Force to 0 if no rate specified,
1950 * default to 8000 kbits if burst is 0,
1951 * else stick with user-specified value.
1952 */
1953 policer_burst = (!policer_rate ? 0
1954 : !policer_burst ? 8000
1955 : policer_burst);
1956
1957 ovs_mutex_lock(&dev->mutex);
1958
1959 policer = ovsrcu_get_protected(struct ingress_policer *,
1960 &dev->ingress_policer);
1961
1962 if (dev->policer_rate == policer_rate &&
1963 dev->policer_burst == policer_burst) {
1964 /* Assume that settings haven't changed since we last set them. */
1965 ovs_mutex_unlock(&dev->mutex);
1966 return 0;
1967 }
1968
1969 /* Destroy any existing ingress policer for the device if one exists */
1970 if (policer) {
1971 ovsrcu_postpone(free, policer);
1972 }
1973
1974 if (policer_rate != 0) {
1975 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
1976 } else {
1977 policer = NULL;
1978 }
1979 ovsrcu_set(&dev->ingress_policer, policer);
1980 dev->policer_rate = policer_rate;
1981 dev->policer_burst = policer_burst;
1982 ovs_mutex_unlock(&dev->mutex);
1983
1984 return 0;
1985 }
1986
1987 static int
1988 netdev_dpdk_get_ifindex(const struct netdev *netdev)
1989 {
1990 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1991 int ifindex;
1992
1993 ovs_mutex_lock(&dev->mutex);
1994 ifindex = dev->port_id;
1995 ovs_mutex_unlock(&dev->mutex);
1996
1997 return ifindex;
1998 }
1999
2000 static int
2001 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
2002 {
2003 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2004
2005 ovs_mutex_lock(&dev->mutex);
2006 check_link_status(dev);
2007 *carrier = dev->link.link_status;
2008
2009 ovs_mutex_unlock(&dev->mutex);
2010
2011 return 0;
2012 }
2013
2014 static int
2015 netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
2016 {
2017 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2018
2019 ovs_mutex_lock(&dev->mutex);
2020
2021 if (is_vhost_running(dev)) {
2022 *carrier = 1;
2023 } else {
2024 *carrier = 0;
2025 }
2026
2027 ovs_mutex_unlock(&dev->mutex);
2028
2029 return 0;
2030 }
2031
2032 static long long int
2033 netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
2034 {
2035 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2036 long long int carrier_resets;
2037
2038 ovs_mutex_lock(&dev->mutex);
2039 carrier_resets = dev->link_reset_cnt;
2040 ovs_mutex_unlock(&dev->mutex);
2041
2042 return carrier_resets;
2043 }
2044
2045 static int
2046 netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
2047 long long int interval OVS_UNUSED)
2048 {
2049 return EOPNOTSUPP;
2050 }
2051
2052 static int
2053 netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
2054 enum netdev_flags off, enum netdev_flags on,
2055 enum netdev_flags *old_flagsp)
2056 OVS_REQUIRES(dev->mutex)
2057 {
2058 int err;
2059
2060 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
2061 return EINVAL;
2062 }
2063
2064 *old_flagsp = dev->flags;
2065 dev->flags |= on;
2066 dev->flags &= ~off;
2067
2068 if (dev->flags == *old_flagsp) {
2069 return 0;
2070 }
2071
2072 if (dev->type == DPDK_DEV_ETH) {
2073 if (dev->flags & NETDEV_UP) {
2074 err = rte_eth_dev_start(dev->port_id);
2075 if (err)
2076 return -err;
2077 }
2078
2079 if (dev->flags & NETDEV_PROMISC) {
2080 rte_eth_promiscuous_enable(dev->port_id);
2081 }
2082
2083 if (!(dev->flags & NETDEV_UP)) {
2084 rte_eth_dev_stop(dev->port_id);
2085 }
2086 } else {
2087 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
2088 * running then change netdev's change_seq to trigger link state
2089 * update. */
2090
2091 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
2092 && is_vhost_running(dev)) {
2093 netdev_change_seq_changed(&dev->up);
2094
2095 /* Clear statistics if device is getting up. */
2096 if (NETDEV_UP & on) {
2097 rte_spinlock_lock(&dev->stats_lock);
2098 memset(&dev->stats, 0, sizeof(dev->stats));
2099 rte_spinlock_unlock(&dev->stats_lock);
2100 }
2101 }
2102 }
2103
2104 return 0;
2105 }
2106
2107 static int
2108 netdev_dpdk_update_flags(struct netdev *netdev,
2109 enum netdev_flags off, enum netdev_flags on,
2110 enum netdev_flags *old_flagsp)
2111 {
2112 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2113 int error;
2114
2115 ovs_mutex_lock(&dev->mutex);
2116 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
2117 ovs_mutex_unlock(&dev->mutex);
2118
2119 return error;
2120 }
2121
2122 static int
2123 netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
2124 {
2125 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2126 struct rte_eth_dev_info dev_info;
2127
2128 if (dev->port_id < 0)
2129 return ENODEV;
2130
2131 ovs_mutex_lock(&dev->mutex);
2132 rte_eth_dev_info_get(dev->port_id, &dev_info);
2133 ovs_mutex_unlock(&dev->mutex);
2134
2135 smap_add_format(args, "port_no", "%d", dev->port_id);
2136 smap_add_format(args, "numa_id", "%d", rte_eth_dev_socket_id(dev->port_id));
2137 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
2138 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
2139 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
2140 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
2141 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
2142 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
2143 smap_add_format(args, "max_hash_mac_addrs", "%u", dev_info.max_hash_mac_addrs);
2144 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
2145 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
2146
2147 if (dev_info.pci_dev) {
2148 smap_add_format(args, "pci-vendor_id", "0x%u",
2149 dev_info.pci_dev->id.vendor_id);
2150 smap_add_format(args, "pci-device_id", "0x%x",
2151 dev_info.pci_dev->id.device_id);
2152 }
2153
2154 return 0;
2155 }
2156
2157 static void
2158 netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
2159 OVS_REQUIRES(dev->mutex)
2160 {
2161 enum netdev_flags old_flags;
2162
2163 if (admin_state) {
2164 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
2165 } else {
2166 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
2167 }
2168 }
2169
2170 static void
2171 netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
2172 const char *argv[], void *aux OVS_UNUSED)
2173 {
2174 bool up;
2175
2176 if (!strcasecmp(argv[argc - 1], "up")) {
2177 up = true;
2178 } else if ( !strcasecmp(argv[argc - 1], "down")) {
2179 up = false;
2180 } else {
2181 unixctl_command_reply_error(conn, "Invalid Admin State");
2182 return;
2183 }
2184
2185 if (argc > 2) {
2186 struct netdev *netdev = netdev_from_name(argv[1]);
2187 if (netdev && is_dpdk_class(netdev->netdev_class)) {
2188 struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
2189
2190 ovs_mutex_lock(&dpdk_dev->mutex);
2191 netdev_dpdk_set_admin_state__(dpdk_dev, up);
2192 ovs_mutex_unlock(&dpdk_dev->mutex);
2193
2194 netdev_close(netdev);
2195 } else {
2196 unixctl_command_reply_error(conn, "Not a DPDK Interface");
2197 netdev_close(netdev);
2198 return;
2199 }
2200 } else {
2201 struct netdev_dpdk *netdev;
2202
2203 ovs_mutex_lock(&dpdk_mutex);
2204 LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
2205 ovs_mutex_lock(&netdev->mutex);
2206 netdev_dpdk_set_admin_state__(netdev, up);
2207 ovs_mutex_unlock(&netdev->mutex);
2208 }
2209 ovs_mutex_unlock(&dpdk_mutex);
2210 }
2211 unixctl_command_reply(conn, "OK");
2212 }
2213
2214 /*
2215 * Set virtqueue flags so that we do not receive interrupts.
2216 */
2217 static void
2218 set_irq_status(int vid)
2219 {
2220 uint32_t i;
2221 uint64_t idx;
2222
2223 for (i = 0; i < rte_vhost_get_queue_num(vid); i++) {
2224 idx = i * VIRTIO_QNUM;
2225 rte_vhost_enable_guest_notification(vid, idx + VIRTIO_RXQ, 0);
2226 rte_vhost_enable_guest_notification(vid, idx + VIRTIO_TXQ, 0);
2227 }
2228 }
2229
2230 /*
2231 * Fixes mapping for vhost-user tx queues. Must be called after each
2232 * enabling/disabling of queues and n_txq modifications.
2233 */
2234 static void
2235 netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
2236 OVS_REQUIRES(dev->mutex)
2237 {
2238 int *enabled_queues, n_enabled = 0;
2239 int i, k, total_txqs = dev->up.n_txq;
2240
2241 enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
2242
2243 for (i = 0; i < total_txqs; i++) {
2244 /* Enabled queues always mapped to themselves. */
2245 if (dev->tx_q[i].map == i) {
2246 enabled_queues[n_enabled++] = i;
2247 }
2248 }
2249
2250 if (n_enabled == 0 && total_txqs != 0) {
2251 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
2252 n_enabled = 1;
2253 }
2254
2255 k = 0;
2256 for (i = 0; i < total_txqs; i++) {
2257 if (dev->tx_q[i].map != i) {
2258 dev->tx_q[i].map = enabled_queues[k];
2259 k = (k + 1) % n_enabled;
2260 }
2261 }
2262
2263 VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
2264 for (i = 0; i < total_txqs; i++) {
2265 VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
2266 }
2267
2268 rte_free(enabled_queues);
2269 }
2270
2271 /*
2272 * A new virtio-net device is added to a vhost port.
2273 */
2274 static int
2275 new_device(int vid)
2276 {
2277 struct netdev_dpdk *dev;
2278 bool exists = false;
2279 int newnode = 0;
2280 char ifname[IF_NAME_SZ];
2281
2282 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
2283
2284 ovs_mutex_lock(&dpdk_mutex);
2285 /* Add device to the vhost port with the same name as that passed down. */
2286 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
2287 if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
2288 uint32_t qp_num = rte_vhost_get_queue_num(vid);
2289
2290 ovs_mutex_lock(&dev->mutex);
2291 /* Get NUMA information */
2292 newnode = rte_vhost_get_numa_node(vid);
2293 if (newnode == -1) {
2294 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
2295 ifname);
2296 newnode = dev->socket_id;
2297 }
2298
2299 if (dev->requested_n_txq != qp_num
2300 || dev->requested_n_rxq != qp_num
2301 || dev->requested_socket_id != newnode) {
2302 dev->requested_socket_id = newnode;
2303 dev->requested_n_rxq = qp_num;
2304 dev->requested_n_txq = qp_num;
2305 netdev_request_reconfigure(&dev->up);
2306 } else {
2307 /* Reconfiguration not required. */
2308 dev->vhost_reconfigured = true;
2309 }
2310
2311 ovsrcu_index_set(&dev->vid, vid);
2312 exists = true;
2313
2314 /* Disable notifications. */
2315 set_irq_status(vid);
2316 netdev_change_seq_changed(&dev->up);
2317 ovs_mutex_unlock(&dev->mutex);
2318 break;
2319 }
2320 }
2321 ovs_mutex_unlock(&dpdk_mutex);
2322
2323 if (!exists) {
2324 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
2325
2326 return -1;
2327 }
2328
2329 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
2330 ifname, newnode);
2331
2332 return 0;
2333 }
2334
2335 /* Clears mapping for all available queues of vhost interface. */
2336 static void
2337 netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
2338 OVS_REQUIRES(dev->mutex)
2339 {
2340 int i;
2341
2342 for (i = 0; i < dev->up.n_txq; i++) {
2343 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
2344 }
2345 }
2346
2347 /*
2348 * Remove a virtio-net device from the specific vhost port. Use dev->remove
2349 * flag to stop any more packets from being sent or received to/from a VM and
2350 * ensure all currently queued packets have been sent/received before removing
2351 * the device.
2352 */
2353 static void
2354 destroy_device(int vid)
2355 {
2356 struct netdev_dpdk *dev;
2357 bool exists = false;
2358 char ifname[IF_NAME_SZ];
2359
2360 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
2361
2362 ovs_mutex_lock(&dpdk_mutex);
2363 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
2364 if (netdev_dpdk_get_vid(dev) == vid) {
2365
2366 ovs_mutex_lock(&dev->mutex);
2367 dev->vhost_reconfigured = false;
2368 ovsrcu_index_set(&dev->vid, -1);
2369 netdev_dpdk_txq_map_clear(dev);
2370
2371 netdev_change_seq_changed(&dev->up);
2372 ovs_mutex_unlock(&dev->mutex);
2373 exists = true;
2374 break;
2375 }
2376 }
2377
2378 ovs_mutex_unlock(&dpdk_mutex);
2379
2380 if (exists) {
2381 /*
2382 * Wait for other threads to quiesce after setting the 'virtio_dev'
2383 * to NULL, before returning.
2384 */
2385 ovsrcu_synchronize();
2386 /*
2387 * As call to ovsrcu_synchronize() will end the quiescent state,
2388 * put thread back into quiescent state before returning.
2389 */
2390 ovsrcu_quiesce_start();
2391 VLOG_INFO("vHost Device '%s' has been removed", ifname);
2392 } else {
2393 VLOG_INFO("vHost Device '%s' not found", ifname);
2394 }
2395 }
2396
2397 static int
2398 vring_state_changed(int vid, uint16_t queue_id, int enable)
2399 {
2400 struct netdev_dpdk *dev;
2401 bool exists = false;
2402 int qid = queue_id / VIRTIO_QNUM;
2403 char ifname[IF_NAME_SZ];
2404
2405 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
2406
2407 if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
2408 return 0;
2409 }
2410
2411 ovs_mutex_lock(&dpdk_mutex);
2412 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
2413 if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
2414 ovs_mutex_lock(&dev->mutex);
2415 if (enable) {
2416 dev->tx_q[qid].map = qid;
2417 } else {
2418 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
2419 }
2420 netdev_dpdk_remap_txqs(dev);
2421 exists = true;
2422 ovs_mutex_unlock(&dev->mutex);
2423 break;
2424 }
2425 }
2426 ovs_mutex_unlock(&dpdk_mutex);
2427
2428 if (exists) {
2429 VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s'"
2430 "changed to \'%s\'", queue_id, qid, ifname,
2431 (enable == 1) ? "enabled" : "disabled");
2432 } else {
2433 VLOG_INFO("vHost Device '%s' not found", ifname);
2434 return -1;
2435 }
2436
2437 return 0;
2438 }
2439
2440 int
2441 netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
2442 {
2443 return ovsrcu_index_get(&dev->vid);
2444 }
2445
2446 struct ingress_policer *
2447 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
2448 {
2449 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
2450 }
2451
2452 /*
2453 * These callbacks allow virtio-net devices to be added to vhost ports when
2454 * configuration has been fully complete.
2455 */
2456 static const struct virtio_net_device_ops virtio_net_device_ops =
2457 {
2458 .new_device = new_device,
2459 .destroy_device = destroy_device,
2460 .vring_state_changed = vring_state_changed
2461 };
2462
2463 static void *
2464 start_vhost_loop(void *dummy OVS_UNUSED)
2465 {
2466 pthread_detach(pthread_self());
2467 /* Put the cuse thread into quiescent state. */
2468 ovsrcu_quiesce_start();
2469 rte_vhost_driver_session_start();
2470 return NULL;
2471 }
2472
2473 static int
2474 dpdk_vhost_class_init(void)
2475 {
2476 rte_vhost_driver_callback_register(&virtio_net_device_ops);
2477 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
2478 | 1ULL << VIRTIO_NET_F_HOST_TSO6
2479 | 1ULL << VIRTIO_NET_F_CSUM);
2480
2481 ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
2482 return 0;
2483 }
2484
2485 static int
2486 dpdk_vhost_cuse_class_init(void)
2487 {
2488 return 0;
2489 }
2490
2491 static int
2492 dpdk_vhost_user_class_init(void)
2493 {
2494 return 0;
2495 }
2496
2497 static void
2498 dpdk_common_init(void)
2499 {
2500 unixctl_command_register("netdev-dpdk/set-admin-state",
2501 "[netdev] up|down", 1, 2,
2502 netdev_dpdk_set_admin_state, NULL);
2503
2504 }
2505
2506 /* Client Rings */
2507
2508 static int
2509 dpdk_ring_create(const char dev_name[], unsigned int port_no,
2510 unsigned int *eth_port_id)
2511 {
2512 struct dpdk_ring *ivshmem;
2513 char ring_name[RTE_RING_NAMESIZE];
2514 int err;
2515
2516 ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
2517 if (ivshmem == NULL) {
2518 return ENOMEM;
2519 }
2520
2521 /* XXX: Add support for multiquque ring. */
2522 err = snprintf(ring_name, sizeof(ring_name), "%s_tx", dev_name);
2523 if (err < 0) {
2524 return -err;
2525 }
2526
2527 /* Create single producer tx ring, netdev does explicit locking. */
2528 ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
2529 RING_F_SP_ENQ);
2530 if (ivshmem->cring_tx == NULL) {
2531 rte_free(ivshmem);
2532 return ENOMEM;
2533 }
2534
2535 err = snprintf(ring_name, sizeof(ring_name), "%s_rx", dev_name);
2536 if (err < 0) {
2537 return -err;
2538 }
2539
2540 /* Create single consumer rx ring, netdev does explicit locking. */
2541 ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
2542 RING_F_SC_DEQ);
2543 if (ivshmem->cring_rx == NULL) {
2544 rte_free(ivshmem);
2545 return ENOMEM;
2546 }
2547
2548 err = rte_eth_from_rings(dev_name, &ivshmem->cring_rx, 1,
2549 &ivshmem->cring_tx, 1, SOCKET0);
2550
2551 if (err < 0) {
2552 rte_free(ivshmem);
2553 return ENODEV;
2554 }
2555
2556 ivshmem->user_port_id = port_no;
2557 ivshmem->eth_port_id = rte_eth_dev_count() - 1;
2558 ovs_list_push_back(&dpdk_ring_list, &ivshmem->list_node);
2559
2560 *eth_port_id = ivshmem->eth_port_id;
2561 return 0;
2562 }
2563
2564 static int
2565 dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id)
2566 OVS_REQUIRES(dpdk_mutex)
2567 {
2568 struct dpdk_ring *ivshmem;
2569 unsigned int port_no;
2570 int err = 0;
2571
2572 /* Names always start with "dpdkr" */
2573 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
2574 if (err) {
2575 return err;
2576 }
2577
2578 /* look through our list to find the device */
2579 LIST_FOR_EACH (ivshmem, list_node, &dpdk_ring_list) {
2580 if (ivshmem->user_port_id == port_no) {
2581 VLOG_INFO("Found dpdk ring device %s:", dev_name);
2582 *eth_port_id = ivshmem->eth_port_id; /* really all that is needed */
2583 return 0;
2584 }
2585 }
2586 /* Need to create the device rings */
2587 return dpdk_ring_create(dev_name, port_no, eth_port_id);
2588 }
2589
2590 static int
2591 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
2592 struct dp_packet_batch *batch, bool may_steal,
2593 bool concurrent_txq)
2594 {
2595 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2596 unsigned i;
2597
2598 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the
2599 * rss hash field is clear. This is because the same mbuf may be modified by
2600 * the consumer of the ring and return into the datapath without recalculating
2601 * the RSS hash. */
2602 for (i = 0; i < batch->count; i++) {
2603 dp_packet_rss_invalidate(batch->packets[i]);
2604 }
2605
2606 netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
2607 return 0;
2608 }
2609
2610 static int
2611 netdev_dpdk_ring_construct(struct netdev *netdev)
2612 {
2613 unsigned int port_no = 0;
2614 int err = 0;
2615
2616 if (rte_eal_init_ret) {
2617 return rte_eal_init_ret;
2618 }
2619
2620 ovs_mutex_lock(&dpdk_mutex);
2621
2622 err = dpdk_ring_open(netdev->name, &port_no);
2623 if (err) {
2624 goto unlock_dpdk;
2625 }
2626
2627 err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_ETH);
2628
2629 unlock_dpdk:
2630 ovs_mutex_unlock(&dpdk_mutex);
2631 return err;
2632 }
2633
2634 /* QoS Functions */
2635
2636 /*
2637 * Initialize QoS configuration operations.
2638 */
2639 static void
2640 qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
2641 {
2642 conf->ops = ops;
2643 }
2644
2645 /*
2646 * Search existing QoS operations in qos_ops and compare each set of
2647 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
2648 * else return NULL
2649 */
2650 static const struct dpdk_qos_ops *
2651 qos_lookup_name(const char *name)
2652 {
2653 const struct dpdk_qos_ops *const *opsp;
2654
2655 for (opsp = qos_confs; *opsp != NULL; opsp++) {
2656 const struct dpdk_qos_ops *ops = *opsp;
2657 if (!strcmp(name, ops->qos_name)) {
2658 return ops;
2659 }
2660 }
2661 return NULL;
2662 }
2663
2664 /*
2665 * Call qos_destruct to clean up items associated with the netdevs
2666 * qos_conf. Set netdevs qos_conf to NULL.
2667 */
2668 static void
2669 qos_delete_conf(struct netdev *netdev)
2670 {
2671 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2672
2673 rte_spinlock_lock(&dev->qos_lock);
2674 if (dev->qos_conf) {
2675 if (dev->qos_conf->ops->qos_destruct) {
2676 dev->qos_conf->ops->qos_destruct(netdev, dev->qos_conf);
2677 }
2678 dev->qos_conf = NULL;
2679 }
2680 rte_spinlock_unlock(&dev->qos_lock);
2681 }
2682
2683 static int
2684 netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
2685 struct sset *types)
2686 {
2687 const struct dpdk_qos_ops *const *opsp;
2688
2689 for (opsp = qos_confs; *opsp != NULL; opsp++) {
2690 const struct dpdk_qos_ops *ops = *opsp;
2691 if (ops->qos_construct && ops->qos_name[0] != '\0') {
2692 sset_add(types, ops->qos_name);
2693 }
2694 }
2695 return 0;
2696 }
2697
2698 static int
2699 netdev_dpdk_get_qos(const struct netdev *netdev,
2700 const char **typep, struct smap *details)
2701 {
2702 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2703 int error = 0;
2704
2705 ovs_mutex_lock(&dev->mutex);
2706 if(dev->qos_conf) {
2707 *typep = dev->qos_conf->ops->qos_name;
2708 error = (dev->qos_conf->ops->qos_get
2709 ? dev->qos_conf->ops->qos_get(netdev, details): 0);
2710 } else {
2711 /* No QoS configuration set, return an empty string */
2712 *typep = "";
2713 }
2714 ovs_mutex_unlock(&dev->mutex);
2715
2716 return error;
2717 }
2718
2719 static int
2720 netdev_dpdk_set_qos(struct netdev *netdev,
2721 const char *type, const struct smap *details)
2722 {
2723 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2724 const struct dpdk_qos_ops *new_ops = NULL;
2725 int error = 0;
2726
2727 /* If type is empty or unsupported then the current QoS configuration
2728 * for the dpdk-netdev can be destroyed */
2729 new_ops = qos_lookup_name(type);
2730
2731 if (type[0] == '\0' || !new_ops || !new_ops->qos_construct) {
2732 qos_delete_conf(netdev);
2733 return EOPNOTSUPP;
2734 }
2735
2736 ovs_mutex_lock(&dev->mutex);
2737
2738 if (dev->qos_conf) {
2739 if (new_ops == dev->qos_conf->ops) {
2740 error = new_ops->qos_set ? new_ops->qos_set(netdev, details) : 0;
2741 } else {
2742 /* Delete existing QoS configuration. */
2743 qos_delete_conf(netdev);
2744 ovs_assert(dev->qos_conf == NULL);
2745
2746 /* Install new QoS configuration. */
2747 error = new_ops->qos_construct(netdev, details);
2748 }
2749 } else {
2750 error = new_ops->qos_construct(netdev, details);
2751 }
2752
2753 ovs_assert((error == 0) == (dev->qos_conf != NULL));
2754 if (error) {
2755 VLOG_ERR("Failed to set QoS type %s on port %s, returned error: %s",
2756 type, netdev->name, rte_strerror(-error));
2757 }
2758
2759 ovs_mutex_unlock(&dev->mutex);
2760 return error;
2761 }
2762
2763 /* egress-policer details */
2764
2765 struct egress_policer {
2766 struct qos_conf qos_conf;
2767 struct rte_meter_srtcm_params app_srtcm_params;
2768 struct rte_meter_srtcm egress_meter;
2769 };
2770
2771 static struct egress_policer *
2772 egress_policer_get__(const struct netdev *netdev)
2773 {
2774 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2775 return CONTAINER_OF(dev->qos_conf, struct egress_policer, qos_conf);
2776 }
2777
2778 static int
2779 egress_policer_qos_construct(struct netdev *netdev,
2780 const struct smap *details)
2781 {
2782 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2783 struct egress_policer *policer;
2784 int err = 0;
2785
2786 rte_spinlock_lock(&dev->qos_lock);
2787 policer = xmalloc(sizeof *policer);
2788 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
2789 dev->qos_conf = &policer->qos_conf;
2790 policer->app_srtcm_params.cir = smap_get_ullong(details, "cir", 0);
2791 policer->app_srtcm_params.cbs = smap_get_ullong(details, "cbs", 0);
2792 policer->app_srtcm_params.ebs = 0;
2793 err = rte_meter_srtcm_config(&policer->egress_meter,
2794 &policer->app_srtcm_params);
2795
2796 if (err < 0) {
2797 /* Error occurred during rte_meter creation, destroy the policer
2798 * and set the qos configuration for the netdev dpdk to NULL
2799 */
2800 free(policer);
2801 dev->qos_conf = NULL;
2802 err = -err;
2803 }
2804 rte_spinlock_unlock(&dev->qos_lock);
2805
2806 return err;
2807 }
2808
2809 static void
2810 egress_policer_qos_destruct(struct netdev *netdev OVS_UNUSED,
2811 struct qos_conf *conf)
2812 {
2813 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
2814 qos_conf);
2815 free(policer);
2816 }
2817
2818 static int
2819 egress_policer_qos_get(const struct netdev *netdev, struct smap *details)
2820 {
2821 struct egress_policer *policer = egress_policer_get__(netdev);
2822 smap_add_format(details, "cir", "%llu",
2823 1ULL * policer->app_srtcm_params.cir);
2824 smap_add_format(details, "cbs", "%llu",
2825 1ULL * policer->app_srtcm_params.cbs);
2826
2827 return 0;
2828 }
2829
2830 static int
2831 egress_policer_qos_set(struct netdev *netdev, const struct smap *details)
2832 {
2833 struct egress_policer *policer;
2834 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2835 int err = 0;
2836
2837 policer = egress_policer_get__(netdev);
2838 rte_spinlock_lock(&dev->qos_lock);
2839 policer->app_srtcm_params.cir = smap_get_ullong(details, "cir", 0);
2840 policer->app_srtcm_params.cbs = smap_get_ullong(details, "cbs", 0);
2841 policer->app_srtcm_params.ebs = 0;
2842 err = rte_meter_srtcm_config(&policer->egress_meter,
2843 &policer->app_srtcm_params);
2844
2845 if (err < 0) {
2846 /* Error occurred during rte_meter creation, destroy the policer
2847 * and set the qos configuration for the netdev dpdk to NULL
2848 */
2849 free(policer);
2850 dev->qos_conf = NULL;
2851 err = -err;
2852 }
2853 rte_spinlock_unlock(&dev->qos_lock);
2854
2855 return err;
2856 }
2857
2858 static int
2859 egress_policer_run(struct netdev *netdev, struct rte_mbuf **pkts, int pkt_cnt)
2860 {
2861 int cnt = 0;
2862 struct egress_policer *policer = egress_policer_get__(netdev);
2863
2864 cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts, pkt_cnt);
2865
2866 return cnt;
2867 }
2868
2869 static const struct dpdk_qos_ops egress_policer_ops = {
2870 "egress-policer", /* qos_name */
2871 egress_policer_qos_construct,
2872 egress_policer_qos_destruct,
2873 egress_policer_qos_get,
2874 egress_policer_qos_set,
2875 egress_policer_run
2876 };
2877
2878 static int
2879 netdev_dpdk_reconfigure(struct netdev *netdev)
2880 {
2881 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2882 int err = 0;
2883
2884 ovs_mutex_lock(&dpdk_mutex);
2885 ovs_mutex_lock(&dev->mutex);
2886
2887 if (netdev->n_txq == dev->requested_n_txq
2888 && netdev->n_rxq == dev->requested_n_rxq) {
2889 /* Reconfiguration is unnecessary */
2890
2891 goto out;
2892 }
2893
2894 rte_eth_dev_stop(dev->port_id);
2895
2896 netdev->n_txq = dev->requested_n_txq;
2897 netdev->n_rxq = dev->requested_n_rxq;
2898
2899 rte_free(dev->tx_q);
2900 err = dpdk_eth_dev_init(dev);
2901 netdev_dpdk_alloc_txq(dev, netdev->n_txq);
2902
2903 out:
2904
2905 ovs_mutex_unlock(&dev->mutex);
2906 ovs_mutex_unlock(&dpdk_mutex);
2907
2908 return err;
2909 }
2910
2911 static int
2912 netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
2913 {
2914 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2915 int err = 0;
2916
2917 ovs_mutex_lock(&dpdk_mutex);
2918 ovs_mutex_lock(&dev->mutex);
2919
2920 netdev->n_txq = dev->requested_n_txq;
2921 netdev->n_rxq = dev->requested_n_rxq;
2922
2923 /* Enable TX queue 0 by default if it wasn't disabled. */
2924 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
2925 dev->tx_q[0].map = 0;
2926 }
2927
2928 netdev_dpdk_remap_txqs(dev);
2929
2930 if (dev->requested_socket_id != dev->socket_id) {
2931 dev->socket_id = dev->requested_socket_id;
2932 /* Change mempool to new NUMA Node */
2933 dpdk_mp_put(dev->dpdk_mp);
2934 dev->dpdk_mp = dpdk_mp_get(dev->socket_id, dev->mtu);
2935 if (!dev->dpdk_mp) {
2936 err = ENOMEM;
2937 }
2938 }
2939
2940 if (netdev_dpdk_get_vid(dev) >= 0) {
2941 dev->vhost_reconfigured = true;
2942 }
2943
2944 ovs_mutex_unlock(&dev->mutex);
2945 ovs_mutex_unlock(&dpdk_mutex);
2946
2947 return err;
2948 }
2949
2950 static int
2951 netdev_dpdk_vhost_cuse_reconfigure(struct netdev *netdev)
2952 {
2953 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2954
2955 ovs_mutex_lock(&dpdk_mutex);
2956 ovs_mutex_lock(&dev->mutex);
2957
2958 netdev->n_txq = dev->requested_n_txq;
2959 netdev->n_rxq = 1;
2960
2961 ovs_mutex_unlock(&dev->mutex);
2962 ovs_mutex_unlock(&dpdk_mutex);
2963
2964 return 0;
2965 }
2966
2967 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
2968 SET_CONFIG, SET_TX_MULTIQ, SEND, \
2969 GET_CARRIER, GET_STATS, \
2970 GET_FEATURES, GET_STATUS, \
2971 RECONFIGURE, RXQ_RECV) \
2972 { \
2973 NAME, \
2974 true, /* is_pmd */ \
2975 INIT, /* init */ \
2976 NULL, /* netdev_dpdk_run */ \
2977 NULL, /* netdev_dpdk_wait */ \
2978 \
2979 netdev_dpdk_alloc, \
2980 CONSTRUCT, \
2981 DESTRUCT, \
2982 netdev_dpdk_dealloc, \
2983 netdev_dpdk_get_config, \
2984 SET_CONFIG, \
2985 NULL, /* get_tunnel_config */ \
2986 NULL, /* build header */ \
2987 NULL, /* push header */ \
2988 NULL, /* pop header */ \
2989 netdev_dpdk_get_numa_id, /* get_numa_id */ \
2990 SET_TX_MULTIQ, \
2991 \
2992 SEND, /* send */ \
2993 NULL, /* send_wait */ \
2994 \
2995 netdev_dpdk_set_etheraddr, \
2996 netdev_dpdk_get_etheraddr, \
2997 netdev_dpdk_get_mtu, \
2998 netdev_dpdk_set_mtu, \
2999 netdev_dpdk_get_ifindex, \
3000 GET_CARRIER, \
3001 netdev_dpdk_get_carrier_resets, \
3002 netdev_dpdk_set_miimon, \
3003 GET_STATS, \
3004 GET_FEATURES, \
3005 NULL, /* set_advertisements */ \
3006 \
3007 netdev_dpdk_set_policing, \
3008 netdev_dpdk_get_qos_types, \
3009 NULL, /* get_qos_capabilities */ \
3010 netdev_dpdk_get_qos, \
3011 netdev_dpdk_set_qos, \
3012 NULL, /* get_queue */ \
3013 NULL, /* set_queue */ \
3014 NULL, /* delete_queue */ \
3015 NULL, /* get_queue_stats */ \
3016 NULL, /* queue_dump_start */ \
3017 NULL, /* queue_dump_next */ \
3018 NULL, /* queue_dump_done */ \
3019 NULL, /* dump_queue_stats */ \
3020 \
3021 NULL, /* set_in4 */ \
3022 NULL, /* get_addr_list */ \
3023 NULL, /* add_router */ \
3024 NULL, /* get_next_hop */ \
3025 GET_STATUS, \
3026 NULL, /* arp_lookup */ \
3027 \
3028 netdev_dpdk_update_flags, \
3029 RECONFIGURE, \
3030 \
3031 netdev_dpdk_rxq_alloc, \
3032 netdev_dpdk_rxq_construct, \
3033 netdev_dpdk_rxq_destruct, \
3034 netdev_dpdk_rxq_dealloc, \
3035 RXQ_RECV, \
3036 NULL, /* rx_wait */ \
3037 NULL, /* rxq_drain */ \
3038 }
3039
3040 static int
3041 process_vhost_flags(char *flag, char *default_val, int size,
3042 const struct smap *ovs_other_config,
3043 char **new_val)
3044 {
3045 const char *val;
3046 int changed = 0;
3047
3048 val = smap_get(ovs_other_config, flag);
3049
3050 /* Depending on which version of vhost is in use, process the vhost-specific
3051 * flag if it is provided, otherwise resort to default value.
3052 */
3053 if (val && (strlen(val) <= size)) {
3054 changed = 1;
3055 *new_val = xstrdup(val);
3056 VLOG_INFO("User-provided %s in use: %s", flag, *new_val);
3057 } else {
3058 VLOG_INFO("No %s provided - defaulting to %s", flag, default_val);
3059 *new_val = default_val;
3060 }
3061
3062 return changed;
3063 }
3064
3065 static char **
3066 grow_argv(char ***argv, size_t cur_siz, size_t grow_by)
3067 {
3068 return xrealloc(*argv, sizeof(char *) * (cur_siz + grow_by));
3069 }
3070
3071 static void
3072 dpdk_option_extend(char ***argv, int argc, const char *option,
3073 const char *value)
3074 {
3075 char **newargv = grow_argv(argv, argc, 2);
3076 *argv = newargv;
3077 newargv[argc] = xstrdup(option);
3078 newargv[argc+1] = xstrdup(value);
3079 }
3080
3081 static char **
3082 move_argv(char ***argv, size_t cur_size, char **src_argv, size_t src_argc)
3083 {
3084 char **newargv = grow_argv(argv, cur_size, src_argc);
3085 while (src_argc--) {
3086 newargv[cur_size+src_argc] = src_argv[src_argc];
3087 src_argv[src_argc] = NULL;
3088 }
3089 return newargv;
3090 }
3091
3092 static int
3093 extra_dpdk_args(const char *ovs_extra_config, char ***argv, int argc)
3094 {
3095 int ret = argc;
3096 char *release_tok = xstrdup(ovs_extra_config);
3097 char *tok, *endptr = NULL;
3098
3099 for (tok = strtok_r(release_tok, " ", &endptr); tok != NULL;
3100 tok = strtok_r(NULL, " ", &endptr)) {
3101 char **newarg = grow_argv(argv, ret, 1);
3102 *argv = newarg;
3103 newarg[ret++] = xstrdup(tok);
3104 }
3105 free(release_tok);
3106 return ret;
3107 }
3108
3109 static bool
3110 argv_contains(char **argv_haystack, const size_t argc_haystack,
3111 const char *needle)
3112 {
3113 for (size_t i = 0; i < argc_haystack; ++i) {
3114 if (!strcmp(argv_haystack[i], needle))
3115 return true;
3116 }
3117 return false;
3118 }
3119
3120 static int
3121 construct_dpdk_options(const struct smap *ovs_other_config,
3122 char ***argv, const int initial_size,
3123 char **extra_args, const size_t extra_argc)
3124 {
3125 struct dpdk_options_map {
3126 const char *ovs_configuration;
3127 const char *dpdk_option;
3128 bool default_enabled;
3129 const char *default_value;
3130 } opts[] = {
3131 {"dpdk-lcore-mask", "-c", false, NULL},
3132 {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
3133 };
3134
3135 int i, ret = initial_size;
3136
3137 /*First, construct from the flat-options (non-mutex)*/
3138 for (i = 0; i < ARRAY_SIZE(opts); ++i) {
3139 const char *lookup = smap_get(ovs_other_config,
3140 opts[i].ovs_configuration);
3141 if (!lookup && opts[i].default_enabled) {
3142 lookup = opts[i].default_value;
3143 }
3144
3145 if (lookup) {
3146 if (!argv_contains(extra_args, extra_argc, opts[i].dpdk_option)) {
3147 dpdk_option_extend(argv, ret, opts[i].dpdk_option, lookup);
3148 ret += 2;
3149 } else {
3150 VLOG_WARN("Ignoring database defined option '%s' due to "
3151 "dpdk_extras config", opts[i].dpdk_option);
3152 }
3153 }
3154 }
3155
3156 return ret;
3157 }
3158
3159 #define MAX_DPDK_EXCL_OPTS 10
3160
3161 static int
3162 construct_dpdk_mutex_options(const struct smap *ovs_other_config,
3163 char ***argv, const int initial_size,
3164 char **extra_args, const size_t extra_argc)
3165 {
3166 struct dpdk_exclusive_options_map {
3167 const char *category;
3168 const char *ovs_dpdk_options[MAX_DPDK_EXCL_OPTS];
3169 const char *eal_dpdk_options[MAX_DPDK_EXCL_OPTS];
3170 const char *default_value;
3171 int default_option;
3172 } excl_opts[] = {
3173 {"memory type",
3174 {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
3175 {"-m", "--socket-mem", NULL,},
3176 "1024,0", 1
3177 },
3178 };
3179
3180 int i, ret = initial_size;
3181 for (i = 0; i < ARRAY_SIZE(excl_opts); ++i) {
3182 int found_opts = 0, scan, found_pos = -1;
3183 const char *found_value;
3184 struct dpdk_exclusive_options_map *popt = &excl_opts[i];
3185
3186 for (scan = 0; scan < MAX_DPDK_EXCL_OPTS
3187 && popt->ovs_dpdk_options[scan]; ++scan) {
3188 const char *lookup = smap_get(ovs_other_config,
3189 popt->ovs_dpdk_options[scan]);
3190 if (lookup && strlen(lookup)) {
3191 found_opts++;
3192 found_pos = scan;
3193 found_value = lookup;
3194 }
3195 }
3196
3197 if (!found_opts) {
3198 if (popt->default_option) {
3199 found_pos = popt->default_option;
3200 found_value = popt->default_value;
3201 } else {
3202 continue;
3203 }
3204 }
3205
3206 if (found_opts > 1) {
3207 VLOG_ERR("Multiple defined options for %s. Please check your"
3208 " database settings and reconfigure if necessary.",
3209 popt->category);
3210 }
3211
3212 if (!argv_contains(extra_args, extra_argc,
3213 popt->eal_dpdk_options[found_pos])) {
3214 dpdk_option_extend(argv, ret, popt->eal_dpdk_options[found_pos],
3215 found_value);
3216 ret += 2;
3217 } else {
3218 VLOG_WARN("Ignoring database defined option '%s' due to "
3219 "dpdk_extras config", popt->eal_dpdk_options[found_pos]);
3220 }
3221 }
3222
3223 return ret;
3224 }
3225
3226 static int
3227 get_dpdk_args(const struct smap *ovs_other_config, char ***argv,
3228 int argc)
3229 {
3230 const char *extra_configuration;
3231 char **extra_args = NULL;
3232 int i;
3233 size_t extra_argc = 0;
3234
3235 extra_configuration = smap_get(ovs_other_config, "dpdk-extra");
3236 if (extra_configuration) {
3237 extra_argc = extra_dpdk_args(extra_configuration, &extra_args, 0);
3238 }
3239
3240 i = construct_dpdk_options(ovs_other_config, argv, argc, extra_args,
3241 extra_argc);
3242 i = construct_dpdk_mutex_options(ovs_other_config, argv, i, extra_args,
3243 extra_argc);
3244
3245 if (extra_configuration) {
3246 *argv = move_argv(argv, i, extra_args, extra_argc);
3247 }
3248
3249 return i + extra_argc;
3250 }
3251
3252 static char **dpdk_argv;
3253 static int dpdk_argc;
3254
3255 static void
3256 deferred_argv_release(void)
3257 {
3258 int result;
3259 for (result = 0; result < dpdk_argc; ++result) {
3260 free(dpdk_argv[result]);
3261 }
3262
3263 free(dpdk_argv);
3264 }
3265
3266 static void
3267 dpdk_init__(const struct smap *ovs_other_config)
3268 {
3269 char **argv = NULL;
3270 int result;
3271 int argc, argc_tmp;
3272 bool auto_determine = true;
3273 int err = 0;
3274 cpu_set_t cpuset;
3275 #ifndef VHOST_CUSE
3276 char *sock_dir_subcomponent;
3277 #endif
3278
3279 if (!smap_get_bool(ovs_other_config, "dpdk-init", false)) {
3280 VLOG_INFO("DPDK Disabled - to change this requires a restart.\n");
3281 return;
3282 }
3283
3284 VLOG_INFO("DPDK Enabled, initializing");
3285
3286 #ifdef VHOST_CUSE
3287 if (process_vhost_flags("cuse-dev-name", xstrdup("vhost-net"),
3288 PATH_MAX, ovs_other_config, &cuse_dev_name)) {
3289 #else
3290 if (process_vhost_flags("vhost-sock-dir", xstrdup(ovs_rundir()),
3291 NAME_MAX, ovs_other_config,
3292 &sock_dir_subcomponent)) {
3293 struct stat s;
3294 if (!strstr(sock_dir_subcomponent, "..")) {
3295 vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(),
3296 sock_dir_subcomponent);
3297
3298 err = stat(vhost_sock_dir, &s);
3299 if (err) {
3300 VLOG_ERR("vhost-user sock directory '%s' does not exist.",
3301 vhost_sock_dir);
3302 }
3303 } else {
3304 vhost_sock_dir = xstrdup(ovs_rundir());
3305 VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
3306 "characters '..' - using %s instead.",
3307 ovs_rundir(), sock_dir_subcomponent, ovs_rundir());
3308 }
3309 free(sock_dir_subcomponent);
3310 } else {
3311 vhost_sock_dir = sock_dir_subcomponent;
3312 #endif
3313 }
3314
3315 argv = grow_argv(&argv, 0, 1);
3316 argc = 1;
3317 argv[0] = xstrdup(ovs_get_program_name());
3318 argc_tmp = get_dpdk_args(ovs_other_config, &argv, argc);
3319
3320 while (argc_tmp != argc) {
3321 if (!strcmp("-c", argv[argc]) || !strcmp("-l", argv[argc])) {
3322 auto_determine = false;
3323 break;
3324 }
3325 argc++;
3326 }
3327 argc = argc_tmp;
3328
3329 /**
3330 * NOTE: This is an unsophisticated mechanism for determining the DPDK
3331 * lcore for the DPDK Master.
3332 */
3333 if (auto_determine) {
3334 int i;
3335 /* Get the main thread affinity */
3336 CPU_ZERO(&cpuset);
3337 err = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t),
3338 &cpuset);
3339 if (!err) {
3340 for (i = 0; i < CPU_SETSIZE; i++) {
3341 if (CPU_ISSET(i, &cpuset)) {
3342 argv = grow_argv(&argv, argc, 2);
3343 argv[argc++] = xstrdup("-c");
3344 argv[argc++] = xasprintf("0x%08llX", (1ULL<<i));
3345 i = CPU_SETSIZE;
3346 }
3347 }
3348 } else {
3349 VLOG_ERR("Thread getaffinity error %d. Using core 0x1", err);
3350 /* User did not set dpdk-lcore-mask and unable to get current
3351 * thread affintity - default to core 0x1 */
3352 argv = grow_argv(&argv, argc, 2);
3353 argv[argc++] = xstrdup("-c");
3354 argv[argc++] = xasprintf("0x%X", 1);
3355 }
3356 }
3357
3358 argv = grow_argv(&argv, argc, 1);
3359 argv[argc] = NULL;
3360
3361 optind = 1;
3362
3363 if (VLOG_IS_INFO_ENABLED()) {
3364 struct ds eal_args;
3365 int opt;
3366 ds_init(&eal_args);
3367 ds_put_cstr(&eal_args, "EAL ARGS:");
3368 for (opt = 0; opt < argc; ++opt) {
3369 ds_put_cstr(&eal_args, " ");
3370 ds_put_cstr(&eal_args, argv[opt]);
3371 }
3372 VLOG_INFO("%s", ds_cstr_ro(&eal_args));
3373 ds_destroy(&eal_args);
3374 }
3375
3376 /* Make sure things are initialized ... */
3377 result = rte_eal_init(argc, argv);
3378 if (result < 0) {
3379 ovs_abort(result, "Cannot init EAL");
3380 }
3381
3382 /* Set the main thread affinity back to pre rte_eal_init() value */
3383 if (auto_determine && !err) {
3384 err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
3385 &cpuset);
3386 if (err) {
3387 VLOG_ERR("Thread setaffinity error %d", err);
3388 }
3389 }
3390
3391 dpdk_argv = argv;
3392 dpdk_argc = argc;
3393
3394 atexit(deferred_argv_release);
3395
3396 rte_memzone_dump(stdout);
3397 rte_eal_init_ret = 0;
3398
3399 /* We are called from the main thread here */
3400 RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
3401
3402 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3403
3404 #ifdef VHOST_CUSE
3405 /* Register CUSE device to handle IOCTLs.
3406 * Unless otherwise specified, cuse_dev_name is set to vhost-net.
3407 */
3408 err = rte_vhost_driver_register(cuse_dev_name, 0);
3409
3410 if (err != 0) {
3411 VLOG_ERR("CUSE device setup failure.");
3412 return;
3413 }
3414 #endif
3415
3416 dpdk_vhost_class_init();
3417
3418 #ifdef DPDK_PDUMP
3419 VLOG_INFO("DPDK pdump packet capture enabled");
3420 err = rte_pdump_init(ovs_rundir());
3421 if (err) {
3422 VLOG_INFO("Error initialising DPDK pdump");
3423 rte_pdump_uninit();
3424 } else {
3425 char *server_socket_path;
3426
3427 server_socket_path = xasprintf("%s/%s", ovs_rundir(),
3428 "pdump_server_socket");
3429 fatal_signal_add_file_to_unlink(server_socket_path);
3430 free(server_socket_path);
3431 }
3432 #endif
3433
3434 /* Finally, register the dpdk classes */
3435 netdev_dpdk_register();
3436 }
3437
3438 void
3439 dpdk_init(const struct smap *ovs_other_config)
3440 {
3441 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3442
3443 if (ovs_other_config && ovsthread_once_start(&once)) {
3444 dpdk_init__(ovs_other_config);
3445 ovsthread_once_done(&once);
3446 }
3447 }
3448
3449 static const struct netdev_class dpdk_class =
3450 NETDEV_DPDK_CLASS(
3451 "dpdk",
3452 NULL,
3453 netdev_dpdk_construct,
3454 netdev_dpdk_destruct,
3455 netdev_dpdk_set_config,
3456 netdev_dpdk_set_tx_multiq,
3457 netdev_dpdk_eth_send,
3458 netdev_dpdk_get_carrier,
3459 netdev_dpdk_get_stats,
3460 netdev_dpdk_get_features,
3461 netdev_dpdk_get_status,
3462 netdev_dpdk_reconfigure,
3463 netdev_dpdk_rxq_recv);
3464
3465 static const struct netdev_class dpdk_ring_class =
3466 NETDEV_DPDK_CLASS(
3467 "dpdkr",
3468 NULL,
3469 netdev_dpdk_ring_construct,
3470 netdev_dpdk_destruct,
3471 netdev_dpdk_set_config,
3472 netdev_dpdk_set_tx_multiq,
3473 netdev_dpdk_ring_send,
3474 netdev_dpdk_get_carrier,
3475 netdev_dpdk_get_stats,
3476 netdev_dpdk_get_features,
3477 netdev_dpdk_get_status,
3478 netdev_dpdk_reconfigure,
3479 netdev_dpdk_rxq_recv);
3480
3481 static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
3482 NETDEV_DPDK_CLASS(
3483 "dpdkvhostcuse",
3484 dpdk_vhost_cuse_class_init,
3485 netdev_dpdk_vhost_cuse_construct,
3486 netdev_dpdk_vhost_destruct,
3487 NULL,
3488 NULL,
3489 netdev_dpdk_vhost_send,
3490 netdev_dpdk_vhost_get_carrier,
3491 netdev_dpdk_vhost_get_stats,
3492 NULL,
3493 NULL,
3494 netdev_dpdk_vhost_cuse_reconfigure,
3495 netdev_dpdk_vhost_rxq_recv);
3496
3497 static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
3498 NETDEV_DPDK_CLASS(
3499 "dpdkvhostuser",
3500 dpdk_vhost_user_class_init,
3501 netdev_dpdk_vhost_user_construct,
3502 netdev_dpdk_vhost_destruct,
3503 NULL,
3504 NULL,
3505 netdev_dpdk_vhost_send,
3506 netdev_dpdk_vhost_get_carrier,
3507 netdev_dpdk_vhost_get_stats,
3508 NULL,
3509 NULL,
3510 netdev_dpdk_vhost_user_reconfigure,
3511 netdev_dpdk_vhost_rxq_recv);
3512
3513 void
3514 netdev_dpdk_register(void)
3515 {
3516 dpdk_common_init();
3517 netdev_register_provider(&dpdk_class);
3518 netdev_register_provider(&dpdk_ring_class);
3519 #ifdef VHOST_CUSE
3520 netdev_register_provider(&dpdk_vhost_cuse_class);
3521 #else
3522 netdev_register_provider(&dpdk_vhost_user_class);
3523 #endif
3524 }
3525
3526 void
3527 dpdk_set_lcore_id(unsigned cpu)
3528 {
3529 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
3530 ovs_assert(cpu != NON_PMD_CORE_ID);
3531 RTE_PER_LCORE(_lcore_id) = cpu;
3532 }
3533
3534 static bool
3535 dpdk_thread_is_pmd(void)
3536 {
3537 return rte_lcore_id() != NON_PMD_CORE_ID;
3538 }