]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-dpdk.c
netdev-dpdk: Deprecate ring ports.
[mirror_ovs.git] / lib / netdev-dpdk.c
1 /*
2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "netdev-dpdk.h"
19
20 #include <errno.h>
21 #include <signal.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <linux/virtio_net.h>
26 #include <sys/socket.h>
27 #include <linux/if.h>
28
29 #include <rte_bus_pci.h>
30 #include <rte_config.h>
31 #include <rte_cycles.h>
32 #include <rte_errno.h>
33 #include <rte_eth_ring.h>
34 #include <rte_ethdev.h>
35 #include <rte_flow.h>
36 #include <rte_malloc.h>
37 #include <rte_mbuf.h>
38 #include <rte_meter.h>
39 #include <rte_pci.h>
40 #include <rte_version.h>
41 #include <rte_vhost.h>
42
43 #include "cmap.h"
44 #include "coverage.h"
45 #include "dirs.h"
46 #include "dp-packet.h"
47 #include "dpdk.h"
48 #include "dpif-netdev.h"
49 #include "fatal-signal.h"
50 #include "netdev-provider.h"
51 #include "netdev-vport.h"
52 #include "odp-util.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "openvswitch/list.h"
55 #include "openvswitch/match.h"
56 #include "openvswitch/ofp-print.h"
57 #include "openvswitch/shash.h"
58 #include "openvswitch/vlog.h"
59 #include "ovs-numa.h"
60 #include "ovs-rcu.h"
61 #include "ovs-thread.h"
62 #include "packets.h"
63 #include "smap.h"
64 #include "sset.h"
65 #include "timeval.h"
66 #include "unaligned.h"
67 #include "unixctl.h"
68 #include "util.h"
69 #include "uuid.h"
70
71 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
72
73 VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
74 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
75
76 COVERAGE_DEFINE(vhost_tx_contention);
77
78 #define DPDK_PORT_WATCHDOG_INTERVAL 5
79
80 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
81 #define OVS_VPORT_DPDK "ovs_dpdk"
82
83 /*
84 * need to reserve tons of extra space in the mbufs so we can align the
85 * DMA addresses to 4KB.
86 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
87 * performance for standard Ethernet MTU.
88 */
89 #define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
90 + (2 * VLAN_HEADER_LEN))
91 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
92 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
93 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
94 - ETHER_HDR_LEN - ETHER_CRC_LEN)
95 #define NETDEV_DPDK_MBUF_ALIGN 1024
96 #define NETDEV_DPDK_MAX_PKT_LEN 9728
97
98 /* Max and min number of packets in the mempool. OVS tries to allocate a
99 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
100 * enough hugepages) we keep halving the number until the allocation succeeds
101 * or we reach MIN_NB_MBUF */
102
103 #define MAX_NB_MBUF (4096 * 64)
104 #define MIN_NB_MBUF (4096 * 4)
105 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
106
107 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
108 BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
109 == 0);
110
111 /* The smallest possible NB_MBUF that we're going to try should be a multiple
112 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
113 BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
114 % MP_CACHE_SZ == 0);
115
116 #define SOCKET0 0
117
118 /* Default size of Physical NIC RXQ */
119 #define NIC_PORT_DEFAULT_RXQ_SIZE 2048
120 /* Default size of Physical NIC TXQ */
121 #define NIC_PORT_DEFAULT_TXQ_SIZE 2048
122 /* Maximum size of Physical NIC Queues */
123 #define NIC_PORT_MAX_Q_SIZE 4096
124
125 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
126 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
127 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
128 * yet mapped to another queue. */
129
130 #define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
131
132 /* DPDK library uses uint16_t for port_id. */
133 typedef uint16_t dpdk_port_t;
134 #define DPDK_PORT_ID_FMT "%"PRIu16
135
136 /* Minimum amount of vhost tx retries, effectively a disable. */
137 #define VHOST_ENQ_RETRY_MIN 0
138 /* Maximum amount of vhost tx retries. */
139 #define VHOST_ENQ_RETRY_MAX 32
140 /* Legacy default value for vhost tx retries. */
141 #define VHOST_ENQ_RETRY_DEF 8
142
143 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
144
145 static const struct rte_eth_conf port_conf = {
146 .rxmode = {
147 .mq_mode = ETH_MQ_RX_RSS,
148 .split_hdr_size = 0,
149 .offloads = 0,
150 },
151 .rx_adv_conf = {
152 .rss_conf = {
153 .rss_key = NULL,
154 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
155 },
156 },
157 .txmode = {
158 .mq_mode = ETH_MQ_TX_NONE,
159 },
160 };
161
162 /*
163 * These callbacks allow virtio-net devices to be added to vhost ports when
164 * configuration has been fully completed.
165 */
166 static int new_device(int vid);
167 static void destroy_device(int vid);
168 static int vring_state_changed(int vid, uint16_t queue_id, int enable);
169 static void destroy_connection(int vid);
170 static const struct vhost_device_ops virtio_net_device_ops =
171 {
172 .new_device = new_device,
173 .destroy_device = destroy_device,
174 .vring_state_changed = vring_state_changed,
175 .features_changed = NULL,
176 .new_connection = NULL,
177 .destroy_connection = destroy_connection,
178 };
179
180 /* Custom software stats for dpdk ports */
181 struct netdev_dpdk_sw_stats {
182 /* No. of retries when unable to transmit. */
183 uint64_t tx_retries;
184 /* Packet drops when unable to transmit; Probably Tx queue is full. */
185 uint64_t tx_failure_drops;
186 /* Packet length greater than device MTU. */
187 uint64_t tx_mtu_exceeded_drops;
188 /* Packet drops in egress policer processing. */
189 uint64_t tx_qos_drops;
190 /* Packet drops in ingress policer processing. */
191 uint64_t rx_qos_drops;
192 };
193
194 enum { DPDK_RING_SIZE = 256 };
195 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
196 enum { DRAIN_TSC = 200000ULL };
197
198 enum dpdk_dev_type {
199 DPDK_DEV_ETH = 0,
200 DPDK_DEV_VHOST = 1,
201 };
202
203 /* Quality of Service */
204
205 /* An instance of a QoS configuration. Always associated with a particular
206 * network device.
207 *
208 * Each QoS implementation subclasses this with whatever additional data it
209 * needs.
210 */
211 struct qos_conf {
212 const struct dpdk_qos_ops *ops;
213 rte_spinlock_t lock;
214 };
215
216 /* A particular implementation of dpdk QoS operations.
217 *
218 * The functions below return 0 if successful or a positive errno value on
219 * failure, except where otherwise noted. All of them must be provided, except
220 * where otherwise noted.
221 */
222 struct dpdk_qos_ops {
223
224 /* Name of the QoS type */
225 const char *qos_name;
226
227 /* Called to construct a qos_conf object. The implementation should make
228 * the appropriate calls to configure QoS according to 'details'.
229 *
230 * The contents of 'details' should be documented as valid for 'ovs_name'
231 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
232 * (which is built as ovs-vswitchd.conf.db(8)).
233 *
234 * This function must return 0 if and only if it sets '*conf' to an
235 * initialized 'struct qos_conf'.
236 *
237 * For all QoS implementations it should always be non-null.
238 */
239 int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
240
241 /* Destroys the data structures allocated by the implementation as part of
242 * 'qos_conf'.
243 *
244 * For all QoS implementations it should always be non-null.
245 */
246 void (*qos_destruct)(struct qos_conf *conf);
247
248 /* Retrieves details of 'conf' configuration into 'details'.
249 *
250 * The contents of 'details' should be documented as valid for 'ovs_name'
251 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
252 * (which is built as ovs-vswitchd.conf.db(8)).
253 */
254 int (*qos_get)(const struct qos_conf *conf, struct smap *details);
255
256 /* Returns true if 'conf' is already configured according to 'details'.
257 *
258 * The contents of 'details' should be documented as valid for 'ovs_name'
259 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
260 * (which is built as ovs-vswitchd.conf.db(8)).
261 *
262 * For all QoS implementations it should always be non-null.
263 */
264 bool (*qos_is_equal)(const struct qos_conf *conf,
265 const struct smap *details);
266
267 /* Modify an array of rte_mbufs. The modification is specific to
268 * each qos implementation.
269 *
270 * The function should take and array of mbufs and an int representing
271 * the current number of mbufs present in the array.
272 *
273 * After the function has performed a qos modification to the array of
274 * mbufs it returns an int representing the number of mbufs now present in
275 * the array. This value is can then be passed to the port send function
276 * along with the modified array for transmission.
277 *
278 * For all QoS implementations it should always be non-null.
279 */
280 int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
281 int pkt_cnt, bool should_steal);
282 };
283
284 /* dpdk_qos_ops for each type of user space QoS implementation */
285 static const struct dpdk_qos_ops egress_policer_ops;
286
287 /*
288 * Array of dpdk_qos_ops, contains pointer to all supported QoS
289 * operations.
290 */
291 static const struct dpdk_qos_ops *const qos_confs[] = {
292 &egress_policer_ops,
293 NULL
294 };
295
296 static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
297
298 /* Contains all 'struct dpdk_dev's. */
299 static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
300 = OVS_LIST_INITIALIZER(&dpdk_list);
301
302 static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
303 = OVS_MUTEX_INITIALIZER;
304
305 /* Contains all 'struct dpdk_mp's. */
306 static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
307 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
308
309 struct dpdk_mp {
310 struct rte_mempool *mp;
311 int mtu;
312 int socket_id;
313 int refcount;
314 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
315 };
316
317 /* There should be one 'struct dpdk_tx_queue' created for
318 * each netdev tx queue. */
319 struct dpdk_tx_queue {
320 /* Padding to make dpdk_tx_queue exactly one cache line long. */
321 PADDED_MEMBERS(CACHE_LINE_SIZE,
322 /* Protects the members and the NIC queue from concurrent access.
323 * It is used only if the queue is shared among different pmd threads
324 * (see 'concurrent_txq'). */
325 rte_spinlock_t tx_lock;
326 /* Mapping of configured vhost-user queue to enabled by guest. */
327 int map;
328 );
329 };
330
331 /* dpdk has no way to remove dpdk ring ethernet devices
332 so we have to keep them around once they've been created
333 */
334
335 static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
336 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
337
338 struct dpdk_ring {
339 /* For the client rings */
340 struct rte_ring *cring_tx;
341 struct rte_ring *cring_rx;
342 unsigned int user_port_id; /* User given port no, parsed from port name */
343 dpdk_port_t eth_port_id; /* ethernet device port id */
344 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
345 };
346
347 struct ingress_policer {
348 struct rte_meter_srtcm_params app_srtcm_params;
349 struct rte_meter_srtcm in_policer;
350 struct rte_meter_srtcm_profile in_prof;
351 rte_spinlock_t policer_lock;
352 };
353
354 enum dpdk_hw_ol_features {
355 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
356 NETDEV_RX_HW_CRC_STRIP = 1 << 1,
357 NETDEV_RX_HW_SCATTER = 1 << 2
358 };
359
360 /*
361 * In order to avoid confusion in variables names, following naming convention
362 * should be used, if possible:
363 *
364 * 'struct netdev' : 'netdev'
365 * 'struct netdev_dpdk' : 'dev'
366 * 'struct netdev_rxq' : 'rxq'
367 * 'struct netdev_rxq_dpdk' : 'rx'
368 *
369 * Example:
370 * struct netdev *netdev = netdev_from_name(name);
371 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
372 *
373 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
374 * already defined.
375 */
376
377 struct netdev_dpdk {
378 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
379 dpdk_port_t port_id;
380
381 /* If true, device was attached by rte_eth_dev_attach(). */
382 bool attached;
383 /* If true, rte_eth_dev_start() was successfully called */
384 bool started;
385 struct eth_addr hwaddr;
386 int mtu;
387 int socket_id;
388 int buf_size;
389 int max_packet_len;
390 enum dpdk_dev_type type;
391 enum netdev_flags flags;
392 int link_reset_cnt;
393 union {
394 /* Device arguments for dpdk ports. */
395 char *devargs;
396 /* Identifier used to distinguish vhost devices from each other. */
397 char *vhost_id;
398 };
399 struct dpdk_tx_queue *tx_q;
400 struct rte_eth_link link;
401 );
402
403 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
404 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
405 struct dpdk_mp *dpdk_mp;
406
407 /* virtio identifier for vhost devices */
408 ovsrcu_index vid;
409
410 /* True if vHost device is 'up' and has been reconfigured at least once */
411 bool vhost_reconfigured;
412
413 atomic_uint8_t vhost_tx_retries_max;
414 /* 2 pad bytes here. */
415 );
416
417 PADDED_MEMBERS(CACHE_LINE_SIZE,
418 struct netdev up;
419 /* In dpdk_list. */
420 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
421
422 /* QoS configuration and lock for the device */
423 OVSRCU_TYPE(struct qos_conf *) qos_conf;
424
425 /* Ingress Policer */
426 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
427 uint32_t policer_rate;
428 uint32_t policer_burst;
429
430 /* Array of vhost rxq states, see vring_state_changed. */
431 bool *vhost_rxq_enabled;
432 );
433
434 PADDED_MEMBERS(CACHE_LINE_SIZE,
435 struct netdev_stats stats;
436 struct netdev_dpdk_sw_stats *sw_stats;
437 /* Protects stats */
438 rte_spinlock_t stats_lock;
439 /* 36 pad bytes here. */
440 );
441
442 PADDED_MEMBERS(CACHE_LINE_SIZE,
443 /* The following properties cannot be changed when a device is running,
444 * so we remember the request and update them next time
445 * netdev_dpdk*_reconfigure() is called */
446 int requested_mtu;
447 int requested_n_txq;
448 int requested_n_rxq;
449 int requested_rxq_size;
450 int requested_txq_size;
451
452 /* Number of rx/tx descriptors for physical devices */
453 int rxq_size;
454 int txq_size;
455
456 /* Socket ID detected when vHost device is brought up */
457 int requested_socket_id;
458
459 /* Denotes whether vHost port is client/server mode */
460 uint64_t vhost_driver_flags;
461
462 /* DPDK-ETH Flow control */
463 struct rte_eth_fc_conf fc_conf;
464
465 /* DPDK-ETH hardware offload features,
466 * from the enum set 'dpdk_hw_ol_features' */
467 uint32_t hw_ol_features;
468
469 /* Properties for link state change detection mode.
470 * If lsc_interrupt_mode is set to false, poll mode is used,
471 * otherwise interrupt mode is used. */
472 bool requested_lsc_interrupt_mode;
473 bool lsc_interrupt_mode;
474 );
475
476 PADDED_MEMBERS(CACHE_LINE_SIZE,
477 /* Names of all XSTATS counters */
478 struct rte_eth_xstat_name *rte_xstats_names;
479 int rte_xstats_names_size;
480 int rte_xstats_ids_size;
481 uint64_t *rte_xstats_ids;
482 );
483 };
484
485 struct netdev_rxq_dpdk {
486 struct netdev_rxq up;
487 dpdk_port_t port_id;
488 };
489
490 static void netdev_dpdk_destruct(struct netdev *netdev);
491 static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
492
493 static int netdev_dpdk_get_sw_custom_stats(const struct netdev *,
494 struct netdev_custom_stats *);
495 static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
496
497 int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
498
499 struct ingress_policer *
500 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
501
502 static bool
503 is_dpdk_class(const struct netdev_class *class)
504 {
505 return class->destruct == netdev_dpdk_destruct
506 || class->destruct == netdev_dpdk_vhost_destruct;
507 }
508
509 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
510 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
511 * value, insufficient buffers are allocated to accomodate the packet in its
512 * entirety. Furthermore, certain drivers need to ensure that there is also
513 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
514 * frames). If the RX buffer is too small, then the driver enables scatter RX
515 * behaviour, which reduces performance. To prevent this, use a buffer size
516 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
517 */
518 static uint32_t
519 dpdk_buf_size(int mtu)
520 {
521 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
522 + RTE_PKTMBUF_HEADROOM;
523 }
524
525 /* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
526 *
527 * Unlike xmalloc(), this function can return NULL on failure. */
528 static void *
529 dpdk_rte_mzalloc(size_t sz)
530 {
531 return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
532 }
533
534 void
535 free_dpdk_buf(struct dp_packet *p)
536 {
537 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
538
539 rte_pktmbuf_free(pkt);
540 }
541
542 static void
543 ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
544 void *opaque_arg OVS_UNUSED,
545 void *_p,
546 unsigned i OVS_UNUSED)
547 {
548 struct rte_mbuf *pkt = _p;
549
550 dp_packet_init_dpdk((struct dp_packet *) pkt);
551 }
552
553 static int
554 dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
555 {
556 /* At this point we want to know if all the mbufs are back
557 * in the mempool. rte_mempool_full() is not atomic but it's
558 * the best available and as we are no longer requesting mbufs
559 * from the mempool, it means mbufs will not move from
560 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
561 * the ring is counted before caches, so we won't get false
562 * positives in this use case and we handle false negatives.
563 *
564 * If future implementations of rte_mempool_full() were to change
565 * it could be possible for a false positive. Even that would
566 * likely be ok, as there are additional checks during mempool
567 * freeing but it would make things racey.
568 */
569 return rte_mempool_full(mp);
570 }
571
572 /* Free unused mempools. */
573 static void
574 dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
575 {
576 struct dpdk_mp *dmp, *next;
577
578 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
579 if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
580 VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
581 ovs_list_remove(&dmp->list_node);
582 rte_mempool_free(dmp->mp);
583 rte_free(dmp);
584 }
585 }
586 }
587
588 /* Calculating the required number of mbufs differs depending on the
589 * mempool model being used. Check if per port memory is in use before
590 * calculating.
591 */
592 static uint32_t
593 dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
594 {
595 uint32_t n_mbufs;
596
597 if (!per_port_mp) {
598 /* Shared memory are being used.
599 * XXX: this is a really rough method of provisioning memory.
600 * It's impossible to determine what the exact memory requirements are
601 * when the number of ports and rxqs that utilize a particular mempool
602 * can change dynamically at runtime. For now, use this rough
603 * heurisitic.
604 */
605 if (mtu >= ETHER_MTU) {
606 n_mbufs = MAX_NB_MBUF;
607 } else {
608 n_mbufs = MIN_NB_MBUF;
609 }
610 } else {
611 /* Per port memory is being used.
612 * XXX: rough estimation of number of mbufs required for this port:
613 * <packets required to fill the device rxqs>
614 * + <packets that could be stuck on other ports txqs>
615 * + <packets in the pmd threads>
616 * + <additional memory for corner cases>
617 */
618 n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
619 + dev->requested_n_txq * dev->requested_txq_size
620 + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
621 + MIN_NB_MBUF;
622 }
623
624 return n_mbufs;
625 }
626
627 static struct dpdk_mp *
628 dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
629 {
630 char mp_name[RTE_MEMPOOL_NAMESIZE];
631 const char *netdev_name = netdev_get_name(&dev->up);
632 int socket_id = dev->requested_socket_id;
633 uint32_t n_mbufs = 0;
634 uint32_t mbuf_size = 0;
635 uint32_t aligned_mbuf_size = 0;
636 uint32_t mbuf_priv_data_len = 0;
637 uint32_t pkt_size = 0;
638 uint32_t hash = hash_string(netdev_name, 0);
639 struct dpdk_mp *dmp = NULL;
640 int ret;
641
642 dmp = dpdk_rte_mzalloc(sizeof *dmp);
643 if (!dmp) {
644 return NULL;
645 }
646 dmp->socket_id = socket_id;
647 dmp->mtu = mtu;
648 dmp->refcount = 1;
649
650 /* Get the size of each mbuf, based on the MTU */
651 mbuf_size = MTU_TO_FRAME_LEN(mtu);
652
653 n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);
654
655 do {
656 /* Full DPDK memory pool name must be unique and cannot be
657 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
658 * mempool case this can result in one device using a mempool
659 * which references a different device in it's name. However as
660 * mempool names are hashed, the device name will not be readable
661 * so this is not an issue for tasks such as debugging.
662 */
663 ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
664 "ovs%08x%02d%05d%07u",
665 hash, socket_id, mtu, n_mbufs);
666 if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
667 VLOG_DBG("snprintf returned %d. "
668 "Failed to generate a mempool name for \"%s\". "
669 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
670 ret, netdev_name, hash, socket_id, mtu, n_mbufs);
671 break;
672 }
673
674 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
675 "on socket %d for %d Rx and %d Tx queues, "
676 "cache line size of %u",
677 netdev_name, n_mbufs, mbuf_size, socket_id,
678 dev->requested_n_rxq, dev->requested_n_txq,
679 RTE_CACHE_LINE_SIZE);
680
681 /* The size of the mbuf's private area (i.e. area that holds OvS'
682 * dp_packet data)*/
683 mbuf_priv_data_len = sizeof(struct dp_packet) -
684 sizeof(struct rte_mbuf);
685 /* The size of the entire dp_packet. */
686 pkt_size = sizeof(struct dp_packet) + mbuf_size;
687 /* mbuf size, rounded up to cacheline size. */
688 aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
689 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
690 * This maintains mbuf size cache alignment, while also honoring RX
691 * buffer alignment in the data portion of the mbuf. If this adjustment
692 * is not made, there is a possiblity later on that for an element of
693 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
694 * This is problematic in the case of multi-segment mbufs, particularly
695 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
696 * header, for example.
697 */
698 mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
699
700 dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
701 mbuf_priv_data_len,
702 mbuf_size,
703 socket_id);
704
705 if (dmp->mp) {
706 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
707 mp_name, n_mbufs);
708 /* rte_pktmbuf_pool_create has done some initialization of the
709 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
710 * initializes some OVS specific fields of dp_packet.
711 */
712 rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
713 return dmp;
714 } else if (rte_errno == EEXIST) {
715 /* A mempool with the same name already exists. We just
716 * retrieve its pointer to be returned to the caller. */
717 dmp->mp = rte_mempool_lookup(mp_name);
718 /* As the mempool create returned EEXIST we can expect the
719 * lookup has returned a valid pointer. If for some reason
720 * that's not the case we keep track of it. */
721 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
722 mp_name, dmp->mp);
723 return dmp;
724 } else {
725 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
726 "%u mbufs, retrying with %u mbufs",
727 mp_name, n_mbufs, n_mbufs / 2);
728 }
729 } while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
730
731 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
732 mp_name, n_mbufs);
733
734 rte_free(dmp);
735 return NULL;
736 }
737
738 static struct dpdk_mp *
739 dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
740 {
741 struct dpdk_mp *dmp, *next;
742 bool reuse = false;
743
744 ovs_mutex_lock(&dpdk_mp_mutex);
745 /* Check if shared memory is being used, if so check existing mempools
746 * to see if reuse is possible. */
747 if (!per_port_mp) {
748 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
749 if (dmp->socket_id == dev->requested_socket_id
750 && dmp->mtu == mtu) {
751 VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
752 dmp->refcount++;
753 reuse = true;
754 break;
755 }
756 }
757 }
758 /* Sweep mempools after reuse or before create. */
759 dpdk_mp_sweep();
760
761 if (!reuse) {
762 dmp = dpdk_mp_create(dev, mtu, per_port_mp);
763 if (dmp) {
764 /* Shared memory will hit the reuse case above so will not
765 * request a mempool that already exists but we need to check
766 * for the EEXIST case for per port memory case. Compare the
767 * mempool returned by dmp to each entry in dpdk_mp_list. If a
768 * match is found, free dmp as a new entry is not required, set
769 * dmp to point to the existing entry and increment the refcount
770 * to avoid being freed at a later stage.
771 */
772 if (per_port_mp && rte_errno == EEXIST) {
773 LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
774 if (dmp->mp == next->mp) {
775 rte_free(dmp);
776 dmp = next;
777 dmp->refcount++;
778 }
779 }
780 } else {
781 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
782 }
783 }
784 }
785
786 ovs_mutex_unlock(&dpdk_mp_mutex);
787
788 return dmp;
789 }
790
791 /* Decrement reference to a mempool. */
792 static void
793 dpdk_mp_put(struct dpdk_mp *dmp)
794 {
795 if (!dmp) {
796 return;
797 }
798
799 ovs_mutex_lock(&dpdk_mp_mutex);
800 ovs_assert(dmp->refcount);
801 dmp->refcount--;
802 ovs_mutex_unlock(&dpdk_mp_mutex);
803 }
804
805 /* Depending on the memory model being used this function tries to
806 * identify and reuse an existing mempool or tries to allocate a new
807 * mempool on requested_socket_id with mbuf size corresponding to the
808 * requested_mtu. On success, a new configuration will be applied.
809 * On error, device will be left unchanged. */
810 static int
811 netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
812 OVS_REQUIRES(dev->mutex)
813 {
814 uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
815 struct dpdk_mp *dmp;
816 int ret = 0;
817 bool per_port_mp = dpdk_per_port_memory();
818
819 /* With shared memory we do not need to configure a mempool if the MTU
820 * and socket ID have not changed, the previous configuration is still
821 * valid so return 0 */
822 if (!per_port_mp && dev->mtu == dev->requested_mtu
823 && dev->socket_id == dev->requested_socket_id) {
824 return ret;
825 }
826
827 dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
828 if (!dmp) {
829 VLOG_ERR("Failed to create memory pool for netdev "
830 "%s, with MTU %d on socket %d: %s\n",
831 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
832 rte_strerror(rte_errno));
833 ret = rte_errno;
834 } else {
835 /* Check for any pre-existing dpdk_mp for the device before accessing
836 * the associated mempool.
837 */
838 if (dev->dpdk_mp != NULL) {
839 /* A new MTU was requested, decrement the reference count for the
840 * devices current dpdk_mp. This is required even if a pointer to
841 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
842 * has already been incremented by dpdk_mp_get at this stage so it
843 * must be decremented to keep an accurate refcount for the
844 * dpdk_mp.
845 */
846 dpdk_mp_put(dev->dpdk_mp);
847 }
848 dev->dpdk_mp = dmp;
849 dev->mtu = dev->requested_mtu;
850 dev->socket_id = dev->requested_socket_id;
851 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
852 }
853
854 return ret;
855 }
856
857 static void
858 check_link_status(struct netdev_dpdk *dev)
859 {
860 struct rte_eth_link link;
861
862 rte_eth_link_get_nowait(dev->port_id, &link);
863
864 if (dev->link.link_status != link.link_status) {
865 netdev_change_seq_changed(&dev->up);
866
867 dev->link_reset_cnt++;
868 dev->link = link;
869 if (dev->link.link_status) {
870 VLOG_DBG_RL(&rl,
871 "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
872 dev->port_id, (unsigned) dev->link.link_speed,
873 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
874 ? "full-duplex" : "half-duplex");
875 } else {
876 VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
877 dev->port_id);
878 }
879 }
880 }
881
882 static void *
883 dpdk_watchdog(void *dummy OVS_UNUSED)
884 {
885 struct netdev_dpdk *dev;
886
887 pthread_detach(pthread_self());
888
889 for (;;) {
890 ovs_mutex_lock(&dpdk_mutex);
891 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
892 ovs_mutex_lock(&dev->mutex);
893 if (dev->type == DPDK_DEV_ETH) {
894 check_link_status(dev);
895 }
896 ovs_mutex_unlock(&dev->mutex);
897 }
898 ovs_mutex_unlock(&dpdk_mutex);
899 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
900 }
901
902 return NULL;
903 }
904
905 static int
906 dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
907 {
908 int diag = 0;
909 int i;
910 struct rte_eth_conf conf = port_conf;
911 struct rte_eth_dev_info info;
912 uint16_t conf_mtu;
913
914 rte_eth_dev_info_get(dev->port_id, &info);
915
916 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
917 * scatter to support jumbo RX.
918 * Setting scatter for the device is done after checking for
919 * scatter support in the device capabilites. */
920 if (dev->mtu > ETHER_MTU) {
921 if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
922 conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
923 }
924 }
925
926 conf.intr_conf.lsc = dev->lsc_interrupt_mode;
927
928 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
929 conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
930 }
931
932 if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
933 && info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
934 conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
935 }
936
937 /* Limit configured rss hash functions to only those supported
938 * by the eth device. */
939 conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
940
941 /* A device may report more queues than it makes available (this has
942 * been observed for Intel xl710, which reserves some of them for
943 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
944 * available. When this happens we can retry the configuration
945 * and request less queues */
946 while (n_rxq && n_txq) {
947 if (diag) {
948 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
949 }
950
951 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
952 if (diag) {
953 VLOG_WARN("Interface %s eth_dev setup error %s\n",
954 dev->up.name, rte_strerror(-diag));
955 break;
956 }
957
958 diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
959 if (diag) {
960 /* A device may not support rte_eth_dev_set_mtu, in this case
961 * flag a warning to the user and include the devices configured
962 * MTU value that will be used instead. */
963 if (-ENOTSUP == diag) {
964 rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
965 VLOG_WARN("Interface %s does not support MTU configuration, "
966 "max packet size supported is %"PRIu16".",
967 dev->up.name, conf_mtu);
968 } else {
969 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
970 dev->up.name, dev->mtu, rte_strerror(-diag));
971 break;
972 }
973 }
974
975 for (i = 0; i < n_txq; i++) {
976 diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
977 dev->socket_id, NULL);
978 if (diag) {
979 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
980 dev->up.name, i, rte_strerror(-diag));
981 break;
982 }
983 }
984
985 if (i != n_txq) {
986 /* Retry with less tx queues */
987 n_txq = i;
988 continue;
989 }
990
991 for (i = 0; i < n_rxq; i++) {
992 diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
993 dev->socket_id, NULL,
994 dev->dpdk_mp->mp);
995 if (diag) {
996 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
997 dev->up.name, i, rte_strerror(-diag));
998 break;
999 }
1000 }
1001
1002 if (i != n_rxq) {
1003 /* Retry with less rx queues */
1004 n_rxq = i;
1005 continue;
1006 }
1007
1008 dev->up.n_rxq = n_rxq;
1009 dev->up.n_txq = n_txq;
1010
1011 return 0;
1012 }
1013
1014 return diag;
1015 }
1016
1017 static void
1018 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
1019 {
1020 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
1021 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
1022 dev->port_id);
1023 }
1024 }
1025
1026 static int
1027 dpdk_eth_dev_init(struct netdev_dpdk *dev)
1028 OVS_REQUIRES(dev->mutex)
1029 {
1030 struct rte_pktmbuf_pool_private *mbp_priv;
1031 struct rte_eth_dev_info info;
1032 struct ether_addr eth_addr;
1033 int diag;
1034 int n_rxq, n_txq;
1035 uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
1036 DEV_RX_OFFLOAD_TCP_CKSUM |
1037 DEV_RX_OFFLOAD_IPV4_CKSUM;
1038
1039 rte_eth_dev_info_get(dev->port_id, &info);
1040
1041 if (strstr(info.driver_name, "vf") != NULL) {
1042 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1043 dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
1044 } else {
1045 dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
1046 }
1047
1048 if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
1049 rx_chksm_offload_capa) {
1050 VLOG_WARN("Rx checksum offload is not supported on port "
1051 DPDK_PORT_ID_FMT, dev->port_id);
1052 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
1053 } else {
1054 dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
1055 }
1056
1057 if (info.rx_offload_capa & DEV_RX_OFFLOAD_SCATTER) {
1058 dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
1059 } else {
1060 /* Do not warn on lack of scatter support */
1061 dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
1062 }
1063
1064 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
1065 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
1066
1067 diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
1068 if (diag) {
1069 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1070 "configure error: %s",
1071 dev->up.name, n_rxq, n_txq,
1072 dev->lsc_interrupt_mode ? "true" : "false",
1073 rte_strerror(-diag));
1074 return -diag;
1075 }
1076
1077 diag = rte_eth_dev_start(dev->port_id);
1078 if (diag) {
1079 VLOG_ERR("Interface %s start error: %s", dev->up.name,
1080 rte_strerror(-diag));
1081 return -diag;
1082 }
1083 dev->started = true;
1084
1085 rte_eth_promiscuous_enable(dev->port_id);
1086 rte_eth_allmulticast_enable(dev->port_id);
1087
1088 memset(&eth_addr, 0x0, sizeof(eth_addr));
1089 rte_eth_macaddr_get(dev->port_id, &eth_addr);
1090 VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
1091 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
1092
1093 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
1094 rte_eth_link_get_nowait(dev->port_id, &dev->link);
1095
1096 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
1097 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
1098 return 0;
1099 }
1100
1101 static struct netdev_dpdk *
1102 netdev_dpdk_cast(const struct netdev *netdev)
1103 {
1104 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
1105 }
1106
1107 static struct netdev *
1108 netdev_dpdk_alloc(void)
1109 {
1110 struct netdev_dpdk *dev;
1111
1112 dev = dpdk_rte_mzalloc(sizeof *dev);
1113 if (dev) {
1114 return &dev->up;
1115 }
1116
1117 return NULL;
1118 }
1119
1120 static struct dpdk_tx_queue *
1121 netdev_dpdk_alloc_txq(unsigned int n_txqs)
1122 {
1123 struct dpdk_tx_queue *txqs;
1124 unsigned i;
1125
1126 txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
1127 if (txqs) {
1128 for (i = 0; i < n_txqs; i++) {
1129 /* Initialize map for vhost devices. */
1130 txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
1131 rte_spinlock_init(&txqs[i].tx_lock);
1132 }
1133 }
1134
1135 return txqs;
1136 }
1137
1138 static int
1139 common_construct(struct netdev *netdev, dpdk_port_t port_no,
1140 enum dpdk_dev_type type, int socket_id)
1141 OVS_REQUIRES(dpdk_mutex)
1142 {
1143 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1144
1145 ovs_mutex_init(&dev->mutex);
1146
1147 rte_spinlock_init(&dev->stats_lock);
1148
1149 /* If the 'sid' is negative, it means that the kernel fails
1150 * to obtain the pci numa info. In that situation, always
1151 * use 'SOCKET0'. */
1152 dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
1153 dev->requested_socket_id = dev->socket_id;
1154 dev->port_id = port_no;
1155 dev->type = type;
1156 dev->flags = 0;
1157 dev->requested_mtu = ETHER_MTU;
1158 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1159 dev->requested_lsc_interrupt_mode = 0;
1160 ovsrcu_index_init(&dev->vid, -1);
1161 dev->vhost_reconfigured = false;
1162 dev->attached = false;
1163
1164 ovsrcu_init(&dev->qos_conf, NULL);
1165
1166 ovsrcu_init(&dev->ingress_policer, NULL);
1167 dev->policer_rate = 0;
1168 dev->policer_burst = 0;
1169
1170 netdev->n_rxq = 0;
1171 netdev->n_txq = 0;
1172 dev->requested_n_rxq = NR_QUEUE;
1173 dev->requested_n_txq = NR_QUEUE;
1174 dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
1175 dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
1176
1177 /* Initialize the flow control to NULL */
1178 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
1179
1180 /* Initilize the hardware offload flags to 0 */
1181 dev->hw_ol_features = 0;
1182
1183 dev->flags = NETDEV_UP | NETDEV_PROMISC;
1184
1185 ovs_list_push_back(&dpdk_list, &dev->list_node);
1186
1187 netdev_request_reconfigure(netdev);
1188
1189 dev->rte_xstats_names = NULL;
1190 dev->rte_xstats_names_size = 0;
1191
1192 dev->rte_xstats_ids = NULL;
1193 dev->rte_xstats_ids_size = 0;
1194
1195 dev->sw_stats = xzalloc(sizeof *dev->sw_stats);
1196 dev->sw_stats->tx_retries = (dev->type == DPDK_DEV_VHOST) ? 0 : UINT64_MAX;
1197
1198 return 0;
1199 }
1200
1201 /* dev_name must be the prefix followed by a positive decimal number.
1202 * (no leading + or - signs are allowed) */
1203 static int
1204 dpdk_dev_parse_name(const char dev_name[], const char prefix[],
1205 unsigned int *port_no)
1206 {
1207 const char *cport;
1208
1209 if (strncmp(dev_name, prefix, strlen(prefix))) {
1210 return ENODEV;
1211 }
1212
1213 cport = dev_name + strlen(prefix);
1214
1215 if (str_to_uint(cport, 10, port_no)) {
1216 return 0;
1217 } else {
1218 return ENODEV;
1219 }
1220 }
1221
1222 /* Get the number of OVS interfaces which have the same DPDK
1223 * rte device (e.g. same pci bus address).
1224 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1225 */
1226 static int
1227 netdev_dpdk_get_num_ports(struct rte_device *device)
1228 OVS_REQUIRES(dpdk_mutex)
1229 {
1230 struct netdev_dpdk *dev;
1231 int count = 0;
1232
1233 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1234 if (rte_eth_devices[dev->port_id].device == device
1235 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
1236 count++;
1237 }
1238 }
1239 return count;
1240 }
1241
1242 static int
1243 vhost_common_construct(struct netdev *netdev)
1244 OVS_REQUIRES(dpdk_mutex)
1245 {
1246 int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
1247 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1248
1249 dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
1250 sizeof *dev->vhost_rxq_enabled);
1251 if (!dev->vhost_rxq_enabled) {
1252 return ENOMEM;
1253 }
1254 dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
1255 if (!dev->tx_q) {
1256 rte_free(dev->vhost_rxq_enabled);
1257 return ENOMEM;
1258 }
1259
1260 atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
1261
1262 return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1263 DPDK_DEV_VHOST, socket_id);
1264 }
1265
1266 static int
1267 netdev_dpdk_vhost_construct(struct netdev *netdev)
1268 {
1269 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1270 const char *name = netdev->name;
1271 int err;
1272
1273 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1274 * the file system. '/' or '\' would traverse directories, so they're not
1275 * acceptable in 'name'. */
1276 if (strchr(name, '/') || strchr(name, '\\')) {
1277 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1278 "A valid name must not include '/' or '\\'",
1279 name);
1280 return EINVAL;
1281 }
1282
1283 ovs_mutex_lock(&dpdk_mutex);
1284 /* Take the name of the vhost-user port and append it to the location where
1285 * the socket is to be created, then register the socket.
1286 */
1287 dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name);
1288
1289 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
1290 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
1291 if (err) {
1292 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
1293 dev->vhost_id);
1294 goto out;
1295 } else {
1296 fatal_signal_add_file_to_unlink(dev->vhost_id);
1297 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1298 dev->vhost_id, name);
1299 }
1300
1301 err = rte_vhost_driver_callback_register(dev->vhost_id,
1302 &virtio_net_device_ops);
1303 if (err) {
1304 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1305 "port: %s\n", name);
1306 goto out;
1307 }
1308
1309 err = rte_vhost_driver_disable_features(dev->vhost_id,
1310 1ULL << VIRTIO_NET_F_HOST_TSO4
1311 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1312 | 1ULL << VIRTIO_NET_F_CSUM);
1313 if (err) {
1314 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1315 "port: %s\n", name);
1316 goto out;
1317 }
1318
1319 err = rte_vhost_driver_start(dev->vhost_id);
1320 if (err) {
1321 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1322 "port: %s\n", name);
1323 goto out;
1324 }
1325
1326 err = vhost_common_construct(netdev);
1327 if (err) {
1328 VLOG_ERR("vhost_common_construct failed for vhost user "
1329 "port: %s\n", name);
1330 }
1331
1332 out:
1333 if (err) {
1334 free(dev->vhost_id);
1335 dev->vhost_id = NULL;
1336 }
1337
1338 ovs_mutex_unlock(&dpdk_mutex);
1339 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1340 "please migrate to dpdkvhostuserclient ports.");
1341 return err;
1342 }
1343
1344 static int
1345 netdev_dpdk_vhost_client_construct(struct netdev *netdev)
1346 {
1347 int err;
1348
1349 ovs_mutex_lock(&dpdk_mutex);
1350 err = vhost_common_construct(netdev);
1351 if (err) {
1352 VLOG_ERR("vhost_common_construct failed for vhost user client"
1353 "port: %s\n", netdev->name);
1354 }
1355 ovs_mutex_unlock(&dpdk_mutex);
1356 return err;
1357 }
1358
1359 static int
1360 netdev_dpdk_construct(struct netdev *netdev)
1361 {
1362 int err;
1363
1364 ovs_mutex_lock(&dpdk_mutex);
1365 err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1366 DPDK_DEV_ETH, SOCKET0);
1367 ovs_mutex_unlock(&dpdk_mutex);
1368 return err;
1369 }
1370
1371 static void
1372 common_destruct(struct netdev_dpdk *dev)
1373 OVS_REQUIRES(dpdk_mutex)
1374 OVS_EXCLUDED(dev->mutex)
1375 {
1376 rte_free(dev->tx_q);
1377 dpdk_mp_put(dev->dpdk_mp);
1378
1379 ovs_list_remove(&dev->list_node);
1380 free(ovsrcu_get_protected(struct ingress_policer *,
1381 &dev->ingress_policer));
1382 free(dev->sw_stats);
1383 ovs_mutex_destroy(&dev->mutex);
1384 }
1385
1386 static void
1387 netdev_dpdk_destruct(struct netdev *netdev)
1388 {
1389 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1390 struct rte_device *rte_dev;
1391 struct rte_eth_dev *eth_dev;
1392 bool remove_on_close;
1393
1394 ovs_mutex_lock(&dpdk_mutex);
1395
1396 rte_eth_dev_stop(dev->port_id);
1397 dev->started = false;
1398
1399 if (dev->attached) {
1400 /* Retrieve eth device data before closing it.
1401 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1402 */
1403 eth_dev = &rte_eth_devices[dev->port_id];
1404 remove_on_close =
1405 eth_dev->data &&
1406 (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
1407 rte_dev = eth_dev->device;
1408
1409 /* Remove the eth device. */
1410 rte_eth_dev_close(dev->port_id);
1411
1412 /* Remove this rte device and all its eth devices if flag
1413 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1414 * are not supported), or if all the eth devices belonging to the rte
1415 * device are closed.
1416 */
1417 if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
1418 int ret = rte_dev_remove(rte_dev);
1419
1420 if (ret < 0) {
1421 VLOG_ERR("Device '%s' can not be detached: %s.",
1422 dev->devargs, rte_strerror(-ret));
1423 } else {
1424 /* Device was closed and detached. */
1425 VLOG_INFO("Device '%s' has been removed and detached",
1426 dev->devargs);
1427 }
1428 } else {
1429 /* Device was only closed. rte_dev_remove() was not called. */
1430 VLOG_INFO("Device '%s' has been removed", dev->devargs);
1431 }
1432 }
1433
1434 netdev_dpdk_clear_xstats(dev);
1435 free(dev->devargs);
1436 common_destruct(dev);
1437
1438 ovs_mutex_unlock(&dpdk_mutex);
1439 }
1440
1441 /* rte_vhost_driver_unregister() can call back destroy_device(), which will
1442 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1443 * deadlock, none of the mutexes must be held while calling this function. */
1444 static int
1445 dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
1446 char *vhost_id)
1447 OVS_EXCLUDED(dpdk_mutex)
1448 OVS_EXCLUDED(dev->mutex)
1449 {
1450 return rte_vhost_driver_unregister(vhost_id);
1451 }
1452
1453 static void
1454 netdev_dpdk_vhost_destruct(struct netdev *netdev)
1455 {
1456 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1457 char *vhost_id;
1458
1459 ovs_mutex_lock(&dpdk_mutex);
1460
1461 /* Guest becomes an orphan if still attached. */
1462 if (netdev_dpdk_get_vid(dev) >= 0
1463 && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1464 VLOG_ERR("Removing port '%s' while vhost device still attached.",
1465 netdev->name);
1466 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1467 "socket '%s' must be restarted.", dev->vhost_id);
1468 }
1469
1470 vhost_id = dev->vhost_id;
1471 dev->vhost_id = NULL;
1472 rte_free(dev->vhost_rxq_enabled);
1473
1474 common_destruct(dev);
1475
1476 ovs_mutex_unlock(&dpdk_mutex);
1477
1478 if (!vhost_id) {
1479 goto out;
1480 }
1481
1482 if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
1483 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1484 netdev->name, vhost_id);
1485 } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1486 /* OVS server mode - remove this socket from list for deletion */
1487 fatal_signal_remove_file_to_unlink(vhost_id);
1488 }
1489 out:
1490 free(vhost_id);
1491 }
1492
1493 static void
1494 netdev_dpdk_dealloc(struct netdev *netdev)
1495 {
1496 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1497
1498 rte_free(dev);
1499 }
1500
1501 static void
1502 netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
1503 {
1504 /* If statistics are already allocated, we have to
1505 * reconfigure, as port_id could have been changed. */
1506 if (dev->rte_xstats_names) {
1507 free(dev->rte_xstats_names);
1508 dev->rte_xstats_names = NULL;
1509 dev->rte_xstats_names_size = 0;
1510 }
1511 if (dev->rte_xstats_ids) {
1512 free(dev->rte_xstats_ids);
1513 dev->rte_xstats_ids = NULL;
1514 dev->rte_xstats_ids_size = 0;
1515 }
1516 }
1517
1518 static const char*
1519 netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
1520 {
1521 if (id >= dev->rte_xstats_names_size) {
1522 return "UNKNOWN";
1523 }
1524 return dev->rte_xstats_names[id].name;
1525 }
1526
1527 static bool
1528 netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
1529 OVS_REQUIRES(dev->mutex)
1530 {
1531 int rte_xstats_len;
1532 bool ret;
1533 struct rte_eth_xstat *rte_xstats;
1534 uint64_t id;
1535 int xstats_no;
1536 const char *name;
1537
1538 /* Retrieving all XSTATS names. If something will go wrong
1539 * or amount of counters will be equal 0, rte_xstats_names
1540 * buffer will be marked as NULL, and any further xstats
1541 * query won't be performed (e.g. during netdev_dpdk_get_stats
1542 * execution). */
1543
1544 ret = false;
1545 rte_xstats = NULL;
1546
1547 if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
1548 dev->rte_xstats_names_size =
1549 rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1550
1551 if (dev->rte_xstats_names_size < 0) {
1552 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
1553 dev->port_id);
1554 dev->rte_xstats_names_size = 0;
1555 } else {
1556 /* Reserve memory for xstats names and values */
1557 dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
1558 sizeof *dev->rte_xstats_names);
1559
1560 if (dev->rte_xstats_names) {
1561 /* Retreive xstats names */
1562 rte_xstats_len =
1563 rte_eth_xstats_get_names(dev->port_id,
1564 dev->rte_xstats_names,
1565 dev->rte_xstats_names_size);
1566
1567 if (rte_xstats_len < 0) {
1568 VLOG_WARN("Cannot get XSTATS names for port: "
1569 DPDK_PORT_ID_FMT, dev->port_id);
1570 goto out;
1571 } else if (rte_xstats_len != dev->rte_xstats_names_size) {
1572 VLOG_WARN("XSTATS size doesn't match for port: "
1573 DPDK_PORT_ID_FMT, dev->port_id);
1574 goto out;
1575 }
1576
1577 dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
1578 sizeof(uint64_t));
1579
1580 /* We have to calculate number of counters */
1581 rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
1582 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1583
1584 /* Retreive xstats values */
1585 if (rte_eth_xstats_get(dev->port_id, rte_xstats,
1586 rte_xstats_len) > 0) {
1587 dev->rte_xstats_ids_size = 0;
1588 xstats_no = 0;
1589 for (uint32_t i = 0; i < rte_xstats_len; i++) {
1590 id = rte_xstats[i].id;
1591 name = netdev_dpdk_get_xstat_name(dev, id);
1592 /* We need to filter out everything except
1593 * dropped, error and management counters */
1594 if (string_ends_with(name, "_errors") ||
1595 strstr(name, "_management_") ||
1596 string_ends_with(name, "_dropped")) {
1597
1598 dev->rte_xstats_ids[xstats_no] = id;
1599 xstats_no++;
1600 }
1601 }
1602 dev->rte_xstats_ids_size = xstats_no;
1603 ret = true;
1604 } else {
1605 VLOG_WARN("Can't get XSTATS IDs for port: "
1606 DPDK_PORT_ID_FMT, dev->port_id);
1607 }
1608
1609 free(rte_xstats);
1610 }
1611 }
1612 } else {
1613 /* Already configured */
1614 ret = true;
1615 }
1616
1617 out:
1618 if (!ret) {
1619 netdev_dpdk_clear_xstats(dev);
1620 }
1621 return ret;
1622 }
1623
1624 static int
1625 netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
1626 {
1627 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1628
1629 ovs_mutex_lock(&dev->mutex);
1630
1631 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
1632 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
1633 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1634 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
1635 smap_add_format(args, "mtu", "%d", dev->mtu);
1636
1637 if (dev->type == DPDK_DEV_ETH) {
1638 smap_add_format(args, "requested_rxq_descriptors", "%d",
1639 dev->requested_rxq_size);
1640 smap_add_format(args, "configured_rxq_descriptors", "%d",
1641 dev->rxq_size);
1642 smap_add_format(args, "requested_txq_descriptors", "%d",
1643 dev->requested_txq_size);
1644 smap_add_format(args, "configured_txq_descriptors", "%d",
1645 dev->txq_size);
1646 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
1647 smap_add(args, "rx_csum_offload", "true");
1648 } else {
1649 smap_add(args, "rx_csum_offload", "false");
1650 }
1651 smap_add(args, "lsc_interrupt_mode",
1652 dev->lsc_interrupt_mode ? "true" : "false");
1653 }
1654 ovs_mutex_unlock(&dev->mutex);
1655
1656 return 0;
1657 }
1658
1659 static struct netdev_dpdk *
1660 netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
1661 OVS_REQUIRES(dpdk_mutex)
1662 {
1663 struct netdev_dpdk *dev;
1664
1665 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1666 if (dev->port_id == port_id) {
1667 return dev;
1668 }
1669 }
1670
1671 return NULL;
1672 }
1673
1674 static dpdk_port_t
1675 netdev_dpdk_get_port_by_mac(const char *mac_str)
1676 {
1677 dpdk_port_t port_id;
1678 struct eth_addr mac, port_mac;
1679
1680 if (!eth_addr_from_string(mac_str, &mac)) {
1681 VLOG_ERR("invalid mac: %s", mac_str);
1682 return DPDK_ETH_PORT_ID_INVALID;
1683 }
1684
1685 RTE_ETH_FOREACH_DEV (port_id) {
1686 struct ether_addr ea;
1687
1688 rte_eth_macaddr_get(port_id, &ea);
1689 memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
1690 if (eth_addr_equals(mac, port_mac)) {
1691 return port_id;
1692 }
1693 }
1694
1695 return DPDK_ETH_PORT_ID_INVALID;
1696 }
1697
1698 /* Return the first DPDK port id matching the devargs pattern. */
1699 static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
1700 OVS_REQUIRES(dpdk_mutex)
1701 {
1702 dpdk_port_t port_id;
1703 struct rte_dev_iterator iterator;
1704
1705 RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
1706 /* If a break is done - must call rte_eth_iterator_cleanup. */
1707 rte_eth_iterator_cleanup(&iterator);
1708 break;
1709 }
1710
1711 return port_id;
1712 }
1713
1714 /*
1715 * Normally, a PCI id (optionally followed by a representor number)
1716 * is enough for identifying a specific DPDK port.
1717 * However, for some NICs having multiple ports sharing the same PCI
1718 * id, using PCI id won't work then.
1719 *
1720 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1721 *
1722 * Note that the compatibility is fully kept: user can still use the
1723 * PCI id for adding ports (when it's enough for them).
1724 */
1725 static dpdk_port_t
1726 netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
1727 const char *devargs, char **errp)
1728 OVS_REQUIRES(dpdk_mutex)
1729 {
1730 dpdk_port_t new_port_id;
1731
1732 if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
1733 new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
1734 } else {
1735 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1736 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1737 /* Device not found in DPDK, attempt to attach it */
1738 if (rte_dev_probe(devargs)) {
1739 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1740 } else {
1741 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1742 if (rte_eth_dev_is_valid_port(new_port_id)) {
1743 /* Attach successful */
1744 dev->attached = true;
1745 VLOG_INFO("Device '%s' attached to DPDK", devargs);
1746 } else {
1747 /* Attach unsuccessful */
1748 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1749 }
1750 }
1751 }
1752 }
1753
1754 if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
1755 VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
1756 }
1757
1758 return new_port_id;
1759 }
1760
1761 static void
1762 dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
1763 OVS_REQUIRES(dev->mutex)
1764 {
1765 int new_n_rxq;
1766
1767 new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
1768 if (new_n_rxq != dev->requested_n_rxq) {
1769 dev->requested_n_rxq = new_n_rxq;
1770 netdev_request_reconfigure(&dev->up);
1771 }
1772 }
1773
1774 static void
1775 dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
1776 const char *flag, int default_size, int *new_size)
1777 {
1778 int queue_size = smap_get_int(args, flag, default_size);
1779
1780 if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
1781 || !is_pow2(queue_size)) {
1782 queue_size = default_size;
1783 }
1784
1785 if (queue_size != *new_size) {
1786 *new_size = queue_size;
1787 netdev_request_reconfigure(netdev);
1788 }
1789 }
1790
1791 static int
1792 netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
1793 char **errp)
1794 {
1795 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1796 bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
1797 bool flow_control_requested = true;
1798 enum rte_eth_fc_mode fc_mode;
1799 static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
1800 {RTE_FC_NONE, RTE_FC_TX_PAUSE},
1801 {RTE_FC_RX_PAUSE, RTE_FC_FULL }
1802 };
1803 const char *new_devargs;
1804 int err = 0;
1805
1806 ovs_mutex_lock(&dpdk_mutex);
1807 ovs_mutex_lock(&dev->mutex);
1808
1809 dpdk_set_rxq_config(dev, args);
1810
1811 dpdk_process_queue_size(netdev, args, "n_rxq_desc",
1812 NIC_PORT_DEFAULT_RXQ_SIZE,
1813 &dev->requested_rxq_size);
1814 dpdk_process_queue_size(netdev, args, "n_txq_desc",
1815 NIC_PORT_DEFAULT_TXQ_SIZE,
1816 &dev->requested_txq_size);
1817
1818 new_devargs = smap_get(args, "dpdk-devargs");
1819
1820 if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
1821 /* The user requested a new device. If we return error, the caller
1822 * will delete this netdev and try to recreate it. */
1823 err = EAGAIN;
1824 goto out;
1825 }
1826
1827 /* dpdk-devargs is required for device configuration */
1828 if (new_devargs && new_devargs[0]) {
1829 /* Don't process dpdk-devargs if value is unchanged and port id
1830 * is valid */
1831 if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
1832 && rte_eth_dev_is_valid_port(dev->port_id))) {
1833 dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
1834 new_devargs,
1835 errp);
1836 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1837 err = EINVAL;
1838 } else if (new_port_id == dev->port_id) {
1839 /* Already configured, do not reconfigure again */
1840 err = 0;
1841 } else {
1842 struct netdev_dpdk *dup_dev;
1843
1844 dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
1845 if (dup_dev) {
1846 VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
1847 "which is already in use by '%s'",
1848 netdev_get_name(netdev), new_devargs,
1849 netdev_get_name(&dup_dev->up));
1850 err = EADDRINUSE;
1851 } else {
1852 int sid = rte_eth_dev_socket_id(new_port_id);
1853
1854 dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
1855 dev->devargs = xstrdup(new_devargs);
1856 dev->port_id = new_port_id;
1857 netdev_request_reconfigure(&dev->up);
1858 netdev_dpdk_clear_xstats(dev);
1859 err = 0;
1860 }
1861 }
1862 }
1863 } else {
1864 VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
1865 "The old 'dpdk<port_id>' names are not supported",
1866 netdev_get_name(netdev));
1867 err = EINVAL;
1868 }
1869
1870 if (err) {
1871 goto out;
1872 }
1873
1874 lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
1875 if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
1876 dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
1877 netdev_request_reconfigure(netdev);
1878 }
1879
1880 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
1881 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
1882 autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
1883
1884 fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
1885
1886 if (!smap_get(args, "rx-flow-ctrl") && !smap_get(args, "tx-flow-ctrl")
1887 && !smap_get(args, "flow-ctrl-autoneg")) {
1888 /* FIXME: User didn't ask for flow control configuration.
1889 * For now we'll not print a warning if flow control is not
1890 * supported by the DPDK port. */
1891 flow_control_requested = false;
1892 }
1893
1894 /* Get the Flow control configuration. */
1895 err = -rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
1896 if (err) {
1897 if (err == ENOTSUP) {
1898 if (flow_control_requested) {
1899 VLOG_WARN("%s: Flow control is not supported.",
1900 netdev_get_name(netdev));
1901 }
1902 err = 0; /* Not fatal. */
1903 } else {
1904 VLOG_WARN("%s: Cannot get flow control parameters: %s",
1905 netdev_get_name(netdev), rte_strerror(err));
1906 }
1907 goto out;
1908 }
1909
1910 if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
1911 dev->fc_conf.mode = fc_mode;
1912 dev->fc_conf.autoneg = autoneg;
1913 dpdk_eth_flow_ctrl_setup(dev);
1914 }
1915
1916 out:
1917 ovs_mutex_unlock(&dev->mutex);
1918 ovs_mutex_unlock(&dpdk_mutex);
1919
1920 return err;
1921 }
1922
1923 static int
1924 netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
1925 char **errp OVS_UNUSED)
1926 {
1927 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1928
1929 ovs_mutex_lock(&dev->mutex);
1930 dpdk_set_rxq_config(dev, args);
1931 ovs_mutex_unlock(&dev->mutex);
1932
1933 return 0;
1934 }
1935
1936 static int
1937 netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
1938 const struct smap *args,
1939 char **errp OVS_UNUSED)
1940 {
1941 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1942 const char *path;
1943 int max_tx_retries, cur_max_tx_retries;
1944
1945 ovs_mutex_lock(&dev->mutex);
1946 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1947 path = smap_get(args, "vhost-server-path");
1948 if (!nullable_string_is_equal(path, dev->vhost_id)) {
1949 free(dev->vhost_id);
1950 dev->vhost_id = nullable_xstrdup(path);
1951 /* check zero copy configuration */
1952 if (smap_get_bool(args, "dq-zero-copy", false)) {
1953 dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1954 } else {
1955 dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1956 }
1957 netdev_request_reconfigure(netdev);
1958 }
1959 }
1960
1961 max_tx_retries = smap_get_int(args, "tx-retries-max",
1962 VHOST_ENQ_RETRY_DEF);
1963 if (max_tx_retries < VHOST_ENQ_RETRY_MIN
1964 || max_tx_retries > VHOST_ENQ_RETRY_MAX) {
1965 max_tx_retries = VHOST_ENQ_RETRY_DEF;
1966 }
1967 atomic_read_relaxed(&dev->vhost_tx_retries_max, &cur_max_tx_retries);
1968 if (max_tx_retries != cur_max_tx_retries) {
1969 atomic_store_relaxed(&dev->vhost_tx_retries_max, max_tx_retries);
1970 VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
1971 netdev_get_name(netdev), max_tx_retries);
1972 }
1973 ovs_mutex_unlock(&dev->mutex);
1974
1975 return 0;
1976 }
1977
1978 static int
1979 netdev_dpdk_get_numa_id(const struct netdev *netdev)
1980 {
1981 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1982
1983 return dev->socket_id;
1984 }
1985
1986 /* Sets the number of tx queues for the dpdk interface. */
1987 static int
1988 netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
1989 {
1990 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1991
1992 ovs_mutex_lock(&dev->mutex);
1993
1994 if (dev->requested_n_txq == n_txq) {
1995 goto out;
1996 }
1997
1998 dev->requested_n_txq = n_txq;
1999 netdev_request_reconfigure(netdev);
2000
2001 out:
2002 ovs_mutex_unlock(&dev->mutex);
2003 return 0;
2004 }
2005
2006 static struct netdev_rxq *
2007 netdev_dpdk_rxq_alloc(void)
2008 {
2009 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
2010
2011 if (rx) {
2012 return &rx->up;
2013 }
2014
2015 return NULL;
2016 }
2017
2018 static struct netdev_rxq_dpdk *
2019 netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
2020 {
2021 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
2022 }
2023
2024 static int
2025 netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
2026 {
2027 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2028 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2029
2030 ovs_mutex_lock(&dev->mutex);
2031 rx->port_id = dev->port_id;
2032 ovs_mutex_unlock(&dev->mutex);
2033
2034 return 0;
2035 }
2036
2037 static void
2038 netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
2039 {
2040 }
2041
2042 static void
2043 netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
2044 {
2045 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2046
2047 rte_free(rx);
2048 }
2049
2050 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2051 * 'pkts', even in case of failure.
2052 *
2053 * Returns the number of packets that weren't transmitted. */
2054 static inline int
2055 netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
2056 struct rte_mbuf **pkts, int cnt)
2057 {
2058 uint32_t nb_tx = 0;
2059
2060 while (nb_tx != cnt) {
2061 uint32_t ret;
2062
2063 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
2064 if (!ret) {
2065 break;
2066 }
2067
2068 nb_tx += ret;
2069 }
2070
2071 if (OVS_UNLIKELY(nb_tx != cnt)) {
2072 /* Free buffers, which we couldn't transmit, one at a time (each
2073 * packet could come from a different mempool) */
2074 int i;
2075
2076 for (i = nb_tx; i < cnt; i++) {
2077 rte_pktmbuf_free(pkts[i]);
2078 }
2079 }
2080
2081 return cnt - nb_tx;
2082 }
2083
2084 static inline bool
2085 netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
2086 struct rte_meter_srtcm_profile *profile,
2087 struct rte_mbuf *pkt, uint64_t time)
2088 {
2089 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
2090
2091 return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
2092 e_RTE_METER_GREEN;
2093 }
2094
2095 static int
2096 netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
2097 struct rte_meter_srtcm_profile *profile,
2098 struct rte_mbuf **pkts, int pkt_cnt,
2099 bool should_steal)
2100 {
2101 int i = 0;
2102 int cnt = 0;
2103 struct rte_mbuf *pkt = NULL;
2104 uint64_t current_time = rte_rdtsc();
2105
2106 for (i = 0; i < pkt_cnt; i++) {
2107 pkt = pkts[i];
2108 /* Handle current packet */
2109 if (netdev_dpdk_policer_pkt_handle(meter, profile,
2110 pkt, current_time)) {
2111 if (cnt != i) {
2112 pkts[cnt] = pkt;
2113 }
2114 cnt++;
2115 } else {
2116 if (should_steal) {
2117 rte_pktmbuf_free(pkt);
2118 }
2119 }
2120 }
2121
2122 return cnt;
2123 }
2124
2125 static int
2126 ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
2127 int pkt_cnt, bool should_steal)
2128 {
2129 int cnt = 0;
2130
2131 rte_spinlock_lock(&policer->policer_lock);
2132 cnt = netdev_dpdk_policer_run(&policer->in_policer, &policer->in_prof,
2133 pkts, pkt_cnt, should_steal);
2134 rte_spinlock_unlock(&policer->policer_lock);
2135
2136 return cnt;
2137 }
2138
2139 static bool
2140 is_vhost_running(struct netdev_dpdk *dev)
2141 {
2142 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
2143 }
2144
2145 static inline void
2146 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
2147 unsigned int packet_size)
2148 {
2149 /* Hard-coded search for the size bucket. */
2150 if (packet_size < 256) {
2151 if (packet_size >= 128) {
2152 stats->rx_128_to_255_packets++;
2153 } else if (packet_size <= 64) {
2154 stats->rx_1_to_64_packets++;
2155 } else {
2156 stats->rx_65_to_127_packets++;
2157 }
2158 } else {
2159 if (packet_size >= 1523) {
2160 stats->rx_1523_to_max_packets++;
2161 } else if (packet_size >= 1024) {
2162 stats->rx_1024_to_1522_packets++;
2163 } else if (packet_size < 512) {
2164 stats->rx_256_to_511_packets++;
2165 } else {
2166 stats->rx_512_to_1023_packets++;
2167 }
2168 }
2169 }
2170
2171 static inline void
2172 netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
2173 struct dp_packet **packets, int count,
2174 int qos_drops)
2175 {
2176 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2177 struct netdev_stats *stats = &dev->stats;
2178 struct dp_packet *packet;
2179 unsigned int packet_size;
2180 int i;
2181
2182 stats->rx_packets += count;
2183 stats->rx_dropped += qos_drops;
2184 for (i = 0; i < count; i++) {
2185 packet = packets[i];
2186 packet_size = dp_packet_size(packet);
2187
2188 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
2189 /* This only protects the following multicast counting from
2190 * too short packets, but it does not stop the packet from
2191 * further processing. */
2192 stats->rx_errors++;
2193 stats->rx_length_errors++;
2194 continue;
2195 }
2196
2197 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
2198
2199 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
2200 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
2201 stats->multicast++;
2202 }
2203
2204 stats->rx_bytes += packet_size;
2205 }
2206
2207 sw_stats->rx_qos_drops += qos_drops;
2208 }
2209
2210 /*
2211 * The receive path for the vhost port is the TX path out from guest.
2212 */
2213 static int
2214 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
2215 struct dp_packet_batch *batch, int *qfill)
2216 {
2217 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2218 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
2219 uint16_t nb_rx = 0;
2220 uint16_t qos_drops = 0;
2221 int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
2222 int vid = netdev_dpdk_get_vid(dev);
2223
2224 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
2225 || !(dev->flags & NETDEV_UP))) {
2226 return EAGAIN;
2227 }
2228
2229 nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
2230 (struct rte_mbuf **) batch->packets,
2231 NETDEV_MAX_BURST);
2232 if (!nb_rx) {
2233 return EAGAIN;
2234 }
2235
2236 if (qfill) {
2237 if (nb_rx == NETDEV_MAX_BURST) {
2238 /* The DPDK API returns a uint32_t which often has invalid bits in
2239 * the upper 16-bits. Need to restrict the value to uint16_t. */
2240 *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
2241 } else {
2242 *qfill = 0;
2243 }
2244 }
2245
2246 if (policer) {
2247 qos_drops = nb_rx;
2248 nb_rx = ingress_policer_run(policer,
2249 (struct rte_mbuf **) batch->packets,
2250 nb_rx, true);
2251 qos_drops -= nb_rx;
2252 }
2253
2254 rte_spinlock_lock(&dev->stats_lock);
2255 netdev_dpdk_vhost_update_rx_counters(dev, batch->packets,
2256 nb_rx, qos_drops);
2257 rte_spinlock_unlock(&dev->stats_lock);
2258
2259 batch->count = nb_rx;
2260 dp_packet_batch_init_packet_fields(batch);
2261
2262 return 0;
2263 }
2264
2265 static bool
2266 netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
2267 {
2268 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2269
2270 return dev->vhost_rxq_enabled[rxq->queue_id];
2271 }
2272
2273 static int
2274 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
2275 int *qfill)
2276 {
2277 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2278 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2279 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
2280 int nb_rx;
2281 int dropped = 0;
2282
2283 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2284 return EAGAIN;
2285 }
2286
2287 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
2288 (struct rte_mbuf **) batch->packets,
2289 NETDEV_MAX_BURST);
2290 if (!nb_rx) {
2291 return EAGAIN;
2292 }
2293
2294 if (policer) {
2295 dropped = nb_rx;
2296 nb_rx = ingress_policer_run(policer,
2297 (struct rte_mbuf **) batch->packets,
2298 nb_rx, true);
2299 dropped -= nb_rx;
2300 }
2301
2302 /* Update stats to reflect dropped packets */
2303 if (OVS_UNLIKELY(dropped)) {
2304 rte_spinlock_lock(&dev->stats_lock);
2305 dev->stats.rx_dropped += dropped;
2306 dev->sw_stats->rx_qos_drops += dropped;
2307 rte_spinlock_unlock(&dev->stats_lock);
2308 }
2309
2310 batch->count = nb_rx;
2311 dp_packet_batch_init_packet_fields(batch);
2312
2313 if (qfill) {
2314 if (nb_rx == NETDEV_MAX_BURST) {
2315 *qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
2316 } else {
2317 *qfill = 0;
2318 }
2319 }
2320
2321 return 0;
2322 }
2323
2324 static inline int
2325 netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2326 int cnt, bool should_steal)
2327 {
2328 struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
2329
2330 if (qos_conf) {
2331 rte_spinlock_lock(&qos_conf->lock);
2332 cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
2333 rte_spinlock_unlock(&qos_conf->lock);
2334 }
2335
2336 return cnt;
2337 }
2338
2339 static int
2340 netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2341 int pkt_cnt)
2342 {
2343 int i = 0;
2344 int cnt = 0;
2345 struct rte_mbuf *pkt;
2346
2347 for (i = 0; i < pkt_cnt; i++) {
2348 pkt = pkts[i];
2349 if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
2350 VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
2351 dev->up.name, pkt->pkt_len, dev->max_packet_len);
2352 rte_pktmbuf_free(pkt);
2353 continue;
2354 }
2355
2356 if (OVS_UNLIKELY(i != cnt)) {
2357 pkts[cnt] = pkt;
2358 }
2359 cnt++;
2360 }
2361
2362 return cnt;
2363 }
2364
2365 static inline void
2366 netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev,
2367 struct dp_packet **packets,
2368 int attempted,
2369 struct netdev_dpdk_sw_stats *sw_stats_add)
2370 {
2371 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2372 int dropped = sw_stats_add->tx_mtu_exceeded_drops +
2373 sw_stats_add->tx_qos_drops +
2374 sw_stats_add->tx_failure_drops;
2375 struct netdev_stats *stats = &dev->stats;
2376 int sent = attempted - dropped;
2377 int i;
2378
2379 stats->tx_packets += sent;
2380 stats->tx_dropped += dropped;
2381
2382 for (i = 0; i < sent; i++) {
2383 stats->tx_bytes += dp_packet_size(packets[i]);
2384 }
2385
2386 sw_stats->tx_retries += sw_stats_add->tx_retries;
2387 sw_stats->tx_failure_drops += sw_stats_add->tx_failure_drops;
2388 sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops;
2389 sw_stats->tx_qos_drops += sw_stats_add->tx_qos_drops;
2390 }
2391
2392 static void
2393 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2394 struct dp_packet **pkts, int cnt)
2395 {
2396 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2397 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
2398 struct netdev_dpdk_sw_stats sw_stats_add;
2399 unsigned int n_packets_to_free = cnt;
2400 unsigned int total_packets = cnt;
2401 int i, retries = 0;
2402 int max_retries = VHOST_ENQ_RETRY_MIN;
2403 int vid = netdev_dpdk_get_vid(dev);
2404
2405 qid = dev->tx_q[qid % netdev->n_txq].map;
2406
2407 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
2408 || !(dev->flags & NETDEV_UP))) {
2409 rte_spinlock_lock(&dev->stats_lock);
2410 dev->stats.tx_dropped+= cnt;
2411 rte_spinlock_unlock(&dev->stats_lock);
2412 goto out;
2413 }
2414
2415 if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
2416 COVERAGE_INC(vhost_tx_contention);
2417 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2418 }
2419
2420 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
2421 sw_stats_add.tx_mtu_exceeded_drops = total_packets - cnt;
2422
2423 /* Check has QoS has been configured for the netdev */
2424 sw_stats_add.tx_qos_drops = cnt;
2425 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
2426 sw_stats_add.tx_qos_drops -= cnt;
2427
2428 n_packets_to_free = cnt;
2429
2430 do {
2431 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
2432 unsigned int tx_pkts;
2433
2434 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
2435 if (OVS_LIKELY(tx_pkts)) {
2436 /* Packets have been sent.*/
2437 cnt -= tx_pkts;
2438 /* Prepare for possible retry.*/
2439 cur_pkts = &cur_pkts[tx_pkts];
2440 if (OVS_UNLIKELY(cnt && !retries)) {
2441 /*
2442 * Read max retries as there are packets not sent
2443 * and no retries have already occurred.
2444 */
2445 atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
2446 }
2447 } else {
2448 /* No packets sent - do not retry.*/
2449 break;
2450 }
2451 } while (cnt && (retries++ < max_retries));
2452
2453 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2454
2455 sw_stats_add.tx_failure_drops = cnt;
2456 sw_stats_add.tx_retries = MIN(retries, max_retries);
2457
2458 rte_spinlock_lock(&dev->stats_lock);
2459 netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets,
2460 &sw_stats_add);
2461 rte_spinlock_unlock(&dev->stats_lock);
2462
2463 out:
2464 for (i = 0; i < n_packets_to_free; i++) {
2465 dp_packet_delete(pkts[i]);
2466 }
2467 }
2468
2469 /* Tx function. Transmit packets indefinitely */
2470 static void
2471 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
2472 OVS_NO_THREAD_SAFETY_ANALYSIS
2473 {
2474 const size_t batch_cnt = dp_packet_batch_size(batch);
2475 #if !defined(__CHECKER__) && !defined(_WIN32)
2476 const size_t PKT_ARRAY_SIZE = batch_cnt;
2477 #else
2478 /* Sparse or MSVC doesn't like variable length array. */
2479 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
2480 #endif
2481 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2482 struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
2483 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2484 uint32_t cnt = batch_cnt;
2485 uint32_t dropped = 0;
2486 uint32_t tx_failure = 0;
2487 uint32_t mtu_drops = 0;
2488 uint32_t qos_drops = 0;
2489
2490 if (dev->type != DPDK_DEV_VHOST) {
2491 /* Check if QoS has been configured for this netdev. */
2492 cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
2493 batch_cnt, false);
2494 qos_drops = batch_cnt - cnt;
2495 }
2496
2497 uint32_t txcnt = 0;
2498
2499 for (uint32_t i = 0; i < cnt; i++) {
2500 struct dp_packet *packet = batch->packets[i];
2501 uint32_t size = dp_packet_size(packet);
2502
2503 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
2504 VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
2505 size, dev->max_packet_len);
2506
2507 mtu_drops++;
2508 continue;
2509 }
2510
2511 pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
2512 if (OVS_UNLIKELY(!pkts[txcnt])) {
2513 dropped = cnt - i;
2514 break;
2515 }
2516
2517 /* We have to do a copy for now */
2518 memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
2519 dp_packet_data(packet), size);
2520 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
2521
2522 txcnt++;
2523 }
2524
2525 if (OVS_LIKELY(txcnt)) {
2526 if (dev->type == DPDK_DEV_VHOST) {
2527 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
2528 txcnt);
2529 } else {
2530 tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
2531 }
2532 }
2533
2534 dropped += qos_drops + mtu_drops + tx_failure;
2535 if (OVS_UNLIKELY(dropped)) {
2536 rte_spinlock_lock(&dev->stats_lock);
2537 dev->stats.tx_dropped += dropped;
2538 sw_stats->tx_failure_drops += tx_failure;
2539 sw_stats->tx_mtu_exceeded_drops += mtu_drops;
2540 sw_stats->tx_qos_drops += qos_drops;
2541 rte_spinlock_unlock(&dev->stats_lock);
2542 }
2543 }
2544
2545 static int
2546 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2547 struct dp_packet_batch *batch,
2548 bool concurrent_txq OVS_UNUSED)
2549 {
2550
2551 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
2552 dpdk_do_tx_copy(netdev, qid, batch);
2553 dp_packet_delete_batch(batch, true);
2554 } else {
2555 __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
2556 dp_packet_batch_size(batch));
2557 }
2558 return 0;
2559 }
2560
2561 static inline void
2562 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
2563 struct dp_packet_batch *batch,
2564 bool concurrent_txq)
2565 {
2566 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2567 dp_packet_delete_batch(batch, true);
2568 return;
2569 }
2570
2571 if (OVS_UNLIKELY(concurrent_txq)) {
2572 qid = qid % dev->up.n_txq;
2573 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2574 }
2575
2576 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
2577 struct netdev *netdev = &dev->up;
2578
2579 dpdk_do_tx_copy(netdev, qid, batch);
2580 dp_packet_delete_batch(batch, true);
2581 } else {
2582 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2583 int tx_cnt, dropped;
2584 int tx_failure, mtu_drops, qos_drops;
2585 int batch_cnt = dp_packet_batch_size(batch);
2586 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
2587
2588 tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
2589 mtu_drops = batch_cnt - tx_cnt;
2590 qos_drops = tx_cnt;
2591 tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
2592 qos_drops -= tx_cnt;
2593
2594 tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);
2595
2596 dropped = tx_failure + mtu_drops + qos_drops;
2597 if (OVS_UNLIKELY(dropped)) {
2598 rte_spinlock_lock(&dev->stats_lock);
2599 dev->stats.tx_dropped += dropped;
2600 sw_stats->tx_failure_drops += tx_failure;
2601 sw_stats->tx_mtu_exceeded_drops += mtu_drops;
2602 sw_stats->tx_qos_drops += qos_drops;
2603 rte_spinlock_unlock(&dev->stats_lock);
2604 }
2605 }
2606
2607 if (OVS_UNLIKELY(concurrent_txq)) {
2608 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2609 }
2610 }
2611
2612 static int
2613 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
2614 struct dp_packet_batch *batch, bool concurrent_txq)
2615 {
2616 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2617
2618 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
2619 return 0;
2620 }
2621
2622 static int
2623 netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
2624 {
2625 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2626
2627 ovs_mutex_lock(&dev->mutex);
2628 if (!eth_addr_equals(dev->hwaddr, mac)) {
2629 dev->hwaddr = mac;
2630 netdev_change_seq_changed(netdev);
2631 }
2632 ovs_mutex_unlock(&dev->mutex);
2633
2634 return 0;
2635 }
2636
2637 static int
2638 netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
2639 {
2640 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2641
2642 ovs_mutex_lock(&dev->mutex);
2643 *mac = dev->hwaddr;
2644 ovs_mutex_unlock(&dev->mutex);
2645
2646 return 0;
2647 }
2648
2649 static int
2650 netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
2651 {
2652 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2653
2654 ovs_mutex_lock(&dev->mutex);
2655 *mtup = dev->mtu;
2656 ovs_mutex_unlock(&dev->mutex);
2657
2658 return 0;
2659 }
2660
2661 static int
2662 netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
2663 {
2664 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2665
2666 /* XXX: Ensure that the overall frame length of the requested MTU does not
2667 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2668 * the L2 frame length is calculated for a given MTU when
2669 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2670 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2671 * not include vlan headers. As such we should use
2672 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2673 * (8 bytes) for comparison. This avoids a failure later with
2674 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
2675 * a method to retrieve the upper bound MTU for a given device.
2676 */
2677 if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
2678 || mtu < ETHER_MIN_MTU) {
2679 VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
2680 return EINVAL;
2681 }
2682
2683 ovs_mutex_lock(&dev->mutex);
2684 if (dev->requested_mtu != mtu) {
2685 dev->requested_mtu = mtu;
2686 netdev_request_reconfigure(netdev);
2687 }
2688 ovs_mutex_unlock(&dev->mutex);
2689
2690 return 0;
2691 }
2692
2693 static int
2694 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
2695
2696 static int
2697 netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
2698 struct netdev_stats *stats)
2699 {
2700 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2701
2702 ovs_mutex_lock(&dev->mutex);
2703
2704 rte_spinlock_lock(&dev->stats_lock);
2705 /* Supported Stats */
2706 stats->rx_packets = dev->stats.rx_packets;
2707 stats->tx_packets = dev->stats.tx_packets;
2708 stats->rx_dropped = dev->stats.rx_dropped;
2709 stats->tx_dropped = dev->stats.tx_dropped;
2710 stats->multicast = dev->stats.multicast;
2711 stats->rx_bytes = dev->stats.rx_bytes;
2712 stats->tx_bytes = dev->stats.tx_bytes;
2713 stats->rx_errors = dev->stats.rx_errors;
2714 stats->rx_length_errors = dev->stats.rx_length_errors;
2715
2716 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
2717 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
2718 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
2719 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
2720 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
2721 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
2722 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
2723
2724 rte_spinlock_unlock(&dev->stats_lock);
2725
2726 ovs_mutex_unlock(&dev->mutex);
2727
2728 return 0;
2729 }
2730
2731 static void
2732 netdev_dpdk_convert_xstats(struct netdev_stats *stats,
2733 const struct rte_eth_xstat *xstats,
2734 const struct rte_eth_xstat_name *names,
2735 const unsigned int size)
2736 {
2737 /* DPDK XSTATS Counter names definition. */
2738 #define DPDK_XSTATS \
2739 DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
2740 DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
2741 DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
2742 DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
2743 DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
2744 DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
2745 DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
2746 DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
2747 DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
2748 DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
2749 DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
2750 DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
2751 DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
2752 DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
2753 DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
2754 DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
2755 DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
2756 DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
2757 DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
2758 DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
2759 DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
2760 DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
2761
2762 for (unsigned int i = 0; i < size; i++) {
2763 #define DPDK_XSTAT(MEMBER, NAME) \
2764 if (strcmp(NAME, names[i].name) == 0) { \
2765 stats->MEMBER = xstats[i].value; \
2766 continue; \
2767 }
2768 DPDK_XSTATS;
2769 #undef DPDK_XSTAT
2770 }
2771 #undef DPDK_XSTATS
2772 }
2773
2774 static int
2775 netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
2776 {
2777 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2778 struct rte_eth_stats rte_stats;
2779 bool gg;
2780
2781 netdev_dpdk_get_carrier(netdev, &gg);
2782 ovs_mutex_lock(&dev->mutex);
2783
2784 struct rte_eth_xstat *rte_xstats = NULL;
2785 struct rte_eth_xstat_name *rte_xstats_names = NULL;
2786 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
2787
2788 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
2789 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
2790 dev->port_id);
2791 ovs_mutex_unlock(&dev->mutex);
2792 return EPROTO;
2793 }
2794
2795 /* Get length of statistics */
2796 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
2797 if (rte_xstats_len < 0) {
2798 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2799 dev->port_id);
2800 goto out;
2801 }
2802 /* Reserve memory for xstats names and values */
2803 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
2804 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
2805
2806 /* Retreive xstats names */
2807 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
2808 rte_xstats_names,
2809 rte_xstats_len);
2810 if (rte_xstats_new_len != rte_xstats_len) {
2811 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
2812 dev->port_id);
2813 goto out;
2814 }
2815 /* Retreive xstats values */
2816 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
2817 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
2818 rte_xstats_len);
2819 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
2820 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
2821 rte_xstats_len);
2822 } else {
2823 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2824 dev->port_id);
2825 }
2826
2827 out:
2828 free(rte_xstats);
2829 free(rte_xstats_names);
2830
2831 stats->rx_packets = rte_stats.ipackets;
2832 stats->tx_packets = rte_stats.opackets;
2833 stats->rx_bytes = rte_stats.ibytes;
2834 stats->tx_bytes = rte_stats.obytes;
2835 stats->rx_errors = rte_stats.ierrors;
2836 stats->tx_errors = rte_stats.oerrors;
2837
2838 rte_spinlock_lock(&dev->stats_lock);
2839 stats->tx_dropped = dev->stats.tx_dropped;
2840 stats->rx_dropped = dev->stats.rx_dropped;
2841 rte_spinlock_unlock(&dev->stats_lock);
2842
2843 /* These are the available DPDK counters for packets not received due to
2844 * local resource constraints in DPDK and NIC respectively. */
2845 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
2846 stats->rx_missed_errors = rte_stats.imissed;
2847
2848 ovs_mutex_unlock(&dev->mutex);
2849
2850 return 0;
2851 }
2852
2853 static int
2854 netdev_dpdk_get_custom_stats(const struct netdev *netdev,
2855 struct netdev_custom_stats *custom_stats)
2856 {
2857
2858 uint32_t i;
2859 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2860 int rte_xstats_ret, sw_stats_size;
2861
2862 netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
2863
2864 ovs_mutex_lock(&dev->mutex);
2865
2866 if (netdev_dpdk_configure_xstats(dev)) {
2867 uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
2868 sizeof(uint64_t));
2869
2870 rte_xstats_ret =
2871 rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
2872 values, dev->rte_xstats_ids_size);
2873
2874 if (rte_xstats_ret > 0 &&
2875 rte_xstats_ret <= dev->rte_xstats_ids_size) {
2876
2877 sw_stats_size = custom_stats->size;
2878 custom_stats->size += rte_xstats_ret;
2879 custom_stats->counters = xrealloc(custom_stats->counters,
2880 custom_stats->size *
2881 sizeof *custom_stats->counters);
2882
2883 for (i = 0; i < rte_xstats_ret; i++) {
2884 ovs_strlcpy(custom_stats->counters[sw_stats_size + i].name,
2885 netdev_dpdk_get_xstat_name(dev,
2886 dev->rte_xstats_ids[i]),
2887 NETDEV_CUSTOM_STATS_NAME_SIZE);
2888 custom_stats->counters[sw_stats_size + i].value = values[i];
2889 }
2890 } else {
2891 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2892 dev->port_id);
2893 /* Let's clear statistics cache, so it will be
2894 * reconfigured */
2895 netdev_dpdk_clear_xstats(dev);
2896 }
2897
2898 free(values);
2899 }
2900
2901 ovs_mutex_unlock(&dev->mutex);
2902
2903 return 0;
2904 }
2905
2906 static int
2907 netdev_dpdk_get_sw_custom_stats(const struct netdev *netdev,
2908 struct netdev_custom_stats *custom_stats)
2909 {
2910 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2911 int i, n;
2912
2913 #define SW_CSTATS \
2914 SW_CSTAT(tx_retries) \
2915 SW_CSTAT(tx_failure_drops) \
2916 SW_CSTAT(tx_mtu_exceeded_drops) \
2917 SW_CSTAT(tx_qos_drops) \
2918 SW_CSTAT(rx_qos_drops)
2919
2920 #define SW_CSTAT(NAME) + 1
2921 custom_stats->size = SW_CSTATS;
2922 #undef SW_CSTAT
2923 custom_stats->counters = xcalloc(custom_stats->size,
2924 sizeof *custom_stats->counters);
2925
2926 ovs_mutex_lock(&dev->mutex);
2927
2928 rte_spinlock_lock(&dev->stats_lock);
2929 i = 0;
2930 #define SW_CSTAT(NAME) \
2931 custom_stats->counters[i++].value = dev->sw_stats->NAME;
2932 SW_CSTATS;
2933 #undef SW_CSTAT
2934 rte_spinlock_unlock(&dev->stats_lock);
2935
2936 ovs_mutex_unlock(&dev->mutex);
2937
2938 i = 0;
2939 n = 0;
2940 #define SW_CSTAT(NAME) \
2941 if (custom_stats->counters[i].value != UINT64_MAX) { \
2942 ovs_strlcpy(custom_stats->counters[n].name, \
2943 "ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
2944 custom_stats->counters[n].value = custom_stats->counters[i].value; \
2945 n++; \
2946 } \
2947 i++;
2948 SW_CSTATS;
2949 #undef SW_CSTAT
2950
2951 custom_stats->size = n;
2952 return 0;
2953 }
2954
2955 static int
2956 netdev_dpdk_get_features(const struct netdev *netdev,
2957 enum netdev_features *current,
2958 enum netdev_features *advertised,
2959 enum netdev_features *supported,
2960 enum netdev_features *peer)
2961 {
2962 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2963 struct rte_eth_link link;
2964 uint32_t feature = 0;
2965
2966 ovs_mutex_lock(&dev->mutex);
2967 link = dev->link;
2968 ovs_mutex_unlock(&dev->mutex);
2969
2970 /* Match against OpenFlow defined link speed values. */
2971 if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
2972 switch (link.link_speed) {
2973 case ETH_SPEED_NUM_10M:
2974 feature |= NETDEV_F_10MB_FD;
2975 break;
2976 case ETH_SPEED_NUM_100M:
2977 feature |= NETDEV_F_100MB_FD;
2978 break;
2979 case ETH_SPEED_NUM_1G:
2980 feature |= NETDEV_F_1GB_FD;
2981 break;
2982 case ETH_SPEED_NUM_10G:
2983 feature |= NETDEV_F_10GB_FD;
2984 break;
2985 case ETH_SPEED_NUM_40G:
2986 feature |= NETDEV_F_40GB_FD;
2987 break;
2988 case ETH_SPEED_NUM_100G:
2989 feature |= NETDEV_F_100GB_FD;
2990 break;
2991 default:
2992 feature |= NETDEV_F_OTHER;
2993 }
2994 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
2995 switch (link.link_speed) {
2996 case ETH_SPEED_NUM_10M:
2997 feature |= NETDEV_F_10MB_HD;
2998 break;
2999 case ETH_SPEED_NUM_100M:
3000 feature |= NETDEV_F_100MB_HD;
3001 break;
3002 case ETH_SPEED_NUM_1G:
3003 feature |= NETDEV_F_1GB_HD;
3004 break;
3005 default:
3006 feature |= NETDEV_F_OTHER;
3007 }
3008 }
3009
3010 if (link.link_autoneg) {
3011 feature |= NETDEV_F_AUTONEG;
3012 }
3013
3014 *current = feature;
3015 *advertised = *supported = *peer = 0;
3016
3017 return 0;
3018 }
3019
3020 static struct ingress_policer *
3021 netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
3022 {
3023 struct ingress_policer *policer = NULL;
3024 uint64_t rate_bytes;
3025 uint64_t burst_bytes;
3026 int err = 0;
3027
3028 policer = xmalloc(sizeof *policer);
3029 rte_spinlock_init(&policer->policer_lock);
3030
3031 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
3032 rate_bytes = rate * 1000ULL / 8;
3033 burst_bytes = burst * 1000ULL / 8;
3034
3035 policer->app_srtcm_params.cir = rate_bytes;
3036 policer->app_srtcm_params.cbs = burst_bytes;
3037 policer->app_srtcm_params.ebs = 0;
3038 err = rte_meter_srtcm_profile_config(&policer->in_prof,
3039 &policer->app_srtcm_params);
3040 if (!err) {
3041 err = rte_meter_srtcm_config(&policer->in_policer,
3042 &policer->in_prof);
3043 }
3044 if (err) {
3045 VLOG_ERR("Could not create rte meter for ingress policer");
3046 free(policer);
3047 return NULL;
3048 }
3049
3050 return policer;
3051 }
3052
3053 static int
3054 netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
3055 uint32_t policer_burst)
3056 {
3057 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3058 struct ingress_policer *policer;
3059
3060 /* Force to 0 if no rate specified,
3061 * default to 8000 kbits if burst is 0,
3062 * else stick with user-specified value.
3063 */
3064 policer_burst = (!policer_rate ? 0
3065 : !policer_burst ? 8000
3066 : policer_burst);
3067
3068 ovs_mutex_lock(&dev->mutex);
3069
3070 policer = ovsrcu_get_protected(struct ingress_policer *,
3071 &dev->ingress_policer);
3072
3073 if (dev->policer_rate == policer_rate &&
3074 dev->policer_burst == policer_burst) {
3075 /* Assume that settings haven't changed since we last set them. */
3076 ovs_mutex_unlock(&dev->mutex);
3077 return 0;
3078 }
3079
3080 /* Destroy any existing ingress policer for the device if one exists */
3081 if (policer) {
3082 ovsrcu_postpone(free, policer);
3083 }
3084
3085 if (policer_rate != 0) {
3086 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
3087 } else {
3088 policer = NULL;
3089 }
3090 ovsrcu_set(&dev->ingress_policer, policer);
3091 dev->policer_rate = policer_rate;
3092 dev->policer_burst = policer_burst;
3093 ovs_mutex_unlock(&dev->mutex);
3094
3095 return 0;
3096 }
3097
3098 static int
3099 netdev_dpdk_get_ifindex(const struct netdev *netdev)
3100 {
3101 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3102
3103 ovs_mutex_lock(&dev->mutex);
3104 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
3105 * postive integer to meet RFC 2863 recommendations.
3106 */
3107 int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
3108 ovs_mutex_unlock(&dev->mutex);
3109
3110 return ifindex;
3111 }
3112
3113 static int
3114 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
3115 {
3116 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3117
3118 ovs_mutex_lock(&dev->mutex);
3119 check_link_status(dev);
3120 *carrier = dev->link.link_status;
3121
3122 ovs_mutex_unlock(&dev->mutex);
3123
3124 return 0;
3125 }
3126
3127 static int
3128 netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
3129 {
3130 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3131
3132 ovs_mutex_lock(&dev->mutex);
3133
3134 if (is_vhost_running(dev)) {
3135 *carrier = 1;
3136 } else {
3137 *carrier = 0;
3138 }
3139
3140 ovs_mutex_unlock(&dev->mutex);
3141
3142 return 0;
3143 }
3144
3145 static long long int
3146 netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
3147 {
3148 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3149 long long int carrier_resets;
3150
3151 ovs_mutex_lock(&dev->mutex);
3152 carrier_resets = dev->link_reset_cnt;
3153 ovs_mutex_unlock(&dev->mutex);
3154
3155 return carrier_resets;
3156 }
3157
3158 static int
3159 netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
3160 long long int interval OVS_UNUSED)
3161 {
3162 return EOPNOTSUPP;
3163 }
3164
3165 static int
3166 netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
3167 enum netdev_flags off, enum netdev_flags on,
3168 enum netdev_flags *old_flagsp)
3169 OVS_REQUIRES(dev->mutex)
3170 {
3171 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
3172 return EINVAL;
3173 }
3174
3175 *old_flagsp = dev->flags;
3176 dev->flags |= on;
3177 dev->flags &= ~off;
3178
3179 if (dev->flags == *old_flagsp) {
3180 return 0;
3181 }
3182
3183 if (dev->type == DPDK_DEV_ETH) {
3184
3185 if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
3186 int err;
3187
3188 if (dev->flags & NETDEV_UP) {
3189 err = rte_eth_dev_set_link_up(dev->port_id);
3190 } else {
3191 err = rte_eth_dev_set_link_down(dev->port_id);
3192 }
3193 if (err == -ENOTSUP) {
3194 VLOG_INFO("Interface %s does not support link state "
3195 "configuration", netdev_get_name(&dev->up));
3196 } else if (err < 0) {
3197 VLOG_ERR("Interface %s link change error: %s",
3198 netdev_get_name(&dev->up), rte_strerror(-err));
3199 dev->flags = *old_flagsp;
3200 return -err;
3201 }
3202 }
3203
3204 if (dev->flags & NETDEV_PROMISC) {
3205 rte_eth_promiscuous_enable(dev->port_id);
3206 }
3207
3208 netdev_change_seq_changed(&dev->up);
3209 } else {
3210 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3211 * running then change netdev's change_seq to trigger link state
3212 * update. */
3213
3214 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
3215 && is_vhost_running(dev)) {
3216 netdev_change_seq_changed(&dev->up);
3217
3218 /* Clear statistics if device is getting up. */
3219 if (NETDEV_UP & on) {
3220 rte_spinlock_lock(&dev->stats_lock);
3221 memset(&dev->stats, 0, sizeof dev->stats);
3222 rte_spinlock_unlock(&dev->stats_lock);
3223 }
3224 }
3225 }
3226
3227 return 0;
3228 }
3229
3230 static int
3231 netdev_dpdk_update_flags(struct netdev *netdev,
3232 enum netdev_flags off, enum netdev_flags on,
3233 enum netdev_flags *old_flagsp)
3234 {
3235 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3236 int error;
3237
3238 ovs_mutex_lock(&dev->mutex);
3239 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
3240 ovs_mutex_unlock(&dev->mutex);
3241
3242 return error;
3243 }
3244
3245 static int
3246 netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
3247 struct smap *args)
3248 {
3249 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3250
3251 ovs_mutex_lock(&dev->mutex);
3252
3253 bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
3254 smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
3255
3256 int vid = netdev_dpdk_get_vid(dev);
3257 if (vid < 0) {
3258 smap_add_format(args, "status", "disconnected");
3259 ovs_mutex_unlock(&dev->mutex);
3260 return 0;
3261 } else {
3262 smap_add_format(args, "status", "connected");
3263 }
3264
3265 char socket_name[PATH_MAX];
3266 if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
3267 smap_add_format(args, "socket", "%s", socket_name);
3268 }
3269
3270 uint64_t features;
3271 if (!rte_vhost_get_negotiated_features(vid, &features)) {
3272 smap_add_format(args, "features", "0x%016"PRIx64, features);
3273 }
3274
3275 uint16_t mtu;
3276 if (!rte_vhost_get_mtu(vid, &mtu)) {
3277 smap_add_format(args, "mtu", "%d", mtu);
3278 }
3279
3280 int numa = rte_vhost_get_numa_node(vid);
3281 if (numa >= 0) {
3282 smap_add_format(args, "numa", "%d", numa);
3283 }
3284
3285 uint16_t vring_num = rte_vhost_get_vring_num(vid);
3286 if (vring_num) {
3287 smap_add_format(args, "num_of_vrings", "%d", vring_num);
3288 }
3289
3290 for (int i = 0; i < vring_num; i++) {
3291 struct rte_vhost_vring vring;
3292
3293 rte_vhost_get_vhost_vring(vid, i, &vring);
3294 smap_add_nocopy(args, xasprintf("vring_%d_size", i),
3295 xasprintf("%d", vring.size));
3296 }
3297
3298 ovs_mutex_unlock(&dev->mutex);
3299 return 0;
3300 }
3301
3302 /*
3303 * Convert a given uint32_t link speed defined in DPDK to a string
3304 * equivalent.
3305 */
3306 static const char *
3307 netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
3308 {
3309 switch (link_speed) {
3310 case ETH_SPEED_NUM_10M: return "10Mbps";
3311 case ETH_SPEED_NUM_100M: return "100Mbps";
3312 case ETH_SPEED_NUM_1G: return "1Gbps";
3313 case ETH_SPEED_NUM_2_5G: return "2.5Gbps";
3314 case ETH_SPEED_NUM_5G: return "5Gbps";
3315 case ETH_SPEED_NUM_10G: return "10Gbps";
3316 case ETH_SPEED_NUM_20G: return "20Gbps";
3317 case ETH_SPEED_NUM_25G: return "25Gbps";
3318 case ETH_SPEED_NUM_40G: return "40Gbps";
3319 case ETH_SPEED_NUM_50G: return "50Gbps";
3320 case ETH_SPEED_NUM_56G: return "56Gbps";
3321 case ETH_SPEED_NUM_100G: return "100Gbps";
3322 default: return "Not Defined";
3323 }
3324 }
3325
3326 static int
3327 netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
3328 {
3329 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3330 struct rte_eth_dev_info dev_info;
3331 uint32_t link_speed;
3332
3333 if (!rte_eth_dev_is_valid_port(dev->port_id)) {
3334 return ENODEV;
3335 }
3336
3337 ovs_mutex_lock(&dpdk_mutex);
3338 ovs_mutex_lock(&dev->mutex);
3339 rte_eth_dev_info_get(dev->port_id, &dev_info);
3340 link_speed = dev->link.link_speed;
3341 ovs_mutex_unlock(&dev->mutex);
3342 const struct rte_bus *bus;
3343 const struct rte_pci_device *pci_dev;
3344 uint16_t vendor_id = PCI_ANY_ID;
3345 uint16_t device_id = PCI_ANY_ID;
3346 bus = rte_bus_find_by_device(dev_info.device);
3347 if (bus && !strcmp(bus->name, "pci")) {
3348 pci_dev = RTE_DEV_TO_PCI(dev_info.device);
3349 if (pci_dev) {
3350 vendor_id = pci_dev->id.vendor_id;
3351 device_id = pci_dev->id.device_id;
3352 }
3353 }
3354 ovs_mutex_unlock(&dpdk_mutex);
3355
3356 smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
3357 smap_add_format(args, "numa_id", "%d",
3358 rte_eth_dev_socket_id(dev->port_id));
3359 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
3360 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
3361 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
3362 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
3363 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
3364 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
3365 smap_add_format(args, "max_hash_mac_addrs", "%u",
3366 dev_info.max_hash_mac_addrs);
3367 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
3368 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
3369
3370 /* Querying the DPDK library for iftype may be done in future, pending
3371 * support; cf. RFC 3635 Section 3.2.4. */
3372 enum { IF_TYPE_ETHERNETCSMACD = 6 };
3373
3374 smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
3375 smap_add_format(args, "if_descr", "%s %s", rte_version(),
3376 dev_info.driver_name);
3377 smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id);
3378 smap_add_format(args, "pci-device_id", "0x%x", device_id);
3379
3380 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3381 * In that case the speed will not be reported as part of the usual
3382 * call to get_features(). Get the link speed of the device and add it
3383 * to the device status in an easy to read string format.
3384 */
3385 smap_add(args, "link_speed",
3386 netdev_dpdk_link_speed_to_str__(link_speed));
3387
3388 return 0;
3389 }
3390
3391 static void
3392 netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
3393 OVS_REQUIRES(dev->mutex)
3394 {
3395 enum netdev_flags old_flags;
3396
3397 if (admin_state) {
3398 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
3399 } else {
3400 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
3401 }
3402 }
3403
3404 static void
3405 netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
3406 const char *argv[], void *aux OVS_UNUSED)
3407 {
3408 bool up;
3409
3410 if (!strcasecmp(argv[argc - 1], "up")) {
3411 up = true;
3412 } else if ( !strcasecmp(argv[argc - 1], "down")) {
3413 up = false;
3414 } else {
3415 unixctl_command_reply_error(conn, "Invalid Admin State");
3416 return;
3417 }
3418
3419 if (argc > 2) {
3420 struct netdev *netdev = netdev_from_name(argv[1]);
3421
3422 if (netdev && is_dpdk_class(netdev->netdev_class)) {
3423 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3424
3425 ovs_mutex_lock(&dev->mutex);
3426 netdev_dpdk_set_admin_state__(dev, up);
3427 ovs_mutex_unlock(&dev->mutex);
3428
3429 netdev_close(netdev);
3430 } else {
3431 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3432 netdev_close(netdev);
3433 return;
3434 }
3435 } else {
3436 struct netdev_dpdk *dev;
3437
3438 ovs_mutex_lock(&dpdk_mutex);
3439 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3440 ovs_mutex_lock(&dev->mutex);
3441 netdev_dpdk_set_admin_state__(dev, up);
3442 ovs_mutex_unlock(&dev->mutex);
3443 }
3444 ovs_mutex_unlock(&dpdk_mutex);
3445 }
3446 unixctl_command_reply(conn, "OK");
3447 }
3448
3449 static void
3450 netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
3451 const char *argv[], void *aux OVS_UNUSED)
3452 {
3453 char *response;
3454 dpdk_port_t port_id;
3455 struct netdev_dpdk *dev;
3456 struct rte_device *rte_dev;
3457 struct ds used_interfaces = DS_EMPTY_INITIALIZER;
3458 bool used = false;
3459
3460 ovs_mutex_lock(&dpdk_mutex);
3461
3462 port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
3463 if (!rte_eth_dev_is_valid_port(port_id)) {
3464 response = xasprintf("Device '%s' not found in DPDK", argv[1]);
3465 goto error;
3466 }
3467
3468 rte_dev = rte_eth_devices[port_id].device;
3469 ds_put_format(&used_interfaces,
3470 "Device '%s' is being used by the following interfaces:",
3471 argv[1]);
3472
3473 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3474 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3475 if (rte_eth_devices[dev->port_id].device == rte_dev
3476 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
3477 used = true;
3478 ds_put_format(&used_interfaces, " %s",
3479 netdev_get_name(&dev->up));
3480 }
3481 }
3482
3483 if (used) {
3484 ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
3485 response = ds_steal_cstr(&used_interfaces);
3486 ds_destroy(&used_interfaces);
3487 goto error;
3488 }
3489 ds_destroy(&used_interfaces);
3490
3491 rte_eth_dev_close(port_id);
3492 if (rte_dev_remove(rte_dev) < 0) {
3493 response = xasprintf("Device '%s' can not be detached", argv[1]);
3494 goto error;
3495 }
3496
3497 response = xasprintf("All devices shared with device '%s' "
3498 "have been detached", argv[1]);
3499
3500 ovs_mutex_unlock(&dpdk_mutex);
3501 unixctl_command_reply(conn, response);
3502 free(response);
3503 return;
3504
3505 error:
3506 ovs_mutex_unlock(&dpdk_mutex);
3507 unixctl_command_reply_error(conn, response);
3508 free(response);
3509 }
3510
3511 static void
3512 netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
3513 int argc, const char *argv[],
3514 void *aux OVS_UNUSED)
3515 {
3516 size_t size;
3517 FILE *stream;
3518 char *response = NULL;
3519 struct netdev *netdev = NULL;
3520
3521 if (argc == 2) {
3522 netdev = netdev_from_name(argv[1]);
3523 if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
3524 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3525 goto out;
3526 }
3527 }
3528
3529 stream = open_memstream(&response, &size);
3530 if (!stream) {
3531 response = xasprintf("Unable to open memstream: %s.",
3532 ovs_strerror(errno));
3533 unixctl_command_reply_error(conn, response);
3534 goto out;
3535 }
3536
3537 if (netdev) {
3538 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3539
3540 ovs_mutex_lock(&dev->mutex);
3541 ovs_mutex_lock(&dpdk_mp_mutex);
3542
3543 rte_mempool_dump(stream, dev->dpdk_mp->mp);
3544
3545 ovs_mutex_unlock(&dpdk_mp_mutex);
3546 ovs_mutex_unlock(&dev->mutex);
3547 } else {
3548 ovs_mutex_lock(&dpdk_mp_mutex);
3549 rte_mempool_list_dump(stream);
3550 ovs_mutex_unlock(&dpdk_mp_mutex);
3551 }
3552
3553 fclose(stream);
3554
3555 unixctl_command_reply(conn, response);
3556 out:
3557 free(response);
3558 netdev_close(netdev);
3559 }
3560
3561 /*
3562 * Set virtqueue flags so that we do not receive interrupts.
3563 */
3564 static void
3565 set_irq_status(int vid)
3566 {
3567 uint32_t i;
3568
3569 for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
3570 rte_vhost_enable_guest_notification(vid, i, 0);
3571 }
3572 }
3573
3574 /*
3575 * Fixes mapping for vhost-user tx queues. Must be called after each
3576 * enabling/disabling of queues and n_txq modifications.
3577 */
3578 static void
3579 netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
3580 OVS_REQUIRES(dev->mutex)
3581 {
3582 int *enabled_queues, n_enabled = 0;
3583 int i, k, total_txqs = dev->up.n_txq;
3584
3585 enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
3586
3587 for (i = 0; i < total_txqs; i++) {
3588 /* Enabled queues always mapped to themselves. */
3589 if (dev->tx_q[i].map == i) {
3590 enabled_queues[n_enabled++] = i;
3591 }
3592 }
3593
3594 if (n_enabled == 0 && total_txqs != 0) {
3595 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
3596 n_enabled = 1;
3597 }
3598
3599 k = 0;
3600 for (i = 0; i < total_txqs; i++) {
3601 if (dev->tx_q[i].map != i) {
3602 dev->tx_q[i].map = enabled_queues[k];
3603 k = (k + 1) % n_enabled;
3604 }
3605 }
3606
3607 if (VLOG_IS_DBG_ENABLED()) {
3608 struct ds mapping = DS_EMPTY_INITIALIZER;
3609
3610 ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
3611 netdev_get_name(&dev->up));
3612 for (i = 0; i < total_txqs; i++) {
3613 ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
3614 }
3615
3616 VLOG_DBG("%s", ds_cstr(&mapping));
3617 ds_destroy(&mapping);
3618 }
3619
3620 free(enabled_queues);
3621 }
3622
3623 /*
3624 * A new virtio-net device is added to a vhost port.
3625 */
3626 static int
3627 new_device(int vid)
3628 {
3629 struct netdev_dpdk *dev;
3630 bool exists = false;
3631 int newnode = 0;
3632 char ifname[IF_NAME_SZ];
3633
3634 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3635
3636 ovs_mutex_lock(&dpdk_mutex);
3637 /* Add device to the vhost port with the same name as that passed down. */
3638 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
3639 ovs_mutex_lock(&dev->mutex);
3640 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3641 uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
3642
3643 /* Get NUMA information */
3644 newnode = rte_vhost_get_numa_node(vid);
3645 if (newnode == -1) {
3646 #ifdef VHOST_NUMA
3647 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
3648 ifname);
3649 #endif
3650 newnode = dev->socket_id;
3651 }
3652
3653 if (dev->requested_n_txq < qp_num
3654 || dev->requested_n_rxq < qp_num
3655 || dev->requested_socket_id != newnode) {
3656 dev->requested_socket_id = newnode;
3657 dev->requested_n_rxq = qp_num;
3658 dev->requested_n_txq = qp_num;
3659 netdev_request_reconfigure(&dev->up);
3660 } else {
3661 /* Reconfiguration not required. */
3662 dev->vhost_reconfigured = true;
3663 }
3664
3665 ovsrcu_index_set(&dev->vid, vid);
3666 exists = true;
3667
3668 /* Disable notifications. */
3669 set_irq_status(vid);
3670 netdev_change_seq_changed(&dev->up);
3671 ovs_mutex_unlock(&dev->mutex);
3672 break;
3673 }
3674 ovs_mutex_unlock(&dev->mutex);
3675 }
3676 ovs_mutex_unlock(&dpdk_mutex);
3677
3678 if (!exists) {
3679 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
3680
3681 return -1;
3682 }
3683
3684 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
3685 ifname, newnode);
3686
3687 return 0;
3688 }
3689
3690 /* Clears mapping for all available queues of vhost interface. */
3691 static void
3692 netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
3693 OVS_REQUIRES(dev->mutex)
3694 {
3695 int i;
3696
3697 for (i = 0; i < dev->up.n_txq; i++) {
3698 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
3699 }
3700 }
3701
3702 /*
3703 * Remove a virtio-net device from the specific vhost port. Use dev->remove
3704 * flag to stop any more packets from being sent or received to/from a VM and
3705 * ensure all currently queued packets have been sent/received before removing
3706 * the device.
3707 */
3708 static void
3709 destroy_device(int vid)
3710 {
3711 struct netdev_dpdk *dev;
3712 bool exists = false;
3713 char ifname[IF_NAME_SZ];
3714
3715 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3716
3717 ovs_mutex_lock(&dpdk_mutex);
3718 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3719 if (netdev_dpdk_get_vid(dev) == vid) {
3720
3721 ovs_mutex_lock(&dev->mutex);
3722 dev->vhost_reconfigured = false;
3723 ovsrcu_index_set(&dev->vid, -1);
3724 memset(dev->vhost_rxq_enabled, 0,
3725 dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
3726 netdev_dpdk_txq_map_clear(dev);
3727
3728 netdev_change_seq_changed(&dev->up);
3729 ovs_mutex_unlock(&dev->mutex);
3730 exists = true;
3731 break;
3732 }
3733 }
3734
3735 ovs_mutex_unlock(&dpdk_mutex);
3736
3737 if (exists) {
3738 /*
3739 * Wait for other threads to quiesce after setting the 'virtio_dev'
3740 * to NULL, before returning.
3741 */
3742 ovsrcu_synchronize();
3743 /*
3744 * As call to ovsrcu_synchronize() will end the quiescent state,
3745 * put thread back into quiescent state before returning.
3746 */
3747 ovsrcu_quiesce_start();
3748 VLOG_INFO("vHost Device '%s' has been removed", ifname);
3749 } else {
3750 VLOG_INFO("vHost Device '%s' not found", ifname);
3751 }
3752 }
3753
3754 static int
3755 vring_state_changed(int vid, uint16_t queue_id, int enable)
3756 {
3757 struct netdev_dpdk *dev;
3758 bool exists = false;
3759 int qid = queue_id / VIRTIO_QNUM;
3760 bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
3761 char ifname[IF_NAME_SZ];
3762
3763 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3764
3765 ovs_mutex_lock(&dpdk_mutex);
3766 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3767 ovs_mutex_lock(&dev->mutex);
3768 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3769 if (is_rx) {
3770 bool old_state = dev->vhost_rxq_enabled[qid];
3771
3772 dev->vhost_rxq_enabled[qid] = enable != 0;
3773 if (old_state != dev->vhost_rxq_enabled[qid]) {
3774 netdev_change_seq_changed(&dev->up);
3775 }
3776 } else {
3777 if (enable) {
3778 dev->tx_q[qid].map = qid;
3779 } else {
3780 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
3781 }
3782 netdev_dpdk_remap_txqs(dev);
3783 }
3784 exists = true;
3785 ovs_mutex_unlock(&dev->mutex);
3786 break;
3787 }
3788 ovs_mutex_unlock(&dev->mutex);
3789 }
3790 ovs_mutex_unlock(&dpdk_mutex);
3791
3792 if (exists) {
3793 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
3794 "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
3795 qid, ifname, (enable == 1) ? "enabled" : "disabled");
3796 } else {
3797 VLOG_INFO("vHost Device '%s' not found", ifname);
3798 return -1;
3799 }
3800
3801 return 0;
3802 }
3803
3804 static void
3805 destroy_connection(int vid)
3806 {
3807 struct netdev_dpdk *dev;
3808 char ifname[IF_NAME_SZ];
3809 bool exists = false;
3810
3811 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3812
3813 ovs_mutex_lock(&dpdk_mutex);
3814 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3815 ovs_mutex_lock(&dev->mutex);
3816 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3817 uint32_t qp_num = NR_QUEUE;
3818
3819 if (netdev_dpdk_get_vid(dev) >= 0) {
3820 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
3821 "device still attached.", dev->vhost_id);
3822 }
3823
3824 /* Restore the number of queue pairs to default. */
3825 if (dev->requested_n_txq != qp_num
3826 || dev->requested_n_rxq != qp_num) {
3827 dev->requested_n_rxq = qp_num;
3828 dev->requested_n_txq = qp_num;
3829 netdev_request_reconfigure(&dev->up);
3830 }
3831 ovs_mutex_unlock(&dev->mutex);
3832 exists = true;
3833 break;
3834 }
3835 ovs_mutex_unlock(&dev->mutex);
3836 }
3837 ovs_mutex_unlock(&dpdk_mutex);
3838
3839 if (exists) {
3840 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
3841 } else {
3842 VLOG_INFO("vHost Device '%s' not found", ifname);
3843 }
3844 }
3845
3846 /*
3847 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
3848 * or vhostuserclient netdev.
3849 *
3850 * Returns a value greater or equal to zero for a valid vid or '-1' if
3851 * there is no valid vid associated. A vid of '-1' must not be used in
3852 * rte_vhost_ APi calls.
3853 *
3854 * Once obtained and validated, a vid can be used by a PMD for multiple
3855 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
3856 * not fetch the vid again for each of a series of API calls.
3857 */
3858
3859 int
3860 netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
3861 {
3862 return ovsrcu_index_get(&dev->vid);
3863 }
3864
3865 struct ingress_policer *
3866 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
3867 {
3868 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
3869 }
3870
3871 static int
3872 netdev_dpdk_class_init(void)
3873 {
3874 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3875
3876 /* This function can be called for different classes. The initialization
3877 * needs to be done only once */
3878 if (ovsthread_once_start(&once)) {
3879 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3880 unixctl_command_register("netdev-dpdk/set-admin-state",
3881 "[netdev] up|down", 1, 2,
3882 netdev_dpdk_set_admin_state, NULL);
3883
3884 unixctl_command_register("netdev-dpdk/detach",
3885 "pci address of device", 1, 1,
3886 netdev_dpdk_detach, NULL);
3887
3888 unixctl_command_register("netdev-dpdk/get-mempool-info",
3889 "[netdev]", 0, 1,
3890 netdev_dpdk_get_mempool_info, NULL);
3891
3892 ovsthread_once_done(&once);
3893 }
3894
3895 return 0;
3896 }
3897
3898 /* Client Rings */
3899
3900 static int
3901 dpdk_ring_create(const char dev_name[], unsigned int port_no,
3902 dpdk_port_t *eth_port_id)
3903 {
3904 struct dpdk_ring *ring_pair;
3905 char *ring_name;
3906 int port_id;
3907
3908 ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
3909 if (!ring_pair) {
3910 return ENOMEM;
3911 }
3912
3913 /* XXX: Add support for multiquque ring. */
3914 ring_name = xasprintf("%s_tx", dev_name);
3915
3916 /* Create single producer tx ring, netdev does explicit locking. */
3917 ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
3918 RING_F_SP_ENQ);
3919 free(ring_name);
3920 if (ring_pair->cring_tx == NULL) {
3921 rte_free(ring_pair);
3922 return ENOMEM;
3923 }
3924
3925 ring_name = xasprintf("%s_rx", dev_name);
3926
3927 /* Create single consumer rx ring, netdev does explicit locking. */
3928 ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
3929 RING_F_SC_DEQ);
3930 free(ring_name);
3931 if (ring_pair->cring_rx == NULL) {
3932 rte_free(ring_pair);
3933 return ENOMEM;
3934 }
3935
3936 port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
3937 &ring_pair->cring_tx, 1, SOCKET0);
3938
3939 if (port_id < 0) {
3940 rte_free(ring_pair);
3941 return ENODEV;
3942 }
3943
3944 ring_pair->user_port_id = port_no;
3945 ring_pair->eth_port_id = port_id;
3946 *eth_port_id = port_id;
3947
3948 ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
3949
3950 return 0;
3951 }
3952
3953 static int
3954 dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
3955 OVS_REQUIRES(dpdk_mutex)
3956 {
3957 struct dpdk_ring *ring_pair;
3958 unsigned int port_no;
3959 int err = 0;
3960
3961 /* Names always start with "dpdkr" */
3962 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
3963 if (err) {
3964 return err;
3965 }
3966
3967 /* Look through our list to find the device */
3968 LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
3969 if (ring_pair->user_port_id == port_no) {
3970 VLOG_INFO("Found dpdk ring device %s:", dev_name);
3971 /* Really all that is needed */
3972 *eth_port_id = ring_pair->eth_port_id;
3973 return 0;
3974 }
3975 }
3976 /* Need to create the device rings */
3977 return dpdk_ring_create(dev_name, port_no, eth_port_id);
3978 }
3979
3980 static int
3981 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
3982 struct dp_packet_batch *batch, bool concurrent_txq)
3983 {
3984 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3985 struct dp_packet *packet;
3986
3987 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
3988 * the offload fields are clear. This is because the same mbuf may be
3989 * modified by the consumer of the ring and return into the datapath
3990 * without recalculating the RSS hash or revalidating the checksums. */
3991 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
3992 dp_packet_reset_offload(packet);
3993 }
3994
3995 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
3996 return 0;
3997 }
3998
3999 static int
4000 netdev_dpdk_ring_construct(struct netdev *netdev)
4001 {
4002 dpdk_port_t port_no = 0;
4003 int err = 0;
4004
4005 VLOG_WARN_ONCE("dpdkr a.k.a. ring ports are considered deprecated. "
4006 "Please migrate to virtio-based interfaces, e.g. "
4007 "dpdkvhostuserclient ports, net_virtio_user DPDK vdev.");
4008
4009 ovs_mutex_lock(&dpdk_mutex);
4010
4011 err = dpdk_ring_open(netdev->name, &port_no);
4012 if (err) {
4013 goto unlock_dpdk;
4014 }
4015
4016 err = common_construct(netdev, port_no, DPDK_DEV_ETH,
4017 rte_eth_dev_socket_id(port_no));
4018 unlock_dpdk:
4019 ovs_mutex_unlock(&dpdk_mutex);
4020 return err;
4021 }
4022
4023 /* QoS Functions */
4024
4025 /*
4026 * Initialize QoS configuration operations.
4027 */
4028 static void
4029 qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
4030 {
4031 conf->ops = ops;
4032 rte_spinlock_init(&conf->lock);
4033 }
4034
4035 /*
4036 * Search existing QoS operations in qos_ops and compare each set of
4037 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
4038 * else return NULL
4039 */
4040 static const struct dpdk_qos_ops *
4041 qos_lookup_name(const char *name)
4042 {
4043 const struct dpdk_qos_ops *const *opsp;
4044
4045 for (opsp = qos_confs; *opsp != NULL; opsp++) {
4046 const struct dpdk_qos_ops *ops = *opsp;
4047 if (!strcmp(name, ops->qos_name)) {
4048 return ops;
4049 }
4050 }
4051 return NULL;
4052 }
4053
4054 static int
4055 netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
4056 struct sset *types)
4057 {
4058 const struct dpdk_qos_ops *const *opsp;
4059
4060 for (opsp = qos_confs; *opsp != NULL; opsp++) {
4061 const struct dpdk_qos_ops *ops = *opsp;
4062 if (ops->qos_construct && ops->qos_name[0] != '\0') {
4063 sset_add(types, ops->qos_name);
4064 }
4065 }
4066 return 0;
4067 }
4068
4069 static int
4070 netdev_dpdk_get_qos(const struct netdev *netdev,
4071 const char **typep, struct smap *details)
4072 {
4073 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4074 struct qos_conf *qos_conf;
4075 int error = 0;
4076
4077 ovs_mutex_lock(&dev->mutex);
4078 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4079 if (qos_conf) {
4080 *typep = qos_conf->ops->qos_name;
4081 error = (qos_conf->ops->qos_get
4082 ? qos_conf->ops->qos_get(qos_conf, details): 0);
4083 } else {
4084 /* No QoS configuration set, return an empty string */
4085 *typep = "";
4086 }
4087 ovs_mutex_unlock(&dev->mutex);
4088
4089 return error;
4090 }
4091
4092 static int
4093 netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
4094 const struct smap *details)
4095 {
4096 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4097 const struct dpdk_qos_ops *new_ops = NULL;
4098 struct qos_conf *qos_conf, *new_qos_conf = NULL;
4099 int error = 0;
4100
4101 ovs_mutex_lock(&dev->mutex);
4102
4103 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4104
4105 new_ops = qos_lookup_name(type);
4106
4107 if (!new_ops || !new_ops->qos_construct) {
4108 new_qos_conf = NULL;
4109 if (type && type[0]) {
4110 error = EOPNOTSUPP;
4111 }
4112 } else if (qos_conf && qos_conf->ops == new_ops
4113 && qos_conf->ops->qos_is_equal(qos_conf, details)) {
4114 new_qos_conf = qos_conf;
4115 } else {
4116 error = new_ops->qos_construct(details, &new_qos_conf);
4117 }
4118
4119 if (error) {
4120 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
4121 type, netdev->name, rte_strerror(error));
4122 }
4123
4124 if (new_qos_conf != qos_conf) {
4125 ovsrcu_set(&dev->qos_conf, new_qos_conf);
4126 if (qos_conf) {
4127 ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
4128 }
4129 }
4130
4131 ovs_mutex_unlock(&dev->mutex);
4132
4133 return error;
4134 }
4135
4136 /* egress-policer details */
4137
4138 struct egress_policer {
4139 struct qos_conf qos_conf;
4140 struct rte_meter_srtcm_params app_srtcm_params;
4141 struct rte_meter_srtcm egress_meter;
4142 struct rte_meter_srtcm_profile egress_prof;
4143 };
4144
4145 static void
4146 egress_policer_details_to_param(const struct smap *details,
4147 struct rte_meter_srtcm_params *params)
4148 {
4149 memset(params, 0, sizeof *params);
4150 params->cir = smap_get_ullong(details, "cir", 0);
4151 params->cbs = smap_get_ullong(details, "cbs", 0);
4152 params->ebs = 0;
4153 }
4154
4155 static int
4156 egress_policer_qos_construct(const struct smap *details,
4157 struct qos_conf **conf)
4158 {
4159 struct egress_policer *policer;
4160 int err = 0;
4161
4162 policer = xmalloc(sizeof *policer);
4163 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
4164 egress_policer_details_to_param(details, &policer->app_srtcm_params);
4165 err = rte_meter_srtcm_profile_config(&policer->egress_prof,
4166 &policer->app_srtcm_params);
4167 if (!err) {
4168 err = rte_meter_srtcm_config(&policer->egress_meter,
4169 &policer->egress_prof);
4170 }
4171
4172 if (!err) {
4173 *conf = &policer->qos_conf;
4174 } else {
4175 VLOG_ERR("Could not create rte meter for egress policer");
4176 free(policer);
4177 *conf = NULL;
4178 err = -err;
4179 }
4180
4181 return err;
4182 }
4183
4184 static void
4185 egress_policer_qos_destruct(struct qos_conf *conf)
4186 {
4187 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
4188 qos_conf);
4189 free(policer);
4190 }
4191
4192 static int
4193 egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
4194 {
4195 struct egress_policer *policer =
4196 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4197
4198 smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
4199 smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
4200
4201 return 0;
4202 }
4203
4204 static bool
4205 egress_policer_qos_is_equal(const struct qos_conf *conf,
4206 const struct smap *details)
4207 {
4208 struct egress_policer *policer =
4209 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4210 struct rte_meter_srtcm_params params;
4211
4212 egress_policer_details_to_param(details, &params);
4213
4214 return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
4215 }
4216
4217 static int
4218 egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
4219 bool should_steal)
4220 {
4221 int cnt = 0;
4222 struct egress_policer *policer =
4223 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4224
4225 cnt = netdev_dpdk_policer_run(&policer->egress_meter,
4226 &policer->egress_prof, pkts,
4227 pkt_cnt, should_steal);
4228
4229 return cnt;
4230 }
4231
4232 static const struct dpdk_qos_ops egress_policer_ops = {
4233 "egress-policer", /* qos_name */
4234 egress_policer_qos_construct,
4235 egress_policer_qos_destruct,
4236 egress_policer_qos_get,
4237 egress_policer_qos_is_equal,
4238 egress_policer_run
4239 };
4240
4241 static int
4242 netdev_dpdk_reconfigure(struct netdev *netdev)
4243 {
4244 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4245 int err = 0;
4246
4247 ovs_mutex_lock(&dev->mutex);
4248
4249 if (netdev->n_txq == dev->requested_n_txq
4250 && netdev->n_rxq == dev->requested_n_rxq
4251 && dev->mtu == dev->requested_mtu
4252 && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
4253 && dev->rxq_size == dev->requested_rxq_size
4254 && dev->txq_size == dev->requested_txq_size
4255 && dev->socket_id == dev->requested_socket_id
4256 && dev->started) {
4257 /* Reconfiguration is unnecessary */
4258
4259 goto out;
4260 }
4261
4262 rte_eth_dev_stop(dev->port_id);
4263 dev->started = false;
4264
4265 err = netdev_dpdk_mempool_configure(dev);
4266 if (err && err != EEXIST) {
4267 goto out;
4268 }
4269
4270 dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
4271
4272 netdev->n_txq = dev->requested_n_txq;
4273 netdev->n_rxq = dev->requested_n_rxq;
4274
4275 dev->rxq_size = dev->requested_rxq_size;
4276 dev->txq_size = dev->requested_txq_size;
4277
4278 rte_free(dev->tx_q);
4279 err = dpdk_eth_dev_init(dev);
4280 dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
4281 if (!dev->tx_q) {
4282 err = ENOMEM;
4283 }
4284
4285 netdev_change_seq_changed(netdev);
4286
4287 out:
4288 ovs_mutex_unlock(&dev->mutex);
4289 return err;
4290 }
4291
4292 static int
4293 dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
4294 OVS_REQUIRES(dev->mutex)
4295 {
4296 dev->up.n_txq = dev->requested_n_txq;
4297 dev->up.n_rxq = dev->requested_n_rxq;
4298 int err;
4299
4300 /* Always keep RX queue 0 enabled for implementations that won't
4301 * report vring states. */
4302 dev->vhost_rxq_enabled[0] = true;
4303
4304 /* Enable TX queue 0 by default if it wasn't disabled. */
4305 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
4306 dev->tx_q[0].map = 0;
4307 }
4308
4309 netdev_dpdk_remap_txqs(dev);
4310
4311 err = netdev_dpdk_mempool_configure(dev);
4312 if (!err) {
4313 /* A new mempool was created or re-used. */
4314 netdev_change_seq_changed(&dev->up);
4315 } else if (err != EEXIST) {
4316 return err;
4317 }
4318 if (netdev_dpdk_get_vid(dev) >= 0) {
4319 if (dev->vhost_reconfigured == false) {
4320 dev->vhost_reconfigured = true;
4321 /* Carrier status may need updating. */
4322 netdev_change_seq_changed(&dev->up);
4323 }
4324 }
4325
4326 return 0;
4327 }
4328
4329 static int
4330 netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
4331 {
4332 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4333 int err;
4334
4335 ovs_mutex_lock(&dev->mutex);
4336 err = dpdk_vhost_reconfigure_helper(dev);
4337 ovs_mutex_unlock(&dev->mutex);
4338
4339 return err;
4340 }
4341
4342 static int
4343 netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
4344 {
4345 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4346 int err;
4347 uint64_t vhost_flags = 0;
4348 bool zc_enabled;
4349
4350 ovs_mutex_lock(&dev->mutex);
4351
4352 /* Configure vHost client mode if requested and if the following criteria
4353 * are met:
4354 * 1. Device hasn't been registered yet.
4355 * 2. A path has been specified.
4356 */
4357 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
4358 /* Register client-mode device. */
4359 vhost_flags |= RTE_VHOST_USER_CLIENT;
4360
4361 /* Enable IOMMU support, if explicitly requested. */
4362 if (dpdk_vhost_iommu_enabled()) {
4363 vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
4364 }
4365
4366 /* Enable POSTCOPY support, if explicitly requested. */
4367 if (dpdk_vhost_postcopy_enabled()) {
4368 vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
4369 }
4370
4371 zc_enabled = dev->vhost_driver_flags
4372 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4373 /* Enable zero copy flag, if requested */
4374 if (zc_enabled) {
4375 vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4376 }
4377
4378 err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
4379 if (err) {
4380 VLOG_ERR("vhost-user device setup failure for device %s\n",
4381 dev->vhost_id);
4382 goto unlock;
4383 } else {
4384 /* Configuration successful */
4385 dev->vhost_driver_flags |= vhost_flags;
4386 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
4387 "using client socket '%s'",
4388 dev->up.name, dev->vhost_id);
4389 if (zc_enabled) {
4390 VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
4391 }
4392 }
4393
4394 err = rte_vhost_driver_callback_register(dev->vhost_id,
4395 &virtio_net_device_ops);
4396 if (err) {
4397 VLOG_ERR("rte_vhost_driver_callback_register failed for "
4398 "vhost user client port: %s\n", dev->up.name);
4399 goto unlock;
4400 }
4401
4402 err = rte_vhost_driver_disable_features(dev->vhost_id,
4403 1ULL << VIRTIO_NET_F_HOST_TSO4
4404 | 1ULL << VIRTIO_NET_F_HOST_TSO6
4405 | 1ULL << VIRTIO_NET_F_CSUM);
4406 if (err) {
4407 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
4408 "client port: %s\n", dev->up.name);
4409 goto unlock;
4410 }
4411
4412 err = rte_vhost_driver_start(dev->vhost_id);
4413 if (err) {
4414 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
4415 "client port: %s\n", dev->up.name);
4416 goto unlock;
4417 }
4418 }
4419
4420 err = dpdk_vhost_reconfigure_helper(dev);
4421
4422 unlock:
4423 ovs_mutex_unlock(&dev->mutex);
4424
4425 return err;
4426 }
4427
4428 bool
4429 netdev_dpdk_flow_api_supported(struct netdev *netdev)
4430 {
4431 struct netdev_dpdk *dev;
4432 bool ret = false;
4433
4434 if (!is_dpdk_class(netdev->netdev_class)) {
4435 goto out;
4436 }
4437
4438 dev = netdev_dpdk_cast(netdev);
4439 ovs_mutex_lock(&dev->mutex);
4440 if (dev->type == DPDK_DEV_ETH) {
4441 /* TODO: Check if we able to offload some minimal flow. */
4442 ret = true;
4443 }
4444 ovs_mutex_unlock(&dev->mutex);
4445 out:
4446 return ret;
4447 }
4448
4449 int
4450 netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
4451 struct rte_flow *rte_flow,
4452 struct rte_flow_error *error)
4453 {
4454 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4455 int ret;
4456
4457 ovs_mutex_lock(&dev->mutex);
4458 ret = rte_flow_destroy(dev->port_id, rte_flow, error);
4459 ovs_mutex_unlock(&dev->mutex);
4460 return ret;
4461 }
4462
4463 struct rte_flow *
4464 netdev_dpdk_rte_flow_create(struct netdev *netdev,
4465 const struct rte_flow_attr *attr,
4466 const struct rte_flow_item *items,
4467 const struct rte_flow_action *actions,
4468 struct rte_flow_error *error)
4469 {
4470 struct rte_flow *flow;
4471 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4472
4473 ovs_mutex_lock(&dev->mutex);
4474 flow = rte_flow_create(dev->port_id, attr, items, actions, error);
4475 ovs_mutex_unlock(&dev->mutex);
4476 return flow;
4477 }
4478
4479 #define NETDEV_DPDK_CLASS_COMMON \
4480 .is_pmd = true, \
4481 .alloc = netdev_dpdk_alloc, \
4482 .dealloc = netdev_dpdk_dealloc, \
4483 .get_config = netdev_dpdk_get_config, \
4484 .get_numa_id = netdev_dpdk_get_numa_id, \
4485 .set_etheraddr = netdev_dpdk_set_etheraddr, \
4486 .get_etheraddr = netdev_dpdk_get_etheraddr, \
4487 .get_mtu = netdev_dpdk_get_mtu, \
4488 .set_mtu = netdev_dpdk_set_mtu, \
4489 .get_ifindex = netdev_dpdk_get_ifindex, \
4490 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
4491 .set_miimon_interval = netdev_dpdk_set_miimon, \
4492 .set_policing = netdev_dpdk_set_policing, \
4493 .get_qos_types = netdev_dpdk_get_qos_types, \
4494 .get_qos = netdev_dpdk_get_qos, \
4495 .set_qos = netdev_dpdk_set_qos, \
4496 .update_flags = netdev_dpdk_update_flags, \
4497 .rxq_alloc = netdev_dpdk_rxq_alloc, \
4498 .rxq_construct = netdev_dpdk_rxq_construct, \
4499 .rxq_destruct = netdev_dpdk_rxq_destruct, \
4500 .rxq_dealloc = netdev_dpdk_rxq_dealloc
4501
4502 #define NETDEV_DPDK_CLASS_BASE \
4503 NETDEV_DPDK_CLASS_COMMON, \
4504 .init = netdev_dpdk_class_init, \
4505 .destruct = netdev_dpdk_destruct, \
4506 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
4507 .get_carrier = netdev_dpdk_get_carrier, \
4508 .get_stats = netdev_dpdk_get_stats, \
4509 .get_custom_stats = netdev_dpdk_get_custom_stats, \
4510 .get_features = netdev_dpdk_get_features, \
4511 .get_status = netdev_dpdk_get_status, \
4512 .reconfigure = netdev_dpdk_reconfigure, \
4513 .rxq_recv = netdev_dpdk_rxq_recv
4514
4515 static const struct netdev_class dpdk_class = {
4516 .type = "dpdk",
4517 NETDEV_DPDK_CLASS_BASE,
4518 .construct = netdev_dpdk_construct,
4519 .set_config = netdev_dpdk_set_config,
4520 .send = netdev_dpdk_eth_send,
4521 };
4522
4523 static const struct netdev_class dpdk_ring_class = {
4524 .type = "dpdkr",
4525 NETDEV_DPDK_CLASS_BASE,
4526 .construct = netdev_dpdk_ring_construct,
4527 .set_config = netdev_dpdk_ring_set_config,
4528 .send = netdev_dpdk_ring_send,
4529 };
4530
4531 static const struct netdev_class dpdk_vhost_class = {
4532 .type = "dpdkvhostuser",
4533 NETDEV_DPDK_CLASS_COMMON,
4534 .construct = netdev_dpdk_vhost_construct,
4535 .destruct = netdev_dpdk_vhost_destruct,
4536 .send = netdev_dpdk_vhost_send,
4537 .get_carrier = netdev_dpdk_vhost_get_carrier,
4538 .get_stats = netdev_dpdk_vhost_get_stats,
4539 .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
4540 .get_status = netdev_dpdk_vhost_user_get_status,
4541 .reconfigure = netdev_dpdk_vhost_reconfigure,
4542 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4543 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
4544 };
4545
4546 static const struct netdev_class dpdk_vhost_client_class = {
4547 .type = "dpdkvhostuserclient",
4548 NETDEV_DPDK_CLASS_COMMON,
4549 .construct = netdev_dpdk_vhost_client_construct,
4550 .destruct = netdev_dpdk_vhost_destruct,
4551 .set_config = netdev_dpdk_vhost_client_set_config,
4552 .send = netdev_dpdk_vhost_send,
4553 .get_carrier = netdev_dpdk_vhost_get_carrier,
4554 .get_stats = netdev_dpdk_vhost_get_stats,
4555 .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
4556 .get_status = netdev_dpdk_vhost_user_get_status,
4557 .reconfigure = netdev_dpdk_vhost_client_reconfigure,
4558 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4559 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
4560 };
4561
4562 void
4563 netdev_dpdk_register(void)
4564 {
4565 netdev_register_provider(&dpdk_class);
4566 netdev_register_provider(&dpdk_ring_class);
4567 netdev_register_provider(&dpdk_vhost_class);
4568 netdev_register_provider(&dpdk_vhost_client_class);
4569 }