]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netdev-dpdk.c
userspace: Add TCP Segmentation Offload support
[mirror_ovs.git] / lib / netdev-dpdk.c
1 /*
2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "netdev-dpdk.h"
19
20 #include <errno.h>
21 #include <signal.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <linux/virtio_net.h>
26 #include <sys/socket.h>
27 #include <linux/if.h>
28
29 /* Include rte_compat.h first to allow experimental API's needed for the
30 * rte_meter.h rfc4115 functions. Once they are no longer marked as
31 * experimental the #define and rte_compat.h include can be removed.
32 */
33 #define ALLOW_EXPERIMENTAL_API
34 #include <rte_compat.h>
35 #include <rte_bus_pci.h>
36 #include <rte_config.h>
37 #include <rte_cycles.h>
38 #include <rte_errno.h>
39 #include <rte_eth_ring.h>
40 #include <rte_ethdev.h>
41 #include <rte_flow.h>
42 #include <rte_malloc.h>
43 #include <rte_mbuf.h>
44 #include <rte_meter.h>
45 #include <rte_pci.h>
46 #include <rte_version.h>
47 #include <rte_vhost.h>
48
49 #include "cmap.h"
50 #include "coverage.h"
51 #include "dirs.h"
52 #include "dp-packet.h"
53 #include "dpdk.h"
54 #include "dpif-netdev.h"
55 #include "fatal-signal.h"
56 #include "if-notifier.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "odp-util.h"
60 #include "openvswitch/dynamic-string.h"
61 #include "openvswitch/list.h"
62 #include "openvswitch/match.h"
63 #include "openvswitch/ofp-print.h"
64 #include "openvswitch/shash.h"
65 #include "openvswitch/vlog.h"
66 #include "ovs-numa.h"
67 #include "ovs-rcu.h"
68 #include "ovs-thread.h"
69 #include "packets.h"
70 #include "smap.h"
71 #include "sset.h"
72 #include "timeval.h"
73 #include "unaligned.h"
74 #include "unixctl.h"
75 #include "userspace-tso.h"
76 #include "util.h"
77 #include "uuid.h"
78
79 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
80
81 VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
82 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
83
84 COVERAGE_DEFINE(vhost_tx_contention);
85 COVERAGE_DEFINE(vhost_notification);
86
87 #define DPDK_PORT_WATCHDOG_INTERVAL 5
88
89 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
90 #define OVS_VPORT_DPDK "ovs_dpdk"
91
92 /*
93 * need to reserve tons of extra space in the mbufs so we can align the
94 * DMA addresses to 4KB.
95 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
96 * performance for standard Ethernet MTU.
97 */
98 #define ETHER_HDR_MAX_LEN (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \
99 + (2 * VLAN_HEADER_LEN))
100 #define MTU_TO_FRAME_LEN(mtu) ((mtu) + RTE_ETHER_HDR_LEN + \
101 RTE_ETHER_CRC_LEN)
102 #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
103 #define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
104 - RTE_ETHER_HDR_LEN - RTE_ETHER_CRC_LEN)
105 #define NETDEV_DPDK_MBUF_ALIGN 1024
106 #define NETDEV_DPDK_MAX_PKT_LEN 9728
107
108 /* Max and min number of packets in the mempool. OVS tries to allocate a
109 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
110 * enough hugepages) we keep halving the number until the allocation succeeds
111 * or we reach MIN_NB_MBUF */
112
113 #define MAX_NB_MBUF (4096 * 64)
114 #define MIN_NB_MBUF (4096 * 4)
115 #define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
116
117 /* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
118 BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
119 == 0);
120
121 /* The smallest possible NB_MBUF that we're going to try should be a multiple
122 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
123 BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
124 % MP_CACHE_SZ == 0);
125
126 #define SOCKET0 0
127
128 /* Default size of Physical NIC RXQ */
129 #define NIC_PORT_DEFAULT_RXQ_SIZE 2048
130 /* Default size of Physical NIC TXQ */
131 #define NIC_PORT_DEFAULT_TXQ_SIZE 2048
132 /* Maximum size of Physical NIC Queues */
133 #define NIC_PORT_MAX_Q_SIZE 4096
134
135 #define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
136 #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
137 #define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
138 * yet mapped to another queue. */
139
140 #define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
141
142 /* DPDK library uses uint16_t for port_id. */
143 typedef uint16_t dpdk_port_t;
144 #define DPDK_PORT_ID_FMT "%"PRIu16
145
146 /* Minimum amount of vhost tx retries, effectively a disable. */
147 #define VHOST_ENQ_RETRY_MIN 0
148 /* Maximum amount of vhost tx retries. */
149 #define VHOST_ENQ_RETRY_MAX 32
150 /* Legacy default value for vhost tx retries. */
151 #define VHOST_ENQ_RETRY_DEF 8
152
153 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
154
155 static const struct rte_eth_conf port_conf = {
156 .rxmode = {
157 .mq_mode = ETH_MQ_RX_RSS,
158 .split_hdr_size = 0,
159 .offloads = 0,
160 },
161 .rx_adv_conf = {
162 .rss_conf = {
163 .rss_key = NULL,
164 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
165 },
166 },
167 .txmode = {
168 .mq_mode = ETH_MQ_TX_NONE,
169 },
170 };
171
172 /*
173 * These callbacks allow virtio-net devices to be added to vhost ports when
174 * configuration has been fully completed.
175 */
176 static int new_device(int vid);
177 static void destroy_device(int vid);
178 static int vring_state_changed(int vid, uint16_t queue_id, int enable);
179 static void destroy_connection(int vid);
180 static void vhost_guest_notified(int vid);
181
182 static const struct vhost_device_ops virtio_net_device_ops =
183 {
184 .new_device = new_device,
185 .destroy_device = destroy_device,
186 .vring_state_changed = vring_state_changed,
187 .features_changed = NULL,
188 .new_connection = NULL,
189 .destroy_connection = destroy_connection,
190 .guest_notified = vhost_guest_notified,
191 };
192
193 /* Custom software stats for dpdk ports */
194 struct netdev_dpdk_sw_stats {
195 /* No. of retries when unable to transmit. */
196 uint64_t tx_retries;
197 /* Packet drops when unable to transmit; Probably Tx queue is full. */
198 uint64_t tx_failure_drops;
199 /* Packet length greater than device MTU. */
200 uint64_t tx_mtu_exceeded_drops;
201 /* Packet drops in egress policer processing. */
202 uint64_t tx_qos_drops;
203 /* Packet drops in ingress policer processing. */
204 uint64_t rx_qos_drops;
205 /* Packet drops in HWOL processing. */
206 uint64_t tx_invalid_hwol_drops;
207 };
208
209 enum { DPDK_RING_SIZE = 256 };
210 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
211 enum { DRAIN_TSC = 200000ULL };
212
213 enum dpdk_dev_type {
214 DPDK_DEV_ETH = 0,
215 DPDK_DEV_VHOST = 1,
216 };
217
218 /* Quality of Service */
219
220 /* An instance of a QoS configuration. Always associated with a particular
221 * network device.
222 *
223 * Each QoS implementation subclasses this with whatever additional data it
224 * needs.
225 */
226 struct qos_conf {
227 const struct dpdk_qos_ops *ops;
228 rte_spinlock_t lock;
229 };
230
231 /* QoS queue information used by the netdev queue dump functions. */
232 struct netdev_dpdk_queue_state {
233 uint32_t *queues;
234 size_t cur_queue;
235 size_t n_queues;
236 };
237
238 /* A particular implementation of dpdk QoS operations.
239 *
240 * The functions below return 0 if successful or a positive errno value on
241 * failure, except where otherwise noted. All of them must be provided, except
242 * where otherwise noted.
243 */
244 struct dpdk_qos_ops {
245
246 /* Name of the QoS type */
247 const char *qos_name;
248
249 /* Called to construct a qos_conf object. The implementation should make
250 * the appropriate calls to configure QoS according to 'details'.
251 *
252 * The contents of 'details' should be documented as valid for 'ovs_name'
253 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
254 * (which is built as ovs-vswitchd.conf.db(8)).
255 *
256 * This function must return 0 if and only if it sets '*conf' to an
257 * initialized 'struct qos_conf'.
258 *
259 * For all QoS implementations it should always be non-null.
260 */
261 int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
262
263 /* Destroys the data structures allocated by the implementation as part of
264 * 'qos_conf'.
265 *
266 * For all QoS implementations it should always be non-null.
267 */
268 void (*qos_destruct)(struct qos_conf *conf);
269
270 /* Retrieves details of 'conf' configuration into 'details'.
271 *
272 * The contents of 'details' should be documented as valid for 'ovs_name'
273 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
274 * (which is built as ovs-vswitchd.conf.db(8)).
275 */
276 int (*qos_get)(const struct qos_conf *conf, struct smap *details);
277
278 /* Returns true if 'conf' is already configured according to 'details'.
279 *
280 * The contents of 'details' should be documented as valid for 'ovs_name'
281 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
282 * (which is built as ovs-vswitchd.conf.db(8)).
283 *
284 * For all QoS implementations it should always be non-null.
285 */
286 bool (*qos_is_equal)(const struct qos_conf *conf,
287 const struct smap *details);
288
289 /* Modify an array of rte_mbufs. The modification is specific to
290 * each qos implementation.
291 *
292 * The function should take and array of mbufs and an int representing
293 * the current number of mbufs present in the array.
294 *
295 * After the function has performed a qos modification to the array of
296 * mbufs it returns an int representing the number of mbufs now present in
297 * the array. This value is can then be passed to the port send function
298 * along with the modified array for transmission.
299 *
300 * For all QoS implementations it should always be non-null.
301 */
302 int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
303 int pkt_cnt, bool should_steal);
304
305 /* Called to construct a QoS Queue. The implementation should make
306 * the appropriate calls to configure QoS Queue according to 'details'.
307 *
308 * The contents of 'details' should be documented as valid for 'ovs_name'
309 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
310 * (which is built as ovs-vswitchd.conf.db(8)).
311 *
312 * This function must return 0 if and only if it constructs
313 * QoS queue successfully.
314 */
315 int (*qos_queue_construct)(const struct smap *details,
316 uint32_t queue_id, struct qos_conf *conf);
317
318 /* Destroys the QoS Queue. */
319 void (*qos_queue_destruct)(struct qos_conf *conf, uint32_t queue_id);
320
321 /* Retrieves details of QoS Queue configuration into 'details'.
322 *
323 * The contents of 'details' should be documented as valid for 'ovs_name'
324 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
325 * (which is built as ovs-vswitchd.conf.db(8)).
326 */
327 int (*qos_queue_get)(struct smap *details, uint32_t queue_id,
328 const struct qos_conf *conf);
329
330 /* Retrieves statistics of QoS Queue configuration into 'stats'. */
331 int (*qos_queue_get_stats)(const struct qos_conf *conf, uint32_t queue_id,
332 struct netdev_queue_stats *stats);
333
334 /* Setup the 'netdev_dpdk_queue_state' structure used by the dpdk queue
335 * dump functions.
336 */
337 int (*qos_queue_dump_state_init)(const struct qos_conf *conf,
338 struct netdev_dpdk_queue_state *state);
339 };
340
341 /* dpdk_qos_ops for each type of user space QoS implementation. */
342 static const struct dpdk_qos_ops egress_policer_ops;
343 static const struct dpdk_qos_ops trtcm_policer_ops;
344
345 /*
346 * Array of dpdk_qos_ops, contains pointer to all supported QoS
347 * operations.
348 */
349 static const struct dpdk_qos_ops *const qos_confs[] = {
350 &egress_policer_ops,
351 &trtcm_policer_ops,
352 NULL
353 };
354
355 static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
356
357 /* Contains all 'struct dpdk_dev's. */
358 static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
359 = OVS_LIST_INITIALIZER(&dpdk_list);
360
361 static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
362 = OVS_MUTEX_INITIALIZER;
363
364 /* Contains all 'struct dpdk_mp's. */
365 static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
366 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
367
368 struct dpdk_mp {
369 struct rte_mempool *mp;
370 int mtu;
371 int socket_id;
372 int refcount;
373 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
374 };
375
376 /* There should be one 'struct dpdk_tx_queue' created for
377 * each netdev tx queue. */
378 struct dpdk_tx_queue {
379 /* Padding to make dpdk_tx_queue exactly one cache line long. */
380 PADDED_MEMBERS(CACHE_LINE_SIZE,
381 /* Protects the members and the NIC queue from concurrent access.
382 * It is used only if the queue is shared among different pmd threads
383 * (see 'concurrent_txq'). */
384 rte_spinlock_t tx_lock;
385 /* Mapping of configured vhost-user queue to enabled by guest. */
386 int map;
387 );
388 };
389
390 /* dpdk has no way to remove dpdk ring ethernet devices
391 so we have to keep them around once they've been created
392 */
393
394 static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
395 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
396
397 struct dpdk_ring {
398 /* For the client rings */
399 struct rte_ring *cring_tx;
400 struct rte_ring *cring_rx;
401 unsigned int user_port_id; /* User given port no, parsed from port name */
402 dpdk_port_t eth_port_id; /* ethernet device port id */
403 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
404 };
405
406 struct ingress_policer {
407 struct rte_meter_srtcm_params app_srtcm_params;
408 struct rte_meter_srtcm in_policer;
409 struct rte_meter_srtcm_profile in_prof;
410 rte_spinlock_t policer_lock;
411 };
412
413 enum dpdk_hw_ol_features {
414 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
415 NETDEV_RX_HW_CRC_STRIP = 1 << 1,
416 NETDEV_RX_HW_SCATTER = 1 << 2,
417 NETDEV_TX_TSO_OFFLOAD = 1 << 3,
418 };
419
420 /*
421 * In order to avoid confusion in variables names, following naming convention
422 * should be used, if possible:
423 *
424 * 'struct netdev' : 'netdev'
425 * 'struct netdev_dpdk' : 'dev'
426 * 'struct netdev_rxq' : 'rxq'
427 * 'struct netdev_rxq_dpdk' : 'rx'
428 *
429 * Example:
430 * struct netdev *netdev = netdev_from_name(name);
431 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
432 *
433 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
434 * already defined.
435 */
436
437 struct netdev_dpdk {
438 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
439 dpdk_port_t port_id;
440
441 /* If true, device was attached by rte_eth_dev_attach(). */
442 bool attached;
443 /* If true, rte_eth_dev_start() was successfully called */
444 bool started;
445 bool reset_needed;
446 /* 1 pad byte here. */
447 struct eth_addr hwaddr;
448 int mtu;
449 int socket_id;
450 int buf_size;
451 int max_packet_len;
452 enum dpdk_dev_type type;
453 enum netdev_flags flags;
454 int link_reset_cnt;
455 union {
456 /* Device arguments for dpdk ports. */
457 char *devargs;
458 /* Identifier used to distinguish vhost devices from each other. */
459 char *vhost_id;
460 };
461 struct dpdk_tx_queue *tx_q;
462 struct rte_eth_link link;
463 );
464
465 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
466 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
467 struct dpdk_mp *dpdk_mp;
468
469 /* virtio identifier for vhost devices */
470 ovsrcu_index vid;
471
472 /* True if vHost device is 'up' and has been reconfigured at least once */
473 bool vhost_reconfigured;
474
475 atomic_uint8_t vhost_tx_retries_max;
476 /* 2 pad bytes here. */
477 );
478
479 PADDED_MEMBERS(CACHE_LINE_SIZE,
480 struct netdev up;
481 /* In dpdk_list. */
482 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
483
484 /* QoS configuration and lock for the device */
485 OVSRCU_TYPE(struct qos_conf *) qos_conf;
486
487 /* Ingress Policer */
488 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
489 uint32_t policer_rate;
490 uint32_t policer_burst;
491
492 /* Array of vhost rxq states, see vring_state_changed. */
493 bool *vhost_rxq_enabled;
494 );
495
496 PADDED_MEMBERS(CACHE_LINE_SIZE,
497 struct netdev_stats stats;
498 struct netdev_dpdk_sw_stats *sw_stats;
499 /* Protects stats */
500 rte_spinlock_t stats_lock;
501 /* 36 pad bytes here. */
502 );
503
504 PADDED_MEMBERS(CACHE_LINE_SIZE,
505 /* The following properties cannot be changed when a device is running,
506 * so we remember the request and update them next time
507 * netdev_dpdk*_reconfigure() is called */
508 int requested_mtu;
509 int requested_n_txq;
510 int requested_n_rxq;
511 int requested_rxq_size;
512 int requested_txq_size;
513
514 /* Number of rx/tx descriptors for physical devices */
515 int rxq_size;
516 int txq_size;
517
518 /* Socket ID detected when vHost device is brought up */
519 int requested_socket_id;
520
521 /* Denotes whether vHost port is client/server mode */
522 uint64_t vhost_driver_flags;
523
524 /* DPDK-ETH Flow control */
525 struct rte_eth_fc_conf fc_conf;
526
527 /* DPDK-ETH hardware offload features,
528 * from the enum set 'dpdk_hw_ol_features' */
529 uint32_t hw_ol_features;
530
531 /* Properties for link state change detection mode.
532 * If lsc_interrupt_mode is set to false, poll mode is used,
533 * otherwise interrupt mode is used. */
534 bool requested_lsc_interrupt_mode;
535 bool lsc_interrupt_mode;
536 );
537
538 PADDED_MEMBERS(CACHE_LINE_SIZE,
539 /* Names of all XSTATS counters */
540 struct rte_eth_xstat_name *rte_xstats_names;
541 int rte_xstats_names_size;
542 int rte_xstats_ids_size;
543 uint64_t *rte_xstats_ids;
544 );
545 };
546
547 struct netdev_rxq_dpdk {
548 struct netdev_rxq up;
549 dpdk_port_t port_id;
550 };
551
552 static void netdev_dpdk_destruct(struct netdev *netdev);
553 static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
554
555 static int netdev_dpdk_get_sw_custom_stats(const struct netdev *,
556 struct netdev_custom_stats *);
557 static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
558
559 int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
560
561 struct ingress_policer *
562 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
563
564 static bool
565 is_dpdk_class(const struct netdev_class *class)
566 {
567 return class->destruct == netdev_dpdk_destruct
568 || class->destruct == netdev_dpdk_vhost_destruct;
569 }
570
571 /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
572 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
573 * value, insufficient buffers are allocated to accomodate the packet in its
574 * entirety. Furthermore, certain drivers need to ensure that there is also
575 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
576 * frames). If the RX buffer is too small, then the driver enables scatter RX
577 * behaviour, which reduces performance. To prevent this, use a buffer size
578 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
579 */
580 static uint32_t
581 dpdk_buf_size(int mtu)
582 {
583 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
584 + RTE_PKTMBUF_HEADROOM;
585 }
586
587 /* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
588 *
589 * Unlike xmalloc(), this function can return NULL on failure. */
590 static void *
591 dpdk_rte_mzalloc(size_t sz)
592 {
593 return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
594 }
595
596 void
597 free_dpdk_buf(struct dp_packet *p)
598 {
599 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
600
601 rte_pktmbuf_free(pkt);
602 }
603
604 static void
605 ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
606 void *opaque_arg OVS_UNUSED,
607 void *_p,
608 unsigned i OVS_UNUSED)
609 {
610 struct rte_mbuf *pkt = _p;
611
612 dp_packet_init_dpdk((struct dp_packet *) pkt);
613 }
614
615 static int
616 dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
617 {
618 /* At this point we want to know if all the mbufs are back
619 * in the mempool. rte_mempool_full() is not atomic but it's
620 * the best available and as we are no longer requesting mbufs
621 * from the mempool, it means mbufs will not move from
622 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
623 * the ring is counted before caches, so we won't get false
624 * positives in this use case and we handle false negatives.
625 *
626 * If future implementations of rte_mempool_full() were to change
627 * it could be possible for a false positive. Even that would
628 * likely be ok, as there are additional checks during mempool
629 * freeing but it would make things racey.
630 */
631 return rte_mempool_full(mp);
632 }
633
634 /* Free unused mempools. */
635 static void
636 dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
637 {
638 struct dpdk_mp *dmp, *next;
639
640 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
641 if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
642 VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
643 ovs_list_remove(&dmp->list_node);
644 rte_mempool_free(dmp->mp);
645 rte_free(dmp);
646 }
647 }
648 }
649
650 /* Calculating the required number of mbufs differs depending on the
651 * mempool model being used. Check if per port memory is in use before
652 * calculating.
653 */
654 static uint32_t
655 dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
656 {
657 uint32_t n_mbufs;
658
659 if (!per_port_mp) {
660 /* Shared memory are being used.
661 * XXX: this is a really rough method of provisioning memory.
662 * It's impossible to determine what the exact memory requirements are
663 * when the number of ports and rxqs that utilize a particular mempool
664 * can change dynamically at runtime. For now, use this rough
665 * heurisitic.
666 */
667 if (mtu >= RTE_ETHER_MTU) {
668 n_mbufs = MAX_NB_MBUF;
669 } else {
670 n_mbufs = MIN_NB_MBUF;
671 }
672 } else {
673 /* Per port memory is being used.
674 * XXX: rough estimation of number of mbufs required for this port:
675 * <packets required to fill the device rxqs>
676 * + <packets that could be stuck on other ports txqs>
677 * + <packets in the pmd threads>
678 * + <additional memory for corner cases>
679 */
680 n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
681 + dev->requested_n_txq * dev->requested_txq_size
682 + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
683 + MIN_NB_MBUF;
684 }
685
686 return n_mbufs;
687 }
688
689 static struct dpdk_mp *
690 dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
691 {
692 char mp_name[RTE_MEMPOOL_NAMESIZE];
693 const char *netdev_name = netdev_get_name(&dev->up);
694 int socket_id = dev->requested_socket_id;
695 uint32_t n_mbufs = 0;
696 uint32_t mbuf_size = 0;
697 uint32_t aligned_mbuf_size = 0;
698 uint32_t mbuf_priv_data_len = 0;
699 uint32_t pkt_size = 0;
700 uint32_t hash = hash_string(netdev_name, 0);
701 struct dpdk_mp *dmp = NULL;
702 int ret;
703
704 dmp = dpdk_rte_mzalloc(sizeof *dmp);
705 if (!dmp) {
706 return NULL;
707 }
708 dmp->socket_id = socket_id;
709 dmp->mtu = mtu;
710 dmp->refcount = 1;
711
712 /* Get the size of each mbuf, based on the MTU */
713 mbuf_size = MTU_TO_FRAME_LEN(mtu);
714
715 n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);
716
717 do {
718 /* Full DPDK memory pool name must be unique and cannot be
719 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
720 * mempool case this can result in one device using a mempool
721 * which references a different device in it's name. However as
722 * mempool names are hashed, the device name will not be readable
723 * so this is not an issue for tasks such as debugging.
724 */
725 ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
726 "ovs%08x%02d%05d%07u",
727 hash, socket_id, mtu, n_mbufs);
728 if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
729 VLOG_DBG("snprintf returned %d. "
730 "Failed to generate a mempool name for \"%s\". "
731 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
732 ret, netdev_name, hash, socket_id, mtu, n_mbufs);
733 break;
734 }
735
736 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
737 "on socket %d for %d Rx and %d Tx queues, "
738 "cache line size of %u",
739 netdev_name, n_mbufs, mbuf_size, socket_id,
740 dev->requested_n_rxq, dev->requested_n_txq,
741 RTE_CACHE_LINE_SIZE);
742
743 /* The size of the mbuf's private area (i.e. area that holds OvS'
744 * dp_packet data)*/
745 mbuf_priv_data_len = sizeof(struct dp_packet) -
746 sizeof(struct rte_mbuf);
747 /* The size of the entire dp_packet. */
748 pkt_size = sizeof(struct dp_packet) + mbuf_size;
749 /* mbuf size, rounded up to cacheline size. */
750 aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
751 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
752 * This maintains mbuf size cache alignment, while also honoring RX
753 * buffer alignment in the data portion of the mbuf. If this adjustment
754 * is not made, there is a possiblity later on that for an element of
755 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
756 * This is problematic in the case of multi-segment mbufs, particularly
757 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
758 * header, for example.
759 */
760 mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
761
762 dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
763 mbuf_priv_data_len,
764 mbuf_size,
765 socket_id);
766
767 if (dmp->mp) {
768 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
769 mp_name, n_mbufs);
770 /* rte_pktmbuf_pool_create has done some initialization of the
771 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
772 * initializes some OVS specific fields of dp_packet.
773 */
774 rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
775 return dmp;
776 } else if (rte_errno == EEXIST) {
777 /* A mempool with the same name already exists. We just
778 * retrieve its pointer to be returned to the caller. */
779 dmp->mp = rte_mempool_lookup(mp_name);
780 /* As the mempool create returned EEXIST we can expect the
781 * lookup has returned a valid pointer. If for some reason
782 * that's not the case we keep track of it. */
783 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
784 mp_name, dmp->mp);
785 return dmp;
786 } else {
787 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
788 "%u mbufs, retrying with %u mbufs",
789 mp_name, n_mbufs, n_mbufs / 2);
790 }
791 } while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
792
793 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
794 mp_name, n_mbufs);
795
796 rte_free(dmp);
797 return NULL;
798 }
799
800 static struct dpdk_mp *
801 dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
802 {
803 struct dpdk_mp *dmp, *next;
804 bool reuse = false;
805
806 ovs_mutex_lock(&dpdk_mp_mutex);
807 /* Check if shared memory is being used, if so check existing mempools
808 * to see if reuse is possible. */
809 if (!per_port_mp) {
810 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
811 if (dmp->socket_id == dev->requested_socket_id
812 && dmp->mtu == mtu) {
813 VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
814 dmp->refcount++;
815 reuse = true;
816 break;
817 }
818 }
819 }
820 /* Sweep mempools after reuse or before create. */
821 dpdk_mp_sweep();
822
823 if (!reuse) {
824 dmp = dpdk_mp_create(dev, mtu, per_port_mp);
825 if (dmp) {
826 /* Shared memory will hit the reuse case above so will not
827 * request a mempool that already exists but we need to check
828 * for the EEXIST case for per port memory case. Compare the
829 * mempool returned by dmp to each entry in dpdk_mp_list. If a
830 * match is found, free dmp as a new entry is not required, set
831 * dmp to point to the existing entry and increment the refcount
832 * to avoid being freed at a later stage.
833 */
834 if (per_port_mp && rte_errno == EEXIST) {
835 LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
836 if (dmp->mp == next->mp) {
837 rte_free(dmp);
838 dmp = next;
839 dmp->refcount++;
840 }
841 }
842 } else {
843 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
844 }
845 }
846 }
847
848 ovs_mutex_unlock(&dpdk_mp_mutex);
849
850 return dmp;
851 }
852
853 /* Decrement reference to a mempool. */
854 static void
855 dpdk_mp_put(struct dpdk_mp *dmp)
856 {
857 if (!dmp) {
858 return;
859 }
860
861 ovs_mutex_lock(&dpdk_mp_mutex);
862 ovs_assert(dmp->refcount);
863 dmp->refcount--;
864 ovs_mutex_unlock(&dpdk_mp_mutex);
865 }
866
867 /* Depending on the memory model being used this function tries to
868 * identify and reuse an existing mempool or tries to allocate a new
869 * mempool on requested_socket_id with mbuf size corresponding to the
870 * requested_mtu. On success, a new configuration will be applied.
871 * On error, device will be left unchanged. */
872 static int
873 netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
874 OVS_REQUIRES(dev->mutex)
875 {
876 uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
877 struct dpdk_mp *dmp;
878 int ret = 0;
879 bool per_port_mp = dpdk_per_port_memory();
880
881 /* With shared memory we do not need to configure a mempool if the MTU
882 * and socket ID have not changed, the previous configuration is still
883 * valid so return 0 */
884 if (!per_port_mp && dev->mtu == dev->requested_mtu
885 && dev->socket_id == dev->requested_socket_id) {
886 return ret;
887 }
888
889 dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
890 if (!dmp) {
891 VLOG_ERR("Failed to create memory pool for netdev "
892 "%s, with MTU %d on socket %d: %s\n",
893 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
894 rte_strerror(rte_errno));
895 ret = rte_errno;
896 } else {
897 /* Check for any pre-existing dpdk_mp for the device before accessing
898 * the associated mempool.
899 */
900 if (dev->dpdk_mp != NULL) {
901 /* A new MTU was requested, decrement the reference count for the
902 * devices current dpdk_mp. This is required even if a pointer to
903 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
904 * has already been incremented by dpdk_mp_get at this stage so it
905 * must be decremented to keep an accurate refcount for the
906 * dpdk_mp.
907 */
908 dpdk_mp_put(dev->dpdk_mp);
909 }
910 dev->dpdk_mp = dmp;
911 dev->mtu = dev->requested_mtu;
912 dev->socket_id = dev->requested_socket_id;
913 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
914 }
915
916 return ret;
917 }
918
919 static void
920 check_link_status(struct netdev_dpdk *dev)
921 {
922 struct rte_eth_link link;
923
924 rte_eth_link_get_nowait(dev->port_id, &link);
925
926 if (dev->link.link_status != link.link_status) {
927 netdev_change_seq_changed(&dev->up);
928
929 dev->link_reset_cnt++;
930 dev->link = link;
931 if (dev->link.link_status) {
932 VLOG_DBG_RL(&rl,
933 "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
934 dev->port_id, (unsigned) dev->link.link_speed,
935 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
936 ? "full-duplex" : "half-duplex");
937 } else {
938 VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
939 dev->port_id);
940 }
941 }
942 }
943
944 static void *
945 dpdk_watchdog(void *dummy OVS_UNUSED)
946 {
947 struct netdev_dpdk *dev;
948
949 pthread_detach(pthread_self());
950
951 for (;;) {
952 ovs_mutex_lock(&dpdk_mutex);
953 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
954 ovs_mutex_lock(&dev->mutex);
955 if (dev->type == DPDK_DEV_ETH) {
956 check_link_status(dev);
957 }
958 ovs_mutex_unlock(&dev->mutex);
959 }
960 ovs_mutex_unlock(&dpdk_mutex);
961 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
962 }
963
964 return NULL;
965 }
966
967 static int
968 dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
969 {
970 int diag = 0;
971 int i;
972 struct rte_eth_conf conf = port_conf;
973 struct rte_eth_dev_info info;
974 uint16_t conf_mtu;
975
976 rte_eth_dev_info_get(dev->port_id, &info);
977
978 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
979 * scatter to support jumbo RX.
980 * Setting scatter for the device is done after checking for
981 * scatter support in the device capabilites. */
982 if (dev->mtu > RTE_ETHER_MTU) {
983 if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
984 conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
985 }
986 }
987
988 conf.intr_conf.lsc = dev->lsc_interrupt_mode;
989
990 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
991 conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
992 }
993
994 if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
995 && info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
996 conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
997 }
998
999 if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
1000 conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
1001 conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
1002 conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
1003 }
1004
1005 /* Limit configured rss hash functions to only those supported
1006 * by the eth device. */
1007 conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
1008
1009 /* A device may report more queues than it makes available (this has
1010 * been observed for Intel xl710, which reserves some of them for
1011 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
1012 * available. When this happens we can retry the configuration
1013 * and request less queues */
1014 while (n_rxq && n_txq) {
1015 if (diag) {
1016 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
1017 }
1018
1019 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
1020 if (diag) {
1021 VLOG_WARN("Interface %s eth_dev setup error %s\n",
1022 dev->up.name, rte_strerror(-diag));
1023 break;
1024 }
1025
1026 diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
1027 if (diag) {
1028 /* A device may not support rte_eth_dev_set_mtu, in this case
1029 * flag a warning to the user and include the devices configured
1030 * MTU value that will be used instead. */
1031 if (-ENOTSUP == diag) {
1032 rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
1033 VLOG_WARN("Interface %s does not support MTU configuration, "
1034 "max packet size supported is %"PRIu16".",
1035 dev->up.name, conf_mtu);
1036 } else {
1037 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
1038 dev->up.name, dev->mtu, rte_strerror(-diag));
1039 break;
1040 }
1041 }
1042
1043 for (i = 0; i < n_txq; i++) {
1044 diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
1045 dev->socket_id, NULL);
1046 if (diag) {
1047 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
1048 dev->up.name, i, rte_strerror(-diag));
1049 break;
1050 }
1051 }
1052
1053 if (i != n_txq) {
1054 /* Retry with less tx queues */
1055 n_txq = i;
1056 continue;
1057 }
1058
1059 for (i = 0; i < n_rxq; i++) {
1060 diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
1061 dev->socket_id, NULL,
1062 dev->dpdk_mp->mp);
1063 if (diag) {
1064 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
1065 dev->up.name, i, rte_strerror(-diag));
1066 break;
1067 }
1068 }
1069
1070 if (i != n_rxq) {
1071 /* Retry with less rx queues */
1072 n_rxq = i;
1073 continue;
1074 }
1075
1076 dev->up.n_rxq = n_rxq;
1077 dev->up.n_txq = n_txq;
1078
1079 return 0;
1080 }
1081
1082 return diag;
1083 }
1084
1085 static void
1086 dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
1087 {
1088 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
1089 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
1090 dev->port_id);
1091 }
1092 }
1093
1094 static int
1095 dpdk_eth_dev_init(struct netdev_dpdk *dev)
1096 OVS_REQUIRES(dev->mutex)
1097 {
1098 struct rte_pktmbuf_pool_private *mbp_priv;
1099 struct rte_eth_dev_info info;
1100 struct rte_ether_addr eth_addr;
1101 int diag;
1102 int n_rxq, n_txq;
1103 uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
1104 DEV_RX_OFFLOAD_TCP_CKSUM |
1105 DEV_RX_OFFLOAD_IPV4_CKSUM;
1106 uint32_t tx_tso_offload_capa = DEV_TX_OFFLOAD_TCP_TSO |
1107 DEV_TX_OFFLOAD_TCP_CKSUM |
1108 DEV_TX_OFFLOAD_IPV4_CKSUM;
1109
1110 rte_eth_dev_info_get(dev->port_id, &info);
1111
1112 if (strstr(info.driver_name, "vf") != NULL) {
1113 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1114 dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
1115 } else {
1116 dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
1117 }
1118
1119 if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
1120 rx_chksm_offload_capa) {
1121 VLOG_WARN("Rx checksum offload is not supported on port "
1122 DPDK_PORT_ID_FMT, dev->port_id);
1123 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
1124 } else {
1125 dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
1126 }
1127
1128 if (info.rx_offload_capa & DEV_RX_OFFLOAD_SCATTER) {
1129 dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
1130 } else {
1131 /* Do not warn on lack of scatter support */
1132 dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
1133 }
1134
1135 if (info.tx_offload_capa & tx_tso_offload_capa) {
1136 dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
1137 } else {
1138 dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
1139 VLOG_WARN("Tx TSO offload is not supported on %s port "
1140 DPDK_PORT_ID_FMT, netdev_get_name(&dev->up), dev->port_id);
1141 }
1142
1143 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
1144 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
1145
1146 diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
1147 if (diag) {
1148 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1149 "configure error: %s",
1150 dev->up.name, n_rxq, n_txq,
1151 dev->lsc_interrupt_mode ? "true" : "false",
1152 rte_strerror(-diag));
1153 return -diag;
1154 }
1155
1156 diag = rte_eth_dev_start(dev->port_id);
1157 if (diag) {
1158 VLOG_ERR("Interface %s start error: %s", dev->up.name,
1159 rte_strerror(-diag));
1160 return -diag;
1161 }
1162 dev->started = true;
1163
1164 rte_eth_promiscuous_enable(dev->port_id);
1165 rte_eth_allmulticast_enable(dev->port_id);
1166
1167 memset(&eth_addr, 0x0, sizeof(eth_addr));
1168 rte_eth_macaddr_get(dev->port_id, &eth_addr);
1169 VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
1170 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
1171
1172 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
1173 rte_eth_link_get_nowait(dev->port_id, &dev->link);
1174
1175 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
1176 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
1177 return 0;
1178 }
1179
1180 static struct netdev_dpdk *
1181 netdev_dpdk_cast(const struct netdev *netdev)
1182 {
1183 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
1184 }
1185
1186 static struct netdev *
1187 netdev_dpdk_alloc(void)
1188 {
1189 struct netdev_dpdk *dev;
1190
1191 dev = dpdk_rte_mzalloc(sizeof *dev);
1192 if (dev) {
1193 return &dev->up;
1194 }
1195
1196 return NULL;
1197 }
1198
1199 static struct dpdk_tx_queue *
1200 netdev_dpdk_alloc_txq(unsigned int n_txqs)
1201 {
1202 struct dpdk_tx_queue *txqs;
1203 unsigned i;
1204
1205 txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
1206 if (txqs) {
1207 for (i = 0; i < n_txqs; i++) {
1208 /* Initialize map for vhost devices. */
1209 txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
1210 rte_spinlock_init(&txqs[i].tx_lock);
1211 }
1212 }
1213
1214 return txqs;
1215 }
1216
1217 static int
1218 common_construct(struct netdev *netdev, dpdk_port_t port_no,
1219 enum dpdk_dev_type type, int socket_id)
1220 OVS_REQUIRES(dpdk_mutex)
1221 {
1222 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1223
1224 ovs_mutex_init(&dev->mutex);
1225
1226 rte_spinlock_init(&dev->stats_lock);
1227
1228 /* If the 'sid' is negative, it means that the kernel fails
1229 * to obtain the pci numa info. In that situation, always
1230 * use 'SOCKET0'. */
1231 dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
1232 dev->requested_socket_id = dev->socket_id;
1233 dev->port_id = port_no;
1234 dev->type = type;
1235 dev->flags = 0;
1236 dev->requested_mtu = RTE_ETHER_MTU;
1237 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
1238 dev->requested_lsc_interrupt_mode = 0;
1239 ovsrcu_index_init(&dev->vid, -1);
1240 dev->vhost_reconfigured = false;
1241 dev->attached = false;
1242 dev->started = false;
1243 dev->reset_needed = false;
1244
1245 ovsrcu_init(&dev->qos_conf, NULL);
1246
1247 ovsrcu_init(&dev->ingress_policer, NULL);
1248 dev->policer_rate = 0;
1249 dev->policer_burst = 0;
1250
1251 netdev->n_rxq = 0;
1252 netdev->n_txq = 0;
1253 dev->requested_n_rxq = NR_QUEUE;
1254 dev->requested_n_txq = NR_QUEUE;
1255 dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
1256 dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
1257
1258 /* Initialize the flow control to NULL */
1259 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
1260
1261 /* Initilize the hardware offload flags to 0 */
1262 dev->hw_ol_features = 0;
1263
1264 dev->flags = NETDEV_UP | NETDEV_PROMISC;
1265
1266 ovs_list_push_back(&dpdk_list, &dev->list_node);
1267
1268 netdev_request_reconfigure(netdev);
1269
1270 dev->rte_xstats_names = NULL;
1271 dev->rte_xstats_names_size = 0;
1272
1273 dev->rte_xstats_ids = NULL;
1274 dev->rte_xstats_ids_size = 0;
1275
1276 dev->sw_stats = xzalloc(sizeof *dev->sw_stats);
1277 dev->sw_stats->tx_retries = (dev->type == DPDK_DEV_VHOST) ? 0 : UINT64_MAX;
1278
1279 return 0;
1280 }
1281
1282 /* dev_name must be the prefix followed by a positive decimal number.
1283 * (no leading + or - signs are allowed) */
1284 static int
1285 dpdk_dev_parse_name(const char dev_name[], const char prefix[],
1286 unsigned int *port_no)
1287 {
1288 const char *cport;
1289
1290 if (strncmp(dev_name, prefix, strlen(prefix))) {
1291 return ENODEV;
1292 }
1293
1294 cport = dev_name + strlen(prefix);
1295
1296 if (str_to_uint(cport, 10, port_no)) {
1297 return 0;
1298 } else {
1299 return ENODEV;
1300 }
1301 }
1302
1303 /* Get the number of OVS interfaces which have the same DPDK
1304 * rte device (e.g. same pci bus address).
1305 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1306 */
1307 static int
1308 netdev_dpdk_get_num_ports(struct rte_device *device)
1309 OVS_REQUIRES(dpdk_mutex)
1310 {
1311 struct netdev_dpdk *dev;
1312 int count = 0;
1313
1314 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1315 if (rte_eth_devices[dev->port_id].device == device
1316 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
1317 count++;
1318 }
1319 }
1320 return count;
1321 }
1322
1323 static int
1324 vhost_common_construct(struct netdev *netdev)
1325 OVS_REQUIRES(dpdk_mutex)
1326 {
1327 int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
1328 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1329
1330 dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
1331 sizeof *dev->vhost_rxq_enabled);
1332 if (!dev->vhost_rxq_enabled) {
1333 return ENOMEM;
1334 }
1335 dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
1336 if (!dev->tx_q) {
1337 rte_free(dev->vhost_rxq_enabled);
1338 return ENOMEM;
1339 }
1340
1341 atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
1342
1343 return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1344 DPDK_DEV_VHOST, socket_id);
1345 }
1346
1347 static int
1348 netdev_dpdk_vhost_construct(struct netdev *netdev)
1349 {
1350 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1351 const char *name = netdev->name;
1352 int err;
1353
1354 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1355 * the file system. '/' or '\' would traverse directories, so they're not
1356 * acceptable in 'name'. */
1357 if (strchr(name, '/') || strchr(name, '\\')) {
1358 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1359 "A valid name must not include '/' or '\\'",
1360 name);
1361 return EINVAL;
1362 }
1363
1364 ovs_mutex_lock(&dpdk_mutex);
1365 /* Take the name of the vhost-user port and append it to the location where
1366 * the socket is to be created, then register the socket.
1367 */
1368 dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name);
1369
1370 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
1371
1372 /* There is no support for multi-segments buffers. */
1373 dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1374 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
1375 if (err) {
1376 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
1377 dev->vhost_id);
1378 goto out;
1379 } else {
1380 fatal_signal_add_file_to_unlink(dev->vhost_id);
1381 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1382 dev->vhost_id, name);
1383 }
1384
1385 err = rte_vhost_driver_callback_register(dev->vhost_id,
1386 &virtio_net_device_ops);
1387 if (err) {
1388 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1389 "port: %s\n", name);
1390 goto out;
1391 }
1392
1393 if (!userspace_tso_enabled()) {
1394 err = rte_vhost_driver_disable_features(dev->vhost_id,
1395 1ULL << VIRTIO_NET_F_HOST_TSO4
1396 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1397 | 1ULL << VIRTIO_NET_F_CSUM);
1398 if (err) {
1399 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1400 "port: %s\n", name);
1401 goto out;
1402 }
1403 }
1404
1405 err = rte_vhost_driver_start(dev->vhost_id);
1406 if (err) {
1407 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1408 "port: %s\n", name);
1409 goto out;
1410 }
1411
1412 err = vhost_common_construct(netdev);
1413 if (err) {
1414 VLOG_ERR("vhost_common_construct failed for vhost user "
1415 "port: %s\n", name);
1416 }
1417
1418 out:
1419 if (err) {
1420 free(dev->vhost_id);
1421 dev->vhost_id = NULL;
1422 }
1423
1424 ovs_mutex_unlock(&dpdk_mutex);
1425 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1426 "please migrate to dpdkvhostuserclient ports.");
1427 return err;
1428 }
1429
1430 static int
1431 netdev_dpdk_vhost_client_construct(struct netdev *netdev)
1432 {
1433 int err;
1434
1435 ovs_mutex_lock(&dpdk_mutex);
1436 err = vhost_common_construct(netdev);
1437 if (err) {
1438 VLOG_ERR("vhost_common_construct failed for vhost user client"
1439 "port: %s\n", netdev->name);
1440 }
1441 ovs_mutex_unlock(&dpdk_mutex);
1442 return err;
1443 }
1444
1445 static int
1446 netdev_dpdk_construct(struct netdev *netdev)
1447 {
1448 int err;
1449
1450 ovs_mutex_lock(&dpdk_mutex);
1451 err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1452 DPDK_DEV_ETH, SOCKET0);
1453 ovs_mutex_unlock(&dpdk_mutex);
1454 return err;
1455 }
1456
1457 static void
1458 common_destruct(struct netdev_dpdk *dev)
1459 OVS_REQUIRES(dpdk_mutex)
1460 OVS_EXCLUDED(dev->mutex)
1461 {
1462 rte_free(dev->tx_q);
1463 dpdk_mp_put(dev->dpdk_mp);
1464
1465 ovs_list_remove(&dev->list_node);
1466 free(ovsrcu_get_protected(struct ingress_policer *,
1467 &dev->ingress_policer));
1468 free(dev->sw_stats);
1469 ovs_mutex_destroy(&dev->mutex);
1470 }
1471
1472 static void
1473 netdev_dpdk_destruct(struct netdev *netdev)
1474 {
1475 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1476 struct rte_device *rte_dev;
1477 struct rte_eth_dev *eth_dev;
1478 bool remove_on_close;
1479
1480 ovs_mutex_lock(&dpdk_mutex);
1481
1482 rte_eth_dev_stop(dev->port_id);
1483 dev->started = false;
1484
1485 if (dev->attached) {
1486 /* Retrieve eth device data before closing it.
1487 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1488 */
1489 eth_dev = &rte_eth_devices[dev->port_id];
1490 remove_on_close =
1491 eth_dev->data &&
1492 (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
1493 rte_dev = eth_dev->device;
1494
1495 /* Remove the eth device. */
1496 rte_eth_dev_close(dev->port_id);
1497
1498 /* Remove this rte device and all its eth devices if flag
1499 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1500 * are not supported), or if all the eth devices belonging to the rte
1501 * device are closed.
1502 */
1503 if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
1504 int ret = rte_dev_remove(rte_dev);
1505
1506 if (ret < 0) {
1507 VLOG_ERR("Device '%s' can not be detached: %s.",
1508 dev->devargs, rte_strerror(-ret));
1509 } else {
1510 /* Device was closed and detached. */
1511 VLOG_INFO("Device '%s' has been removed and detached",
1512 dev->devargs);
1513 }
1514 } else {
1515 /* Device was only closed. rte_dev_remove() was not called. */
1516 VLOG_INFO("Device '%s' has been removed", dev->devargs);
1517 }
1518 }
1519
1520 netdev_dpdk_clear_xstats(dev);
1521 free(dev->devargs);
1522 common_destruct(dev);
1523
1524 ovs_mutex_unlock(&dpdk_mutex);
1525 }
1526
1527 /* rte_vhost_driver_unregister() can call back destroy_device(), which will
1528 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1529 * deadlock, none of the mutexes must be held while calling this function. */
1530 static int
1531 dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
1532 char *vhost_id)
1533 OVS_EXCLUDED(dpdk_mutex)
1534 OVS_EXCLUDED(dev->mutex)
1535 {
1536 return rte_vhost_driver_unregister(vhost_id);
1537 }
1538
1539 static void
1540 netdev_dpdk_vhost_destruct(struct netdev *netdev)
1541 {
1542 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1543 char *vhost_id;
1544
1545 ovs_mutex_lock(&dpdk_mutex);
1546
1547 /* Guest becomes an orphan if still attached. */
1548 if (netdev_dpdk_get_vid(dev) >= 0
1549 && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1550 VLOG_ERR("Removing port '%s' while vhost device still attached.",
1551 netdev->name);
1552 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1553 "socket '%s' must be restarted.", dev->vhost_id);
1554 }
1555
1556 vhost_id = dev->vhost_id;
1557 dev->vhost_id = NULL;
1558 rte_free(dev->vhost_rxq_enabled);
1559
1560 common_destruct(dev);
1561
1562 ovs_mutex_unlock(&dpdk_mutex);
1563
1564 if (!vhost_id) {
1565 goto out;
1566 }
1567
1568 if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
1569 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1570 netdev->name, vhost_id);
1571 } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1572 /* OVS server mode - remove this socket from list for deletion */
1573 fatal_signal_remove_file_to_unlink(vhost_id);
1574 }
1575 out:
1576 free(vhost_id);
1577 }
1578
1579 static void
1580 netdev_dpdk_dealloc(struct netdev *netdev)
1581 {
1582 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1583
1584 rte_free(dev);
1585 }
1586
1587 static void
1588 netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
1589 {
1590 /* If statistics are already allocated, we have to
1591 * reconfigure, as port_id could have been changed. */
1592 if (dev->rte_xstats_names) {
1593 free(dev->rte_xstats_names);
1594 dev->rte_xstats_names = NULL;
1595 dev->rte_xstats_names_size = 0;
1596 }
1597 if (dev->rte_xstats_ids) {
1598 free(dev->rte_xstats_ids);
1599 dev->rte_xstats_ids = NULL;
1600 dev->rte_xstats_ids_size = 0;
1601 }
1602 }
1603
1604 static const char*
1605 netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
1606 {
1607 if (id >= dev->rte_xstats_names_size) {
1608 return "UNKNOWN";
1609 }
1610 return dev->rte_xstats_names[id].name;
1611 }
1612
1613 static bool
1614 netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
1615 OVS_REQUIRES(dev->mutex)
1616 {
1617 int rte_xstats_len;
1618 bool ret;
1619 struct rte_eth_xstat *rte_xstats;
1620 uint64_t id;
1621 int xstats_no;
1622 const char *name;
1623
1624 /* Retrieving all XSTATS names. If something will go wrong
1625 * or amount of counters will be equal 0, rte_xstats_names
1626 * buffer will be marked as NULL, and any further xstats
1627 * query won't be performed (e.g. during netdev_dpdk_get_stats
1628 * execution). */
1629
1630 ret = false;
1631 rte_xstats = NULL;
1632
1633 if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
1634 dev->rte_xstats_names_size =
1635 rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1636
1637 if (dev->rte_xstats_names_size < 0) {
1638 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
1639 dev->port_id);
1640 dev->rte_xstats_names_size = 0;
1641 } else {
1642 /* Reserve memory for xstats names and values */
1643 dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
1644 sizeof *dev->rte_xstats_names);
1645
1646 if (dev->rte_xstats_names) {
1647 /* Retreive xstats names */
1648 rte_xstats_len =
1649 rte_eth_xstats_get_names(dev->port_id,
1650 dev->rte_xstats_names,
1651 dev->rte_xstats_names_size);
1652
1653 if (rte_xstats_len < 0) {
1654 VLOG_WARN("Cannot get XSTATS names for port: "
1655 DPDK_PORT_ID_FMT, dev->port_id);
1656 goto out;
1657 } else if (rte_xstats_len != dev->rte_xstats_names_size) {
1658 VLOG_WARN("XSTATS size doesn't match for port: "
1659 DPDK_PORT_ID_FMT, dev->port_id);
1660 goto out;
1661 }
1662
1663 dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
1664 sizeof(uint64_t));
1665
1666 /* We have to calculate number of counters */
1667 rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
1668 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1669
1670 /* Retreive xstats values */
1671 if (rte_eth_xstats_get(dev->port_id, rte_xstats,
1672 rte_xstats_len) > 0) {
1673 dev->rte_xstats_ids_size = 0;
1674 xstats_no = 0;
1675 for (uint32_t i = 0; i < rte_xstats_len; i++) {
1676 id = rte_xstats[i].id;
1677 name = netdev_dpdk_get_xstat_name(dev, id);
1678 /* We need to filter out everything except
1679 * dropped, error and management counters */
1680 if (string_ends_with(name, "_errors") ||
1681 strstr(name, "_management_") ||
1682 string_ends_with(name, "_dropped")) {
1683
1684 dev->rte_xstats_ids[xstats_no] = id;
1685 xstats_no++;
1686 }
1687 }
1688 dev->rte_xstats_ids_size = xstats_no;
1689 ret = true;
1690 } else {
1691 VLOG_WARN("Can't get XSTATS IDs for port: "
1692 DPDK_PORT_ID_FMT, dev->port_id);
1693 }
1694
1695 free(rte_xstats);
1696 }
1697 }
1698 } else {
1699 /* Already configured */
1700 ret = true;
1701 }
1702
1703 out:
1704 if (!ret) {
1705 netdev_dpdk_clear_xstats(dev);
1706 }
1707 return ret;
1708 }
1709
1710 static int
1711 netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
1712 {
1713 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1714
1715 ovs_mutex_lock(&dev->mutex);
1716
1717 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
1718 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
1719 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1720 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
1721 smap_add_format(args, "mtu", "%d", dev->mtu);
1722
1723 if (dev->type == DPDK_DEV_ETH) {
1724 smap_add_format(args, "requested_rxq_descriptors", "%d",
1725 dev->requested_rxq_size);
1726 smap_add_format(args, "configured_rxq_descriptors", "%d",
1727 dev->rxq_size);
1728 smap_add_format(args, "requested_txq_descriptors", "%d",
1729 dev->requested_txq_size);
1730 smap_add_format(args, "configured_txq_descriptors", "%d",
1731 dev->txq_size);
1732 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
1733 smap_add(args, "rx_csum_offload", "true");
1734 } else {
1735 smap_add(args, "rx_csum_offload", "false");
1736 }
1737 if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
1738 smap_add(args, "tx_tso_offload", "true");
1739 } else {
1740 smap_add(args, "tx_tso_offload", "false");
1741 }
1742 smap_add(args, "lsc_interrupt_mode",
1743 dev->lsc_interrupt_mode ? "true" : "false");
1744 }
1745 ovs_mutex_unlock(&dev->mutex);
1746
1747 return 0;
1748 }
1749
1750 static struct netdev_dpdk *
1751 netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
1752 OVS_REQUIRES(dpdk_mutex)
1753 {
1754 struct netdev_dpdk *dev;
1755
1756 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1757 if (dev->port_id == port_id) {
1758 return dev;
1759 }
1760 }
1761
1762 return NULL;
1763 }
1764
1765 static dpdk_port_t
1766 netdev_dpdk_get_port_by_mac(const char *mac_str)
1767 {
1768 dpdk_port_t port_id;
1769 struct eth_addr mac, port_mac;
1770
1771 if (!eth_addr_from_string(mac_str, &mac)) {
1772 VLOG_ERR("invalid mac: %s", mac_str);
1773 return DPDK_ETH_PORT_ID_INVALID;
1774 }
1775
1776 RTE_ETH_FOREACH_DEV (port_id) {
1777 struct rte_ether_addr ea;
1778
1779 rte_eth_macaddr_get(port_id, &ea);
1780 memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
1781 if (eth_addr_equals(mac, port_mac)) {
1782 return port_id;
1783 }
1784 }
1785
1786 return DPDK_ETH_PORT_ID_INVALID;
1787 }
1788
1789 /* Return the first DPDK port id matching the devargs pattern. */
1790 static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
1791 OVS_REQUIRES(dpdk_mutex)
1792 {
1793 dpdk_port_t port_id;
1794 struct rte_dev_iterator iterator;
1795
1796 RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
1797 /* If a break is done - must call rte_eth_iterator_cleanup. */
1798 rte_eth_iterator_cleanup(&iterator);
1799 break;
1800 }
1801
1802 return port_id;
1803 }
1804
1805 /*
1806 * Normally, a PCI id (optionally followed by a representor number)
1807 * is enough for identifying a specific DPDK port.
1808 * However, for some NICs having multiple ports sharing the same PCI
1809 * id, using PCI id won't work then.
1810 *
1811 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1812 *
1813 * Note that the compatibility is fully kept: user can still use the
1814 * PCI id for adding ports (when it's enough for them).
1815 */
1816 static dpdk_port_t
1817 netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
1818 const char *devargs, char **errp)
1819 OVS_REQUIRES(dpdk_mutex)
1820 {
1821 dpdk_port_t new_port_id;
1822
1823 if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
1824 new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
1825 } else {
1826 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1827 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1828 /* Device not found in DPDK, attempt to attach it */
1829 if (rte_dev_probe(devargs)) {
1830 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1831 } else {
1832 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1833 if (rte_eth_dev_is_valid_port(new_port_id)) {
1834 /* Attach successful */
1835 dev->attached = true;
1836 VLOG_INFO("Device '%s' attached to DPDK", devargs);
1837 } else {
1838 /* Attach unsuccessful */
1839 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1840 }
1841 }
1842 }
1843 }
1844
1845 if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
1846 VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
1847 }
1848
1849 return new_port_id;
1850 }
1851
1852 static int
1853 dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type type,
1854 void *param OVS_UNUSED, void *ret_param OVS_UNUSED)
1855 {
1856 struct netdev_dpdk *dev;
1857
1858 switch ((int) type) {
1859 case RTE_ETH_EVENT_INTR_RESET:
1860 ovs_mutex_lock(&dpdk_mutex);
1861 dev = netdev_dpdk_lookup_by_port_id(port_id);
1862 if (dev) {
1863 ovs_mutex_lock(&dev->mutex);
1864 dev->reset_needed = true;
1865 netdev_request_reconfigure(&dev->up);
1866 VLOG_DBG_RL(&rl, "%s: Device reset requested.",
1867 netdev_get_name(&dev->up));
1868 ovs_mutex_unlock(&dev->mutex);
1869 }
1870 ovs_mutex_unlock(&dpdk_mutex);
1871 break;
1872
1873 default:
1874 /* Ignore all other types. */
1875 break;
1876 }
1877 return 0;
1878 }
1879
1880 static void
1881 dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
1882 OVS_REQUIRES(dev->mutex)
1883 {
1884 int new_n_rxq;
1885
1886 new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
1887 if (new_n_rxq != dev->requested_n_rxq) {
1888 dev->requested_n_rxq = new_n_rxq;
1889 netdev_request_reconfigure(&dev->up);
1890 }
1891 }
1892
1893 static void
1894 dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
1895 const char *flag, int default_size, int *new_size)
1896 {
1897 int queue_size = smap_get_int(args, flag, default_size);
1898
1899 if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
1900 || !is_pow2(queue_size)) {
1901 queue_size = default_size;
1902 }
1903
1904 if (queue_size != *new_size) {
1905 *new_size = queue_size;
1906 netdev_request_reconfigure(netdev);
1907 }
1908 }
1909
1910 static int
1911 netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
1912 char **errp)
1913 {
1914 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1915 bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
1916 bool flow_control_requested = true;
1917 enum rte_eth_fc_mode fc_mode;
1918 static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
1919 {RTE_FC_NONE, RTE_FC_TX_PAUSE},
1920 {RTE_FC_RX_PAUSE, RTE_FC_FULL }
1921 };
1922 const char *new_devargs;
1923 int err = 0;
1924
1925 ovs_mutex_lock(&dpdk_mutex);
1926 ovs_mutex_lock(&dev->mutex);
1927
1928 dpdk_set_rxq_config(dev, args);
1929
1930 dpdk_process_queue_size(netdev, args, "n_rxq_desc",
1931 NIC_PORT_DEFAULT_RXQ_SIZE,
1932 &dev->requested_rxq_size);
1933 dpdk_process_queue_size(netdev, args, "n_txq_desc",
1934 NIC_PORT_DEFAULT_TXQ_SIZE,
1935 &dev->requested_txq_size);
1936
1937 new_devargs = smap_get(args, "dpdk-devargs");
1938
1939 if (dev->devargs && new_devargs && strcmp(new_devargs, dev->devargs)) {
1940 /* The user requested a new device. If we return error, the caller
1941 * will delete this netdev and try to recreate it. */
1942 err = EAGAIN;
1943 goto out;
1944 }
1945
1946 /* dpdk-devargs is required for device configuration */
1947 if (new_devargs && new_devargs[0]) {
1948 /* Don't process dpdk-devargs if value is unchanged and port id
1949 * is valid */
1950 if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
1951 && rte_eth_dev_is_valid_port(dev->port_id))) {
1952 dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
1953 new_devargs,
1954 errp);
1955 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1956 err = EINVAL;
1957 } else if (new_port_id == dev->port_id) {
1958 /* Already configured, do not reconfigure again */
1959 err = 0;
1960 } else {
1961 struct netdev_dpdk *dup_dev;
1962
1963 dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
1964 if (dup_dev) {
1965 VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
1966 "which is already in use by '%s'",
1967 netdev_get_name(netdev), new_devargs,
1968 netdev_get_name(&dup_dev->up));
1969 err = EADDRINUSE;
1970 } else {
1971 int sid = rte_eth_dev_socket_id(new_port_id);
1972
1973 dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
1974 dev->devargs = xstrdup(new_devargs);
1975 dev->port_id = new_port_id;
1976 netdev_request_reconfigure(&dev->up);
1977 netdev_dpdk_clear_xstats(dev);
1978 err = 0;
1979 }
1980 }
1981 }
1982 } else {
1983 VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
1984 "The old 'dpdk<port_id>' names are not supported",
1985 netdev_get_name(netdev));
1986 err = EINVAL;
1987 }
1988
1989 if (err) {
1990 goto out;
1991 }
1992
1993 lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
1994 if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
1995 dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
1996 netdev_request_reconfigure(netdev);
1997 }
1998
1999 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
2000 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
2001 autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
2002
2003 fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
2004
2005 if (!smap_get(args, "rx-flow-ctrl") && !smap_get(args, "tx-flow-ctrl")
2006 && !smap_get(args, "flow-ctrl-autoneg")) {
2007 /* FIXME: User didn't ask for flow control configuration.
2008 * For now we'll not print a warning if flow control is not
2009 * supported by the DPDK port. */
2010 flow_control_requested = false;
2011 }
2012
2013 /* Get the Flow control configuration. */
2014 err = -rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
2015 if (err) {
2016 if (err == ENOTSUP) {
2017 if (flow_control_requested) {
2018 VLOG_WARN("%s: Flow control is not supported.",
2019 netdev_get_name(netdev));
2020 }
2021 err = 0; /* Not fatal. */
2022 } else {
2023 VLOG_WARN("%s: Cannot get flow control parameters: %s",
2024 netdev_get_name(netdev), rte_strerror(err));
2025 }
2026 goto out;
2027 }
2028
2029 if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
2030 dev->fc_conf.mode = fc_mode;
2031 dev->fc_conf.autoneg = autoneg;
2032 dpdk_eth_flow_ctrl_setup(dev);
2033 }
2034
2035 out:
2036 ovs_mutex_unlock(&dev->mutex);
2037 ovs_mutex_unlock(&dpdk_mutex);
2038
2039 return err;
2040 }
2041
2042 static int
2043 netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
2044 char **errp OVS_UNUSED)
2045 {
2046 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2047
2048 ovs_mutex_lock(&dev->mutex);
2049 dpdk_set_rxq_config(dev, args);
2050 ovs_mutex_unlock(&dev->mutex);
2051
2052 return 0;
2053 }
2054
2055 static int
2056 netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
2057 const struct smap *args,
2058 char **errp OVS_UNUSED)
2059 {
2060 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2061 const char *path;
2062 int max_tx_retries, cur_max_tx_retries;
2063
2064 ovs_mutex_lock(&dev->mutex);
2065 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
2066 path = smap_get(args, "vhost-server-path");
2067 if (!nullable_string_is_equal(path, dev->vhost_id)) {
2068 free(dev->vhost_id);
2069 dev->vhost_id = nullable_xstrdup(path);
2070 /* check zero copy configuration */
2071 if (smap_get_bool(args, "dq-zero-copy", false)) {
2072 dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
2073 } else {
2074 dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
2075 }
2076 netdev_request_reconfigure(netdev);
2077 }
2078 }
2079
2080 max_tx_retries = smap_get_int(args, "tx-retries-max",
2081 VHOST_ENQ_RETRY_DEF);
2082 if (max_tx_retries < VHOST_ENQ_RETRY_MIN
2083 || max_tx_retries > VHOST_ENQ_RETRY_MAX) {
2084 max_tx_retries = VHOST_ENQ_RETRY_DEF;
2085 }
2086 atomic_read_relaxed(&dev->vhost_tx_retries_max, &cur_max_tx_retries);
2087 if (max_tx_retries != cur_max_tx_retries) {
2088 atomic_store_relaxed(&dev->vhost_tx_retries_max, max_tx_retries);
2089 VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
2090 netdev_get_name(netdev), max_tx_retries);
2091 }
2092 ovs_mutex_unlock(&dev->mutex);
2093
2094 return 0;
2095 }
2096
2097 static int
2098 netdev_dpdk_get_numa_id(const struct netdev *netdev)
2099 {
2100 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2101
2102 return dev->socket_id;
2103 }
2104
2105 /* Sets the number of tx queues for the dpdk interface. */
2106 static int
2107 netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
2108 {
2109 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2110
2111 ovs_mutex_lock(&dev->mutex);
2112
2113 if (dev->requested_n_txq == n_txq) {
2114 goto out;
2115 }
2116
2117 dev->requested_n_txq = n_txq;
2118 netdev_request_reconfigure(netdev);
2119
2120 out:
2121 ovs_mutex_unlock(&dev->mutex);
2122 return 0;
2123 }
2124
2125 static struct netdev_rxq *
2126 netdev_dpdk_rxq_alloc(void)
2127 {
2128 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
2129
2130 if (rx) {
2131 return &rx->up;
2132 }
2133
2134 return NULL;
2135 }
2136
2137 static struct netdev_rxq_dpdk *
2138 netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
2139 {
2140 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
2141 }
2142
2143 static int
2144 netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
2145 {
2146 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2147 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2148
2149 ovs_mutex_lock(&dev->mutex);
2150 rx->port_id = dev->port_id;
2151 ovs_mutex_unlock(&dev->mutex);
2152
2153 return 0;
2154 }
2155
2156 static void
2157 netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
2158 {
2159 }
2160
2161 static void
2162 netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
2163 {
2164 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2165
2166 rte_free(rx);
2167 }
2168
2169 /* Prepare the packet for HWOL.
2170 * Return True if the packet is OK to continue. */
2171 static bool
2172 netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
2173 {
2174 struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
2175
2176 if (mbuf->ol_flags & PKT_TX_L4_MASK) {
2177 mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
2178 mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
2179 mbuf->outer_l2_len = 0;
2180 mbuf->outer_l3_len = 0;
2181 }
2182
2183 if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
2184 struct tcp_header *th = dp_packet_l4(pkt);
2185
2186 if (!th) {
2187 VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
2188 " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
2189 return false;
2190 }
2191
2192 mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
2193 mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
2194 mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
2195
2196 if (mbuf->ol_flags & PKT_TX_IPV4) {
2197 mbuf->ol_flags |= PKT_TX_IP_CKSUM;
2198 }
2199 }
2200 return true;
2201 }
2202
2203 /* Prepare a batch for HWOL.
2204 * Return the number of good packets in the batch. */
2205 static int
2206 netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2207 int pkt_cnt)
2208 {
2209 int i = 0;
2210 int cnt = 0;
2211 struct rte_mbuf *pkt;
2212
2213 /* Prepare and filter bad HWOL packets. */
2214 for (i = 0; i < pkt_cnt; i++) {
2215 pkt = pkts[i];
2216 if (!netdev_dpdk_prep_hwol_packet(dev, pkt)) {
2217 rte_pktmbuf_free(pkt);
2218 continue;
2219 }
2220
2221 if (OVS_UNLIKELY(i != cnt)) {
2222 pkts[cnt] = pkt;
2223 }
2224 cnt++;
2225 }
2226
2227 return cnt;
2228 }
2229
2230 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2231 * 'pkts', even in case of failure.
2232 *
2233 * Returns the number of packets that weren't transmitted. */
2234 static inline int
2235 netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
2236 struct rte_mbuf **pkts, int cnt)
2237 {
2238 uint32_t nb_tx = 0;
2239 uint16_t nb_tx_prep = cnt;
2240
2241 if (userspace_tso_enabled()) {
2242 nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
2243 if (nb_tx_prep != cnt) {
2244 VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
2245 "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
2246 cnt, rte_strerror(rte_errno));
2247 }
2248 }
2249
2250 while (nb_tx != nb_tx_prep) {
2251 uint32_t ret;
2252
2253 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx,
2254 nb_tx_prep - nb_tx);
2255 if (!ret) {
2256 break;
2257 }
2258
2259 nb_tx += ret;
2260 }
2261
2262 if (OVS_UNLIKELY(nb_tx != cnt)) {
2263 /* Free buffers, which we couldn't transmit, one at a time (each
2264 * packet could come from a different mempool) */
2265 int i;
2266
2267 for (i = nb_tx; i < cnt; i++) {
2268 rte_pktmbuf_free(pkts[i]);
2269 }
2270 }
2271
2272 return cnt - nb_tx;
2273 }
2274
2275 static inline bool
2276 netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter,
2277 struct rte_meter_srtcm_profile *profile,
2278 struct rte_mbuf *pkt, uint64_t time)
2279 {
2280 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
2281
2282 return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
2283 RTE_COLOR_GREEN;
2284 }
2285
2286 static int
2287 srtcm_policer_run_single_packet(struct rte_meter_srtcm *meter,
2288 struct rte_meter_srtcm_profile *profile,
2289 struct rte_mbuf **pkts, int pkt_cnt,
2290 bool should_steal)
2291 {
2292 int i = 0;
2293 int cnt = 0;
2294 struct rte_mbuf *pkt = NULL;
2295 uint64_t current_time = rte_rdtsc();
2296
2297 for (i = 0; i < pkt_cnt; i++) {
2298 pkt = pkts[i];
2299 /* Handle current packet */
2300 if (netdev_dpdk_srtcm_policer_pkt_handle(meter, profile,
2301 pkt, current_time)) {
2302 if (cnt != i) {
2303 pkts[cnt] = pkt;
2304 }
2305 cnt++;
2306 } else {
2307 if (should_steal) {
2308 rte_pktmbuf_free(pkt);
2309 }
2310 }
2311 }
2312
2313 return cnt;
2314 }
2315
2316 static int
2317 ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
2318 int pkt_cnt, bool should_steal)
2319 {
2320 int cnt = 0;
2321
2322 rte_spinlock_lock(&policer->policer_lock);
2323 cnt = srtcm_policer_run_single_packet(&policer->in_policer,
2324 &policer->in_prof,
2325 pkts, pkt_cnt, should_steal);
2326 rte_spinlock_unlock(&policer->policer_lock);
2327
2328 return cnt;
2329 }
2330
2331 static bool
2332 is_vhost_running(struct netdev_dpdk *dev)
2333 {
2334 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
2335 }
2336
2337 static inline void
2338 netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
2339 unsigned int packet_size)
2340 {
2341 /* Hard-coded search for the size bucket. */
2342 if (packet_size < 256) {
2343 if (packet_size >= 128) {
2344 stats->rx_128_to_255_packets++;
2345 } else if (packet_size <= 64) {
2346 stats->rx_1_to_64_packets++;
2347 } else {
2348 stats->rx_65_to_127_packets++;
2349 }
2350 } else {
2351 if (packet_size >= 1523) {
2352 stats->rx_1523_to_max_packets++;
2353 } else if (packet_size >= 1024) {
2354 stats->rx_1024_to_1522_packets++;
2355 } else if (packet_size < 512) {
2356 stats->rx_256_to_511_packets++;
2357 } else {
2358 stats->rx_512_to_1023_packets++;
2359 }
2360 }
2361 }
2362
2363 static inline void
2364 netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
2365 struct dp_packet **packets, int count,
2366 int qos_drops)
2367 {
2368 struct netdev_stats *stats = &dev->stats;
2369 struct dp_packet *packet;
2370 unsigned int packet_size;
2371 int i;
2372
2373 stats->rx_packets += count;
2374 stats->rx_dropped += qos_drops;
2375 for (i = 0; i < count; i++) {
2376 packet = packets[i];
2377 packet_size = dp_packet_size(packet);
2378
2379 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
2380 /* This only protects the following multicast counting from
2381 * too short packets, but it does not stop the packet from
2382 * further processing. */
2383 stats->rx_errors++;
2384 stats->rx_length_errors++;
2385 continue;
2386 }
2387
2388 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
2389
2390 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
2391 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
2392 stats->multicast++;
2393 }
2394
2395 stats->rx_bytes += packet_size;
2396 }
2397
2398 if (OVS_UNLIKELY(qos_drops)) {
2399 dev->sw_stats->rx_qos_drops += qos_drops;
2400 }
2401 }
2402
2403 /*
2404 * The receive path for the vhost port is the TX path out from guest.
2405 */
2406 static int
2407 netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
2408 struct dp_packet_batch *batch, int *qfill)
2409 {
2410 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2411 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
2412 uint16_t nb_rx = 0;
2413 uint16_t qos_drops = 0;
2414 int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
2415 int vid = netdev_dpdk_get_vid(dev);
2416
2417 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
2418 || !(dev->flags & NETDEV_UP))) {
2419 return EAGAIN;
2420 }
2421
2422 nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
2423 (struct rte_mbuf **) batch->packets,
2424 NETDEV_MAX_BURST);
2425 if (!nb_rx) {
2426 return EAGAIN;
2427 }
2428
2429 if (qfill) {
2430 if (nb_rx == NETDEV_MAX_BURST) {
2431 /* The DPDK API returns a uint32_t which often has invalid bits in
2432 * the upper 16-bits. Need to restrict the value to uint16_t. */
2433 *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
2434 } else {
2435 *qfill = 0;
2436 }
2437 }
2438
2439 if (policer) {
2440 qos_drops = nb_rx;
2441 nb_rx = ingress_policer_run(policer,
2442 (struct rte_mbuf **) batch->packets,
2443 nb_rx, true);
2444 qos_drops -= nb_rx;
2445 }
2446
2447 rte_spinlock_lock(&dev->stats_lock);
2448 netdev_dpdk_vhost_update_rx_counters(dev, batch->packets,
2449 nb_rx, qos_drops);
2450 rte_spinlock_unlock(&dev->stats_lock);
2451
2452 batch->count = nb_rx;
2453 dp_packet_batch_init_packet_fields(batch);
2454
2455 return 0;
2456 }
2457
2458 static bool
2459 netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
2460 {
2461 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2462
2463 return dev->vhost_rxq_enabled[rxq->queue_id];
2464 }
2465
2466 static int
2467 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
2468 int *qfill)
2469 {
2470 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2471 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2472 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
2473 int nb_rx;
2474 int dropped = 0;
2475
2476 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2477 return EAGAIN;
2478 }
2479
2480 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
2481 (struct rte_mbuf **) batch->packets,
2482 NETDEV_MAX_BURST);
2483 if (!nb_rx) {
2484 return EAGAIN;
2485 }
2486
2487 if (policer) {
2488 dropped = nb_rx;
2489 nb_rx = ingress_policer_run(policer,
2490 (struct rte_mbuf **) batch->packets,
2491 nb_rx, true);
2492 dropped -= nb_rx;
2493 }
2494
2495 /* Update stats to reflect dropped packets */
2496 if (OVS_UNLIKELY(dropped)) {
2497 rte_spinlock_lock(&dev->stats_lock);
2498 dev->stats.rx_dropped += dropped;
2499 dev->sw_stats->rx_qos_drops += dropped;
2500 rte_spinlock_unlock(&dev->stats_lock);
2501 }
2502
2503 batch->count = nb_rx;
2504 dp_packet_batch_init_packet_fields(batch);
2505
2506 if (qfill) {
2507 if (nb_rx == NETDEV_MAX_BURST) {
2508 *qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
2509 } else {
2510 *qfill = 0;
2511 }
2512 }
2513
2514 return 0;
2515 }
2516
2517 static inline int
2518 netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2519 int cnt, bool should_steal)
2520 {
2521 struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
2522
2523 if (qos_conf) {
2524 rte_spinlock_lock(&qos_conf->lock);
2525 cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
2526 rte_spinlock_unlock(&qos_conf->lock);
2527 }
2528
2529 return cnt;
2530 }
2531
2532 static int
2533 netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2534 int pkt_cnt)
2535 {
2536 int i = 0;
2537 int cnt = 0;
2538 struct rte_mbuf *pkt;
2539
2540 /* Filter oversized packets, unless are marked for TSO. */
2541 for (i = 0; i < pkt_cnt; i++) {
2542 pkt = pkts[i];
2543 if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len)
2544 && !(pkt->ol_flags & PKT_TX_TCP_SEG))) {
2545 VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " "
2546 "max_packet_len %d", dev->up.name, pkt->pkt_len,
2547 dev->max_packet_len);
2548 rte_pktmbuf_free(pkt);
2549 continue;
2550 }
2551
2552 if (OVS_UNLIKELY(i != cnt)) {
2553 pkts[cnt] = pkt;
2554 }
2555 cnt++;
2556 }
2557
2558 return cnt;
2559 }
2560
2561 static inline void
2562 netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev,
2563 struct dp_packet **packets,
2564 int attempted,
2565 struct netdev_dpdk_sw_stats *sw_stats_add)
2566 {
2567 int dropped = sw_stats_add->tx_mtu_exceeded_drops +
2568 sw_stats_add->tx_qos_drops +
2569 sw_stats_add->tx_failure_drops +
2570 sw_stats_add->tx_invalid_hwol_drops;
2571 struct netdev_stats *stats = &dev->stats;
2572 int sent = attempted - dropped;
2573 int i;
2574
2575 stats->tx_packets += sent;
2576 stats->tx_dropped += dropped;
2577
2578 for (i = 0; i < sent; i++) {
2579 stats->tx_bytes += dp_packet_size(packets[i]);
2580 }
2581
2582 if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) {
2583 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2584
2585 sw_stats->tx_retries += sw_stats_add->tx_retries;
2586 sw_stats->tx_failure_drops += sw_stats_add->tx_failure_drops;
2587 sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops;
2588 sw_stats->tx_qos_drops += sw_stats_add->tx_qos_drops;
2589 sw_stats->tx_invalid_hwol_drops += sw_stats_add->tx_invalid_hwol_drops;
2590 }
2591 }
2592
2593 static void
2594 __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2595 struct dp_packet **pkts, int cnt)
2596 {
2597 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2598 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
2599 struct netdev_dpdk_sw_stats sw_stats_add;
2600 unsigned int n_packets_to_free = cnt;
2601 unsigned int total_packets = cnt;
2602 int i, retries = 0;
2603 int max_retries = VHOST_ENQ_RETRY_MIN;
2604 int vid = netdev_dpdk_get_vid(dev);
2605
2606 qid = dev->tx_q[qid % netdev->n_txq].map;
2607
2608 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
2609 || !(dev->flags & NETDEV_UP))) {
2610 rte_spinlock_lock(&dev->stats_lock);
2611 dev->stats.tx_dropped+= cnt;
2612 rte_spinlock_unlock(&dev->stats_lock);
2613 goto out;
2614 }
2615
2616 if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
2617 COVERAGE_INC(vhost_tx_contention);
2618 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2619 }
2620
2621 sw_stats_add.tx_invalid_hwol_drops = cnt;
2622 if (userspace_tso_enabled()) {
2623 cnt = netdev_dpdk_prep_hwol_batch(dev, cur_pkts, cnt);
2624 }
2625
2626 sw_stats_add.tx_invalid_hwol_drops -= cnt;
2627 sw_stats_add.tx_mtu_exceeded_drops = cnt;
2628 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
2629 sw_stats_add.tx_mtu_exceeded_drops -= cnt;
2630
2631 /* Check has QoS has been configured for the netdev */
2632 sw_stats_add.tx_qos_drops = cnt;
2633 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
2634 sw_stats_add.tx_qos_drops -= cnt;
2635
2636 n_packets_to_free = cnt;
2637
2638 do {
2639 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
2640 unsigned int tx_pkts;
2641
2642 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
2643 if (OVS_LIKELY(tx_pkts)) {
2644 /* Packets have been sent.*/
2645 cnt -= tx_pkts;
2646 /* Prepare for possible retry.*/
2647 cur_pkts = &cur_pkts[tx_pkts];
2648 if (OVS_UNLIKELY(cnt && !retries)) {
2649 /*
2650 * Read max retries as there are packets not sent
2651 * and no retries have already occurred.
2652 */
2653 atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
2654 }
2655 } else {
2656 /* No packets sent - do not retry.*/
2657 break;
2658 }
2659 } while (cnt && (retries++ < max_retries));
2660
2661 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2662
2663 sw_stats_add.tx_failure_drops = cnt;
2664 sw_stats_add.tx_retries = MIN(retries, max_retries);
2665
2666 rte_spinlock_lock(&dev->stats_lock);
2667 netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets,
2668 &sw_stats_add);
2669 rte_spinlock_unlock(&dev->stats_lock);
2670
2671 out:
2672 for (i = 0; i < n_packets_to_free; i++) {
2673 dp_packet_delete(pkts[i]);
2674 }
2675 }
2676
2677 static void
2678 netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
2679 {
2680 rte_free(opaque);
2681 }
2682
2683 static struct rte_mbuf *
2684 dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
2685 {
2686 uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len;
2687 struct rte_mbuf_ext_shared_info *shinfo = NULL;
2688 uint16_t buf_len;
2689 void *buf;
2690
2691 if (rte_pktmbuf_tailroom(pkt) >= sizeof *shinfo) {
2692 shinfo = rte_pktmbuf_mtod(pkt, struct rte_mbuf_ext_shared_info *);
2693 } else {
2694 total_len += sizeof *shinfo + sizeof(uintptr_t);
2695 total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2696 }
2697
2698 if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
2699 VLOG_ERR("Can't copy packet: too big %u", total_len);
2700 return NULL;
2701 }
2702
2703 buf_len = total_len;
2704 buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2705 if (OVS_UNLIKELY(buf == NULL)) {
2706 VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
2707 return NULL;
2708 }
2709
2710 /* Initialize shinfo. */
2711 if (shinfo) {
2712 shinfo->free_cb = netdev_dpdk_extbuf_free;
2713 shinfo->fcb_opaque = buf;
2714 rte_mbuf_ext_refcnt_set(shinfo, 1);
2715 } else {
2716 shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2717 netdev_dpdk_extbuf_free,
2718 buf);
2719 if (OVS_UNLIKELY(shinfo == NULL)) {
2720 rte_free(buf);
2721 VLOG_ERR("Failed to initialize shared info for mbuf while "
2722 "attempting to attach an external buffer.");
2723 return NULL;
2724 }
2725 }
2726
2727 rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
2728 shinfo);
2729 rte_pktmbuf_reset_headroom(pkt);
2730
2731 return pkt;
2732 }
2733
2734 static struct rte_mbuf *
2735 dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
2736 {
2737 struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
2738
2739 if (OVS_UNLIKELY(!pkt)) {
2740 return NULL;
2741 }
2742
2743 if (rte_pktmbuf_tailroom(pkt) >= data_len) {
2744 return pkt;
2745 }
2746
2747 if (dpdk_pktmbuf_attach_extbuf(pkt, data_len)) {
2748 return pkt;
2749 }
2750
2751 rte_pktmbuf_free(pkt);
2752
2753 return NULL;
2754 }
2755
2756 static struct dp_packet *
2757 dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
2758 {
2759 struct rte_mbuf *mbuf_dest;
2760 struct dp_packet *pkt_dest;
2761 uint32_t pkt_len;
2762
2763 pkt_len = dp_packet_size(pkt_orig);
2764 mbuf_dest = dpdk_pktmbuf_alloc(mp, pkt_len);
2765 if (OVS_UNLIKELY(mbuf_dest == NULL)) {
2766 return NULL;
2767 }
2768
2769 pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
2770 memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
2771 dp_packet_set_size(pkt_dest, pkt_len);
2772
2773 mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
2774 mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
2775 mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
2776 ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
2777
2778 memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
2779 sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
2780
2781 if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
2782 mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
2783 - (char *)dp_packet_eth(pkt_dest);
2784 mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
2785 - (char *) dp_packet_l3(pkt_dest);
2786 }
2787
2788 return pkt_dest;
2789 }
2790
2791 /* Tx function. Transmit packets indefinitely */
2792 static void
2793 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
2794 OVS_NO_THREAD_SAFETY_ANALYSIS
2795 {
2796 const size_t batch_cnt = dp_packet_batch_size(batch);
2797 #if !defined(__CHECKER__) && !defined(_WIN32)
2798 const size_t PKT_ARRAY_SIZE = batch_cnt;
2799 #else
2800 /* Sparse or MSVC doesn't like variable length array. */
2801 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
2802 #endif
2803 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2804 struct dp_packet *pkts[PKT_ARRAY_SIZE];
2805 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2806 uint32_t cnt = batch_cnt;
2807 uint32_t dropped = 0;
2808 uint32_t tx_failure = 0;
2809 uint32_t mtu_drops = 0;
2810 uint32_t qos_drops = 0;
2811
2812 if (dev->type != DPDK_DEV_VHOST) {
2813 /* Check if QoS has been configured for this netdev. */
2814 cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
2815 batch_cnt, false);
2816 qos_drops = batch_cnt - cnt;
2817 }
2818
2819 uint32_t txcnt = 0;
2820
2821 for (uint32_t i = 0; i < cnt; i++) {
2822 struct dp_packet *packet = batch->packets[i];
2823 uint32_t size = dp_packet_size(packet);
2824
2825 if (size > dev->max_packet_len
2826 && !(packet->mbuf.ol_flags & PKT_TX_TCP_SEG)) {
2827 VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d", size,
2828 dev->max_packet_len);
2829 mtu_drops++;
2830 continue;
2831 }
2832
2833 pkts[txcnt] = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
2834 if (OVS_UNLIKELY(!pkts[txcnt])) {
2835 dropped = cnt - i;
2836 break;
2837 }
2838
2839 txcnt++;
2840 }
2841
2842 if (OVS_LIKELY(txcnt)) {
2843 if (dev->type == DPDK_DEV_VHOST) {
2844 __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
2845 } else {
2846 tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
2847 (struct rte_mbuf **)pkts,
2848 txcnt);
2849 }
2850 }
2851
2852 dropped += qos_drops + mtu_drops + tx_failure;
2853 if (OVS_UNLIKELY(dropped)) {
2854 rte_spinlock_lock(&dev->stats_lock);
2855 dev->stats.tx_dropped += dropped;
2856 sw_stats->tx_failure_drops += tx_failure;
2857 sw_stats->tx_mtu_exceeded_drops += mtu_drops;
2858 sw_stats->tx_qos_drops += qos_drops;
2859 rte_spinlock_unlock(&dev->stats_lock);
2860 }
2861 }
2862
2863 static int
2864 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2865 struct dp_packet_batch *batch,
2866 bool concurrent_txq OVS_UNUSED)
2867 {
2868
2869 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
2870 dpdk_do_tx_copy(netdev, qid, batch);
2871 dp_packet_delete_batch(batch, true);
2872 } else {
2873 __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
2874 dp_packet_batch_size(batch));
2875 }
2876 return 0;
2877 }
2878
2879 static inline void
2880 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
2881 struct dp_packet_batch *batch,
2882 bool concurrent_txq)
2883 {
2884 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2885 dp_packet_delete_batch(batch, true);
2886 return;
2887 }
2888
2889 if (OVS_UNLIKELY(concurrent_txq)) {
2890 qid = qid % dev->up.n_txq;
2891 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2892 }
2893
2894 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
2895 struct netdev *netdev = &dev->up;
2896
2897 dpdk_do_tx_copy(netdev, qid, batch);
2898 dp_packet_delete_batch(batch, true);
2899 } else {
2900 struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
2901 int dropped;
2902 int tx_failure, mtu_drops, qos_drops, hwol_drops;
2903 int batch_cnt = dp_packet_batch_size(batch);
2904 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
2905
2906 hwol_drops = batch_cnt;
2907 if (userspace_tso_enabled()) {
2908 batch_cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, batch_cnt);
2909 }
2910 hwol_drops -= batch_cnt;
2911 mtu_drops = batch_cnt;
2912 batch_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
2913 mtu_drops -= batch_cnt;
2914 qos_drops = batch_cnt;
2915 batch_cnt = netdev_dpdk_qos_run(dev, pkts, batch_cnt, true);
2916 qos_drops -= batch_cnt;
2917
2918 tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, batch_cnt);
2919
2920 dropped = tx_failure + mtu_drops + qos_drops + hwol_drops;
2921 if (OVS_UNLIKELY(dropped)) {
2922 rte_spinlock_lock(&dev->stats_lock);
2923 dev->stats.tx_dropped += dropped;
2924 sw_stats->tx_failure_drops += tx_failure;
2925 sw_stats->tx_mtu_exceeded_drops += mtu_drops;
2926 sw_stats->tx_qos_drops += qos_drops;
2927 sw_stats->tx_invalid_hwol_drops += hwol_drops;
2928 rte_spinlock_unlock(&dev->stats_lock);
2929 }
2930 }
2931
2932 if (OVS_UNLIKELY(concurrent_txq)) {
2933 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2934 }
2935 }
2936
2937 static int
2938 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
2939 struct dp_packet_batch *batch, bool concurrent_txq)
2940 {
2941 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2942
2943 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
2944 return 0;
2945 }
2946
2947 static int
2948 netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
2949 {
2950 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2951
2952 ovs_mutex_lock(&dev->mutex);
2953 if (!eth_addr_equals(dev->hwaddr, mac)) {
2954 dev->hwaddr = mac;
2955 netdev_change_seq_changed(netdev);
2956 }
2957 ovs_mutex_unlock(&dev->mutex);
2958
2959 return 0;
2960 }
2961
2962 static int
2963 netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
2964 {
2965 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2966
2967 ovs_mutex_lock(&dev->mutex);
2968 *mac = dev->hwaddr;
2969 ovs_mutex_unlock(&dev->mutex);
2970
2971 return 0;
2972 }
2973
2974 static int
2975 netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
2976 {
2977 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2978
2979 ovs_mutex_lock(&dev->mutex);
2980 *mtup = dev->mtu;
2981 ovs_mutex_unlock(&dev->mutex);
2982
2983 return 0;
2984 }
2985
2986 static int
2987 netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
2988 {
2989 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2990
2991 /* XXX: Ensure that the overall frame length of the requested MTU does not
2992 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2993 * the L2 frame length is calculated for a given MTU when
2994 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2995 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2996 * not include vlan headers. As such we should use
2997 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2998 * (8 bytes) for comparison. This avoids a failure later with
2999 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
3000 * a method to retrieve the upper bound MTU for a given device.
3001 */
3002 if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
3003 || mtu < RTE_ETHER_MIN_MTU) {
3004 VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
3005 return EINVAL;
3006 }
3007
3008 ovs_mutex_lock(&dev->mutex);
3009 if (dev->requested_mtu != mtu) {
3010 dev->requested_mtu = mtu;
3011 netdev_request_reconfigure(netdev);
3012 }
3013 ovs_mutex_unlock(&dev->mutex);
3014
3015 return 0;
3016 }
3017
3018 static int
3019 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
3020
3021 static int
3022 netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
3023 struct netdev_stats *stats)
3024 {
3025 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3026
3027 ovs_mutex_lock(&dev->mutex);
3028
3029 rte_spinlock_lock(&dev->stats_lock);
3030 /* Supported Stats */
3031 stats->rx_packets = dev->stats.rx_packets;
3032 stats->tx_packets = dev->stats.tx_packets;
3033 stats->rx_dropped = dev->stats.rx_dropped;
3034 stats->tx_dropped = dev->stats.tx_dropped;
3035 stats->multicast = dev->stats.multicast;
3036 stats->rx_bytes = dev->stats.rx_bytes;
3037 stats->tx_bytes = dev->stats.tx_bytes;
3038 stats->rx_errors = dev->stats.rx_errors;
3039 stats->rx_length_errors = dev->stats.rx_length_errors;
3040
3041 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
3042 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
3043 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
3044 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
3045 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
3046 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
3047 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
3048
3049 rte_spinlock_unlock(&dev->stats_lock);
3050
3051 ovs_mutex_unlock(&dev->mutex);
3052
3053 return 0;
3054 }
3055
3056 static void
3057 netdev_dpdk_convert_xstats(struct netdev_stats *stats,
3058 const struct rte_eth_xstat *xstats,
3059 const struct rte_eth_xstat_name *names,
3060 const unsigned int size)
3061 {
3062 /* DPDK XSTATS Counter names definition. */
3063 #define DPDK_XSTATS \
3064 DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
3065 DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
3066 DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
3067 DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
3068 DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
3069 DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
3070 DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
3071 DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
3072 DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
3073 DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
3074 DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
3075 DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
3076 DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
3077 DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
3078 DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
3079 DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
3080 DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
3081 DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
3082 DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
3083 DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
3084 DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
3085 DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
3086
3087 for (unsigned int i = 0; i < size; i++) {
3088 #define DPDK_XSTAT(MEMBER, NAME) \
3089 if (strcmp(NAME, names[i].name) == 0) { \
3090 stats->MEMBER = xstats[i].value; \
3091 continue; \
3092 }
3093 DPDK_XSTATS;
3094 #undef DPDK_XSTAT
3095 }
3096 #undef DPDK_XSTATS
3097 }
3098
3099 static int
3100 netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
3101 {
3102 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3103 struct rte_eth_stats rte_stats;
3104 bool gg;
3105
3106 netdev_dpdk_get_carrier(netdev, &gg);
3107 ovs_mutex_lock(&dev->mutex);
3108
3109 struct rte_eth_xstat *rte_xstats = NULL;
3110 struct rte_eth_xstat_name *rte_xstats_names = NULL;
3111 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
3112
3113 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
3114 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
3115 dev->port_id);
3116 ovs_mutex_unlock(&dev->mutex);
3117 return EPROTO;
3118 }
3119
3120 /* Get length of statistics */
3121 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
3122 if (rte_xstats_len < 0) {
3123 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
3124 dev->port_id);
3125 goto out;
3126 }
3127 /* Reserve memory for xstats names and values */
3128 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
3129 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
3130
3131 /* Retreive xstats names */
3132 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
3133 rte_xstats_names,
3134 rte_xstats_len);
3135 if (rte_xstats_new_len != rte_xstats_len) {
3136 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
3137 dev->port_id);
3138 goto out;
3139 }
3140 /* Retreive xstats values */
3141 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
3142 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
3143 rte_xstats_len);
3144 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
3145 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
3146 rte_xstats_len);
3147 } else {
3148 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
3149 dev->port_id);
3150 }
3151
3152 out:
3153 free(rte_xstats);
3154 free(rte_xstats_names);
3155
3156 stats->rx_packets = rte_stats.ipackets;
3157 stats->tx_packets = rte_stats.opackets;
3158 stats->rx_bytes = rte_stats.ibytes;
3159 stats->tx_bytes = rte_stats.obytes;
3160 stats->rx_errors = rte_stats.ierrors;
3161 stats->tx_errors = rte_stats.oerrors;
3162
3163 rte_spinlock_lock(&dev->stats_lock);
3164 stats->tx_dropped = dev->stats.tx_dropped;
3165 stats->rx_dropped = dev->stats.rx_dropped;
3166 rte_spinlock_unlock(&dev->stats_lock);
3167
3168 /* These are the available DPDK counters for packets not received due to
3169 * local resource constraints in DPDK and NIC respectively. */
3170 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
3171 stats->rx_missed_errors = rte_stats.imissed;
3172
3173 ovs_mutex_unlock(&dev->mutex);
3174
3175 return 0;
3176 }
3177
3178 static int
3179 netdev_dpdk_get_custom_stats(const struct netdev *netdev,
3180 struct netdev_custom_stats *custom_stats)
3181 {
3182
3183 uint32_t i;
3184 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3185 int rte_xstats_ret, sw_stats_size;
3186
3187 netdev_dpdk_get_sw_custom_stats(netdev, custom_stats);
3188
3189 ovs_mutex_lock(&dev->mutex);
3190
3191 if (netdev_dpdk_configure_xstats(dev)) {
3192 uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
3193 sizeof(uint64_t));
3194
3195 rte_xstats_ret =
3196 rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
3197 values, dev->rte_xstats_ids_size);
3198
3199 if (rte_xstats_ret > 0 &&
3200 rte_xstats_ret <= dev->rte_xstats_ids_size) {
3201
3202 sw_stats_size = custom_stats->size;
3203 custom_stats->size += rte_xstats_ret;
3204 custom_stats->counters = xrealloc(custom_stats->counters,
3205 custom_stats->size *
3206 sizeof *custom_stats->counters);
3207
3208 for (i = 0; i < rte_xstats_ret; i++) {
3209 ovs_strlcpy(custom_stats->counters[sw_stats_size + i].name,
3210 netdev_dpdk_get_xstat_name(dev,
3211 dev->rte_xstats_ids[i]),
3212 NETDEV_CUSTOM_STATS_NAME_SIZE);
3213 custom_stats->counters[sw_stats_size + i].value = values[i];
3214 }
3215 } else {
3216 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
3217 dev->port_id);
3218 /* Let's clear statistics cache, so it will be
3219 * reconfigured */
3220 netdev_dpdk_clear_xstats(dev);
3221 }
3222
3223 free(values);
3224 }
3225
3226 ovs_mutex_unlock(&dev->mutex);
3227
3228 return 0;
3229 }
3230
3231 static int
3232 netdev_dpdk_get_sw_custom_stats(const struct netdev *netdev,
3233 struct netdev_custom_stats *custom_stats)
3234 {
3235 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3236 int i, n;
3237
3238 #define SW_CSTATS \
3239 SW_CSTAT(tx_retries) \
3240 SW_CSTAT(tx_failure_drops) \
3241 SW_CSTAT(tx_mtu_exceeded_drops) \
3242 SW_CSTAT(tx_qos_drops) \
3243 SW_CSTAT(rx_qos_drops) \
3244 SW_CSTAT(tx_invalid_hwol_drops)
3245
3246 #define SW_CSTAT(NAME) + 1
3247 custom_stats->size = SW_CSTATS;
3248 #undef SW_CSTAT
3249 custom_stats->counters = xcalloc(custom_stats->size,
3250 sizeof *custom_stats->counters);
3251
3252 ovs_mutex_lock(&dev->mutex);
3253
3254 rte_spinlock_lock(&dev->stats_lock);
3255 i = 0;
3256 #define SW_CSTAT(NAME) \
3257 custom_stats->counters[i++].value = dev->sw_stats->NAME;
3258 SW_CSTATS;
3259 #undef SW_CSTAT
3260 rte_spinlock_unlock(&dev->stats_lock);
3261
3262 ovs_mutex_unlock(&dev->mutex);
3263
3264 i = 0;
3265 n = 0;
3266 #define SW_CSTAT(NAME) \
3267 if (custom_stats->counters[i].value != UINT64_MAX) { \
3268 ovs_strlcpy(custom_stats->counters[n].name, \
3269 "ovs_"#NAME, NETDEV_CUSTOM_STATS_NAME_SIZE); \
3270 custom_stats->counters[n].value = custom_stats->counters[i].value; \
3271 n++; \
3272 } \
3273 i++;
3274 SW_CSTATS;
3275 #undef SW_CSTAT
3276
3277 custom_stats->size = n;
3278 return 0;
3279 }
3280
3281 static int
3282 netdev_dpdk_get_features(const struct netdev *netdev,
3283 enum netdev_features *current,
3284 enum netdev_features *advertised,
3285 enum netdev_features *supported,
3286 enum netdev_features *peer)
3287 {
3288 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3289 struct rte_eth_link link;
3290 uint32_t feature = 0;
3291
3292 ovs_mutex_lock(&dev->mutex);
3293 link = dev->link;
3294 ovs_mutex_unlock(&dev->mutex);
3295
3296 /* Match against OpenFlow defined link speed values. */
3297 if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
3298 switch (link.link_speed) {
3299 case ETH_SPEED_NUM_10M:
3300 feature |= NETDEV_F_10MB_FD;
3301 break;
3302 case ETH_SPEED_NUM_100M:
3303 feature |= NETDEV_F_100MB_FD;
3304 break;
3305 case ETH_SPEED_NUM_1G:
3306 feature |= NETDEV_F_1GB_FD;
3307 break;
3308 case ETH_SPEED_NUM_10G:
3309 feature |= NETDEV_F_10GB_FD;
3310 break;
3311 case ETH_SPEED_NUM_40G:
3312 feature |= NETDEV_F_40GB_FD;
3313 break;
3314 case ETH_SPEED_NUM_100G:
3315 feature |= NETDEV_F_100GB_FD;
3316 break;
3317 default:
3318 feature |= NETDEV_F_OTHER;
3319 }
3320 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
3321 switch (link.link_speed) {
3322 case ETH_SPEED_NUM_10M:
3323 feature |= NETDEV_F_10MB_HD;
3324 break;
3325 case ETH_SPEED_NUM_100M:
3326 feature |= NETDEV_F_100MB_HD;
3327 break;
3328 case ETH_SPEED_NUM_1G:
3329 feature |= NETDEV_F_1GB_HD;
3330 break;
3331 default:
3332 feature |= NETDEV_F_OTHER;
3333 }
3334 }
3335
3336 if (link.link_autoneg) {
3337 feature |= NETDEV_F_AUTONEG;
3338 }
3339
3340 *current = feature;
3341 *advertised = *supported = *peer = 0;
3342
3343 return 0;
3344 }
3345
3346 static struct ingress_policer *
3347 netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
3348 {
3349 struct ingress_policer *policer = NULL;
3350 uint64_t rate_bytes;
3351 uint64_t burst_bytes;
3352 int err = 0;
3353
3354 policer = xmalloc(sizeof *policer);
3355 rte_spinlock_init(&policer->policer_lock);
3356
3357 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
3358 rate_bytes = rate * 1000ULL / 8;
3359 burst_bytes = burst * 1000ULL / 8;
3360
3361 policer->app_srtcm_params.cir = rate_bytes;
3362 policer->app_srtcm_params.cbs = burst_bytes;
3363 policer->app_srtcm_params.ebs = 0;
3364 err = rte_meter_srtcm_profile_config(&policer->in_prof,
3365 &policer->app_srtcm_params);
3366 if (!err) {
3367 err = rte_meter_srtcm_config(&policer->in_policer,
3368 &policer->in_prof);
3369 }
3370 if (err) {
3371 VLOG_ERR("Could not create rte meter for ingress policer");
3372 free(policer);
3373 return NULL;
3374 }
3375
3376 return policer;
3377 }
3378
3379 static int
3380 netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
3381 uint32_t policer_burst)
3382 {
3383 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3384 struct ingress_policer *policer;
3385
3386 /* Force to 0 if no rate specified,
3387 * default to 8000 kbits if burst is 0,
3388 * else stick with user-specified value.
3389 */
3390 policer_burst = (!policer_rate ? 0
3391 : !policer_burst ? 8000
3392 : policer_burst);
3393
3394 ovs_mutex_lock(&dev->mutex);
3395
3396 policer = ovsrcu_get_protected(struct ingress_policer *,
3397 &dev->ingress_policer);
3398
3399 if (dev->policer_rate == policer_rate &&
3400 dev->policer_burst == policer_burst) {
3401 /* Assume that settings haven't changed since we last set them. */
3402 ovs_mutex_unlock(&dev->mutex);
3403 return 0;
3404 }
3405
3406 /* Destroy any existing ingress policer for the device if one exists */
3407 if (policer) {
3408 ovsrcu_postpone(free, policer);
3409 }
3410
3411 if (policer_rate != 0) {
3412 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
3413 } else {
3414 policer = NULL;
3415 }
3416 ovsrcu_set(&dev->ingress_policer, policer);
3417 dev->policer_rate = policer_rate;
3418 dev->policer_burst = policer_burst;
3419 ovs_mutex_unlock(&dev->mutex);
3420
3421 return 0;
3422 }
3423
3424 static int
3425 netdev_dpdk_get_ifindex(const struct netdev *netdev)
3426 {
3427 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3428
3429 ovs_mutex_lock(&dev->mutex);
3430 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
3431 * postive integer to meet RFC 2863 recommendations.
3432 */
3433 int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
3434 ovs_mutex_unlock(&dev->mutex);
3435
3436 return ifindex;
3437 }
3438
3439 static int
3440 netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
3441 {
3442 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3443
3444 ovs_mutex_lock(&dev->mutex);
3445 check_link_status(dev);
3446 *carrier = dev->link.link_status;
3447
3448 ovs_mutex_unlock(&dev->mutex);
3449
3450 return 0;
3451 }
3452
3453 static int
3454 netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
3455 {
3456 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3457
3458 ovs_mutex_lock(&dev->mutex);
3459
3460 if (is_vhost_running(dev)) {
3461 *carrier = 1;
3462 } else {
3463 *carrier = 0;
3464 }
3465
3466 ovs_mutex_unlock(&dev->mutex);
3467
3468 return 0;
3469 }
3470
3471 static long long int
3472 netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
3473 {
3474 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3475 long long int carrier_resets;
3476
3477 ovs_mutex_lock(&dev->mutex);
3478 carrier_resets = dev->link_reset_cnt;
3479 ovs_mutex_unlock(&dev->mutex);
3480
3481 return carrier_resets;
3482 }
3483
3484 static int
3485 netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
3486 long long int interval OVS_UNUSED)
3487 {
3488 return EOPNOTSUPP;
3489 }
3490
3491 static int
3492 netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
3493 enum netdev_flags off, enum netdev_flags on,
3494 enum netdev_flags *old_flagsp)
3495 OVS_REQUIRES(dev->mutex)
3496 {
3497 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
3498 return EINVAL;
3499 }
3500
3501 *old_flagsp = dev->flags;
3502 dev->flags |= on;
3503 dev->flags &= ~off;
3504
3505 if (dev->flags == *old_flagsp) {
3506 return 0;
3507 }
3508
3509 if (dev->type == DPDK_DEV_ETH) {
3510
3511 if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
3512 int err;
3513
3514 if (dev->flags & NETDEV_UP) {
3515 err = rte_eth_dev_set_link_up(dev->port_id);
3516 } else {
3517 err = rte_eth_dev_set_link_down(dev->port_id);
3518 }
3519 if (err == -ENOTSUP) {
3520 VLOG_INFO("Interface %s does not support link state "
3521 "configuration", netdev_get_name(&dev->up));
3522 } else if (err < 0) {
3523 VLOG_ERR("Interface %s link change error: %s",
3524 netdev_get_name(&dev->up), rte_strerror(-err));
3525 dev->flags = *old_flagsp;
3526 return -err;
3527 }
3528 }
3529
3530 if (dev->flags & NETDEV_PROMISC) {
3531 rte_eth_promiscuous_enable(dev->port_id);
3532 }
3533
3534 netdev_change_seq_changed(&dev->up);
3535 } else {
3536 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3537 * running then change netdev's change_seq to trigger link state
3538 * update. */
3539
3540 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
3541 && is_vhost_running(dev)) {
3542 netdev_change_seq_changed(&dev->up);
3543
3544 /* Clear statistics if device is getting up. */
3545 if (NETDEV_UP & on) {
3546 rte_spinlock_lock(&dev->stats_lock);
3547 memset(&dev->stats, 0, sizeof dev->stats);
3548 rte_spinlock_unlock(&dev->stats_lock);
3549 }
3550 }
3551 }
3552
3553 return 0;
3554 }
3555
3556 static int
3557 netdev_dpdk_update_flags(struct netdev *netdev,
3558 enum netdev_flags off, enum netdev_flags on,
3559 enum netdev_flags *old_flagsp)
3560 {
3561 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3562 int error;
3563
3564 ovs_mutex_lock(&dev->mutex);
3565 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
3566 ovs_mutex_unlock(&dev->mutex);
3567
3568 return error;
3569 }
3570
3571 static int
3572 netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
3573 struct smap *args)
3574 {
3575 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3576
3577 ovs_mutex_lock(&dev->mutex);
3578
3579 bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
3580 smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
3581
3582 int vid = netdev_dpdk_get_vid(dev);
3583 if (vid < 0) {
3584 smap_add_format(args, "status", "disconnected");
3585 ovs_mutex_unlock(&dev->mutex);
3586 return 0;
3587 } else {
3588 smap_add_format(args, "status", "connected");
3589 }
3590
3591 char socket_name[PATH_MAX];
3592 if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
3593 smap_add_format(args, "socket", "%s", socket_name);
3594 }
3595
3596 uint64_t features;
3597 if (!rte_vhost_get_negotiated_features(vid, &features)) {
3598 smap_add_format(args, "features", "0x%016"PRIx64, features);
3599 }
3600
3601 uint16_t mtu;
3602 if (!rte_vhost_get_mtu(vid, &mtu)) {
3603 smap_add_format(args, "mtu", "%d", mtu);
3604 }
3605
3606 int numa = rte_vhost_get_numa_node(vid);
3607 if (numa >= 0) {
3608 smap_add_format(args, "numa", "%d", numa);
3609 }
3610
3611 uint16_t vring_num = rte_vhost_get_vring_num(vid);
3612 if (vring_num) {
3613 smap_add_format(args, "num_of_vrings", "%d", vring_num);
3614 }
3615
3616 for (int i = 0; i < vring_num; i++) {
3617 struct rte_vhost_vring vring;
3618
3619 rte_vhost_get_vhost_vring(vid, i, &vring);
3620 smap_add_nocopy(args, xasprintf("vring_%d_size", i),
3621 xasprintf("%d", vring.size));
3622 }
3623
3624 ovs_mutex_unlock(&dev->mutex);
3625 return 0;
3626 }
3627
3628 /*
3629 * Convert a given uint32_t link speed defined in DPDK to a string
3630 * equivalent.
3631 */
3632 static const char *
3633 netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
3634 {
3635 switch (link_speed) {
3636 case ETH_SPEED_NUM_10M: return "10Mbps";
3637 case ETH_SPEED_NUM_100M: return "100Mbps";
3638 case ETH_SPEED_NUM_1G: return "1Gbps";
3639 case ETH_SPEED_NUM_2_5G: return "2.5Gbps";
3640 case ETH_SPEED_NUM_5G: return "5Gbps";
3641 case ETH_SPEED_NUM_10G: return "10Gbps";
3642 case ETH_SPEED_NUM_20G: return "20Gbps";
3643 case ETH_SPEED_NUM_25G: return "25Gbps";
3644 case ETH_SPEED_NUM_40G: return "40Gbps";
3645 case ETH_SPEED_NUM_50G: return "50Gbps";
3646 case ETH_SPEED_NUM_56G: return "56Gbps";
3647 case ETH_SPEED_NUM_100G: return "100Gbps";
3648 default: return "Not Defined";
3649 }
3650 }
3651
3652 static int
3653 netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
3654 {
3655 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3656 struct rte_eth_dev_info dev_info;
3657 uint32_t link_speed;
3658
3659 if (!rte_eth_dev_is_valid_port(dev->port_id)) {
3660 return ENODEV;
3661 }
3662
3663 ovs_mutex_lock(&dpdk_mutex);
3664 ovs_mutex_lock(&dev->mutex);
3665 rte_eth_dev_info_get(dev->port_id, &dev_info);
3666 link_speed = dev->link.link_speed;
3667 ovs_mutex_unlock(&dev->mutex);
3668 const struct rte_bus *bus;
3669 const struct rte_pci_device *pci_dev;
3670 uint16_t vendor_id = PCI_ANY_ID;
3671 uint16_t device_id = PCI_ANY_ID;
3672 bus = rte_bus_find_by_device(dev_info.device);
3673 if (bus && !strcmp(bus->name, "pci")) {
3674 pci_dev = RTE_DEV_TO_PCI(dev_info.device);
3675 if (pci_dev) {
3676 vendor_id = pci_dev->id.vendor_id;
3677 device_id = pci_dev->id.device_id;
3678 }
3679 }
3680 ovs_mutex_unlock(&dpdk_mutex);
3681
3682 smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
3683 smap_add_format(args, "numa_id", "%d",
3684 rte_eth_dev_socket_id(dev->port_id));
3685 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
3686 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
3687 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
3688 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
3689 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
3690 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
3691 smap_add_format(args, "max_hash_mac_addrs", "%u",
3692 dev_info.max_hash_mac_addrs);
3693 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
3694 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
3695
3696 /* Querying the DPDK library for iftype may be done in future, pending
3697 * support; cf. RFC 3635 Section 3.2.4. */
3698 enum { IF_TYPE_ETHERNETCSMACD = 6 };
3699
3700 smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
3701 smap_add_format(args, "if_descr", "%s %s", rte_version(),
3702 dev_info.driver_name);
3703 smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id);
3704 smap_add_format(args, "pci-device_id", "0x%x", device_id);
3705
3706 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3707 * In that case the speed will not be reported as part of the usual
3708 * call to get_features(). Get the link speed of the device and add it
3709 * to the device status in an easy to read string format.
3710 */
3711 smap_add(args, "link_speed",
3712 netdev_dpdk_link_speed_to_str__(link_speed));
3713
3714 return 0;
3715 }
3716
3717 static void
3718 netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
3719 OVS_REQUIRES(dev->mutex)
3720 {
3721 enum netdev_flags old_flags;
3722
3723 if (admin_state) {
3724 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
3725 } else {
3726 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
3727 }
3728 }
3729
3730 static void
3731 netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
3732 const char *argv[], void *aux OVS_UNUSED)
3733 {
3734 bool up;
3735
3736 if (!strcasecmp(argv[argc - 1], "up")) {
3737 up = true;
3738 } else if ( !strcasecmp(argv[argc - 1], "down")) {
3739 up = false;
3740 } else {
3741 unixctl_command_reply_error(conn, "Invalid Admin State");
3742 return;
3743 }
3744
3745 if (argc > 2) {
3746 struct netdev *netdev = netdev_from_name(argv[1]);
3747
3748 if (netdev && is_dpdk_class(netdev->netdev_class)) {
3749 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3750
3751 ovs_mutex_lock(&dev->mutex);
3752 netdev_dpdk_set_admin_state__(dev, up);
3753 ovs_mutex_unlock(&dev->mutex);
3754
3755 netdev_close(netdev);
3756 } else {
3757 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3758 netdev_close(netdev);
3759 return;
3760 }
3761 } else {
3762 struct netdev_dpdk *dev;
3763
3764 ovs_mutex_lock(&dpdk_mutex);
3765 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3766 ovs_mutex_lock(&dev->mutex);
3767 netdev_dpdk_set_admin_state__(dev, up);
3768 ovs_mutex_unlock(&dev->mutex);
3769 }
3770 ovs_mutex_unlock(&dpdk_mutex);
3771 }
3772 unixctl_command_reply(conn, "OK");
3773 }
3774
3775 static void
3776 netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
3777 const char *argv[], void *aux OVS_UNUSED)
3778 {
3779 char *response;
3780 dpdk_port_t port_id;
3781 struct netdev_dpdk *dev;
3782 struct rte_device *rte_dev;
3783 struct ds used_interfaces = DS_EMPTY_INITIALIZER;
3784 bool used = false;
3785
3786 ovs_mutex_lock(&dpdk_mutex);
3787
3788 port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
3789 if (!rte_eth_dev_is_valid_port(port_id)) {
3790 response = xasprintf("Device '%s' not found in DPDK", argv[1]);
3791 goto error;
3792 }
3793
3794 rte_dev = rte_eth_devices[port_id].device;
3795 ds_put_format(&used_interfaces,
3796 "Device '%s' is being used by the following interfaces:",
3797 argv[1]);
3798
3799 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3800 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3801 if (rte_eth_devices[dev->port_id].device == rte_dev
3802 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
3803 used = true;
3804 ds_put_format(&used_interfaces, " %s",
3805 netdev_get_name(&dev->up));
3806 }
3807 }
3808
3809 if (used) {
3810 ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
3811 response = ds_steal_cstr(&used_interfaces);
3812 ds_destroy(&used_interfaces);
3813 goto error;
3814 }
3815 ds_destroy(&used_interfaces);
3816
3817 rte_eth_dev_close(port_id);
3818 if (rte_dev_remove(rte_dev) < 0) {
3819 response = xasprintf("Device '%s' can not be detached", argv[1]);
3820 goto error;
3821 }
3822
3823 response = xasprintf("All devices shared with device '%s' "
3824 "have been detached", argv[1]);
3825
3826 ovs_mutex_unlock(&dpdk_mutex);
3827 unixctl_command_reply(conn, response);
3828 free(response);
3829 return;
3830
3831 error:
3832 ovs_mutex_unlock(&dpdk_mutex);
3833 unixctl_command_reply_error(conn, response);
3834 free(response);
3835 }
3836
3837 static void
3838 netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
3839 int argc, const char *argv[],
3840 void *aux OVS_UNUSED)
3841 {
3842 size_t size;
3843 FILE *stream;
3844 char *response = NULL;
3845 struct netdev *netdev = NULL;
3846
3847 if (argc == 2) {
3848 netdev = netdev_from_name(argv[1]);
3849 if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
3850 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3851 goto out;
3852 }
3853 }
3854
3855 stream = open_memstream(&response, &size);
3856 if (!stream) {
3857 response = xasprintf("Unable to open memstream: %s.",
3858 ovs_strerror(errno));
3859 unixctl_command_reply_error(conn, response);
3860 goto out;
3861 }
3862
3863 if (netdev) {
3864 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3865
3866 ovs_mutex_lock(&dev->mutex);
3867 ovs_mutex_lock(&dpdk_mp_mutex);
3868
3869 rte_mempool_dump(stream, dev->dpdk_mp->mp);
3870
3871 ovs_mutex_unlock(&dpdk_mp_mutex);
3872 ovs_mutex_unlock(&dev->mutex);
3873 } else {
3874 ovs_mutex_lock(&dpdk_mp_mutex);
3875 rte_mempool_list_dump(stream);
3876 ovs_mutex_unlock(&dpdk_mp_mutex);
3877 }
3878
3879 fclose(stream);
3880
3881 unixctl_command_reply(conn, response);
3882 out:
3883 free(response);
3884 netdev_close(netdev);
3885 }
3886
3887 /*
3888 * Set virtqueue flags so that we do not receive interrupts.
3889 */
3890 static void
3891 set_irq_status(int vid)
3892 {
3893 uint32_t i;
3894
3895 for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
3896 rte_vhost_enable_guest_notification(vid, i, 0);
3897 }
3898 }
3899
3900 /*
3901 * Fixes mapping for vhost-user tx queues. Must be called after each
3902 * enabling/disabling of queues and n_txq modifications.
3903 */
3904 static void
3905 netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
3906 OVS_REQUIRES(dev->mutex)
3907 {
3908 int *enabled_queues, n_enabled = 0;
3909 int i, k, total_txqs = dev->up.n_txq;
3910
3911 enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
3912
3913 for (i = 0; i < total_txqs; i++) {
3914 /* Enabled queues always mapped to themselves. */
3915 if (dev->tx_q[i].map == i) {
3916 enabled_queues[n_enabled++] = i;
3917 }
3918 }
3919
3920 if (n_enabled == 0 && total_txqs != 0) {
3921 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
3922 n_enabled = 1;
3923 }
3924
3925 k = 0;
3926 for (i = 0; i < total_txqs; i++) {
3927 if (dev->tx_q[i].map != i) {
3928 dev->tx_q[i].map = enabled_queues[k];
3929 k = (k + 1) % n_enabled;
3930 }
3931 }
3932
3933 if (VLOG_IS_DBG_ENABLED()) {
3934 struct ds mapping = DS_EMPTY_INITIALIZER;
3935
3936 ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
3937 netdev_get_name(&dev->up));
3938 for (i = 0; i < total_txqs; i++) {
3939 ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
3940 }
3941
3942 VLOG_DBG("%s", ds_cstr(&mapping));
3943 ds_destroy(&mapping);
3944 }
3945
3946 free(enabled_queues);
3947 }
3948
3949 /*
3950 * A new virtio-net device is added to a vhost port.
3951 */
3952 static int
3953 new_device(int vid)
3954 {
3955 struct netdev_dpdk *dev;
3956 bool exists = false;
3957 int newnode = 0;
3958 char ifname[IF_NAME_SZ];
3959
3960 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3961
3962 ovs_mutex_lock(&dpdk_mutex);
3963 /* Add device to the vhost port with the same name as that passed down. */
3964 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
3965 ovs_mutex_lock(&dev->mutex);
3966 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3967 uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
3968
3969 /* Get NUMA information */
3970 newnode = rte_vhost_get_numa_node(vid);
3971 if (newnode == -1) {
3972 #ifdef VHOST_NUMA
3973 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
3974 ifname);
3975 #endif
3976 newnode = dev->socket_id;
3977 }
3978
3979 if (dev->requested_n_txq < qp_num
3980 || dev->requested_n_rxq < qp_num
3981 || dev->requested_socket_id != newnode) {
3982 dev->requested_socket_id = newnode;
3983 dev->requested_n_rxq = qp_num;
3984 dev->requested_n_txq = qp_num;
3985 netdev_request_reconfigure(&dev->up);
3986 } else {
3987 /* Reconfiguration not required. */
3988 dev->vhost_reconfigured = true;
3989 }
3990
3991 ovsrcu_index_set(&dev->vid, vid);
3992 exists = true;
3993
3994 /* Disable notifications. */
3995 set_irq_status(vid);
3996 netdev_change_seq_changed(&dev->up);
3997 ovs_mutex_unlock(&dev->mutex);
3998 break;
3999 }
4000 ovs_mutex_unlock(&dev->mutex);
4001 }
4002 ovs_mutex_unlock(&dpdk_mutex);
4003
4004 if (!exists) {
4005 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
4006
4007 return -1;
4008 }
4009
4010 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
4011 ifname, newnode);
4012
4013 return 0;
4014 }
4015
4016 /* Clears mapping for all available queues of vhost interface. */
4017 static void
4018 netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
4019 OVS_REQUIRES(dev->mutex)
4020 {
4021 int i;
4022
4023 for (i = 0; i < dev->up.n_txq; i++) {
4024 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
4025 }
4026 }
4027
4028 /*
4029 * Remove a virtio-net device from the specific vhost port. Use dev->remove
4030 * flag to stop any more packets from being sent or received to/from a VM and
4031 * ensure all currently queued packets have been sent/received before removing
4032 * the device.
4033 */
4034 static void
4035 destroy_device(int vid)
4036 {
4037 struct netdev_dpdk *dev;
4038 bool exists = false;
4039 char ifname[IF_NAME_SZ];
4040
4041 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
4042
4043 ovs_mutex_lock(&dpdk_mutex);
4044 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
4045 if (netdev_dpdk_get_vid(dev) == vid) {
4046
4047 ovs_mutex_lock(&dev->mutex);
4048 dev->vhost_reconfigured = false;
4049 ovsrcu_index_set(&dev->vid, -1);
4050 memset(dev->vhost_rxq_enabled, 0,
4051 dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
4052 netdev_dpdk_txq_map_clear(dev);
4053
4054 netdev_change_seq_changed(&dev->up);
4055 ovs_mutex_unlock(&dev->mutex);
4056 exists = true;
4057 break;
4058 }
4059 }
4060
4061 ovs_mutex_unlock(&dpdk_mutex);
4062
4063 if (exists) {
4064 /*
4065 * Wait for other threads to quiesce after setting the 'virtio_dev'
4066 * to NULL, before returning.
4067 */
4068 ovsrcu_synchronize();
4069 /*
4070 * As call to ovsrcu_synchronize() will end the quiescent state,
4071 * put thread back into quiescent state before returning.
4072 */
4073 ovsrcu_quiesce_start();
4074 VLOG_INFO("vHost Device '%s' has been removed", ifname);
4075 } else {
4076 VLOG_INFO("vHost Device '%s' not found", ifname);
4077 }
4078 }
4079
4080 static int
4081 vring_state_changed(int vid, uint16_t queue_id, int enable)
4082 {
4083 struct netdev_dpdk *dev;
4084 bool exists = false;
4085 int qid = queue_id / VIRTIO_QNUM;
4086 bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
4087 char ifname[IF_NAME_SZ];
4088
4089 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
4090
4091 ovs_mutex_lock(&dpdk_mutex);
4092 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
4093 ovs_mutex_lock(&dev->mutex);
4094 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
4095 if (is_rx) {
4096 bool old_state = dev->vhost_rxq_enabled[qid];
4097
4098 dev->vhost_rxq_enabled[qid] = enable != 0;
4099 if (old_state != dev->vhost_rxq_enabled[qid]) {
4100 netdev_change_seq_changed(&dev->up);
4101 }
4102 } else {
4103 if (enable) {
4104 dev->tx_q[qid].map = qid;
4105 } else {
4106 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
4107 }
4108 netdev_dpdk_remap_txqs(dev);
4109 }
4110 exists = true;
4111 ovs_mutex_unlock(&dev->mutex);
4112 break;
4113 }
4114 ovs_mutex_unlock(&dev->mutex);
4115 }
4116 ovs_mutex_unlock(&dpdk_mutex);
4117
4118 if (exists) {
4119 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
4120 "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
4121 qid, ifname, (enable == 1) ? "enabled" : "disabled");
4122 } else {
4123 VLOG_INFO("vHost Device '%s' not found", ifname);
4124 return -1;
4125 }
4126
4127 return 0;
4128 }
4129
4130 static void
4131 destroy_connection(int vid)
4132 {
4133 struct netdev_dpdk *dev;
4134 char ifname[IF_NAME_SZ];
4135 bool exists = false;
4136
4137 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
4138
4139 ovs_mutex_lock(&dpdk_mutex);
4140 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
4141 ovs_mutex_lock(&dev->mutex);
4142 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
4143 uint32_t qp_num = NR_QUEUE;
4144
4145 if (netdev_dpdk_get_vid(dev) >= 0) {
4146 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
4147 "device still attached.", dev->vhost_id);
4148 }
4149
4150 /* Restore the number of queue pairs to default. */
4151 if (dev->requested_n_txq != qp_num
4152 || dev->requested_n_rxq != qp_num) {
4153 dev->requested_n_rxq = qp_num;
4154 dev->requested_n_txq = qp_num;
4155 netdev_request_reconfigure(&dev->up);
4156 }
4157 ovs_mutex_unlock(&dev->mutex);
4158 exists = true;
4159 break;
4160 }
4161 ovs_mutex_unlock(&dev->mutex);
4162 }
4163 ovs_mutex_unlock(&dpdk_mutex);
4164
4165 if (exists) {
4166 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
4167 } else {
4168 VLOG_INFO("vHost Device '%s' not found", ifname);
4169 }
4170 }
4171
4172 static
4173 void vhost_guest_notified(int vid OVS_UNUSED)
4174 {
4175 COVERAGE_INC(vhost_notification);
4176 }
4177
4178 /*
4179 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
4180 * or vhostuserclient netdev.
4181 *
4182 * Returns a value greater or equal to zero for a valid vid or '-1' if
4183 * there is no valid vid associated. A vid of '-1' must not be used in
4184 * rte_vhost_ APi calls.
4185 *
4186 * Once obtained and validated, a vid can be used by a PMD for multiple
4187 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
4188 * not fetch the vid again for each of a series of API calls.
4189 */
4190
4191 int
4192 netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
4193 {
4194 return ovsrcu_index_get(&dev->vid);
4195 }
4196
4197 struct ingress_policer *
4198 netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
4199 {
4200 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
4201 }
4202
4203 static int
4204 netdev_dpdk_class_init(void)
4205 {
4206 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4207
4208 /* This function can be called for different classes. The initialization
4209 * needs to be done only once */
4210 if (ovsthread_once_start(&once)) {
4211 int ret;
4212
4213 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
4214 unixctl_command_register("netdev-dpdk/set-admin-state",
4215 "[netdev] up|down", 1, 2,
4216 netdev_dpdk_set_admin_state, NULL);
4217
4218 unixctl_command_register("netdev-dpdk/detach",
4219 "pci address of device", 1, 1,
4220 netdev_dpdk_detach, NULL);
4221
4222 unixctl_command_register("netdev-dpdk/get-mempool-info",
4223 "[netdev]", 0, 1,
4224 netdev_dpdk_get_mempool_info, NULL);
4225
4226 ret = rte_eth_dev_callback_register(RTE_ETH_ALL,
4227 RTE_ETH_EVENT_INTR_RESET,
4228 dpdk_eth_event_callback, NULL);
4229 if (ret != 0) {
4230 VLOG_ERR("Ethernet device callback register error: %s",
4231 rte_strerror(-ret));
4232 }
4233
4234 ovsthread_once_done(&once);
4235 }
4236
4237 return 0;
4238 }
4239
4240 /* Client Rings */
4241
4242 static int
4243 dpdk_ring_create(const char dev_name[], unsigned int port_no,
4244 dpdk_port_t *eth_port_id)
4245 {
4246 struct dpdk_ring *ring_pair;
4247 char *ring_name;
4248 int port_id;
4249
4250 ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
4251 if (!ring_pair) {
4252 return ENOMEM;
4253 }
4254
4255 /* XXX: Add support for multiquque ring. */
4256 ring_name = xasprintf("%s_tx", dev_name);
4257
4258 /* Create single producer tx ring, netdev does explicit locking. */
4259 ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
4260 RING_F_SP_ENQ);
4261 free(ring_name);
4262 if (ring_pair->cring_tx == NULL) {
4263 rte_free(ring_pair);
4264 return ENOMEM;
4265 }
4266
4267 ring_name = xasprintf("%s_rx", dev_name);
4268
4269 /* Create single consumer rx ring, netdev does explicit locking. */
4270 ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
4271 RING_F_SC_DEQ);
4272 free(ring_name);
4273 if (ring_pair->cring_rx == NULL) {
4274 rte_free(ring_pair);
4275 return ENOMEM;
4276 }
4277
4278 port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
4279 &ring_pair->cring_tx, 1, SOCKET0);
4280
4281 if (port_id < 0) {
4282 rte_free(ring_pair);
4283 return ENODEV;
4284 }
4285
4286 ring_pair->user_port_id = port_no;
4287 ring_pair->eth_port_id = port_id;
4288 *eth_port_id = port_id;
4289
4290 ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
4291
4292 return 0;
4293 }
4294
4295 static int
4296 dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
4297 OVS_REQUIRES(dpdk_mutex)
4298 {
4299 struct dpdk_ring *ring_pair;
4300 unsigned int port_no;
4301 int err = 0;
4302
4303 /* Names always start with "dpdkr" */
4304 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
4305 if (err) {
4306 return err;
4307 }
4308
4309 /* Look through our list to find the device */
4310 LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
4311 if (ring_pair->user_port_id == port_no) {
4312 VLOG_INFO("Found dpdk ring device %s:", dev_name);
4313 /* Really all that is needed */
4314 *eth_port_id = ring_pair->eth_port_id;
4315 return 0;
4316 }
4317 }
4318 /* Need to create the device rings */
4319 return dpdk_ring_create(dev_name, port_no, eth_port_id);
4320 }
4321
4322 static int
4323 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
4324 struct dp_packet_batch *batch, bool concurrent_txq)
4325 {
4326 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4327 struct dp_packet *packet;
4328
4329 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
4330 * the offload fields are clear. This is because the same mbuf may be
4331 * modified by the consumer of the ring and return into the datapath
4332 * without recalculating the RSS hash or revalidating the checksums. */
4333 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
4334 dp_packet_reset_offload(packet);
4335 }
4336
4337 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
4338 return 0;
4339 }
4340
4341 static int
4342 netdev_dpdk_ring_construct(struct netdev *netdev)
4343 {
4344 dpdk_port_t port_no = 0;
4345 int err = 0;
4346
4347 VLOG_WARN_ONCE("dpdkr a.k.a. ring ports are considered deprecated. "
4348 "Please migrate to virtio-based interfaces, e.g. "
4349 "dpdkvhostuserclient ports, net_virtio_user DPDK vdev.");
4350
4351 ovs_mutex_lock(&dpdk_mutex);
4352
4353 err = dpdk_ring_open(netdev->name, &port_no);
4354 if (err) {
4355 goto unlock_dpdk;
4356 }
4357
4358 err = common_construct(netdev, port_no, DPDK_DEV_ETH,
4359 rte_eth_dev_socket_id(port_no));
4360 unlock_dpdk:
4361 ovs_mutex_unlock(&dpdk_mutex);
4362 return err;
4363 }
4364
4365 /* QoS Functions */
4366
4367 /*
4368 * Initialize QoS configuration operations.
4369 */
4370 static void
4371 qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
4372 {
4373 conf->ops = ops;
4374 rte_spinlock_init(&conf->lock);
4375 }
4376
4377 /*
4378 * Search existing QoS operations in qos_ops and compare each set of
4379 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
4380 * else return NULL
4381 */
4382 static const struct dpdk_qos_ops *
4383 qos_lookup_name(const char *name)
4384 {
4385 const struct dpdk_qos_ops *const *opsp;
4386
4387 for (opsp = qos_confs; *opsp != NULL; opsp++) {
4388 const struct dpdk_qos_ops *ops = *opsp;
4389 if (!strcmp(name, ops->qos_name)) {
4390 return ops;
4391 }
4392 }
4393 return NULL;
4394 }
4395
4396 static int
4397 netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
4398 struct sset *types)
4399 {
4400 const struct dpdk_qos_ops *const *opsp;
4401
4402 for (opsp = qos_confs; *opsp != NULL; opsp++) {
4403 const struct dpdk_qos_ops *ops = *opsp;
4404 if (ops->qos_construct && ops->qos_name[0] != '\0') {
4405 sset_add(types, ops->qos_name);
4406 }
4407 }
4408 return 0;
4409 }
4410
4411 static int
4412 netdev_dpdk_get_qos(const struct netdev *netdev,
4413 const char **typep, struct smap *details)
4414 {
4415 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4416 struct qos_conf *qos_conf;
4417 int error = 0;
4418
4419 ovs_mutex_lock(&dev->mutex);
4420 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4421 if (qos_conf) {
4422 *typep = qos_conf->ops->qos_name;
4423 error = (qos_conf->ops->qos_get
4424 ? qos_conf->ops->qos_get(qos_conf, details): 0);
4425 } else {
4426 /* No QoS configuration set, return an empty string */
4427 *typep = "";
4428 }
4429 ovs_mutex_unlock(&dev->mutex);
4430
4431 return error;
4432 }
4433
4434 static int
4435 netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
4436 const struct smap *details)
4437 {
4438 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4439 const struct dpdk_qos_ops *new_ops = NULL;
4440 struct qos_conf *qos_conf, *new_qos_conf = NULL;
4441 int error = 0;
4442
4443 ovs_mutex_lock(&dev->mutex);
4444
4445 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4446
4447 new_ops = qos_lookup_name(type);
4448
4449 if (!new_ops || !new_ops->qos_construct) {
4450 new_qos_conf = NULL;
4451 if (type && type[0]) {
4452 error = EOPNOTSUPP;
4453 }
4454 } else if (qos_conf && qos_conf->ops == new_ops
4455 && qos_conf->ops->qos_is_equal(qos_conf, details)) {
4456 new_qos_conf = qos_conf;
4457 } else {
4458 error = new_ops->qos_construct(details, &new_qos_conf);
4459 }
4460
4461 if (error) {
4462 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
4463 type, netdev->name, rte_strerror(error));
4464 }
4465
4466 if (new_qos_conf != qos_conf) {
4467 ovsrcu_set(&dev->qos_conf, new_qos_conf);
4468 if (qos_conf) {
4469 ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
4470 }
4471 }
4472
4473 ovs_mutex_unlock(&dev->mutex);
4474
4475 return error;
4476 }
4477
4478 static int
4479 netdev_dpdk_get_queue(const struct netdev *netdev, uint32_t queue_id,
4480 struct smap *details)
4481 {
4482 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4483 struct qos_conf *qos_conf;
4484 int error = 0;
4485
4486 ovs_mutex_lock(&dev->mutex);
4487
4488 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4489 if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_get) {
4490 error = EOPNOTSUPP;
4491 } else {
4492 error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
4493 }
4494
4495 ovs_mutex_unlock(&dev->mutex);
4496
4497 return error;
4498 }
4499
4500 static int
4501 netdev_dpdk_set_queue(struct netdev *netdev, uint32_t queue_id,
4502 const struct smap *details)
4503 {
4504 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4505 struct qos_conf *qos_conf;
4506 int error = 0;
4507
4508 ovs_mutex_lock(&dev->mutex);
4509
4510 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4511 if (!qos_conf || !qos_conf->ops || !qos_conf->ops->qos_queue_construct) {
4512 error = EOPNOTSUPP;
4513 } else {
4514 error = qos_conf->ops->qos_queue_construct(details, queue_id,
4515 qos_conf);
4516 }
4517
4518 if (error && error != EOPNOTSUPP) {
4519 VLOG_ERR("Failed to set QoS queue %d on port %s: %s",
4520 queue_id, netdev_get_name(netdev), rte_strerror(error));
4521 }
4522
4523 ovs_mutex_unlock(&dev->mutex);
4524
4525 return error;
4526 }
4527
4528 static int
4529 netdev_dpdk_delete_queue(struct netdev *netdev, uint32_t queue_id)
4530 {
4531 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4532 struct qos_conf *qos_conf;
4533 int error = 0;
4534
4535 ovs_mutex_lock(&dev->mutex);
4536
4537 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4538 if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_destruct) {
4539 qos_conf->ops->qos_queue_destruct(qos_conf, queue_id);
4540 } else {
4541 error = EOPNOTSUPP;
4542 }
4543
4544 ovs_mutex_unlock(&dev->mutex);
4545
4546 return error;
4547 }
4548
4549 static int
4550 netdev_dpdk_get_queue_stats(const struct netdev *netdev, uint32_t queue_id,
4551 struct netdev_queue_stats *stats)
4552 {
4553 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4554 struct qos_conf *qos_conf;
4555 int error = 0;
4556
4557 ovs_mutex_lock(&dev->mutex);
4558
4559 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4560 if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get_stats) {
4561 qos_conf->ops->qos_queue_get_stats(qos_conf, queue_id, stats);
4562 } else {
4563 error = EOPNOTSUPP;
4564 }
4565
4566 ovs_mutex_unlock(&dev->mutex);
4567
4568 return error;
4569 }
4570
4571 static int
4572 netdev_dpdk_queue_dump_start(const struct netdev *netdev, void **statep)
4573 {
4574 int error = 0;
4575 struct qos_conf *qos_conf;
4576 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4577
4578 ovs_mutex_lock(&dev->mutex);
4579
4580 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4581 if (qos_conf && qos_conf->ops
4582 && qos_conf->ops->qos_queue_dump_state_init) {
4583 struct netdev_dpdk_queue_state *state;
4584
4585 *statep = state = xmalloc(sizeof *state);
4586 error = qos_conf->ops->qos_queue_dump_state_init(qos_conf, state);
4587 } else {
4588 error = EOPNOTSUPP;
4589 }
4590
4591 ovs_mutex_unlock(&dev->mutex);
4592
4593 return error;
4594 }
4595
4596 static int
4597 netdev_dpdk_queue_dump_next(const struct netdev *netdev, void *state_,
4598 uint32_t *queue_idp, struct smap *details)
4599 {
4600 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4601 struct netdev_dpdk_queue_state *state = state_;
4602 struct qos_conf *qos_conf;
4603 int error = EOF;
4604
4605 ovs_mutex_lock(&dev->mutex);
4606
4607 while (state->cur_queue < state->n_queues) {
4608 uint32_t queue_id = state->queues[state->cur_queue++];
4609
4610 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4611 if (qos_conf && qos_conf->ops && qos_conf->ops->qos_queue_get) {
4612 *queue_idp = queue_id;
4613 error = qos_conf->ops->qos_queue_get(details, queue_id, qos_conf);
4614 break;
4615 }
4616 }
4617
4618 ovs_mutex_unlock(&dev->mutex);
4619
4620 return error;
4621 }
4622
4623 static int
4624 netdev_dpdk_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
4625 void *state_)
4626 {
4627 struct netdev_dpdk_queue_state *state = state_;
4628
4629 free(state->queues);
4630 free(state);
4631 return 0;
4632 }
4633
4634
4635
4636 /* egress-policer details */
4637
4638 struct egress_policer {
4639 struct qos_conf qos_conf;
4640 struct rte_meter_srtcm_params app_srtcm_params;
4641 struct rte_meter_srtcm egress_meter;
4642 struct rte_meter_srtcm_profile egress_prof;
4643 };
4644
4645 static void
4646 egress_policer_details_to_param(const struct smap *details,
4647 struct rte_meter_srtcm_params *params)
4648 {
4649 memset(params, 0, sizeof *params);
4650 params->cir = smap_get_ullong(details, "cir", 0);
4651 params->cbs = smap_get_ullong(details, "cbs", 0);
4652 params->ebs = 0;
4653 }
4654
4655 static int
4656 egress_policer_qos_construct(const struct smap *details,
4657 struct qos_conf **conf)
4658 {
4659 struct egress_policer *policer;
4660 int err = 0;
4661
4662 policer = xmalloc(sizeof *policer);
4663 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
4664 egress_policer_details_to_param(details, &policer->app_srtcm_params);
4665 err = rte_meter_srtcm_profile_config(&policer->egress_prof,
4666 &policer->app_srtcm_params);
4667 if (!err) {
4668 err = rte_meter_srtcm_config(&policer->egress_meter,
4669 &policer->egress_prof);
4670 }
4671
4672 if (!err) {
4673 *conf = &policer->qos_conf;
4674 } else {
4675 VLOG_ERR("Could not create rte meter for egress policer");
4676 free(policer);
4677 *conf = NULL;
4678 err = -err;
4679 }
4680
4681 return err;
4682 }
4683
4684 static void
4685 egress_policer_qos_destruct(struct qos_conf *conf)
4686 {
4687 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
4688 qos_conf);
4689 free(policer);
4690 }
4691
4692 static int
4693 egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
4694 {
4695 struct egress_policer *policer =
4696 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4697
4698 smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
4699 smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
4700
4701 return 0;
4702 }
4703
4704 static bool
4705 egress_policer_qos_is_equal(const struct qos_conf *conf,
4706 const struct smap *details)
4707 {
4708 struct egress_policer *policer =
4709 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4710 struct rte_meter_srtcm_params params;
4711
4712 egress_policer_details_to_param(details, &params);
4713
4714 return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
4715 }
4716
4717 static int
4718 egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
4719 bool should_steal)
4720 {
4721 int cnt = 0;
4722 struct egress_policer *policer =
4723 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4724
4725 cnt = srtcm_policer_run_single_packet(&policer->egress_meter,
4726 &policer->egress_prof, pkts,
4727 pkt_cnt, should_steal);
4728
4729 return cnt;
4730 }
4731
4732 static const struct dpdk_qos_ops egress_policer_ops = {
4733 .qos_name = "egress-policer", /* qos_name */
4734 .qos_construct = egress_policer_qos_construct,
4735 .qos_destruct = egress_policer_qos_destruct,
4736 .qos_get = egress_policer_qos_get,
4737 .qos_is_equal = egress_policer_qos_is_equal,
4738 .qos_run = egress_policer_run
4739 };
4740
4741 /* trtcm-policer details */
4742
4743 struct trtcm_policer {
4744 struct qos_conf qos_conf;
4745 struct rte_meter_trtcm_rfc4115_params meter_params;
4746 struct rte_meter_trtcm_rfc4115_profile meter_profile;
4747 struct rte_meter_trtcm_rfc4115 meter;
4748 struct netdev_queue_stats stats;
4749 struct hmap queues;
4750 };
4751
4752 struct trtcm_policer_queue {
4753 struct hmap_node hmap_node;
4754 uint32_t queue_id;
4755 struct rte_meter_trtcm_rfc4115_params meter_params;
4756 struct rte_meter_trtcm_rfc4115_profile meter_profile;
4757 struct rte_meter_trtcm_rfc4115 meter;
4758 struct netdev_queue_stats stats;
4759 };
4760
4761 static void
4762 trtcm_policer_details_to_param(const struct smap *details,
4763 struct rte_meter_trtcm_rfc4115_params *params)
4764 {
4765 memset(params, 0, sizeof *params);
4766 params->cir = smap_get_ullong(details, "cir", 0);
4767 params->eir = smap_get_ullong(details, "eir", 0);
4768 params->cbs = smap_get_ullong(details, "cbs", 0);
4769 params->ebs = smap_get_ullong(details, "ebs", 0);
4770 }
4771
4772 static void
4773 trtcm_policer_param_to_detail(
4774 const struct rte_meter_trtcm_rfc4115_params *params,
4775 struct smap *details)
4776 {
4777 smap_add_format(details, "cir", "%"PRIu64, params->cir);
4778 smap_add_format(details, "eir", "%"PRIu64, params->eir);
4779 smap_add_format(details, "cbs", "%"PRIu64, params->cbs);
4780 smap_add_format(details, "ebs", "%"PRIu64, params->ebs);
4781 }
4782
4783
4784 static int
4785 trtcm_policer_qos_construct(const struct smap *details,
4786 struct qos_conf **conf)
4787 {
4788 struct trtcm_policer *policer;
4789 int err = 0;
4790
4791 policer = xmalloc(sizeof *policer);
4792 qos_conf_init(&policer->qos_conf, &trtcm_policer_ops);
4793 trtcm_policer_details_to_param(details, &policer->meter_params);
4794 err = rte_meter_trtcm_rfc4115_profile_config(&policer->meter_profile,
4795 &policer->meter_params);
4796 if (!err) {
4797 err = rte_meter_trtcm_rfc4115_config(&policer->meter,
4798 &policer->meter_profile);
4799 }
4800
4801 if (!err) {
4802 *conf = &policer->qos_conf;
4803 memset(&policer->stats, 0, sizeof policer->stats);
4804 hmap_init(&policer->queues);
4805 } else {
4806 free(policer);
4807 *conf = NULL;
4808 err = -err;
4809 }
4810
4811 return err;
4812 }
4813
4814 static void
4815 trtcm_policer_qos_destruct(struct qos_conf *conf)
4816 {
4817 struct trtcm_policer_queue *queue, *next_queue;
4818 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4819 qos_conf);
4820
4821 HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node, &policer->queues) {
4822 hmap_remove(&policer->queues, &queue->hmap_node);
4823 free(queue);
4824 }
4825 hmap_destroy(&policer->queues);
4826 free(policer);
4827 }
4828
4829 static int
4830 trtcm_policer_qos_get(const struct qos_conf *conf, struct smap *details)
4831 {
4832 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4833 qos_conf);
4834
4835 trtcm_policer_param_to_detail(&policer->meter_params, details);
4836 return 0;
4837 }
4838
4839 static bool
4840 trtcm_policer_qos_is_equal(const struct qos_conf *conf,
4841 const struct smap *details)
4842 {
4843 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4844 qos_conf);
4845 struct rte_meter_trtcm_rfc4115_params params;
4846
4847 trtcm_policer_details_to_param(details, &params);
4848
4849 return !memcmp(&params, &policer->meter_params, sizeof params);
4850 }
4851
4852 static struct trtcm_policer_queue *
4853 trtcm_policer_qos_find_queue(struct trtcm_policer *policer, uint32_t queue_id)
4854 {
4855 struct trtcm_policer_queue *queue;
4856 HMAP_FOR_EACH_WITH_HASH (queue, hmap_node, hash_2words(queue_id, 0),
4857 &policer->queues) {
4858 if (queue->queue_id == queue_id) {
4859 return queue;
4860 }
4861 }
4862 return NULL;
4863 }
4864
4865 static inline bool
4866 trtcm_policer_run_single_packet(struct trtcm_policer *policer,
4867 struct rte_mbuf *pkt, uint64_t time)
4868 {
4869 enum rte_color pkt_color;
4870 struct trtcm_policer_queue *queue;
4871 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct rte_ether_hdr);
4872 struct dp_packet *dpkt = CONTAINER_OF(pkt, struct dp_packet, mbuf);
4873
4874 queue = trtcm_policer_qos_find_queue(policer, dpkt->md.skb_priority);
4875 if (!queue) {
4876 /* If no queue is found, use the default queue, which MUST exist. */
4877 queue = trtcm_policer_qos_find_queue(policer, 0);
4878 if (!queue) {
4879 return false;
4880 }
4881 }
4882
4883 pkt_color = rte_meter_trtcm_rfc4115_color_blind_check(&queue->meter,
4884 &queue->meter_profile,
4885 time,
4886 pkt_len);
4887
4888 if (pkt_color == RTE_COLOR_RED) {
4889 queue->stats.tx_errors++;
4890 } else {
4891 queue->stats.tx_bytes += pkt_len;
4892 queue->stats.tx_packets++;
4893 }
4894
4895 pkt_color = rte_meter_trtcm_rfc4115_color_aware_check(&policer->meter,
4896 &policer->meter_profile,
4897 time, pkt_len,
4898 pkt_color);
4899
4900 if (pkt_color == RTE_COLOR_RED) {
4901 policer->stats.tx_errors++;
4902 return false;
4903 }
4904
4905 policer->stats.tx_bytes += pkt_len;
4906 policer->stats.tx_packets++;
4907 return true;
4908 }
4909
4910 static int
4911 trtcm_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
4912 bool should_steal)
4913 {
4914 int i = 0;
4915 int cnt = 0;
4916 struct rte_mbuf *pkt = NULL;
4917 uint64_t current_time = rte_rdtsc();
4918
4919 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4920 qos_conf);
4921
4922 for (i = 0; i < pkt_cnt; i++) {
4923 pkt = pkts[i];
4924
4925 if (trtcm_policer_run_single_packet(policer, pkt, current_time)) {
4926 if (cnt != i) {
4927 pkts[cnt] = pkt;
4928 }
4929 cnt++;
4930 } else {
4931 if (should_steal) {
4932 rte_pktmbuf_free(pkt);
4933 }
4934 }
4935 }
4936 return cnt;
4937 }
4938
4939 static int
4940 trtcm_policer_qos_queue_construct(const struct smap *details,
4941 uint32_t queue_id, struct qos_conf *conf)
4942 {
4943 int err = 0;
4944 struct trtcm_policer_queue *queue;
4945 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4946 qos_conf);
4947
4948 queue = trtcm_policer_qos_find_queue(policer, queue_id);
4949 if (!queue) {
4950 queue = xmalloc(sizeof *queue);
4951 queue->queue_id = queue_id;
4952 memset(&queue->stats, 0, sizeof queue->stats);
4953 queue->stats.created = time_msec();
4954 hmap_insert(&policer->queues, &queue->hmap_node,
4955 hash_2words(queue_id, 0));
4956 }
4957 if (queue_id == 0 && smap_is_empty(details)) {
4958 /* No default queue configured, use port values */
4959 memcpy(&queue->meter_params, &policer->meter_params,
4960 sizeof queue->meter_params);
4961 } else {
4962 trtcm_policer_details_to_param(details, &queue->meter_params);
4963 }
4964
4965 err = rte_meter_trtcm_rfc4115_profile_config(&queue->meter_profile,
4966 &queue->meter_params);
4967
4968 if (!err) {
4969 err = rte_meter_trtcm_rfc4115_config(&queue->meter,
4970 &queue->meter_profile);
4971 }
4972 if (err) {
4973 hmap_remove(&policer->queues, &queue->hmap_node);
4974 free(queue);
4975 err = -err;
4976 }
4977 return err;
4978 }
4979
4980 static void
4981 trtcm_policer_qos_queue_destruct(struct qos_conf *conf, uint32_t queue_id)
4982 {
4983 struct trtcm_policer_queue *queue;
4984 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
4985 qos_conf);
4986
4987 queue = trtcm_policer_qos_find_queue(policer, queue_id);
4988 if (queue) {
4989 hmap_remove(&policer->queues, &queue->hmap_node);
4990 free(queue);
4991 }
4992 }
4993
4994 static int
4995 trtcm_policer_qos_queue_get(struct smap *details, uint32_t queue_id,
4996 const struct qos_conf *conf)
4997 {
4998 struct trtcm_policer_queue *queue;
4999 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
5000 qos_conf);
5001
5002 queue = trtcm_policer_qos_find_queue(policer, queue_id);
5003 if (!queue) {
5004 return EINVAL;
5005 }
5006
5007 trtcm_policer_param_to_detail(&queue->meter_params, details);
5008 return 0;
5009 }
5010
5011 static int
5012 trtcm_policer_qos_queue_get_stats(const struct qos_conf *conf,
5013 uint32_t queue_id,
5014 struct netdev_queue_stats *stats)
5015 {
5016 struct trtcm_policer_queue *queue;
5017 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
5018 qos_conf);
5019
5020 queue = trtcm_policer_qos_find_queue(policer, queue_id);
5021 if (!queue) {
5022 return EINVAL;
5023 }
5024 memcpy(stats, &queue->stats, sizeof *stats);
5025 return 0;
5026 }
5027
5028 static int
5029 trtcm_policer_qos_queue_dump_state_init(const struct qos_conf *conf,
5030 struct netdev_dpdk_queue_state *state)
5031 {
5032 uint32_t i = 0;
5033 struct trtcm_policer_queue *queue;
5034 struct trtcm_policer *policer = CONTAINER_OF(conf, struct trtcm_policer,
5035 qos_conf);
5036
5037 state->n_queues = hmap_count(&policer->queues);
5038 state->cur_queue = 0;
5039 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
5040
5041 HMAP_FOR_EACH (queue, hmap_node, &policer->queues) {
5042 state->queues[i++] = queue->queue_id;
5043 }
5044 return 0;
5045 }
5046
5047 static const struct dpdk_qos_ops trtcm_policer_ops = {
5048 .qos_name = "trtcm-policer",
5049 .qos_construct = trtcm_policer_qos_construct,
5050 .qos_destruct = trtcm_policer_qos_destruct,
5051 .qos_get = trtcm_policer_qos_get,
5052 .qos_is_equal = trtcm_policer_qos_is_equal,
5053 .qos_run = trtcm_policer_run,
5054 .qos_queue_construct = trtcm_policer_qos_queue_construct,
5055 .qos_queue_destruct = trtcm_policer_qos_queue_destruct,
5056 .qos_queue_get = trtcm_policer_qos_queue_get,
5057 .qos_queue_get_stats = trtcm_policer_qos_queue_get_stats,
5058 .qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
5059 };
5060
5061 static int
5062 netdev_dpdk_reconfigure(struct netdev *netdev)
5063 {
5064 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5065 int err = 0;
5066
5067 ovs_mutex_lock(&dev->mutex);
5068
5069 if (netdev->n_txq == dev->requested_n_txq
5070 && netdev->n_rxq == dev->requested_n_rxq
5071 && dev->mtu == dev->requested_mtu
5072 && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
5073 && dev->rxq_size == dev->requested_rxq_size
5074 && dev->txq_size == dev->requested_txq_size
5075 && dev->socket_id == dev->requested_socket_id
5076 && dev->started && !dev->reset_needed) {
5077 /* Reconfiguration is unnecessary */
5078
5079 goto out;
5080 }
5081
5082 if (dev->reset_needed) {
5083 rte_eth_dev_reset(dev->port_id);
5084 if_notifier_manual_report();
5085 dev->reset_needed = false;
5086 } else {
5087 rte_eth_dev_stop(dev->port_id);
5088 }
5089
5090 dev->started = false;
5091
5092 err = netdev_dpdk_mempool_configure(dev);
5093 if (err && err != EEXIST) {
5094 goto out;
5095 }
5096
5097 dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
5098
5099 netdev->n_txq = dev->requested_n_txq;
5100 netdev->n_rxq = dev->requested_n_rxq;
5101
5102 dev->rxq_size = dev->requested_rxq_size;
5103 dev->txq_size = dev->requested_txq_size;
5104
5105 rte_free(dev->tx_q);
5106 err = dpdk_eth_dev_init(dev);
5107 if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
5108 netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
5109 netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
5110 netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
5111 }
5112
5113 dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
5114 if (!dev->tx_q) {
5115 err = ENOMEM;
5116 }
5117
5118 netdev_change_seq_changed(netdev);
5119
5120 out:
5121 ovs_mutex_unlock(&dev->mutex);
5122 return err;
5123 }
5124
5125 static int
5126 dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
5127 OVS_REQUIRES(dev->mutex)
5128 {
5129 dev->up.n_txq = dev->requested_n_txq;
5130 dev->up.n_rxq = dev->requested_n_rxq;
5131 int err;
5132
5133 /* Always keep RX queue 0 enabled for implementations that won't
5134 * report vring states. */
5135 dev->vhost_rxq_enabled[0] = true;
5136
5137 /* Enable TX queue 0 by default if it wasn't disabled. */
5138 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
5139 dev->tx_q[0].map = 0;
5140 }
5141
5142 if (userspace_tso_enabled()) {
5143 dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
5144 VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up));
5145 }
5146
5147 netdev_dpdk_remap_txqs(dev);
5148
5149 err = netdev_dpdk_mempool_configure(dev);
5150 if (!err) {
5151 /* A new mempool was created or re-used. */
5152 netdev_change_seq_changed(&dev->up);
5153 } else if (err != EEXIST) {
5154 return err;
5155 }
5156 if (netdev_dpdk_get_vid(dev) >= 0) {
5157 if (dev->vhost_reconfigured == false) {
5158 dev->vhost_reconfigured = true;
5159 /* Carrier status may need updating. */
5160 netdev_change_seq_changed(&dev->up);
5161 }
5162 }
5163
5164 return 0;
5165 }
5166
5167 static int
5168 netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
5169 {
5170 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5171 int err;
5172
5173 ovs_mutex_lock(&dev->mutex);
5174 err = dpdk_vhost_reconfigure_helper(dev);
5175 ovs_mutex_unlock(&dev->mutex);
5176
5177 return err;
5178 }
5179
5180 static int
5181 netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
5182 {
5183 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5184 int err;
5185 uint64_t vhost_flags = 0;
5186 bool zc_enabled;
5187
5188 ovs_mutex_lock(&dev->mutex);
5189
5190 /* Configure vHost client mode if requested and if the following criteria
5191 * are met:
5192 * 1. Device hasn't been registered yet.
5193 * 2. A path has been specified.
5194 */
5195 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
5196 /* Register client-mode device. */
5197 vhost_flags |= RTE_VHOST_USER_CLIENT;
5198
5199 /* There is no support for multi-segments buffers. */
5200 vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
5201
5202 /* Enable IOMMU support, if explicitly requested. */
5203 if (dpdk_vhost_iommu_enabled()) {
5204 vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
5205 }
5206
5207 /* Enable POSTCOPY support, if explicitly requested. */
5208 if (dpdk_vhost_postcopy_enabled()) {
5209 vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
5210 }
5211
5212 zc_enabled = dev->vhost_driver_flags
5213 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
5214 /* Enable zero copy flag, if requested */
5215 if (zc_enabled) {
5216 vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
5217 }
5218
5219 /* Enable External Buffers if TCP Segmentation Offload is enabled. */
5220 if (userspace_tso_enabled()) {
5221 vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
5222 }
5223
5224 err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
5225 if (err) {
5226 VLOG_ERR("vhost-user device setup failure for device %s\n",
5227 dev->vhost_id);
5228 goto unlock;
5229 } else {
5230 /* Configuration successful */
5231 dev->vhost_driver_flags |= vhost_flags;
5232 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
5233 "using client socket '%s'",
5234 dev->up.name, dev->vhost_id);
5235 if (zc_enabled) {
5236 VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
5237 }
5238 }
5239
5240 err = rte_vhost_driver_callback_register(dev->vhost_id,
5241 &virtio_net_device_ops);
5242 if (err) {
5243 VLOG_ERR("rte_vhost_driver_callback_register failed for "
5244 "vhost user client port: %s\n", dev->up.name);
5245 goto unlock;
5246 }
5247
5248 if (userspace_tso_enabled()) {
5249 netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
5250 netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
5251 netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
5252 } else {
5253 err = rte_vhost_driver_disable_features(dev->vhost_id,
5254 1ULL << VIRTIO_NET_F_HOST_TSO4
5255 | 1ULL << VIRTIO_NET_F_HOST_TSO6
5256 | 1ULL << VIRTIO_NET_F_CSUM);
5257 if (err) {
5258 VLOG_ERR("rte_vhost_driver_disable_features failed for "
5259 "vhost user client port: %s\n", dev->up.name);
5260 goto unlock;
5261 }
5262 }
5263
5264 err = rte_vhost_driver_start(dev->vhost_id);
5265 if (err) {
5266 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
5267 "client port: %s\n", dev->up.name);
5268 goto unlock;
5269 }
5270 }
5271
5272 err = dpdk_vhost_reconfigure_helper(dev);
5273
5274 unlock:
5275 ovs_mutex_unlock(&dev->mutex);
5276
5277 return err;
5278 }
5279
5280 int
5281 netdev_dpdk_get_port_id(struct netdev *netdev)
5282 {
5283 struct netdev_dpdk *dev;
5284 int ret = -1;
5285
5286 if (!is_dpdk_class(netdev->netdev_class)) {
5287 goto out;
5288 }
5289
5290 dev = netdev_dpdk_cast(netdev);
5291 ovs_mutex_lock(&dev->mutex);
5292 ret = dev->port_id;
5293 ovs_mutex_unlock(&dev->mutex);
5294 out:
5295 return ret;
5296 }
5297
5298 bool
5299 netdev_dpdk_flow_api_supported(struct netdev *netdev)
5300 {
5301 struct netdev_dpdk *dev;
5302 bool ret = false;
5303
5304 if (!is_dpdk_class(netdev->netdev_class)) {
5305 goto out;
5306 }
5307
5308 dev = netdev_dpdk_cast(netdev);
5309 ovs_mutex_lock(&dev->mutex);
5310 if (dev->type == DPDK_DEV_ETH) {
5311 /* TODO: Check if we able to offload some minimal flow. */
5312 ret = true;
5313 }
5314 ovs_mutex_unlock(&dev->mutex);
5315 out:
5316 return ret;
5317 }
5318
5319 int
5320 netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
5321 struct rte_flow *rte_flow,
5322 struct rte_flow_error *error)
5323 {
5324 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5325 int ret;
5326
5327 ovs_mutex_lock(&dev->mutex);
5328 ret = rte_flow_destroy(dev->port_id, rte_flow, error);
5329 ovs_mutex_unlock(&dev->mutex);
5330 return ret;
5331 }
5332
5333 struct rte_flow *
5334 netdev_dpdk_rte_flow_create(struct netdev *netdev,
5335 const struct rte_flow_attr *attr,
5336 const struct rte_flow_item *items,
5337 const struct rte_flow_action *actions,
5338 struct rte_flow_error *error)
5339 {
5340 struct rte_flow *flow;
5341 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5342
5343 ovs_mutex_lock(&dev->mutex);
5344 flow = rte_flow_create(dev->port_id, attr, items, actions, error);
5345 ovs_mutex_unlock(&dev->mutex);
5346 return flow;
5347 }
5348
5349 int
5350 netdev_dpdk_rte_flow_query_count(struct netdev *netdev,
5351 struct rte_flow *rte_flow,
5352 struct rte_flow_query_count *query,
5353 struct rte_flow_error *error)
5354 {
5355 struct rte_flow_action_count count = { .shared = 0, .id = 0 };
5356 const struct rte_flow_action actions[] = {
5357 {
5358 .type = RTE_FLOW_ACTION_TYPE_COUNT,
5359 .conf = &count,
5360 },
5361 {
5362 .type = RTE_FLOW_ACTION_TYPE_END,
5363 },
5364 };
5365 struct netdev_dpdk *dev;
5366 int ret;
5367
5368 if (!is_dpdk_class(netdev->netdev_class)) {
5369 return -1;
5370 }
5371
5372 dev = netdev_dpdk_cast(netdev);
5373 ovs_mutex_lock(&dev->mutex);
5374 ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
5375 ovs_mutex_unlock(&dev->mutex);
5376 return ret;
5377 }
5378
5379 #define NETDEV_DPDK_CLASS_COMMON \
5380 .is_pmd = true, \
5381 .alloc = netdev_dpdk_alloc, \
5382 .dealloc = netdev_dpdk_dealloc, \
5383 .get_config = netdev_dpdk_get_config, \
5384 .get_numa_id = netdev_dpdk_get_numa_id, \
5385 .set_etheraddr = netdev_dpdk_set_etheraddr, \
5386 .get_etheraddr = netdev_dpdk_get_etheraddr, \
5387 .get_mtu = netdev_dpdk_get_mtu, \
5388 .set_mtu = netdev_dpdk_set_mtu, \
5389 .get_ifindex = netdev_dpdk_get_ifindex, \
5390 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
5391 .set_miimon_interval = netdev_dpdk_set_miimon, \
5392 .set_policing = netdev_dpdk_set_policing, \
5393 .get_qos_types = netdev_dpdk_get_qos_types, \
5394 .get_qos = netdev_dpdk_get_qos, \
5395 .set_qos = netdev_dpdk_set_qos, \
5396 .get_queue = netdev_dpdk_get_queue, \
5397 .set_queue = netdev_dpdk_set_queue, \
5398 .delete_queue = netdev_dpdk_delete_queue, \
5399 .get_queue_stats = netdev_dpdk_get_queue_stats, \
5400 .queue_dump_start = netdev_dpdk_queue_dump_start, \
5401 .queue_dump_next = netdev_dpdk_queue_dump_next, \
5402 .queue_dump_done = netdev_dpdk_queue_dump_done, \
5403 .update_flags = netdev_dpdk_update_flags, \
5404 .rxq_alloc = netdev_dpdk_rxq_alloc, \
5405 .rxq_construct = netdev_dpdk_rxq_construct, \
5406 .rxq_destruct = netdev_dpdk_rxq_destruct, \
5407 .rxq_dealloc = netdev_dpdk_rxq_dealloc
5408
5409 #define NETDEV_DPDK_CLASS_BASE \
5410 NETDEV_DPDK_CLASS_COMMON, \
5411 .init = netdev_dpdk_class_init, \
5412 .destruct = netdev_dpdk_destruct, \
5413 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
5414 .get_carrier = netdev_dpdk_get_carrier, \
5415 .get_stats = netdev_dpdk_get_stats, \
5416 .get_custom_stats = netdev_dpdk_get_custom_stats, \
5417 .get_features = netdev_dpdk_get_features, \
5418 .get_status = netdev_dpdk_get_status, \
5419 .reconfigure = netdev_dpdk_reconfigure, \
5420 .rxq_recv = netdev_dpdk_rxq_recv
5421
5422 static const struct netdev_class dpdk_class = {
5423 .type = "dpdk",
5424 NETDEV_DPDK_CLASS_BASE,
5425 .construct = netdev_dpdk_construct,
5426 .set_config = netdev_dpdk_set_config,
5427 .send = netdev_dpdk_eth_send,
5428 };
5429
5430 static const struct netdev_class dpdk_ring_class = {
5431 .type = "dpdkr",
5432 NETDEV_DPDK_CLASS_BASE,
5433 .construct = netdev_dpdk_ring_construct,
5434 .set_config = netdev_dpdk_ring_set_config,
5435 .send = netdev_dpdk_ring_send,
5436 };
5437
5438 static const struct netdev_class dpdk_vhost_class = {
5439 .type = "dpdkvhostuser",
5440 NETDEV_DPDK_CLASS_COMMON,
5441 .construct = netdev_dpdk_vhost_construct,
5442 .destruct = netdev_dpdk_vhost_destruct,
5443 .send = netdev_dpdk_vhost_send,
5444 .get_carrier = netdev_dpdk_vhost_get_carrier,
5445 .get_stats = netdev_dpdk_vhost_get_stats,
5446 .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
5447 .get_status = netdev_dpdk_vhost_user_get_status,
5448 .reconfigure = netdev_dpdk_vhost_reconfigure,
5449 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
5450 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
5451 };
5452
5453 static const struct netdev_class dpdk_vhost_client_class = {
5454 .type = "dpdkvhostuserclient",
5455 NETDEV_DPDK_CLASS_COMMON,
5456 .construct = netdev_dpdk_vhost_client_construct,
5457 .destruct = netdev_dpdk_vhost_destruct,
5458 .set_config = netdev_dpdk_vhost_client_set_config,
5459 .send = netdev_dpdk_vhost_send,
5460 .get_carrier = netdev_dpdk_vhost_get_carrier,
5461 .get_stats = netdev_dpdk_vhost_get_stats,
5462 .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
5463 .get_status = netdev_dpdk_vhost_user_get_status,
5464 .reconfigure = netdev_dpdk_vhost_client_reconfigure,
5465 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
5466 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
5467 };
5468
5469 void
5470 netdev_dpdk_register(void)
5471 {
5472 netdev_register_provider(&dpdk_class);
5473 netdev_register_provider(&dpdk_ring_class);
5474 netdev_register_provider(&dpdk_vhost_class);
5475 netdev_register_provider(&dpdk_vhost_client_class);
5476 }