]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-dpdk.c
tests: Wait up to OVS_CTL_TIMEOUT seconds.
[mirror_ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2 1/*
12d0d124 2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
8a9562d2
PS
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
01961bbd 18#include "netdev-dpdk.h"
8a9562d2 19
6ebc4b09 20#include <errno.h>
8a9562d2
PS
21#include <signal.h>
22#include <stdlib.h>
6ebc4b09 23#include <string.h>
8a9562d2 24#include <unistd.h>
f3e7ec25
MW
25#include <linux/virtio_net.h>
26#include <sys/socket.h>
27#include <linux/if.h>
01961bbd 28
5e925ccc 29#include <rte_bus_pci.h>
01961bbd
DDP
30#include <rte_config.h>
31#include <rte_cycles.h>
32#include <rte_errno.h>
33#include <rte_eth_ring.h>
34#include <rte_ethdev.h>
6ebc4b09 35#include <rte_flow.h>
01961bbd
DDP
36#include <rte_malloc.h>
37#include <rte_mbuf.h>
38#include <rte_meter.h>
fc56f5e0 39#include <rte_pci.h>
3eb8d4fa 40#include <rte_version.h>
6ebc4b09 41#include <rte_vhost.h>
8a9562d2 42
e8a2b5bf 43#include "cmap.h"
7d1ced01 44#include "dirs.h"
e14deea0 45#include "dp-packet.h"
01961bbd 46#include "dpdk.h"
8a9562d2 47#include "dpif-netdev.h"
e5c0f5a4 48#include "fatal-signal.h"
8a9562d2
PS
49#include "netdev-provider.h"
50#include "netdev-vport.h"
51#include "odp-util.h"
eac84432 52#include "openvswitch/dynamic-string.h"
25d436fb 53#include "openvswitch/list.h"
6ebc4b09 54#include "openvswitch/match.h"
25d436fb 55#include "openvswitch/ofp-print.h"
6ebc4b09 56#include "openvswitch/shash.h"
25d436fb 57#include "openvswitch/vlog.h"
94143fc4 58#include "ovs-numa.h"
8a9562d2 59#include "ovs-rcu.h"
6ebc4b09 60#include "ovs-thread.h"
8a9562d2 61#include "packets.h"
0bf765f7 62#include "smap.h"
8a9562d2 63#include "sset.h"
8a9562d2 64#include "timeval.h"
6ebc4b09 65#include "unaligned.h"
8a9562d2 66#include "unixctl.h"
6ebc4b09
IM
67#include "util.h"
68#include "uuid.h"
8a9562d2 69
f3e7ec25
MW
70enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
71
05b49df6 72VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
8a9562d2
PS
73static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
74
75#define DPDK_PORT_WATCHDOG_INTERVAL 5
76
77#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
78#define OVS_VPORT_DPDK "ovs_dpdk"
79
80/*
81 * need to reserve tons of extra space in the mbufs so we can align the
82 * DMA addresses to 4KB.
18f777b2
TP
83 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
84 * performance for standard Ethernet MTU.
8a9562d2 85 */
58be5c0e
MK
86#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
87 + (2 * VLAN_HEADER_LEN))
4be4d22c
MK
88#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
89#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
58be5c0e
MK
90#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
91 - ETHER_HDR_LEN - ETHER_CRC_LEN)
4be4d22c 92#define NETDEV_DPDK_MBUF_ALIGN 1024
0072e931 93#define NETDEV_DPDK_MAX_PKT_LEN 9728
8a9562d2 94
43307ad0
IS
95/* Max and min number of packets in the mempool. OVS tries to allocate a
96 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
97 * enough hugepages) we keep halving the number until the allocation succeeds
98 * or we reach MIN_NB_MBUF */
99
100#define MAX_NB_MBUF (4096 * 64)
da79ce2b
DDP
101#define MIN_NB_MBUF (4096 * 4)
102#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
103
43307ad0
IS
104/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
105BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
106 == 0);
107
108/* The smallest possible NB_MBUF that we're going to try should be a multiple
109 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
110BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
111 % MP_CACHE_SZ == 0);
112
8a9562d2
PS
113#define SOCKET0 0
114
b685696b
CL
115/* Default size of Physical NIC RXQ */
116#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
117/* Default size of Physical NIC TXQ */
118#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
119/* Maximum size of Physical NIC Queues */
120#define NIC_PORT_MAX_Q_SIZE 4096
79f5354c 121
585a5bea 122#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
f3ea2ad2
IM
123#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
124#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
125 * yet mapped to another queue. */
585a5bea 126
bb37956a
IM
127#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
128
5e925ccc
MK
129/* DPDK library uses uint16_t for port_id. */
130typedef uint16_t dpdk_port_t;
fa9f4eeb 131#define DPDK_PORT_ID_FMT "%"PRIu16
bb37956a 132
080f080c
KT
133/* Minimum amount of vhost tx retries, effectively a disable. */
134#define VHOST_ENQ_RETRY_MIN 0
135/* Maximum amount of vhost tx retries. */
136#define VHOST_ENQ_RETRY_MAX 32
137/* Legacy default value for vhost tx retries. */
138#define VHOST_ENQ_RETRY_DEF 8
139
0a0f39df 140#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
95e9881f 141
8a9562d2 142static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
143 .rxmode = {
144 .mq_mode = ETH_MQ_RX_RSS,
145 .split_hdr_size = 0,
03f3f9c0 146 .offloads = 0,
a28ddd11
DDP
147 },
148 .rx_adv_conf = {
149 .rss_conf = {
150 .rss_key = NULL,
543342a4 151 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
8a9562d2 152 },
a28ddd11
DDP
153 },
154 .txmode = {
155 .mq_mode = ETH_MQ_TX_NONE,
156 },
8a9562d2
PS
157};
158
f3e7ec25
MW
159/*
160 * These callbacks allow virtio-net devices to be added to vhost ports when
161 * configuration has been fully completed.
162 */
163static int new_device(int vid);
164static void destroy_device(int vid);
165static int vring_state_changed(int vid, uint16_t queue_id, int enable);
61473a0e 166static void destroy_connection(int vid);
f3e7ec25
MW
167static const struct vhost_device_ops virtio_net_device_ops =
168{
169 .new_device = new_device,
170 .destroy_device = destroy_device,
171 .vring_state_changed = vring_state_changed,
61473a0e
DM
172 .features_changed = NULL,
173 .new_connection = NULL,
174 .destroy_connection = destroy_connection,
f3e7ec25
MW
175};
176
58f7c37b
DDP
177enum { DPDK_RING_SIZE = 256 };
178BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
179enum { DRAIN_TSC = 200000ULL };
180
58397e6c
KT
181enum dpdk_dev_type {
182 DPDK_DEV_ETH = 0,
7d1ced01 183 DPDK_DEV_VHOST = 1,
58397e6c
KT
184};
185
0bf765f7
IS
186/* Quality of Service */
187
188/* An instance of a QoS configuration. Always associated with a particular
189 * network device.
190 *
191 * Each QoS implementation subclasses this with whatever additional data it
192 * needs.
193 */
194struct qos_conf {
195 const struct dpdk_qos_ops *ops;
78bd47cf 196 rte_spinlock_t lock;
0bf765f7
IS
197};
198
199/* A particular implementation of dpdk QoS operations.
200 *
201 * The functions below return 0 if successful or a positive errno value on
202 * failure, except where otherwise noted. All of them must be provided, except
203 * where otherwise noted.
204 */
205struct dpdk_qos_ops {
206
207 /* Name of the QoS type */
208 const char *qos_name;
209
78bd47cf
DDP
210 /* Called to construct a qos_conf object. The implementation should make
211 * the appropriate calls to configure QoS according to 'details'.
0bf765f7
IS
212 *
213 * The contents of 'details' should be documented as valid for 'ovs_name'
214 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
215 * (which is built as ovs-vswitchd.conf.db(8)).
216 *
78bd47cf
DDP
217 * This function must return 0 if and only if it sets '*conf' to an
218 * initialized 'struct qos_conf'.
0bf765f7
IS
219 *
220 * For all QoS implementations it should always be non-null.
221 */
78bd47cf 222 int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
0bf765f7
IS
223
224 /* Destroys the data structures allocated by the implementation as part of
78bd47cf 225 * 'qos_conf'.
0bf765f7
IS
226 *
227 * For all QoS implementations it should always be non-null.
228 */
78bd47cf 229 void (*qos_destruct)(struct qos_conf *conf);
0bf765f7 230
78bd47cf 231 /* Retrieves details of 'conf' configuration into 'details'.
0bf765f7
IS
232 *
233 * The contents of 'details' should be documented as valid for 'ovs_name'
234 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
235 * (which is built as ovs-vswitchd.conf.db(8)).
236 */
78bd47cf 237 int (*qos_get)(const struct qos_conf *conf, struct smap *details);
0bf765f7 238
78bd47cf 239 /* Returns true if 'conf' is already configured according to 'details'.
0bf765f7
IS
240 *
241 * The contents of 'details' should be documented as valid for 'ovs_name'
242 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
243 * (which is built as ovs-vswitchd.conf.db(8)).
244 *
78bd47cf 245 * For all QoS implementations it should always be non-null.
0bf765f7 246 */
78bd47cf
DDP
247 bool (*qos_is_equal)(const struct qos_conf *conf,
248 const struct smap *details);
0bf765f7
IS
249
250 /* Modify an array of rte_mbufs. The modification is specific to
251 * each qos implementation.
252 *
253 * The function should take and array of mbufs and an int representing
254 * the current number of mbufs present in the array.
255 *
256 * After the function has performed a qos modification to the array of
257 * mbufs it returns an int representing the number of mbufs now present in
258 * the array. This value is can then be passed to the port send function
259 * along with the modified array for transmission.
260 *
261 * For all QoS implementations it should always be non-null.
262 */
78bd47cf 263 int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
7d7ded7a 264 int pkt_cnt, bool should_steal);
0bf765f7
IS
265};
266
267/* dpdk_qos_ops for each type of user space QoS implementation */
268static const struct dpdk_qos_ops egress_policer_ops;
269
270/*
271 * Array of dpdk_qos_ops, contains pointer to all supported QoS
272 * operations.
273 */
274static const struct dpdk_qos_ops *const qos_confs[] = {
275 &egress_policer_ops,
276 NULL
277};
278
c2adb102
IM
279static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
280
8a9562d2 281/* Contains all 'struct dpdk_dev's. */
ca6ba700 282static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 283 = OVS_LIST_INITIALIZER(&dpdk_list);
8a9562d2 284
c2adb102
IM
285static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
286 = OVS_MUTEX_INITIALIZER;
287
91fccdad 288/* Contains all 'struct dpdk_mp's. */
43307ad0
IS
289static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
290 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
91fccdad 291
91fccdad
KT
292struct dpdk_mp {
293 struct rte_mempool *mp;
43307ad0
IS
294 int mtu;
295 int socket_id;
296 int refcount;
91fccdad
KT
297 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
298 };
299
5a034064 300/* There should be one 'struct dpdk_tx_queue' created for
15ba075d 301 * each netdev tx queue. */
8a9562d2 302struct dpdk_tx_queue {
15ba075d
IM
303 /* Padding to make dpdk_tx_queue exactly one cache line long. */
304 PADDED_MEMBERS(CACHE_LINE_SIZE,
305 /* Protects the members and the NIC queue from concurrent access.
306 * It is used only if the queue is shared among different pmd threads
307 * (see 'concurrent_txq'). */
308 rte_spinlock_t tx_lock;
309 /* Mapping of configured vhost-user queue to enabled by guest. */
310 int map;
311 );
8a9562d2
PS
312};
313
95fb793a 314/* dpdk has no way to remove dpdk ring ethernet devices
315 so we have to keep them around once they've been created
316*/
317
ca6ba700 318static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 319 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
95fb793a 320
321struct dpdk_ring {
322 /* For the client rings */
323 struct rte_ring *cring_tx;
324 struct rte_ring *cring_rx;
b83a2df1 325 unsigned int user_port_id; /* User given port no, parsed from port name */
bb37956a 326 dpdk_port_t eth_port_id; /* ethernet device port id */
ca6ba700 327 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
95fb793a 328};
329
9509913a
IS
330struct ingress_policer {
331 struct rte_meter_srtcm_params app_srtcm_params;
332 struct rte_meter_srtcm in_policer;
03f3f9c0 333 struct rte_meter_srtcm_profile in_prof;
9509913a
IS
334 rte_spinlock_t policer_lock;
335};
336
1a2bb118
SC
337enum dpdk_hw_ol_features {
338 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
e10ca8b9 339 NETDEV_RX_HW_CRC_STRIP = 1 << 1,
03f3f9c0 340 NETDEV_RX_HW_SCATTER = 1 << 2
1a2bb118
SC
341};
342
b2e72a9c
IM
343/*
344 * In order to avoid confusion in variables names, following naming convention
345 * should be used, if possible:
346 *
347 * 'struct netdev' : 'netdev'
348 * 'struct netdev_dpdk' : 'dev'
349 * 'struct netdev_rxq' : 'rxq'
350 * 'struct netdev_rxq_dpdk' : 'rx'
351 *
352 * Example:
353 * struct netdev *netdev = netdev_from_name(name);
354 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
355 *
356 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
357 * already defined.
358 */
359
8a9562d2 360struct netdev_dpdk {
23d4d53f
BB
361 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
362 dpdk_port_t port_id;
363
364 /* If true, device was attached by rte_eth_dev_attach(). */
365 bool attached;
606f6650
EC
366 /* If true, rte_eth_dev_start() was successfully called */
367 bool started;
23d4d53f
BB
368 struct eth_addr hwaddr;
369 int mtu;
370 int socket_id;
371 int buf_size;
372 int max_packet_len;
373 enum dpdk_dev_type type;
374 enum netdev_flags flags;
eaa43581 375 int link_reset_cnt;
bb9d2623
IM
376 union {
377 /* Device arguments for dpdk ports. */
378 char *devargs;
379 /* Identifier used to distinguish vhost devices from each other. */
380 char *vhost_id;
381 };
23d4d53f
BB
382 struct dpdk_tx_queue *tx_q;
383 struct rte_eth_link link;
23d4d53f
BB
384 );
385
386 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
387 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
43307ad0 388 struct dpdk_mp *dpdk_mp;
23d4d53f
BB
389
390 /* virtio identifier for vhost devices */
391 ovsrcu_index vid;
392
393 /* True if vHost device is 'up' and has been reconfigured at least once */
394 bool vhost_reconfigured;
080f080c
KT
395
396 atomic_uint8_t vhost_tx_retries_max;
397 /* 2 pad bytes here. */
23d4d53f
BB
398 );
399
23d4d53f
BB
400 PADDED_MEMBERS(CACHE_LINE_SIZE,
401 struct netdev up;
402 /* In dpdk_list. */
403 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
404
405 /* QoS configuration and lock for the device */
406 OVSRCU_TYPE(struct qos_conf *) qos_conf;
407
408 /* Ingress Policer */
409 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
410 uint32_t policer_rate;
411 uint32_t policer_burst;
35c91567
DM
412
413 /* Array of vhost rxq states, see vring_state_changed. */
414 bool *vhost_rxq_enabled;
23d4d53f
BB
415 );
416
417 PADDED_MEMBERS(CACHE_LINE_SIZE,
418 struct netdev_stats stats;
c161357d
KT
419 /* Custom stat for retries when unable to transmit. */
420 uint64_t tx_retries;
23d4d53f
BB
421 /* Protects stats */
422 rte_spinlock_t stats_lock;
c161357d 423 /* 4 pad bytes here. */
23d4d53f
BB
424 );
425
426 PADDED_MEMBERS(CACHE_LINE_SIZE,
427 /* The following properties cannot be changed when a device is running,
428 * so we remember the request and update them next time
429 * netdev_dpdk*_reconfigure() is called */
430 int requested_mtu;
431 int requested_n_txq;
432 int requested_n_rxq;
433 int requested_rxq_size;
434 int requested_txq_size;
435
436 /* Number of rx/tx descriptors for physical devices */
437 int rxq_size;
438 int txq_size;
439
440 /* Socket ID detected when vHost device is brought up */
441 int requested_socket_id;
442
443 /* Denotes whether vHost port is client/server mode */
444 uint64_t vhost_driver_flags;
445
446 /* DPDK-ETH Flow control */
447 struct rte_eth_fc_conf fc_conf;
448
449 /* DPDK-ETH hardware offload features,
450 * from the enum set 'dpdk_hw_ol_features' */
451 uint32_t hw_ol_features;
f8b64a61
RM
452
453 /* Properties for link state change detection mode.
454 * If lsc_interrupt_mode is set to false, poll mode is used,
455 * otherwise interrupt mode is used. */
456 bool requested_lsc_interrupt_mode;
457 bool lsc_interrupt_mode;
23d4d53f 458 );
971f4b39
MW
459
460 PADDED_MEMBERS(CACHE_LINE_SIZE,
461 /* Names of all XSTATS counters */
462 struct rte_eth_xstat_name *rte_xstats_names;
463 int rte_xstats_names_size;
464 int rte_xstats_ids_size;
465 uint64_t *rte_xstats_ids;
466 );
8a9562d2
PS
467};
468
469struct netdev_rxq_dpdk {
470 struct netdev_rxq up;
bb37956a 471 dpdk_port_t port_id;
8a9562d2
PS
472};
473
f3e7ec25
MW
474static void netdev_dpdk_destruct(struct netdev *netdev);
475static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
8a9562d2 476
ac1a9bb9
IM
477static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
478
0a0f39df 479int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
58397e6c 480
9509913a
IS
481struct ingress_policer *
482netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
483
8a9562d2
PS
484static bool
485is_dpdk_class(const struct netdev_class *class)
486{
f3e7ec25
MW
487 return class->destruct == netdev_dpdk_destruct
488 || class->destruct == netdev_dpdk_vhost_destruct;
8a9562d2
PS
489}
490
4be4d22c
MK
491/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
492 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
493 * value, insufficient buffers are allocated to accomodate the packet in its
494 * entirety. Furthermore, certain drivers need to ensure that there is also
495 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
496 * frames). If the RX buffer is too small, then the driver enables scatter RX
58be5c0e
MK
497 * behaviour, which reduces performance. To prevent this, use a buffer size
498 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
4be4d22c
MK
499 */
500static uint32_t
501dpdk_buf_size(int mtu)
502{
a32bab26
TL
503 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
504 + RTE_PKTMBUF_HEADROOM;
4be4d22c
MK
505}
506
eff23640
DDP
507/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
508 *
509 * Unlike xmalloc(), this function can return NULL on failure. */
8a9562d2
PS
510static void *
511dpdk_rte_mzalloc(size_t sz)
512{
eff23640 513 return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
8a9562d2
PS
514}
515
516void
e14deea0 517free_dpdk_buf(struct dp_packet *p)
8a9562d2 518{
db73f716 519 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 520
b00b4a81 521 rte_pktmbuf_free(pkt);
8a9562d2
PS
522}
523
b3cd9f9d 524static void
401b70d6 525ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
b3cd9f9d 526 void *opaque_arg OVS_UNUSED,
2391135c 527 void *_p,
b3cd9f9d
PS
528 unsigned i OVS_UNUSED)
529{
2391135c 530 struct rte_mbuf *pkt = _p;
b3cd9f9d 531
3aaa6201 532 dp_packet_init_dpdk((struct dp_packet *) pkt);
b3cd9f9d
PS
533}
534
91fccdad
KT
535static int
536dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
537{
1f84a2d5
KT
538 /* At this point we want to know if all the mbufs are back
539 * in the mempool. rte_mempool_full() is not atomic but it's
540 * the best available and as we are no longer requesting mbufs
541 * from the mempool, it means mbufs will not move from
542 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
543 * the ring is counted before caches, so we won't get false
544 * positives in this use case and we handle false negatives.
545 *
546 * If future implementations of rte_mempool_full() were to change
547 * it could be possible for a false positive. Even that would
548 * likely be ok, as there are additional checks during mempool
549 * freeing but it would make things racey.
91fccdad 550 */
1f84a2d5 551 return rte_mempool_full(mp);
91fccdad
KT
552}
553
554/* Free unused mempools. */
555static void
43307ad0 556dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
91fccdad
KT
557{
558 struct dpdk_mp *dmp, *next;
559
43307ad0
IS
560 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
561 if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
91fccdad
KT
562 VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
563 ovs_list_remove(&dmp->list_node);
564 rte_mempool_free(dmp->mp);
565 rte_free(dmp);
566 }
567 }
91fccdad
KT
568}
569
43307ad0
IS
570/* Calculating the required number of mbufs differs depending on the
571 * mempool model being used. Check if per port memory is in use before
572 * calculating.
573 */
574static uint32_t
575dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
91fccdad 576{
43307ad0 577 uint32_t n_mbufs;
91fccdad 578
43307ad0
IS
579 if (!per_port_mp) {
580 /* Shared memory are being used.
581 * XXX: this is a really rough method of provisioning memory.
582 * It's impossible to determine what the exact memory requirements are
583 * when the number of ports and rxqs that utilize a particular mempool
584 * can change dynamically at runtime. For now, use this rough
585 * heurisitic.
586 */
587 if (mtu >= ETHER_MTU) {
588 n_mbufs = MAX_NB_MBUF;
589 } else {
590 n_mbufs = MIN_NB_MBUF;
91fccdad 591 }
43307ad0
IS
592 } else {
593 /* Per port memory is being used.
594 * XXX: rough estimation of number of mbufs required for this port:
595 * <packets required to fill the device rxqs>
596 * + <packets that could be stuck on other ports txqs>
597 * + <packets in the pmd threads>
598 * + <additional memory for corner cases>
599 */
600 n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
601 + dev->requested_n_txq * dev->requested_txq_size
602 + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
603 + MIN_NB_MBUF;
91fccdad 604 }
43307ad0
IS
605
606 return n_mbufs;
91fccdad
KT
607}
608
43307ad0
IS
609static struct dpdk_mp *
610dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
8a9562d2 611{
24e78f93
IM
612 char mp_name[RTE_MEMPOOL_NAMESIZE];
613 const char *netdev_name = netdev_get_name(&dev->up);
614 int socket_id = dev->requested_socket_id;
dfaf00e8
MK
615 uint32_t n_mbufs = 0;
616 uint32_t mbuf_size = 0;
617 uint32_t aligned_mbuf_size = 0;
618 uint32_t mbuf_priv_data_len = 0;
619 uint32_t pkt_size = 0;
24e78f93 620 uint32_t hash = hash_string(netdev_name, 0);
43307ad0
IS
621 struct dpdk_mp *dmp = NULL;
622 int ret;
623
624 dmp = dpdk_rte_mzalloc(sizeof *dmp);
625 if (!dmp) {
626 return NULL;
627 }
628 dmp->socket_id = socket_id;
629 dmp->mtu = mtu;
630 dmp->refcount = 1;
631
dfaf00e8
MK
632 /* Get the size of each mbuf, based on the MTU */
633 mbuf_size = MTU_TO_FRAME_LEN(mtu);
634
43307ad0 635 n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);
d555d9bd 636
da79ce2b 637 do {
24e78f93 638 /* Full DPDK memory pool name must be unique and cannot be
43307ad0
IS
639 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
640 * mempool case this can result in one device using a mempool
641 * which references a different device in it's name. However as
642 * mempool names are hashed, the device name will not be readable
643 * so this is not an issue for tasks such as debugging.
644 */
645 ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
dfaf00e8
MK
646 "ovs%08x%02d%05d%07u",
647 hash, socket_id, mtu, n_mbufs);
24e78f93
IM
648 if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
649 VLOG_DBG("snprintf returned %d. "
650 "Failed to generate a mempool name for \"%s\". "
651 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
652 ret, netdev_name, hash, socket_id, mtu, n_mbufs);
653 break;
65056fd7 654 }
95fb793a 655
dfaf00e8
MK
656 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
657 "on socket %d for %d Rx and %d Tx queues, "
658 "cache line size of %u",
659 netdev_name, n_mbufs, mbuf_size, socket_id,
660 dev->requested_n_rxq, dev->requested_n_txq,
661 RTE_CACHE_LINE_SIZE);
662
a32bab26
TL
663 /* The size of the mbuf's private area (i.e. area that holds OvS'
664 * dp_packet data)*/
dfaf00e8
MK
665 mbuf_priv_data_len = sizeof(struct dp_packet) -
666 sizeof(struct rte_mbuf);
667 /* The size of the entire dp_packet. */
668 pkt_size = sizeof(struct dp_packet) + mbuf_size;
669 /* mbuf size, rounded up to cacheline size. */
670 aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
671 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
672 * This maintains mbuf size cache alignment, while also honoring RX
673 * buffer alignment in the data portion of the mbuf. If this adjustment
674 * is not made, there is a possiblity later on that for an element of
675 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
676 * This is problematic in the case of multi-segment mbufs, particularly
677 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
678 * header, for example.
679 */
680 mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
681
682 dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
683 mbuf_priv_data_len,
684 mbuf_size,
43307ad0 685 socket_id);
24e78f93 686
43307ad0 687 if (dmp->mp) {
24e78f93
IM
688 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
689 mp_name, n_mbufs);
837c1761 690 /* rte_pktmbuf_pool_create has done some initialization of the
43307ad0
IS
691 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
692 * initializes some OVS specific fields of dp_packet.
693 */
694 rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
695 return dmp;
d555d9bd
RW
696 } else if (rte_errno == EEXIST) {
697 /* A mempool with the same name already exists. We just
698 * retrieve its pointer to be returned to the caller. */
43307ad0 699 dmp->mp = rte_mempool_lookup(mp_name);
d555d9bd
RW
700 /* As the mempool create returned EEXIST we can expect the
701 * lookup has returned a valid pointer. If for some reason
702 * that's not the case we keep track of it. */
24e78f93 703 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
43307ad0
IS
704 mp_name, dmp->mp);
705 return dmp;
d555d9bd 706 } else {
43307ad0
IS
707 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
708 "%u mbufs, retrying with %u mbufs",
709 mp_name, n_mbufs, n_mbufs / 2);
0c6f39e5 710 }
43307ad0 711 } while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
2ae3d542 712
43307ad0
IS
713 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
714 mp_name, n_mbufs);
715
716 rte_free(dmp);
717 return NULL;
8a9562d2
PS
718}
719
43307ad0
IS
720static struct dpdk_mp *
721dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
8a9562d2 722{
43307ad0
IS
723 struct dpdk_mp *dmp, *next;
724 bool reuse = false;
8a9562d2 725
c2adb102 726 ovs_mutex_lock(&dpdk_mp_mutex);
43307ad0
IS
727 /* Check if shared memory is being used, if so check existing mempools
728 * to see if reuse is possible. */
729 if (!per_port_mp) {
730 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
731 if (dmp->socket_id == dev->requested_socket_id
732 && dmp->mtu == mtu) {
733 VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
734 dmp->refcount++;
735 reuse = true;
736 break;
737 }
738 }
739 }
740 /* Sweep mempools after reuse or before create. */
741 dpdk_mp_sweep();
91fccdad 742
43307ad0
IS
743 if (!reuse) {
744 dmp = dpdk_mp_create(dev, mtu, per_port_mp);
91fccdad 745 if (dmp) {
43307ad0
IS
746 /* Shared memory will hit the reuse case above so will not
747 * request a mempool that already exists but we need to check
748 * for the EEXIST case for per port memory case. Compare the
749 * mempool returned by dmp to each entry in dpdk_mp_list. If a
750 * match is found, free dmp as a new entry is not required, set
751 * dmp to point to the existing entry and increment the refcount
752 * to avoid being freed at a later stage.
753 */
754 if (per_port_mp && rte_errno == EEXIST) {
755 LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
756 if (dmp->mp == next->mp) {
757 rte_free(dmp);
758 dmp = next;
759 dmp->refcount++;
760 }
761 }
762 } else {
763 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
764 }
91fccdad
KT
765 }
766 }
43307ad0 767
43307ad0
IS
768 ovs_mutex_unlock(&dpdk_mp_mutex);
769
770 return dmp;
771}
772
773/* Decrement reference to a mempool. */
774static void
775dpdk_mp_put(struct dpdk_mp *dmp)
776{
777 if (!dmp) {
778 return;
779 }
780
781 ovs_mutex_lock(&dpdk_mp_mutex);
782 ovs_assert(dmp->refcount);
783 dmp->refcount--;
c2adb102 784 ovs_mutex_unlock(&dpdk_mp_mutex);
8a9562d2
PS
785}
786
43307ad0
IS
787/* Depending on the memory model being used this function tries to
788 * identify and reuse an existing mempool or tries to allocate a new
789 * mempool on requested_socket_id with mbuf size corresponding to the
790 * requested_mtu. On success, a new configuration will be applied.
0072e931
MK
791 * On error, device will be left unchanged. */
792static int
793netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
0072e931
MK
794 OVS_REQUIRES(dev->mutex)
795{
796 uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
43307ad0 797 struct dpdk_mp *dmp;
24e78f93 798 int ret = 0;
43307ad0 799 bool per_port_mp = dpdk_per_port_memory();
0072e931 800
43307ad0
IS
801 /* With shared memory we do not need to configure a mempool if the MTU
802 * and socket ID have not changed, the previous configuration is still
803 * valid so return 0 */
804 if (!per_port_mp && dev->mtu == dev->requested_mtu
805 && dev->socket_id == dev->requested_socket_id) {
806 return ret;
807 }
91fccdad 808
43307ad0
IS
809 dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
810 if (!dmp) {
c67e46c0
MK
811 VLOG_ERR("Failed to create memory pool for netdev "
812 "%s, with MTU %d on socket %d: %s\n",
813 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
814 rte_strerror(rte_errno));
24e78f93 815 ret = rte_errno;
0072e931 816 } else {
43307ad0
IS
817 /* Check for any pre-existing dpdk_mp for the device before accessing
818 * the associated mempool.
819 */
820 if (dev->dpdk_mp != NULL) {
821 /* A new MTU was requested, decrement the reference count for the
822 * devices current dpdk_mp. This is required even if a pointer to
823 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
824 * has already been incremented by dpdk_mp_get at this stage so it
825 * must be decremented to keep an accurate refcount for the
826 * dpdk_mp.
827 */
828 dpdk_mp_put(dev->dpdk_mp);
829 }
830 dev->dpdk_mp = dmp;
0072e931
MK
831 dev->mtu = dev->requested_mtu;
832 dev->socket_id = dev->requested_socket_id;
833 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
834 }
835
24e78f93 836 return ret;
0072e931
MK
837}
838
8a9562d2
PS
839static void
840check_link_status(struct netdev_dpdk *dev)
841{
842 struct rte_eth_link link;
843
844 rte_eth_link_get_nowait(dev->port_id, &link);
845
846 if (dev->link.link_status != link.link_status) {
3e912ffc 847 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
848
849 dev->link_reset_cnt++;
850 dev->link = link;
851 if (dev->link.link_status) {
fa9f4eeb
IM
852 VLOG_DBG_RL(&rl,
853 "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
58be5c0e 854 dev->port_id, (unsigned) dev->link.link_speed,
fa9f4eeb
IM
855 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
856 ? "full-duplex" : "half-duplex");
8a9562d2 857 } else {
fa9f4eeb
IM
858 VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
859 dev->port_id);
8a9562d2
PS
860 }
861 }
862}
863
864static void *
865dpdk_watchdog(void *dummy OVS_UNUSED)
866{
867 struct netdev_dpdk *dev;
868
869 pthread_detach(pthread_self());
870
871 for (;;) {
872 ovs_mutex_lock(&dpdk_mutex);
873 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
874 ovs_mutex_lock(&dev->mutex);
1f5b157e
IM
875 if (dev->type == DPDK_DEV_ETH) {
876 check_link_status(dev);
877 }
8a9562d2
PS
878 ovs_mutex_unlock(&dev->mutex);
879 }
880 ovs_mutex_unlock(&dpdk_mutex);
881 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
882 }
883
884 return NULL;
885}
886
b98d7669 887static int
f8b64a61 888dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
b98d7669
DDP
889{
890 int diag = 0;
891 int i;
0072e931 892 struct rte_eth_conf conf = port_conf;
65a87968 893 struct rte_eth_dev_info info;
4dd16ca0 894 uint16_t conf_mtu;
65a87968 895
03f3f9c0
OM
896 rte_eth_dev_info_get(dev->port_id, &info);
897
65a87968 898 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
03f3f9c0
OM
899 * scatter to support jumbo RX.
900 * Setting scatter for the device is done after checking for
901 * scatter support in the device capabilites. */
0072e931 902 if (dev->mtu > ETHER_MTU) {
03f3f9c0
OM
903 if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
904 conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
65a87968 905 }
0072e931 906 }
67fe6d63 907
f8b64a61 908 conf.intr_conf.lsc = dev->lsc_interrupt_mode;
e10ca8b9 909
03f3f9c0
OM
910 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
911 conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
912 }
913
914 if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
915 && info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
916 conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
e10ca8b9
MW
917 }
918
03f3f9c0
OM
919 /* Limit configured rss hash functions to only those supported
920 * by the eth device. */
921 conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
922
b98d7669
DDP
923 /* A device may report more queues than it makes available (this has
924 * been observed for Intel xl710, which reserves some of them for
925 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
926 * available. When this happens we can retry the configuration
927 * and request less queues */
928 while (n_rxq && n_txq) {
929 if (diag) {
930 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
931 }
932
0072e931 933 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
b98d7669 934 if (diag) {
0072e931
MK
935 VLOG_WARN("Interface %s eth_dev setup error %s\n",
936 dev->up.name, rte_strerror(-diag));
b98d7669
DDP
937 break;
938 }
939
67fe6d63
MK
940 diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
941 if (diag) {
4dd16ca0
IS
942 /* A device may not support rte_eth_dev_set_mtu, in this case
943 * flag a warning to the user and include the devices configured
944 * MTU value that will be used instead. */
945 if (-ENOTSUP == diag) {
946 rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
947 VLOG_WARN("Interface %s does not support MTU configuration, "
948 "max packet size supported is %"PRIu16".",
949 dev->up.name, conf_mtu);
950 } else {
951 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
952 dev->up.name, dev->mtu, rte_strerror(-diag));
953 break;
954 }
67fe6d63
MK
955 }
956
b98d7669 957 for (i = 0; i < n_txq; i++) {
b685696b 958 diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
b98d7669
DDP
959 dev->socket_id, NULL);
960 if (diag) {
1dfebee9 961 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
b98d7669
DDP
962 dev->up.name, i, rte_strerror(-diag));
963 break;
964 }
965 }
966
967 if (i != n_txq) {
968 /* Retry with less tx queues */
969 n_txq = i;
970 continue;
971 }
972
973 for (i = 0; i < n_rxq; i++) {
b685696b 974 diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
43307ad0
IS
975 dev->socket_id, NULL,
976 dev->dpdk_mp->mp);
b98d7669 977 if (diag) {
1dfebee9 978 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
b98d7669
DDP
979 dev->up.name, i, rte_strerror(-diag));
980 break;
981 }
982 }
983
984 if (i != n_rxq) {
985 /* Retry with less rx queues */
986 n_rxq = i;
987 continue;
988 }
989
990 dev->up.n_rxq = n_rxq;
81acebda 991 dev->up.n_txq = n_txq;
b98d7669
DDP
992
993 return 0;
994 }
995
996 return diag;
997}
998
9fd39370
SC
999static void
1000dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
1001{
1002 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
fa9f4eeb 1003 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
bb37956a 1004 dev->port_id);
9fd39370
SC
1005 }
1006}
b98d7669 1007
8a9562d2 1008static int
c2adb102
IM
1009dpdk_eth_dev_init(struct netdev_dpdk *dev)
1010 OVS_REQUIRES(dev->mutex)
8a9562d2
PS
1011{
1012 struct rte_pktmbuf_pool_private *mbp_priv;
a0cb2d66 1013 struct rte_eth_dev_info info;
8a9562d2
PS
1014 struct ether_addr eth_addr;
1015 int diag;
b98d7669 1016 int n_rxq, n_txq;
d4f5282c
KT
1017 uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
1018 DEV_RX_OFFLOAD_TCP_CKSUM |
1019 DEV_RX_OFFLOAD_IPV4_CKSUM;
8a9562d2 1020
a0cb2d66 1021 rte_eth_dev_info_get(dev->port_id, &info);
a0cb2d66 1022
e10ca8b9
MW
1023 if (strstr(info.driver_name, "vf") != NULL) {
1024 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1025 dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
1026 } else {
1027 dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
1028 }
1029
d4f5282c
KT
1030 if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
1031 rx_chksm_offload_capa) {
fa9f4eeb
IM
1032 VLOG_WARN("Rx checksum offload is not supported on port "
1033 DPDK_PORT_ID_FMT, dev->port_id);
d4f5282c
KT
1034 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
1035 } else {
1036 dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
1037 }
1038
03f3f9c0
OM
1039 if (info.rx_offload_capa & DEV_RX_OFFLOAD_SCATTER) {
1040 dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
1041 } else {
1042 /* Do not warn on lack of scatter support */
1043 dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
1044 }
1045
b98d7669
DDP
1046 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
1047 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
1048
f8b64a61 1049 diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
8a9562d2 1050 if (diag) {
f8b64a61
RM
1051 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1052 "configure error: %s",
1053 dev->up.name, n_rxq, n_txq,
1054 dev->lsc_interrupt_mode ? "true" : "false",
1055 rte_strerror(-diag));
95fb793a 1056 return -diag;
8a9562d2
PS
1057 }
1058
8a9562d2
PS
1059 diag = rte_eth_dev_start(dev->port_id);
1060 if (diag) {
b98d7669
DDP
1061 VLOG_ERR("Interface %s start error: %s", dev->up.name,
1062 rte_strerror(-diag));
95fb793a 1063 return -diag;
8a9562d2 1064 }
606f6650 1065 dev->started = true;
8a9562d2
PS
1066
1067 rte_eth_promiscuous_enable(dev->port_id);
1068 rte_eth_allmulticast_enable(dev->port_id);
1069
1070 memset(&eth_addr, 0x0, sizeof(eth_addr));
1071 rte_eth_macaddr_get(dev->port_id, &eth_addr);
fa9f4eeb
IM
1072 VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
1073 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
8a9562d2 1074
ca92d173 1075 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
8a9562d2
PS
1076 rte_eth_link_get_nowait(dev->port_id, &dev->link);
1077
43307ad0 1078 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
8a9562d2 1079 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
8a9562d2
PS
1080 return 0;
1081}
1082
1083static struct netdev_dpdk *
1084netdev_dpdk_cast(const struct netdev *netdev)
1085{
1086 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
1087}
1088
1089static struct netdev *
1090netdev_dpdk_alloc(void)
1091{
bab69409
AC
1092 struct netdev_dpdk *dev;
1093
65e19e70
DDP
1094 dev = dpdk_rte_mzalloc(sizeof *dev);
1095 if (dev) {
1096 return &dev->up;
bab69409 1097 }
65e19e70 1098
bab69409 1099 return NULL;
8a9562d2
PS
1100}
1101
eff23640
DDP
1102static struct dpdk_tx_queue *
1103netdev_dpdk_alloc_txq(unsigned int n_txqs)
5a034064 1104{
eff23640 1105 struct dpdk_tx_queue *txqs;
bd5131ba 1106 unsigned i;
5a034064 1107
eff23640
DDP
1108 txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
1109 if (txqs) {
1110 for (i = 0; i < n_txqs; i++) {
1111 /* Initialize map for vhost devices. */
1112 txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
1113 rte_spinlock_init(&txqs[i].tx_lock);
1114 }
5a034064 1115 }
eff23640
DDP
1116
1117 return txqs;
5a034064
AW
1118}
1119
8a9562d2 1120static int
bb37956a 1121common_construct(struct netdev *netdev, dpdk_port_t port_no,
1ce30dfd 1122 enum dpdk_dev_type type, int socket_id)
5a034064 1123 OVS_REQUIRES(dpdk_mutex)
8a9562d2 1124{
d46285a2 1125 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1126
d46285a2 1127 ovs_mutex_init(&dev->mutex);
8a9562d2 1128
d46285a2 1129 rte_spinlock_init(&dev->stats_lock);
45d947c4 1130
1b7a04e0
AW
1131 /* If the 'sid' is negative, it means that the kernel fails
1132 * to obtain the pci numa info. In that situation, always
1133 * use 'SOCKET0'. */
1ce30dfd 1134 dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
db8f13b0 1135 dev->requested_socket_id = dev->socket_id;
d46285a2
DDP
1136 dev->port_id = port_no;
1137 dev->type = type;
1138 dev->flags = 0;
7f381c2e 1139 dev->requested_mtu = ETHER_MTU;
d46285a2 1140 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
f8b64a61 1141 dev->requested_lsc_interrupt_mode = 0;
0a0f39df
CL
1142 ovsrcu_index_init(&dev->vid, -1);
1143 dev->vhost_reconfigured = false;
5dcde09c 1144 dev->attached = false;
8a9562d2 1145
78bd47cf 1146 ovsrcu_init(&dev->qos_conf, NULL);
0bf765f7 1147
9509913a
IS
1148 ovsrcu_init(&dev->ingress_policer, NULL);
1149 dev->policer_rate = 0;
1150 dev->policer_burst = 0;
1151
7f381c2e
DDP
1152 netdev->n_rxq = 0;
1153 netdev->n_txq = 0;
1154 dev->requested_n_rxq = NR_QUEUE;
1155 dev->requested_n_txq = NR_QUEUE;
1156 dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
1157 dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
58397e6c 1158
9fd39370
SC
1159 /* Initialize the flow control to NULL */
1160 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
1a2bb118
SC
1161
1162 /* Initilize the hardware offload flags to 0 */
1163 dev->hw_ol_features = 0;
3b1fb077
DDP
1164
1165 dev->flags = NETDEV_UP | NETDEV_PROMISC;
1166
d46285a2 1167 ovs_list_push_back(&dpdk_list, &dev->list_node);
8a9562d2 1168
7f381c2e
DDP
1169 netdev_request_reconfigure(netdev);
1170
971f4b39
MW
1171 dev->rte_xstats_names = NULL;
1172 dev->rte_xstats_names_size = 0;
1173
1174 dev->rte_xstats_ids = NULL;
1175 dev->rte_xstats_ids_size = 0;
1176
c161357d
KT
1177 dev->tx_retries = 0;
1178
1ce30dfd 1179 return 0;
95fb793a 1180}
1181
b83a2df1
MV
1182/* dev_name must be the prefix followed by a positive decimal number.
1183 * (no leading + or - signs are allowed) */
95fb793a 1184static int
1185dpdk_dev_parse_name(const char dev_name[], const char prefix[],
1186 unsigned int *port_no)
1187{
1188 const char *cport;
1189
1190 if (strncmp(dev_name, prefix, strlen(prefix))) {
1191 return ENODEV;
1192 }
1193
1194 cport = dev_name + strlen(prefix);
b83a2df1
MV
1195
1196 if (str_to_uint(cport, 10, port_no)) {
1197 return 0;
1198 } else {
1199 return ENODEV;
1200 }
95fb793a 1201}
1202
40e940e4
OM
1203/* Get the number of OVS interfaces which have the same DPDK
1204 * rte device (e.g. same pci bus address).
1205 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1206 */
1207static int
1208netdev_dpdk_get_num_ports(struct rte_device *device)
1209 OVS_REQUIRES(dpdk_mutex)
1210{
1211 struct netdev_dpdk *dev;
1212 int count = 0;
1213
1214 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1215 if (rte_eth_devices[dev->port_id].device == device
1216 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
1217 count++;
1218 }
1219 }
1220 return count;
1221}
1222
1ce30dfd
DDP
1223static int
1224vhost_common_construct(struct netdev *netdev)
1225 OVS_REQUIRES(dpdk_mutex)
1226{
1227 int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
1228 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1229
35c91567
DM
1230 dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
1231 sizeof *dev->vhost_rxq_enabled);
1232 if (!dev->vhost_rxq_enabled) {
1233 return ENOMEM;
1234 }
1ce30dfd
DDP
1235 dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
1236 if (!dev->tx_q) {
35c91567 1237 rte_free(dev->vhost_rxq_enabled);
1ce30dfd
DDP
1238 return ENOMEM;
1239 }
1240
080f080c
KT
1241 atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
1242
bb37956a
IM
1243 return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1244 DPDK_DEV_VHOST, socket_id);
1ce30dfd
DDP
1245}
1246
7d1ced01 1247static int
53f50d24 1248netdev_dpdk_vhost_construct(struct netdev *netdev)
7d1ced01 1249{
d46285a2
DDP
1250 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1251 const char *name = netdev->name;
7d1ced01 1252 int err;
a0cb2d66 1253
1af27e8a
DDP
1254 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1255 * the file system. '/' or '\' would traverse directories, so they're not
1256 * acceptable in 'name'. */
1257 if (strchr(name, '/') || strchr(name, '\\')) {
1258 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1259 "A valid name must not include '/' or '\\'",
1260 name);
1261 return EINVAL;
1262 }
1263
7d1ced01
CL
1264 ovs_mutex_lock(&dpdk_mutex);
1265 /* Take the name of the vhost-user port and append it to the location where
2d24d165 1266 * the socket is to be created, then register the socket.
7d1ced01 1267 */
bb9d2623 1268 dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name);
1af27e8a 1269
2d24d165
CL
1270 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
1271 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
7d1ced01
CL
1272 if (err) {
1273 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
2d24d165 1274 dev->vhost_id);
f3e7ec25 1275 goto out;
e5c0f5a4 1276 } else {
2d24d165
CL
1277 fatal_signal_add_file_to_unlink(dev->vhost_id);
1278 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1279 dev->vhost_id, name);
1280 }
f3e7ec25
MW
1281
1282 err = rte_vhost_driver_callback_register(dev->vhost_id,
1283 &virtio_net_device_ops);
1284 if (err) {
1285 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1286 "port: %s\n", name);
1287 goto out;
1288 }
1289
1290 err = rte_vhost_driver_disable_features(dev->vhost_id,
1291 1ULL << VIRTIO_NET_F_HOST_TSO4
1292 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1293 | 1ULL << VIRTIO_NET_F_CSUM);
1294 if (err) {
1295 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1296 "port: %s\n", name);
1297 goto out;
1298 }
1299
1300 err = rte_vhost_driver_start(dev->vhost_id);
1301 if (err) {
1302 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1303 "port: %s\n", name);
1304 goto out;
1305 }
1306
1ce30dfd 1307 err = vhost_common_construct(netdev);
f3e7ec25
MW
1308 if (err) {
1309 VLOG_ERR("vhost_common_construct failed for vhost user "
1310 "port: %s\n", name);
1311 }
2d24d165 1312
f3e7ec25 1313out:
bb9d2623
IM
1314 if (err) {
1315 free(dev->vhost_id);
1316 dev->vhost_id = NULL;
1317 }
1318
2d24d165 1319 ovs_mutex_unlock(&dpdk_mutex);
28ca969e
AC
1320 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1321 "please migrate to dpdkvhostuserclient ports.");
2d24d165
CL
1322 return err;
1323}
1324
1325static int
1326netdev_dpdk_vhost_client_construct(struct netdev *netdev)
1327{
1328 int err;
1329
2d24d165 1330 ovs_mutex_lock(&dpdk_mutex);
1ce30dfd 1331 err = vhost_common_construct(netdev);
f3e7ec25
MW
1332 if (err) {
1333 VLOG_ERR("vhost_common_construct failed for vhost user client"
1334 "port: %s\n", netdev->name);
1335 }
7d1ced01 1336 ovs_mutex_unlock(&dpdk_mutex);
58397e6c
KT
1337 return err;
1338}
1339
95fb793a 1340static int
1341netdev_dpdk_construct(struct netdev *netdev)
1342{
95fb793a 1343 int err;
1344
95fb793a 1345 ovs_mutex_lock(&dpdk_mutex);
bb37956a
IM
1346 err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1347 DPDK_DEV_ETH, SOCKET0);
8a9562d2
PS
1348 ovs_mutex_unlock(&dpdk_mutex);
1349 return err;
1350}
1351
1ce30dfd
DDP
1352static void
1353common_destruct(struct netdev_dpdk *dev)
1354 OVS_REQUIRES(dpdk_mutex)
1355 OVS_EXCLUDED(dev->mutex)
1356{
1357 rte_free(dev->tx_q);
43307ad0 1358 dpdk_mp_put(dev->dpdk_mp);
1ce30dfd
DDP
1359
1360 ovs_list_remove(&dev->list_node);
1361 free(ovsrcu_get_protected(struct ingress_policer *,
1362 &dev->ingress_policer));
1363 ovs_mutex_destroy(&dev->mutex);
1364}
1365
8a9562d2 1366static void
d46285a2 1367netdev_dpdk_destruct(struct netdev *netdev)
8a9562d2 1368{
d46285a2 1369 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
40e940e4
OM
1370 struct rte_device *rte_dev;
1371 struct rte_eth_dev *eth_dev;
1372 bool remove_on_close;
8a9562d2 1373
8d38823b 1374 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1375
8a9562d2 1376 rte_eth_dev_stop(dev->port_id);
606f6650 1377 dev->started = false;
5dcde09c
IM
1378
1379 if (dev->attached) {
40e940e4
OM
1380 /* Retrieve eth device data before closing it.
1381 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1382 */
1383 eth_dev = &rte_eth_devices[dev->port_id];
1384 remove_on_close =
1385 eth_dev->data &&
1386 (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
1387 rte_dev = eth_dev->device;
1388
1389 /* Remove the eth device. */
5dcde09c 1390 rte_eth_dev_close(dev->port_id);
40e940e4
OM
1391
1392 /* Remove this rte device and all its eth devices if flag
1393 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1394 * are not supported), or if all the eth devices belonging to the rte
1395 * device are closed.
1396 */
1397 if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
595ce47c
IM
1398 int ret = rte_dev_remove(rte_dev);
1399
1400 if (ret < 0) {
1401 VLOG_ERR("Device '%s' can not be detached: %s.",
1402 dev->devargs, rte_strerror(-ret));
40e940e4
OM
1403 } else {
1404 /* Device was closed and detached. */
1405 VLOG_INFO("Device '%s' has been removed and detached",
1406 dev->devargs);
1407 }
5dcde09c 1408 } else {
40e940e4
OM
1409 /* Device was only closed. rte_dev_remove() was not called. */
1410 VLOG_INFO("Device '%s' has been removed", dev->devargs);
5dcde09c
IM
1411 }
1412 }
1413
ac1a9bb9 1414 netdev_dpdk_clear_xstats(dev);
55e075e6 1415 free(dev->devargs);
1ce30dfd 1416 common_destruct(dev);
8d38823b 1417
8a9562d2 1418 ovs_mutex_unlock(&dpdk_mutex);
58397e6c 1419}
8a9562d2 1420
3f891bbe
DDP
1421/* rte_vhost_driver_unregister() can call back destroy_device(), which will
1422 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1423 * deadlock, none of the mutexes must be held while calling this function. */
1424static int
c1ff66ac
CL
1425dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
1426 char *vhost_id)
3f891bbe
DDP
1427 OVS_EXCLUDED(dpdk_mutex)
1428 OVS_EXCLUDED(dev->mutex)
1429{
c1ff66ac 1430 return rte_vhost_driver_unregister(vhost_id);
3f891bbe
DDP
1431}
1432
58397e6c 1433static void
d46285a2 1434netdev_dpdk_vhost_destruct(struct netdev *netdev)
58397e6c 1435{
d46285a2 1436 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
c1ff66ac 1437 char *vhost_id;
58397e6c 1438
8d38823b 1439 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1440
c62da695 1441 /* Guest becomes an orphan if still attached. */
c1ff66ac
CL
1442 if (netdev_dpdk_get_vid(dev) >= 0
1443 && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
c62da695 1444 VLOG_ERR("Removing port '%s' while vhost device still attached.",
d46285a2 1445 netdev->name);
58be5c0e
MK
1446 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1447 "socket '%s' must be restarted.", dev->vhost_id);
58397e6c
KT
1448 }
1449
bb9d2623
IM
1450 vhost_id = dev->vhost_id;
1451 dev->vhost_id = NULL;
35c91567 1452 rte_free(dev->vhost_rxq_enabled);
c1ff66ac 1453
1ce30dfd
DDP
1454 common_destruct(dev);
1455
58397e6c 1456 ovs_mutex_unlock(&dpdk_mutex);
3f891bbe 1457
bb9d2623 1458 if (!vhost_id) {
821b8664
IM
1459 goto out;
1460 }
1461
c1ff66ac 1462 if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
41964543
IM
1463 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1464 netdev->name, vhost_id);
c1ff66ac
CL
1465 } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1466 /* OVS server mode - remove this socket from list for deletion */
1467 fatal_signal_remove_file_to_unlink(vhost_id);
3f891bbe 1468 }
821b8664 1469out:
c1ff66ac 1470 free(vhost_id);
8a9562d2
PS
1471}
1472
1473static void
d46285a2 1474netdev_dpdk_dealloc(struct netdev *netdev)
8a9562d2 1475{
d46285a2 1476 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1477
d46285a2 1478 rte_free(dev);
8a9562d2
PS
1479}
1480
971f4b39 1481static void
ac1a9bb9 1482netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
971f4b39
MW
1483{
1484 /* If statistics are already allocated, we have to
1485 * reconfigure, as port_id could have been changed. */
1486 if (dev->rte_xstats_names) {
1487 free(dev->rte_xstats_names);
1488 dev->rte_xstats_names = NULL;
1489 dev->rte_xstats_names_size = 0;
1490 }
1491 if (dev->rte_xstats_ids) {
1492 free(dev->rte_xstats_ids);
1493 dev->rte_xstats_ids = NULL;
1494 dev->rte_xstats_ids_size = 0;
1495 }
1496}
1497
1498static const char*
1499netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
1500{
1501 if (id >= dev->rte_xstats_names_size) {
1502 return "UNKNOWN";
1503 }
1504 return dev->rte_xstats_names[id].name;
1505}
1506
1507static bool
1508netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
1509 OVS_REQUIRES(dev->mutex)
1510{
1511 int rte_xstats_len;
1512 bool ret;
1513 struct rte_eth_xstat *rte_xstats;
1514 uint64_t id;
1515 int xstats_no;
1516 const char *name;
1517
1518 /* Retrieving all XSTATS names. If something will go wrong
1519 * or amount of counters will be equal 0, rte_xstats_names
1520 * buffer will be marked as NULL, and any further xstats
1521 * query won't be performed (e.g. during netdev_dpdk_get_stats
1522 * execution). */
1523
1524 ret = false;
1525 rte_xstats = NULL;
1526
1527 if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
1528 dev->rte_xstats_names_size =
1529 rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1530
1531 if (dev->rte_xstats_names_size < 0) {
fa9f4eeb
IM
1532 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
1533 dev->port_id);
971f4b39
MW
1534 dev->rte_xstats_names_size = 0;
1535 } else {
1536 /* Reserve memory for xstats names and values */
1537 dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
1538 sizeof *dev->rte_xstats_names);
1539
1540 if (dev->rte_xstats_names) {
1541 /* Retreive xstats names */
1542 rte_xstats_len =
1543 rte_eth_xstats_get_names(dev->port_id,
1544 dev->rte_xstats_names,
1545 dev->rte_xstats_names_size);
1546
1547 if (rte_xstats_len < 0) {
fa9f4eeb
IM
1548 VLOG_WARN("Cannot get XSTATS names for port: "
1549 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1550 goto out;
1551 } else if (rte_xstats_len != dev->rte_xstats_names_size) {
fa9f4eeb
IM
1552 VLOG_WARN("XSTATS size doesn't match for port: "
1553 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1554 goto out;
1555 }
1556
1557 dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
1558 sizeof(uint64_t));
1559
1560 /* We have to calculate number of counters */
1561 rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
1562 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1563
1564 /* Retreive xstats values */
1565 if (rte_eth_xstats_get(dev->port_id, rte_xstats,
1566 rte_xstats_len) > 0) {
1567 dev->rte_xstats_ids_size = 0;
1568 xstats_no = 0;
1569 for (uint32_t i = 0; i < rte_xstats_len; i++) {
1570 id = rte_xstats[i].id;
1571 name = netdev_dpdk_get_xstat_name(dev, id);
1572 /* We need to filter out everything except
1573 * dropped, error and management counters */
1574 if (string_ends_with(name, "_errors") ||
1575 strstr(name, "_management_") ||
1576 string_ends_with(name, "_dropped")) {
1577
1578 dev->rte_xstats_ids[xstats_no] = id;
1579 xstats_no++;
1580 }
1581 }
1582 dev->rte_xstats_ids_size = xstats_no;
1583 ret = true;
1584 } else {
fa9f4eeb
IM
1585 VLOG_WARN("Can't get XSTATS IDs for port: "
1586 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39 1587 }
34eb0863
IM
1588
1589 free(rte_xstats);
971f4b39
MW
1590 }
1591 }
1592 } else {
1593 /* Already configured */
1594 ret = true;
1595 }
1596
1597out:
1598 if (!ret) {
1599 netdev_dpdk_clear_xstats(dev);
1600 }
1601 return ret;
1602}
1603
8a9562d2 1604static int
a14b8947 1605netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
8a9562d2 1606{
a14b8947 1607 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
1608
1609 ovs_mutex_lock(&dev->mutex);
1610
050c60bf 1611 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
a14b8947 1612 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
81acebda
IM
1613 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1614 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
0072e931 1615 smap_add_format(args, "mtu", "%d", dev->mtu);
451f26fd
IM
1616
1617 if (dev->type == DPDK_DEV_ETH) {
1618 smap_add_format(args, "requested_rxq_descriptors", "%d",
1619 dev->requested_rxq_size);
1620 smap_add_format(args, "configured_rxq_descriptors", "%d",
1621 dev->rxq_size);
1622 smap_add_format(args, "requested_txq_descriptors", "%d",
1623 dev->requested_txq_size);
1624 smap_add_format(args, "configured_txq_descriptors", "%d",
1625 dev->txq_size);
1a2bb118
SC
1626 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
1627 smap_add(args, "rx_csum_offload", "true");
8155ab7e
KT
1628 } else {
1629 smap_add(args, "rx_csum_offload", "false");
1a2bb118 1630 }
f8b64a61
RM
1631 smap_add(args, "lsc_interrupt_mode",
1632 dev->lsc_interrupt_mode ? "true" : "false");
451f26fd 1633 }
8a9562d2
PS
1634 ovs_mutex_unlock(&dev->mutex);
1635
1636 return 0;
1637}
1638
55e075e6 1639static struct netdev_dpdk *
bb37956a 1640netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
55e075e6
CL
1641 OVS_REQUIRES(dpdk_mutex)
1642{
1643 struct netdev_dpdk *dev;
1644
1645 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1646 if (dev->port_id == port_id) {
1647 return dev;
1648 }
1649 }
1650
1651 return NULL;
1652}
1653
5e758818
YL
1654static dpdk_port_t
1655netdev_dpdk_get_port_by_mac(const char *mac_str)
1656{
1657 dpdk_port_t port_id;
1658 struct eth_addr mac, port_mac;
1659
1660 if (!eth_addr_from_string(mac_str, &mac)) {
1661 VLOG_ERR("invalid mac: %s", mac_str);
1662 return DPDK_ETH_PORT_ID_INVALID;
1663 }
1664
1665 RTE_ETH_FOREACH_DEV (port_id) {
1666 struct ether_addr ea;
1667
1668 rte_eth_macaddr_get(port_id, &ea);
1669 memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
1670 if (eth_addr_equals(mac, port_mac)) {
1671 return port_id;
1672 }
1673 }
1674
1675 return DPDK_ETH_PORT_ID_INVALID;
1676}
1677
40e940e4
OM
1678/* Return the first DPDK port id matching the devargs pattern. */
1679static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
1680 OVS_REQUIRES(dpdk_mutex)
1681{
1682 dpdk_port_t port_id;
1683 struct rte_dev_iterator iterator;
1684
1685 RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
1686 /* If a break is done - must call rte_eth_iterator_cleanup. */
1687 rte_eth_iterator_cleanup(&iterator);
1688 break;
1689 }
1690
1691 return port_id;
1692}
1693
5e758818 1694/*
40e940e4
OM
1695 * Normally, a PCI id (optionally followed by a representor number)
1696 * is enough for identifying a specific DPDK port.
5e758818
YL
1697 * However, for some NICs having multiple ports sharing the same PCI
1698 * id, using PCI id won't work then.
1699 *
1700 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1701 *
1702 * Note that the compatibility is fully kept: user can still use the
1703 * PCI id for adding ports (when it's enough for them).
1704 */
bb37956a 1705static dpdk_port_t
5dcde09c
IM
1706netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
1707 const char *devargs, char **errp)
40e940e4 1708 OVS_REQUIRES(dpdk_mutex)
55e075e6 1709{
40e940e4 1710 dpdk_port_t new_port_id;
55e075e6 1711
5e758818
YL
1712 if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
1713 new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
1714 } else {
40e940e4
OM
1715 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1716 if (!rte_eth_dev_is_valid_port(new_port_id)) {
5e758818 1717 /* Device not found in DPDK, attempt to attach it */
40e940e4 1718 if (rte_dev_probe(devargs)) {
5e758818 1719 new_port_id = DPDK_ETH_PORT_ID_INVALID;
40e940e4
OM
1720 } else {
1721 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1722 if (rte_eth_dev_is_valid_port(new_port_id)) {
1723 /* Attach successful */
1724 dev->attached = true;
1725 VLOG_INFO("Device '%s' attached to DPDK", devargs);
1726 } else {
1727 /* Attach unsuccessful */
1728 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1729 }
5e758818 1730 }
55e075e6 1731 }
5e758818
YL
1732 }
1733
1734 if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
1735 VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
55e075e6
CL
1736 }
1737
1738 return new_port_id;
1739}
1740
c3d062a7
CL
1741static void
1742dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
b614c894 1743 OVS_REQUIRES(dev->mutex)
a14b8947 1744{
050c60bf 1745 int new_n_rxq;
a14b8947 1746
2a21e757 1747 new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
050c60bf
DDP
1748 if (new_n_rxq != dev->requested_n_rxq) {
1749 dev->requested_n_rxq = new_n_rxq;
c3d062a7 1750 netdev_request_reconfigure(&dev->up);
050c60bf 1751 }
c3d062a7
CL
1752}
1753
b685696b
CL
1754static void
1755dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
1756 const char *flag, int default_size, int *new_size)
1757{
1758 int queue_size = smap_get_int(args, flag, default_size);
1759
1760 if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
1761 || !is_pow2(queue_size)) {
1762 queue_size = default_size;
1763 }
1764
1765 if (queue_size != *new_size) {
1766 *new_size = queue_size;
1767 netdev_request_reconfigure(netdev);
1768 }
1769}
1770
c3d062a7 1771static int
9fff138e
DDP
1772netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
1773 char **errp)
c3d062a7
CL
1774{
1775 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
f8b64a61 1776 bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
c2c84474 1777 bool flow_control_requested = true;
b614c894
IM
1778 enum rte_eth_fc_mode fc_mode;
1779 static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
1780 {RTE_FC_NONE, RTE_FC_TX_PAUSE},
1781 {RTE_FC_RX_PAUSE, RTE_FC_FULL }
1782 };
55e075e6
CL
1783 const char *new_devargs;
1784 int err = 0;
c3d062a7 1785
55e075e6 1786 ovs_mutex_lock(&dpdk_mutex);
c3d062a7
CL
1787 ovs_mutex_lock(&dev->mutex);
1788
1789 dpdk_set_rxq_config(dev, args);
1790
b685696b
CL
1791 dpdk_process_queue_size(netdev, args, "n_rxq_desc",
1792 NIC_PORT_DEFAULT_RXQ_SIZE,
1793 &dev->requested_rxq_size);
1794 dpdk_process_queue_size(netdev, args, "n_txq_desc",
1795 NIC_PORT_DEFAULT_TXQ_SIZE,
1796 &dev->requested_txq_size);
1797
55e075e6
CL
1798 new_devargs = smap_get(args, "dpdk-devargs");
1799
1800 if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
1801 /* The user requested a new device. If we return error, the caller
1802 * will delete this netdev and try to recreate it. */
1803 err = EAGAIN;
1804 goto out;
1805 }
1806
1807 /* dpdk-devargs is required for device configuration */
1808 if (new_devargs && new_devargs[0]) {
1809 /* Don't process dpdk-devargs if value is unchanged and port id
1810 * is valid */
1811 if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
1812 && rte_eth_dev_is_valid_port(dev->port_id))) {
bb37956a
IM
1813 dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
1814 new_devargs,
1815 errp);
55e075e6
CL
1816 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1817 err = EINVAL;
1818 } else if (new_port_id == dev->port_id) {
1819 /* Already configured, do not reconfigure again */
1820 err = 0;
1821 } else {
1822 struct netdev_dpdk *dup_dev;
bb37956a 1823
55e075e6
CL
1824 dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
1825 if (dup_dev) {
9fff138e 1826 VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
40e940e4 1827 "which is already in use by '%s'",
9fff138e
DDP
1828 netdev_get_name(netdev), new_devargs,
1829 netdev_get_name(&dup_dev->up));
55e075e6
CL
1830 err = EADDRINUSE;
1831 } else {
bd4e172b 1832 int sid = rte_eth_dev_socket_id(new_port_id);
bb37956a 1833
bd4e172b 1834 dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
55e075e6
CL
1835 dev->devargs = xstrdup(new_devargs);
1836 dev->port_id = new_port_id;
1837 netdev_request_reconfigure(&dev->up);
971f4b39 1838 netdev_dpdk_clear_xstats(dev);
55e075e6
CL
1839 err = 0;
1840 }
1841 }
1842 }
1843 } else {
9fff138e
DDP
1844 VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
1845 "The old 'dpdk<port_id>' names are not supported",
1846 netdev_get_name(netdev));
55e075e6
CL
1847 err = EINVAL;
1848 }
1849
1850 if (err) {
1851 goto out;
1852 }
1853
f8b64a61
RM
1854 lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
1855 if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
1856 dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
1857 netdev_request_reconfigure(netdev);
1858 }
1859
c3d062a7
CL
1860 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
1861 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
b614c894 1862 autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
c3d062a7 1863
b614c894 1864 fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
c2c84474
TK
1865
1866 if (!smap_get(args, "rx-flow-ctrl") && !smap_get(args, "tx-flow-ctrl")
1867 && !smap_get(args, "flow-ctrl-autoneg")) {
1868 /* FIXME: User didn't ask for flow control configuration.
1869 * For now we'll not print a warning if flow control is not
1870 * supported by the DPDK port. */
1871 flow_control_requested = false;
1872 }
1873
1874 /* Get the Flow control configuration. */
1875 err = -rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
1876 if (err) {
1877 if (err == ENOTSUP) {
1878 if (flow_control_requested) {
1879 VLOG_WARN("%s: Flow control is not supported.",
1880 netdev_get_name(netdev));
1881 }
1882 err = 0; /* Not fatal. */
1883 } else {
1884 VLOG_WARN("%s: Cannot get flow control parameters: %s",
1885 netdev_get_name(netdev), rte_strerror(err));
1886 }
1887 goto out;
1888 }
1889
b614c894
IM
1890 if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
1891 dev->fc_conf.mode = fc_mode;
1892 dev->fc_conf.autoneg = autoneg;
1893 dpdk_eth_flow_ctrl_setup(dev);
1894 }
9fd39370 1895
55e075e6 1896out:
c3d062a7 1897 ovs_mutex_unlock(&dev->mutex);
55e075e6 1898 ovs_mutex_unlock(&dpdk_mutex);
c3d062a7 1899
55e075e6 1900 return err;
c3d062a7
CL
1901}
1902
1903static int
9fff138e
DDP
1904netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
1905 char **errp OVS_UNUSED)
c3d062a7
CL
1906{
1907 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1908
1909 ovs_mutex_lock(&dev->mutex);
1910 dpdk_set_rxq_config(dev, args);
a14b8947
IM
1911 ovs_mutex_unlock(&dev->mutex);
1912
1913 return 0;
1914}
1915
c1ff66ac 1916static int
2d24d165 1917netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
9fff138e
DDP
1918 const struct smap *args,
1919 char **errp OVS_UNUSED)
c1ff66ac
CL
1920{
1921 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1922 const char *path;
080f080c 1923 int max_tx_retries, cur_max_tx_retries;
c1ff66ac 1924
6881885a 1925 ovs_mutex_lock(&dev->mutex);
c1ff66ac
CL
1926 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1927 path = smap_get(args, "vhost-server-path");
bb9d2623
IM
1928 if (!nullable_string_is_equal(path, dev->vhost_id)) {
1929 free(dev->vhost_id);
1930 dev->vhost_id = nullable_xstrdup(path);
10087cba
CL
1931 /* check zero copy configuration */
1932 if (smap_get_bool(args, "dq-zero-copy", false)) {
1933 dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1934 } else {
1935 dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1936 }
c1ff66ac
CL
1937 netdev_request_reconfigure(netdev);
1938 }
1939 }
080f080c
KT
1940
1941 max_tx_retries = smap_get_int(args, "tx-retries-max",
1942 VHOST_ENQ_RETRY_DEF);
1943 if (max_tx_retries < VHOST_ENQ_RETRY_MIN
1944 || max_tx_retries > VHOST_ENQ_RETRY_MAX) {
1945 max_tx_retries = VHOST_ENQ_RETRY_DEF;
1946 }
1947 atomic_read_relaxed(&dev->vhost_tx_retries_max, &cur_max_tx_retries);
1948 if (max_tx_retries != cur_max_tx_retries) {
1949 atomic_store_relaxed(&dev->vhost_tx_retries_max, max_tx_retries);
1950 VLOG_INFO("Max Tx retries for vhost device '%s' set to %d",
1951 netdev_get_name(netdev), max_tx_retries);
1952 }
6881885a 1953 ovs_mutex_unlock(&dev->mutex);
c1ff66ac
CL
1954
1955 return 0;
1956}
1957
7dec44fe 1958static int
d46285a2 1959netdev_dpdk_get_numa_id(const struct netdev *netdev)
7dec44fe 1960{
d46285a2 1961 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7dec44fe 1962
d46285a2 1963 return dev->socket_id;
7dec44fe
AW
1964}
1965
050c60bf 1966/* Sets the number of tx queues for the dpdk interface. */
5496878c 1967static int
050c60bf 1968netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
5496878c 1969{
d46285a2 1970 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5496878c 1971
d46285a2 1972 ovs_mutex_lock(&dev->mutex);
91968eb0 1973
050c60bf
DDP
1974 if (dev->requested_n_txq == n_txq) {
1975 goto out;
4573fbd3
FL
1976 }
1977
050c60bf
DDP
1978 dev->requested_n_txq = n_txq;
1979 netdev_request_reconfigure(netdev);
58397e6c 1980
050c60bf 1981out:
d46285a2 1982 ovs_mutex_unlock(&dev->mutex);
050c60bf 1983 return 0;
58397e6c
KT
1984}
1985
8a9562d2
PS
1986static struct netdev_rxq *
1987netdev_dpdk_rxq_alloc(void)
1988{
1989 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
1990
eff23640
DDP
1991 if (rx) {
1992 return &rx->up;
1993 }
1994
1995 return NULL;
8a9562d2
PS
1996}
1997
1998static struct netdev_rxq_dpdk *
d46285a2 1999netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
8a9562d2 2000{
d46285a2 2001 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
8a9562d2
PS
2002}
2003
2004static int
d46285a2 2005netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
8a9562d2 2006{
d46285a2
DDP
2007 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2008 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
8a9562d2 2009
d46285a2
DDP
2010 ovs_mutex_lock(&dev->mutex);
2011 rx->port_id = dev->port_id;
2012 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
2013
2014 return 0;
2015}
2016
2017static void
d46285a2 2018netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
8a9562d2
PS
2019{
2020}
2021
2022static void
d46285a2 2023netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
8a9562d2 2024{
d46285a2 2025 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
8a9562d2
PS
2026
2027 rte_free(rx);
2028}
2029
819f13bd
DDP
2030/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2031 * 'pkts', even in case of failure.
2032 *
2033 * Returns the number of packets that weren't transmitted. */
2034static inline int
b59cc14e 2035netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
819f13bd 2036 struct rte_mbuf **pkts, int cnt)
8a9562d2 2037{
1304f1f8
DDP
2038 uint32_t nb_tx = 0;
2039
b59cc14e 2040 while (nb_tx != cnt) {
1304f1f8
DDP
2041 uint32_t ret;
2042
b59cc14e 2043 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
1304f1f8
DDP
2044 if (!ret) {
2045 break;
2046 }
2047
2048 nb_tx += ret;
2049 }
8a9562d2 2050
b59cc14e 2051 if (OVS_UNLIKELY(nb_tx != cnt)) {
819f13bd 2052 /* Free buffers, which we couldn't transmit, one at a time (each
db73f716
DDP
2053 * packet could come from a different mempool) */
2054 int i;
2055
b59cc14e
IM
2056 for (i = nb_tx; i < cnt; i++) {
2057 rte_pktmbuf_free(pkts[i]);
db73f716 2058 }
8a9562d2 2059 }
819f13bd
DDP
2060
2061 return cnt - nb_tx;
8a9562d2
PS
2062}
2063
f3926f29
IS
2064static inline bool
2065netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
03f3f9c0 2066 struct rte_meter_srtcm_profile *profile,
f3926f29
IS
2067 struct rte_mbuf *pkt, uint64_t time)
2068{
2069 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
2070
03f3f9c0
OM
2071 return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
2072 e_RTE_METER_GREEN;
f3926f29
IS
2073}
2074
2075static int
2076netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
03f3f9c0 2077 struct rte_meter_srtcm_profile *profile,
3e90f7d7 2078 struct rte_mbuf **pkts, int pkt_cnt,
7d7ded7a 2079 bool should_steal)
f3926f29
IS
2080{
2081 int i = 0;
2082 int cnt = 0;
2083 struct rte_mbuf *pkt = NULL;
2084 uint64_t current_time = rte_rdtsc();
2085
2086 for (i = 0; i < pkt_cnt; i++) {
2087 pkt = pkts[i];
2088 /* Handle current packet */
03f3f9c0
OM
2089 if (netdev_dpdk_policer_pkt_handle(meter, profile,
2090 pkt, current_time)) {
f3926f29
IS
2091 if (cnt != i) {
2092 pkts[cnt] = pkt;
2093 }
2094 cnt++;
2095 } else {
7d7ded7a 2096 if (should_steal) {
3e90f7d7
GZ
2097 rte_pktmbuf_free(pkt);
2098 }
f3926f29
IS
2099 }
2100 }
2101
2102 return cnt;
2103}
2104
9509913a
IS
2105static int
2106ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
7d7ded7a 2107 int pkt_cnt, bool should_steal)
9509913a
IS
2108{
2109 int cnt = 0;
2110
2111 rte_spinlock_lock(&policer->policer_lock);
03f3f9c0
OM
2112 cnt = netdev_dpdk_policer_run(&policer->in_policer, &policer->in_prof,
2113 pkts, pkt_cnt, should_steal);
9509913a
IS
2114 rte_spinlock_unlock(&policer->policer_lock);
2115
2116 return cnt;
2117}
2118
58397e6c 2119static bool
0a0f39df 2120is_vhost_running(struct netdev_dpdk *dev)
58397e6c 2121{
0a0f39df 2122 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
58397e6c
KT
2123}
2124
d6e3feb5 2125static inline void
2126netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
2127 unsigned int packet_size)
2128{
2129 /* Hard-coded search for the size bucket. */
2130 if (packet_size < 256) {
2131 if (packet_size >= 128) {
2132 stats->rx_128_to_255_packets++;
2133 } else if (packet_size <= 64) {
2134 stats->rx_1_to_64_packets++;
2135 } else {
2136 stats->rx_65_to_127_packets++;
2137 }
2138 } else {
2139 if (packet_size >= 1523) {
2140 stats->rx_1523_to_max_packets++;
2141 } else if (packet_size >= 1024) {
2142 stats->rx_1024_to_1522_packets++;
2143 } else if (packet_size < 512) {
2144 stats->rx_256_to_511_packets++;
2145 } else {
2146 stats->rx_512_to_1023_packets++;
2147 }
2148 }
2149}
2150
9e3ddd45
TP
2151static inline void
2152netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
9509913a
IS
2153 struct dp_packet **packets, int count,
2154 int dropped)
9e3ddd45
TP
2155{
2156 int i;
d6e3feb5 2157 unsigned int packet_size;
9e3ddd45
TP
2158 struct dp_packet *packet;
2159
2160 stats->rx_packets += count;
9509913a 2161 stats->rx_dropped += dropped;
9e3ddd45
TP
2162 for (i = 0; i < count; i++) {
2163 packet = packets[i];
d6e3feb5 2164 packet_size = dp_packet_size(packet);
9e3ddd45 2165
d6e3feb5 2166 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
9e3ddd45
TP
2167 /* This only protects the following multicast counting from
2168 * too short packets, but it does not stop the packet from
2169 * further processing. */
2170 stats->rx_errors++;
2171 stats->rx_length_errors++;
2172 continue;
2173 }
2174
d6e3feb5 2175 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
2176
9e3ddd45
TP
2177 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
2178 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
2179 stats->multicast++;
2180 }
2181
d6e3feb5 2182 stats->rx_bytes += packet_size;
9e3ddd45
TP
2183 }
2184}
2185
58397e6c
KT
2186/*
2187 * The receive path for the vhost port is the TX path out from guest.
2188 */
2189static int
d46285a2 2190netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
8492adc2 2191 struct dp_packet_batch *batch, int *qfill)
58397e6c 2192{
d46285a2 2193 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 2194 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
58397e6c 2195 uint16_t nb_rx = 0;
9509913a 2196 uint16_t dropped = 0;
8492adc2 2197 int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
daf22bf7 2198 int vid = netdev_dpdk_get_vid(dev);
58397e6c 2199
daf22bf7 2200 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
e543851d 2201 || !(dev->flags & NETDEV_UP))) {
58397e6c
KT
2202 return EAGAIN;
2203 }
2204
43307ad0 2205 nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
64839cf4 2206 (struct rte_mbuf **) batch->packets,
cd159f1a 2207 NETDEV_MAX_BURST);
58397e6c
KT
2208 if (!nb_rx) {
2209 return EAGAIN;
2210 }
2211
8492adc2
JS
2212 if (qfill) {
2213 if (nb_rx == NETDEV_MAX_BURST) {
2214 /* The DPDK API returns a uint32_t which often has invalid bits in
2215 * the upper 16-bits. Need to restrict the value to uint16_t. */
2216 *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
2217 } else {
2218 *qfill = 0;
2219 }
2220 }
2221
9509913a
IS
2222 if (policer) {
2223 dropped = nb_rx;
64839cf4
WT
2224 nb_rx = ingress_policer_run(policer,
2225 (struct rte_mbuf **) batch->packets,
3e90f7d7 2226 nb_rx, true);
9509913a
IS
2227 dropped -= nb_rx;
2228 }
2229
d46285a2 2230 rte_spinlock_lock(&dev->stats_lock);
64839cf4
WT
2231 netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
2232 nb_rx, dropped);
d46285a2 2233 rte_spinlock_unlock(&dev->stats_lock);
45d947c4 2234
75fb9148
ZB
2235 batch->count = nb_rx;
2236 dp_packet_batch_init_packet_fields(batch);
2237
58397e6c
KT
2238 return 0;
2239}
2240
35c91567
DM
2241static bool
2242netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
2243{
2244 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2245
2246 return dev->vhost_rxq_enabled[rxq->queue_id];
2247}
2248
8a9562d2 2249static int
8492adc2
JS
2250netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
2251 int *qfill)
8a9562d2 2252{
d46285a2
DDP
2253 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2254 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 2255 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
8a9562d2 2256 int nb_rx;
9509913a 2257 int dropped = 0;
8a9562d2 2258
3b1fb077
DDP
2259 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2260 return EAGAIN;
2261 }
2262
d46285a2 2263 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
64839cf4 2264 (struct rte_mbuf **) batch->packets,
cd159f1a 2265 NETDEV_MAX_BURST);
8a9562d2
PS
2266 if (!nb_rx) {
2267 return EAGAIN;
2268 }
2269
9509913a
IS
2270 if (policer) {
2271 dropped = nb_rx;
64839cf4 2272 nb_rx = ingress_policer_run(policer,
58be5c0e 2273 (struct rte_mbuf **) batch->packets,
3e90f7d7 2274 nb_rx, true);
9509913a
IS
2275 dropped -= nb_rx;
2276 }
2277
2278 /* Update stats to reflect dropped packets */
2279 if (OVS_UNLIKELY(dropped)) {
2280 rte_spinlock_lock(&dev->stats_lock);
2281 dev->stats.rx_dropped += dropped;
2282 rte_spinlock_unlock(&dev->stats_lock);
2283 }
2284
64839cf4 2285 batch->count = nb_rx;
75fb9148 2286 dp_packet_batch_init_packet_fields(batch);
8a9562d2 2287
8492adc2
JS
2288 if (qfill) {
2289 if (nb_rx == NETDEV_MAX_BURST) {
2290 *qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
2291 } else {
2292 *qfill = 0;
2293 }
2294 }
2295
8a9562d2
PS
2296 return 0;
2297}
2298
0bf765f7 2299static inline int
78bd47cf 2300netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
7d7ded7a 2301 int cnt, bool should_steal)
0bf765f7 2302{
78bd47cf 2303 struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
0bf765f7 2304
78bd47cf
DDP
2305 if (qos_conf) {
2306 rte_spinlock_lock(&qos_conf->lock);
7d7ded7a 2307 cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
78bd47cf 2308 rte_spinlock_unlock(&qos_conf->lock);
0bf765f7
IS
2309 }
2310
2311 return cnt;
2312}
2313
c6ec9d17
IM
2314static int
2315netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2316 int pkt_cnt)
2317{
2318 int i = 0;
2319 int cnt = 0;
2320 struct rte_mbuf *pkt;
2321
2322 for (i = 0; i < pkt_cnt; i++) {
2323 pkt = pkts[i];
2324 if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
2325 VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
2326 dev->up.name, pkt->pkt_len, dev->max_packet_len);
2327 rte_pktmbuf_free(pkt);
2328 continue;
2329 }
2330
2331 if (OVS_UNLIKELY(i != cnt)) {
2332 pkts[cnt] = pkt;
2333 }
2334 cnt++;
2335 }
2336
2337 return cnt;
2338}
2339
9e3ddd45
TP
2340static inline void
2341netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
2342 struct dp_packet **packets,
2343 int attempted,
2344 int dropped)
2345{
2346 int i;
2347 int sent = attempted - dropped;
2348
2349 stats->tx_packets += sent;
2350 stats->tx_dropped += dropped;
2351
2352 for (i = 0; i < sent; i++) {
2353 stats->tx_bytes += dp_packet_size(packets[i]);
2354 }
2355}
2356
58397e6c 2357static void
4573fbd3 2358__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
dd52de45 2359 struct dp_packet **pkts, int cnt)
58397e6c 2360{
d46285a2 2361 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
95e9881f
KT
2362 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
2363 unsigned int total_pkts = cnt;
c6ec9d17 2364 unsigned int dropped = 0;
dd52de45 2365 int i, retries = 0;
080f080c 2366 int max_retries = VHOST_ENQ_RETRY_MIN;
daf22bf7 2367 int vid = netdev_dpdk_get_vid(dev);
58397e6c 2368
81acebda 2369 qid = dev->tx_q[qid % netdev->n_txq].map;
585a5bea 2370
daf22bf7 2371 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
e543851d 2372 || !(dev->flags & NETDEV_UP))) {
d46285a2
DDP
2373 rte_spinlock_lock(&dev->stats_lock);
2374 dev->stats.tx_dropped+= cnt;
2375 rte_spinlock_unlock(&dev->stats_lock);
1b99bb05 2376 goto out;
58397e6c
KT
2377 }
2378
d46285a2 2379 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
58397e6c 2380
c6ec9d17 2381 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
0bf765f7 2382 /* Check has QoS has been configured for the netdev */
3e90f7d7 2383 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
c6ec9d17 2384 dropped = total_pkts - cnt;
0bf765f7 2385
95e9881f 2386 do {
4573fbd3 2387 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
95e9881f
KT
2388 unsigned int tx_pkts;
2389
daf22bf7 2390 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
95e9881f
KT
2391 if (OVS_LIKELY(tx_pkts)) {
2392 /* Packets have been sent.*/
2393 cnt -= tx_pkts;
31871ee3 2394 /* Prepare for possible retry.*/
95e9881f 2395 cur_pkts = &cur_pkts[tx_pkts];
080f080c
KT
2396 if (OVS_UNLIKELY(cnt && !retries)) {
2397 /*
2398 * Read max retries as there are packets not sent
2399 * and no retries have already occurred.
2400 */
2401 atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
2402 }
95e9881f 2403 } else {
31871ee3
KT
2404 /* No packets sent - do not retry.*/
2405 break;
95e9881f 2406 }
080f080c 2407 } while (cnt && (retries++ < max_retries));
4573fbd3 2408
d46285a2 2409 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
95e9881f 2410
d46285a2 2411 rte_spinlock_lock(&dev->stats_lock);
0072e931 2412 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
c6ec9d17 2413 cnt + dropped);
080f080c 2414 dev->tx_retries += MIN(retries, max_retries);
d46285a2 2415 rte_spinlock_unlock(&dev->stats_lock);
58397e6c
KT
2416
2417out:
c6ec9d17 2418 for (i = 0; i < total_pkts - dropped; i++) {
dd52de45 2419 dp_packet_delete(pkts[i]);
58397e6c
KT
2420 }
2421}
2422
8a9562d2
PS
2423/* Tx function. Transmit packets indefinitely */
2424static void
64839cf4 2425dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
db73f716 2426 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2 2427{
8a14bd7b 2428 const size_t batch_cnt = dp_packet_batch_size(batch);
bce01e3a 2429#if !defined(__CHECKER__) && !defined(_WIN32)
8a14bd7b 2430 const size_t PKT_ARRAY_SIZE = batch_cnt;
bce01e3a
EJ
2431#else
2432 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 2433 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
bce01e3a 2434#endif
8a9562d2 2435 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2391135c 2436 struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
8a14bd7b 2437 uint32_t cnt = batch_cnt;
3e90f7d7
GZ
2438 uint32_t dropped = 0;
2439
2440 if (dev->type != DPDK_DEV_VHOST) {
2441 /* Check if QoS has been configured for this netdev. */
2442 cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
8a14bd7b
BB
2443 batch_cnt, false);
2444 dropped += batch_cnt - cnt;
3e90f7d7 2445 }
8a9562d2 2446
3e90f7d7
GZ
2447 uint32_t txcnt = 0;
2448
2449 for (uint32_t i = 0; i < cnt; i++) {
8a14bd7b
BB
2450 struct dp_packet *packet = batch->packets[i];
2451 uint32_t size = dp_packet_size(packet);
95fb793a 2452
f98d7864 2453 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
3e90f7d7
GZ
2454 VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
2455 size, dev->max_packet_len);
f4fd623c 2456
175cf4de 2457 dropped++;
f4fd623c
DDP
2458 continue;
2459 }
8a9562d2 2460
43307ad0 2461 pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
8a14bd7b 2462 if (OVS_UNLIKELY(!pkts[txcnt])) {
3e90f7d7 2463 dropped += cnt - i;
175cf4de 2464 break;
f4fd623c
DDP
2465 }
2466
2467 /* We have to do a copy for now */
3e90f7d7 2468 memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
8a14bd7b
BB
2469 dp_packet_data(packet), size);
2470 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
f4fd623c 2471
3e90f7d7 2472 txcnt++;
f4fd623c 2473 }
8a9562d2 2474
3e90f7d7
GZ
2475 if (OVS_LIKELY(txcnt)) {
2476 if (dev->type == DPDK_DEV_VHOST) {
2477 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
2478 txcnt);
2479 } else {
2480 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
2481 }
58397e6c 2482 }
db73f716 2483
0bf765f7
IS
2484 if (OVS_UNLIKELY(dropped)) {
2485 rte_spinlock_lock(&dev->stats_lock);
2486 dev->stats.tx_dropped += dropped;
2487 rte_spinlock_unlock(&dev->stats_lock);
2488 }
8a9562d2
PS
2489}
2490
58397e6c 2491static int
64839cf4
WT
2492netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2493 struct dp_packet_batch *batch,
b30896c9 2494 bool concurrent_txq OVS_UNUSED)
58397e6c 2495{
58397e6c 2496
b30896c9 2497 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
64839cf4 2498 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2499 dp_packet_delete_batch(batch, true);
58397e6c 2500 } else {
940ac2ce
PC
2501 __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
2502 dp_packet_batch_size(batch));
58397e6c
KT
2503 }
2504 return 0;
2505}
2506
7251515e
DV
2507static inline void
2508netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
b30896c9 2509 struct dp_packet_batch *batch,
324c8374 2510 bool concurrent_txq)
8a9562d2 2511{
3b1fb077 2512 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
b30896c9 2513 dp_packet_delete_batch(batch, true);
3b1fb077
DDP
2514 return;
2515 }
2516
324c8374 2517 if (OVS_UNLIKELY(concurrent_txq)) {
81acebda 2518 qid = qid % dev->up.n_txq;
a0cb2d66
DDP
2519 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2520 }
2521
b30896c9 2522 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
7251515e
DV
2523 struct netdev *netdev = &dev->up;
2524
64839cf4 2525 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2526 dp_packet_delete_batch(batch, true);
8a9562d2 2527 } else {
fd57eeba
BB
2528 int tx_cnt, dropped;
2529 int batch_cnt = dp_packet_batch_size(batch);
2391135c 2530 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
8a9562d2 2531
fd57eeba
BB
2532 tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
2533 tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
2534 dropped = batch_cnt - tx_cnt;
1b99bb05 2535
fd57eeba 2536 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);
8a9562d2 2537
f4fd623c 2538 if (OVS_UNLIKELY(dropped)) {
45d947c4 2539 rte_spinlock_lock(&dev->stats_lock);
f4fd623c 2540 dev->stats.tx_dropped += dropped;
45d947c4 2541 rte_spinlock_unlock(&dev->stats_lock);
f4fd623c 2542 }
8a9562d2 2543 }
a0cb2d66 2544
324c8374 2545 if (OVS_UNLIKELY(concurrent_txq)) {
a0cb2d66
DDP
2546 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2547 }
7251515e
DV
2548}
2549
2550static int
2551netdev_dpdk_eth_send(struct netdev *netdev, int qid,
b30896c9 2552 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e
DV
2553{
2554 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2555
b30896c9 2556 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e 2557 return 0;
8a9562d2
PS
2558}
2559
2560static int
74ff3298 2561netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
8a9562d2
PS
2562{
2563 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2564
2565 ovs_mutex_lock(&dev->mutex);
2566 if (!eth_addr_equals(dev->hwaddr, mac)) {
74ff3298 2567 dev->hwaddr = mac;
045c0d1a 2568 netdev_change_seq_changed(netdev);
8a9562d2
PS
2569 }
2570 ovs_mutex_unlock(&dev->mutex);
2571
2572 return 0;
2573}
2574
2575static int
74ff3298 2576netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
8a9562d2
PS
2577{
2578 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2579
2580 ovs_mutex_lock(&dev->mutex);
74ff3298 2581 *mac = dev->hwaddr;
8a9562d2
PS
2582 ovs_mutex_unlock(&dev->mutex);
2583
2584 return 0;
2585}
2586
2587static int
2588netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
2589{
2590 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2591
2592 ovs_mutex_lock(&dev->mutex);
2593 *mtup = dev->mtu;
2594 ovs_mutex_unlock(&dev->mutex);
2595
2596 return 0;
2597}
2598
0072e931
MK
2599static int
2600netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
2601{
2602 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2603
f6f50552
IS
2604 /* XXX: Ensure that the overall frame length of the requested MTU does not
2605 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2606 * the L2 frame length is calculated for a given MTU when
2607 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2608 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2609 * not include vlan headers. As such we should use
2610 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2611 * (8 bytes) for comparison. This avoids a failure later with
2612 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
2613 * a method to retrieve the upper bound MTU for a given device.
2614 */
2615 if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
0072e931
MK
2616 || mtu < ETHER_MIN_MTU) {
2617 VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
2618 return EINVAL;
2619 }
2620
2621 ovs_mutex_lock(&dev->mutex);
2622 if (dev->requested_mtu != mtu) {
2623 dev->requested_mtu = mtu;
2624 netdev_request_reconfigure(netdev);
2625 }
2626 ovs_mutex_unlock(&dev->mutex);
2627
2628 return 0;
2629}
2630
8a9562d2 2631static int
d46285a2 2632netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
8a9562d2 2633
58397e6c
KT
2634static int
2635netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
2636 struct netdev_stats *stats)
2637{
2638 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2639
2640 ovs_mutex_lock(&dev->mutex);
58397e6c 2641
45d947c4 2642 rte_spinlock_lock(&dev->stats_lock);
58397e6c 2643 /* Supported Stats */
50986e78 2644 stats->rx_packets = dev->stats.rx_packets;
2645 stats->tx_packets = dev->stats.tx_packets;
9509913a 2646 stats->rx_dropped = dev->stats.rx_dropped;
50986e78 2647 stats->tx_dropped = dev->stats.tx_dropped;
9e3ddd45
TP
2648 stats->multicast = dev->stats.multicast;
2649 stats->rx_bytes = dev->stats.rx_bytes;
2650 stats->tx_bytes = dev->stats.tx_bytes;
2651 stats->rx_errors = dev->stats.rx_errors;
2652 stats->rx_length_errors = dev->stats.rx_length_errors;
d6e3feb5 2653
2654 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
2655 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
2656 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
2657 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
2658 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
2659 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
2660 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
2661
45d947c4 2662 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45 2663
58397e6c
KT
2664 ovs_mutex_unlock(&dev->mutex);
2665
2666 return 0;
2667}
2668
d6e3feb5 2669static void
2670netdev_dpdk_convert_xstats(struct netdev_stats *stats,
0a0f39df
CL
2671 const struct rte_eth_xstat *xstats,
2672 const struct rte_eth_xstat_name *names,
d6e3feb5 2673 const unsigned int size)
2674{
18366d16
IM
2675/* DPDK XSTATS Counter names definition. */
2676#define DPDK_XSTATS \
2677 DPDK_XSTAT(multicast, "rx_multicast_packets" ) \
2678 DPDK_XSTAT(tx_multicast_packets, "tx_multicast_packets" ) \
2679 DPDK_XSTAT(rx_broadcast_packets, "rx_broadcast_packets" ) \
2680 DPDK_XSTAT(tx_broadcast_packets, "tx_broadcast_packets" ) \
2681 DPDK_XSTAT(rx_undersized_errors, "rx_undersized_errors" ) \
2682 DPDK_XSTAT(rx_oversize_errors, "rx_oversize_errors" ) \
2683 DPDK_XSTAT(rx_fragmented_errors, "rx_fragmented_errors" ) \
2684 DPDK_XSTAT(rx_jabber_errors, "rx_jabber_errors" ) \
2685 DPDK_XSTAT(rx_1_to_64_packets, "rx_size_64_packets" ) \
2686 DPDK_XSTAT(rx_65_to_127_packets, "rx_size_65_to_127_packets" ) \
2687 DPDK_XSTAT(rx_128_to_255_packets, "rx_size_128_to_255_packets" ) \
2688 DPDK_XSTAT(rx_256_to_511_packets, "rx_size_256_to_511_packets" ) \
2689 DPDK_XSTAT(rx_512_to_1023_packets, "rx_size_512_to_1023_packets" ) \
2690 DPDK_XSTAT(rx_1024_to_1522_packets, "rx_size_1024_to_1522_packets" ) \
2691 DPDK_XSTAT(rx_1523_to_max_packets, "rx_size_1523_to_max_packets" ) \
2692 DPDK_XSTAT(tx_1_to_64_packets, "tx_size_64_packets" ) \
2693 DPDK_XSTAT(tx_65_to_127_packets, "tx_size_65_to_127_packets" ) \
2694 DPDK_XSTAT(tx_128_to_255_packets, "tx_size_128_to_255_packets" ) \
2695 DPDK_XSTAT(tx_256_to_511_packets, "tx_size_256_to_511_packets" ) \
2696 DPDK_XSTAT(tx_512_to_1023_packets, "tx_size_512_to_1023_packets" ) \
2697 DPDK_XSTAT(tx_1024_to_1522_packets, "tx_size_1024_to_1522_packets" ) \
2698 DPDK_XSTAT(tx_1523_to_max_packets, "tx_size_1523_to_max_packets" )
2699
d6e3feb5 2700 for (unsigned int i = 0; i < size; i++) {
18366d16
IM
2701#define DPDK_XSTAT(MEMBER, NAME) \
2702 if (strcmp(NAME, names[i].name) == 0) { \
2703 stats->MEMBER = xstats[i].value; \
2704 continue; \
d6e3feb5 2705 }
18366d16
IM
2706 DPDK_XSTATS;
2707#undef DPDK_XSTAT
d6e3feb5 2708 }
18366d16 2709#undef DPDK_XSTATS
d6e3feb5 2710}
2711
8a9562d2
PS
2712static int
2713netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
2714{
2715 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2716 struct rte_eth_stats rte_stats;
2717 bool gg;
2718
2719 netdev_dpdk_get_carrier(netdev, &gg);
2720 ovs_mutex_lock(&dev->mutex);
8a9562d2 2721
0a0f39df
CL
2722 struct rte_eth_xstat *rte_xstats = NULL;
2723 struct rte_eth_xstat_name *rte_xstats_names = NULL;
2724 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
d6e3feb5 2725
2726 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
fa9f4eeb
IM
2727 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
2728 dev->port_id);
f9256822 2729 ovs_mutex_unlock(&dev->mutex);
d6e3feb5 2730 return EPROTO;
2731 }
2732
0a0f39df
CL
2733 /* Get length of statistics */
2734 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
2735 if (rte_xstats_len < 0) {
fa9f4eeb
IM
2736 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2737 dev->port_id);
0a0f39df
CL
2738 goto out;
2739 }
2740 /* Reserve memory for xstats names and values */
2741 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
2742 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
2743
2744 /* Retreive xstats names */
2745 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
2746 rte_xstats_names,
2747 rte_xstats_len);
2748 if (rte_xstats_new_len != rte_xstats_len) {
fa9f4eeb
IM
2749 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
2750 dev->port_id);
0a0f39df
CL
2751 goto out;
2752 }
2753 /* Retreive xstats values */
2754 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
2755 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
2756 rte_xstats_len);
2757 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
2758 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
2759 rte_xstats_len);
d6e3feb5 2760 } else {
fa9f4eeb
IM
2761 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2762 dev->port_id);
d6e3feb5 2763 }
8a9562d2 2764
0a0f39df
CL
2765out:
2766 free(rte_xstats);
2767 free(rte_xstats_names);
2768
2f9dd77f
PS
2769 stats->rx_packets = rte_stats.ipackets;
2770 stats->tx_packets = rte_stats.opackets;
2771 stats->rx_bytes = rte_stats.ibytes;
2772 stats->tx_bytes = rte_stats.obytes;
21e9844c 2773 stats->rx_errors = rte_stats.ierrors;
2f9dd77f 2774 stats->tx_errors = rte_stats.oerrors;
8a9562d2 2775
45d947c4 2776 rte_spinlock_lock(&dev->stats_lock);
2f9dd77f 2777 stats->tx_dropped = dev->stats.tx_dropped;
9509913a 2778 stats->rx_dropped = dev->stats.rx_dropped;
45d947c4 2779 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45
TP
2780
2781 /* These are the available DPDK counters for packets not received due to
2782 * local resource constraints in DPDK and NIC respectively. */
9509913a 2783 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
9e3ddd45
TP
2784 stats->rx_missed_errors = rte_stats.imissed;
2785
8a9562d2
PS
2786 ovs_mutex_unlock(&dev->mutex);
2787
2788 return 0;
2789}
2790
971f4b39
MW
2791static int
2792netdev_dpdk_get_custom_stats(const struct netdev *netdev,
2793 struct netdev_custom_stats *custom_stats)
2794{
2795
2796 uint32_t i;
2797 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2798 int rte_xstats_ret;
2799
2800 ovs_mutex_lock(&dev->mutex);
2801
2802 if (netdev_dpdk_configure_xstats(dev)) {
2803 uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
2804 sizeof(uint64_t));
2805
2806 rte_xstats_ret =
2807 rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
2808 values, dev->rte_xstats_ids_size);
2809
2810 if (rte_xstats_ret > 0 &&
2811 rte_xstats_ret <= dev->rte_xstats_ids_size) {
2812
2813 custom_stats->size = rte_xstats_ret;
2814 custom_stats->counters =
2815 (struct netdev_custom_counter *) xcalloc(rte_xstats_ret,
2816 sizeof(struct netdev_custom_counter));
2817
2818 for (i = 0; i < rte_xstats_ret; i++) {
2819 ovs_strlcpy(custom_stats->counters[i].name,
2820 netdev_dpdk_get_xstat_name(dev,
2821 dev->rte_xstats_ids[i]),
2822 NETDEV_CUSTOM_STATS_NAME_SIZE);
2823 custom_stats->counters[i].value = values[i];
2824 }
2825 } else {
fa9f4eeb 2826 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
971f4b39
MW
2827 dev->port_id);
2828 custom_stats->counters = NULL;
2829 custom_stats->size = 0;
2830 /* Let's clear statistics cache, so it will be
2831 * reconfigured */
2832 netdev_dpdk_clear_xstats(dev);
2833 }
526259f2
IM
2834
2835 free(values);
971f4b39
MW
2836 }
2837
2838 ovs_mutex_unlock(&dev->mutex);
2839
2840 return 0;
2841}
2842
c161357d
KT
2843static int
2844netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev,
2845 struct netdev_custom_stats *custom_stats)
2846{
2847 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5c7ba90d 2848 int i;
c161357d 2849
5c7ba90d
IM
2850#define VHOST_CSTATS \
2851 VHOST_CSTAT(tx_retries)
c161357d 2852
5c7ba90d
IM
2853#define VHOST_CSTAT(NAME) + 1
2854 custom_stats->size = VHOST_CSTATS;
2855#undef VHOST_CSTAT
c161357d
KT
2856 custom_stats->counters = xcalloc(custom_stats->size,
2857 sizeof *custom_stats->counters);
5c7ba90d
IM
2858 i = 0;
2859#define VHOST_CSTAT(NAME) \
2860 ovs_strlcpy(custom_stats->counters[i++].name, #NAME, \
c161357d 2861 NETDEV_CUSTOM_STATS_NAME_SIZE);
5c7ba90d
IM
2862 VHOST_CSTATS;
2863#undef VHOST_CSTAT
2864
2865 ovs_mutex_lock(&dev->mutex);
c161357d
KT
2866
2867 rte_spinlock_lock(&dev->stats_lock);
5c7ba90d
IM
2868 i = 0;
2869#define VHOST_CSTAT(NAME) \
2870 custom_stats->counters[i++].value = dev->NAME;
2871 VHOST_CSTATS;
2872#undef VHOST_CSTAT
c161357d
KT
2873 rte_spinlock_unlock(&dev->stats_lock);
2874
2875 ovs_mutex_unlock(&dev->mutex);
2876
2877 return 0;
2878}
2879
8a9562d2 2880static int
d46285a2 2881netdev_dpdk_get_features(const struct netdev *netdev,
8a9562d2 2882 enum netdev_features *current,
ca3d4f55
BX
2883 enum netdev_features *advertised,
2884 enum netdev_features *supported,
2885 enum netdev_features *peer)
8a9562d2 2886{
d46285a2 2887 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2888 struct rte_eth_link link;
dfcb5b8a 2889 uint32_t feature = 0;
8a9562d2
PS
2890
2891 ovs_mutex_lock(&dev->mutex);
2892 link = dev->link;
2893 ovs_mutex_unlock(&dev->mutex);
2894
dfcb5b8a
IS
2895 /* Match against OpenFlow defined link speed values. */
2896 if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
2897 switch (link.link_speed) {
2898 case ETH_SPEED_NUM_10M:
2899 feature |= NETDEV_F_10MB_FD;
2900 break;
2901 case ETH_SPEED_NUM_100M:
2902 feature |= NETDEV_F_100MB_FD;
2903 break;
2904 case ETH_SPEED_NUM_1G:
2905 feature |= NETDEV_F_1GB_FD;
2906 break;
2907 case ETH_SPEED_NUM_10G:
2908 feature |= NETDEV_F_10GB_FD;
2909 break;
2910 case ETH_SPEED_NUM_40G:
2911 feature |= NETDEV_F_40GB_FD;
2912 break;
2913 case ETH_SPEED_NUM_100G:
2914 feature |= NETDEV_F_100GB_FD;
2915 break;
2916 default:
2917 feature |= NETDEV_F_OTHER;
8a9562d2 2918 }
dfcb5b8a
IS
2919 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
2920 switch (link.link_speed) {
2921 case ETH_SPEED_NUM_10M:
2922 feature |= NETDEV_F_10MB_HD;
2923 break;
2924 case ETH_SPEED_NUM_100M:
2925 feature |= NETDEV_F_100MB_HD;
2926 break;
2927 case ETH_SPEED_NUM_1G:
2928 feature |= NETDEV_F_1GB_HD;
2929 break;
2930 default:
2931 feature |= NETDEV_F_OTHER;
74cd69a4 2932 }
8a9562d2
PS
2933 }
2934
362ca396 2935 if (link.link_autoneg) {
dfcb5b8a 2936 feature |= NETDEV_F_AUTONEG;
362ca396 2937 }
2938
dfcb5b8a 2939 *current = feature;
ca3d4f55
BX
2940 *advertised = *supported = *peer = 0;
2941
8a9562d2
PS
2942 return 0;
2943}
2944
9509913a
IS
2945static struct ingress_policer *
2946netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
2947{
2948 struct ingress_policer *policer = NULL;
2949 uint64_t rate_bytes;
2950 uint64_t burst_bytes;
2951 int err = 0;
2952
2953 policer = xmalloc(sizeof *policer);
2954 rte_spinlock_init(&policer->policer_lock);
2955
2956 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
602c8668
LR
2957 rate_bytes = rate * 1000ULL / 8;
2958 burst_bytes = burst * 1000ULL / 8;
9509913a
IS
2959
2960 policer->app_srtcm_params.cir = rate_bytes;
2961 policer->app_srtcm_params.cbs = burst_bytes;
2962 policer->app_srtcm_params.ebs = 0;
03f3f9c0
OM
2963 err = rte_meter_srtcm_profile_config(&policer->in_prof,
2964 &policer->app_srtcm_params);
2965 if (!err) {
2966 err = rte_meter_srtcm_config(&policer->in_policer,
2967 &policer->in_prof);
2968 }
58be5c0e 2969 if (err) {
9509913a 2970 VLOG_ERR("Could not create rte meter for ingress policer");
4c47ddde 2971 free(policer);
9509913a
IS
2972 return NULL;
2973 }
2974
2975 return policer;
2976}
2977
2978static int
2979netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
2980 uint32_t policer_burst)
2981{
2982 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2983 struct ingress_policer *policer;
2984
2985 /* Force to 0 if no rate specified,
2986 * default to 8000 kbits if burst is 0,
2987 * else stick with user-specified value.
2988 */
2989 policer_burst = (!policer_rate ? 0
2990 : !policer_burst ? 8000
2991 : policer_burst);
2992
2993 ovs_mutex_lock(&dev->mutex);
2994
2995 policer = ovsrcu_get_protected(struct ingress_policer *,
2996 &dev->ingress_policer);
2997
2998 if (dev->policer_rate == policer_rate &&
2999 dev->policer_burst == policer_burst) {
3000 /* Assume that settings haven't changed since we last set them. */
3001 ovs_mutex_unlock(&dev->mutex);
3002 return 0;
3003 }
3004
3005 /* Destroy any existing ingress policer for the device if one exists */
3006 if (policer) {
3007 ovsrcu_postpone(free, policer);
3008 }
3009
3010 if (policer_rate != 0) {
3011 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
3012 } else {
3013 policer = NULL;
3014 }
3015 ovsrcu_set(&dev->ingress_policer, policer);
3016 dev->policer_rate = policer_rate;
3017 dev->policer_burst = policer_burst;
3018 ovs_mutex_unlock(&dev->mutex);
3019
3020 return 0;
3021}
3022
8a9562d2
PS
3023static int
3024netdev_dpdk_get_ifindex(const struct netdev *netdev)
3025{
3026 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3027
3028 ovs_mutex_lock(&dev->mutex);
12d0d124
PL
3029 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
3030 * postive integer to meet RFC 2863 recommendations.
3031 */
3032 int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
8a9562d2
PS
3033 ovs_mutex_unlock(&dev->mutex);
3034
3035 return ifindex;
3036}
3037
3038static int
d46285a2 3039netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
8a9562d2 3040{
d46285a2 3041 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3042
3043 ovs_mutex_lock(&dev->mutex);
3044 check_link_status(dev);
3045 *carrier = dev->link.link_status;
58397e6c
KT
3046
3047 ovs_mutex_unlock(&dev->mutex);
3048
3049 return 0;
3050}
3051
3052static int
d46285a2 3053netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
58397e6c 3054{
d46285a2 3055 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
58397e6c
KT
3056
3057 ovs_mutex_lock(&dev->mutex);
3058
0a0f39df 3059 if (is_vhost_running(dev)) {
58397e6c
KT
3060 *carrier = 1;
3061 } else {
3062 *carrier = 0;
3063 }
3064
8a9562d2
PS
3065 ovs_mutex_unlock(&dev->mutex);
3066
3067 return 0;
3068}
3069
3070static long long int
d46285a2 3071netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
8a9562d2 3072{
d46285a2 3073 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3074 long long int carrier_resets;
3075
3076 ovs_mutex_lock(&dev->mutex);
3077 carrier_resets = dev->link_reset_cnt;
3078 ovs_mutex_unlock(&dev->mutex);
3079
3080 return carrier_resets;
3081}
3082
3083static int
d46285a2 3084netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
8a9562d2
PS
3085 long long int interval OVS_UNUSED)
3086{
ee32150e 3087 return EOPNOTSUPP;
8a9562d2
PS
3088}
3089
3090static int
3091netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
3092 enum netdev_flags off, enum netdev_flags on,
64839cf4
WT
3093 enum netdev_flags *old_flagsp)
3094 OVS_REQUIRES(dev->mutex)
8a9562d2 3095{
8a9562d2
PS
3096 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
3097 return EINVAL;
3098 }
3099
3100 *old_flagsp = dev->flags;
3101 dev->flags |= on;
3102 dev->flags &= ~off;
3103
3104 if (dev->flags == *old_flagsp) {
3105 return 0;
3106 }
3107
58397e6c 3108 if (dev->type == DPDK_DEV_ETH) {
2d37de73
EC
3109
3110 if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
3111 int err;
3112
3113 if (dev->flags & NETDEV_UP) {
3114 err = rte_eth_dev_set_link_up(dev->port_id);
3115 } else {
3116 err = rte_eth_dev_set_link_down(dev->port_id);
3117 }
3118 if (err == -ENOTSUP) {
3119 VLOG_INFO("Interface %s does not support link state "
3120 "configuration", netdev_get_name(&dev->up));
3121 } else if (err < 0) {
3122 VLOG_ERR("Interface %s link change error: %s",
3123 netdev_get_name(&dev->up), rte_strerror(-err));
3124 dev->flags = *old_flagsp;
3125 return -err;
3126 }
3127 }
3128
58397e6c
KT
3129 if (dev->flags & NETDEV_PROMISC) {
3130 rte_eth_promiscuous_enable(dev->port_id);
3131 }
8a9562d2 3132
314fb5ad 3133 netdev_change_seq_changed(&dev->up);
e543851d
ZB
3134 } else {
3135 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3136 * running then change netdev's change_seq to trigger link state
3137 * update. */
e543851d
ZB
3138
3139 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
0a0f39df 3140 && is_vhost_running(dev)) {
e543851d
ZB
3141 netdev_change_seq_changed(&dev->up);
3142
3143 /* Clear statistics if device is getting up. */
3144 if (NETDEV_UP & on) {
3145 rte_spinlock_lock(&dev->stats_lock);
58be5c0e 3146 memset(&dev->stats, 0, sizeof dev->stats);
e543851d
ZB
3147 rte_spinlock_unlock(&dev->stats_lock);
3148 }
3149 }
8a9562d2
PS
3150 }
3151
3152 return 0;
3153}
3154
3155static int
d46285a2 3156netdev_dpdk_update_flags(struct netdev *netdev,
8a9562d2
PS
3157 enum netdev_flags off, enum netdev_flags on,
3158 enum netdev_flags *old_flagsp)
3159{
d46285a2 3160 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3161 int error;
3162
d46285a2
DDP
3163 ovs_mutex_lock(&dev->mutex);
3164 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
3165 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3166
3167 return error;
3168}
3169
b2e8b12f
FL
3170static int
3171netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
3172 struct smap *args)
3173{
3174 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3175
3176 ovs_mutex_lock(&dev->mutex);
3177
3178 bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
3179 smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
3180
3181 int vid = netdev_dpdk_get_vid(dev);
3182 if (vid < 0) {
3183 smap_add_format(args, "status", "disconnected");
3184 ovs_mutex_unlock(&dev->mutex);
3185 return 0;
3186 } else {
3187 smap_add_format(args, "status", "connected");
3188 }
3189
3190 char socket_name[PATH_MAX];
3191 if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
3192 smap_add_format(args, "socket", "%s", socket_name);
3193 }
3194
3195 uint64_t features;
3196 if (!rte_vhost_get_negotiated_features(vid, &features)) {
3197 smap_add_format(args, "features", "0x%016"PRIx64, features);
3198 }
3199
3200 uint16_t mtu;
3201 if (!rte_vhost_get_mtu(vid, &mtu)) {
3202 smap_add_format(args, "mtu", "%d", mtu);
3203 }
3204
3205 int numa = rte_vhost_get_numa_node(vid);
3206 if (numa >= 0) {
3207 smap_add_format(args, "numa", "%d", numa);
3208 }
3209
3210 uint16_t vring_num = rte_vhost_get_vring_num(vid);
3211 if (vring_num) {
3212 smap_add_format(args, "num_of_vrings", "%d", vring_num);
3213 }
3214
3215 for (int i = 0; i < vring_num; i++) {
3216 struct rte_vhost_vring vring;
b2e8b12f
FL
3217
3218 rte_vhost_get_vhost_vring(vid, i, &vring);
b9a3183d
AC
3219 smap_add_nocopy(args, xasprintf("vring_%d_size", i),
3220 xasprintf("%d", vring.size));
b2e8b12f
FL
3221 }
3222
3223 ovs_mutex_unlock(&dev->mutex);
3224 return 0;
3225}
3226
31154f95
IS
3227/*
3228 * Convert a given uint32_t link speed defined in DPDK to a string
3229 * equivalent.
3230 */
3231static const char *
3232netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
3233{
3234 switch (link_speed) {
3235 case ETH_SPEED_NUM_10M: return "10Mbps";
3236 case ETH_SPEED_NUM_100M: return "100Mbps";
3237 case ETH_SPEED_NUM_1G: return "1Gbps";
3238 case ETH_SPEED_NUM_2_5G: return "2.5Gbps";
3239 case ETH_SPEED_NUM_5G: return "5Gbps";
3240 case ETH_SPEED_NUM_10G: return "10Gbps";
3241 case ETH_SPEED_NUM_20G: return "20Gbps";
3242 case ETH_SPEED_NUM_25G: return "25Gbps";
3243 case ETH_SPEED_NUM_40G: return "40Gbps";
3244 case ETH_SPEED_NUM_50G: return "50Gbps";
3245 case ETH_SPEED_NUM_56G: return "56Gbps";
3246 case ETH_SPEED_NUM_100G: return "100Gbps";
3247 default: return "Not Defined";
3248 }
3249}
3250
8a9562d2 3251static int
d46285a2 3252netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
8a9562d2 3253{
d46285a2 3254 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 3255 struct rte_eth_dev_info dev_info;
31154f95 3256 uint32_t link_speed;
8a9562d2 3257
7cd1261d 3258 if (!rte_eth_dev_is_valid_port(dev->port_id)) {
8a9562d2 3259 return ENODEV;
7cd1261d 3260 }
8a9562d2 3261
03f3f9c0 3262 ovs_mutex_lock(&dpdk_mutex);
8a9562d2
PS
3263 ovs_mutex_lock(&dev->mutex);
3264 rte_eth_dev_info_get(dev->port_id, &dev_info);
31154f95 3265 link_speed = dev->link.link_speed;
8a9562d2 3266 ovs_mutex_unlock(&dev->mutex);
03f3f9c0
OM
3267 const struct rte_bus *bus;
3268 const struct rte_pci_device *pci_dev;
3269 uint16_t vendor_id = PCI_ANY_ID;
3270 uint16_t device_id = PCI_ANY_ID;
3271 bus = rte_bus_find_by_device(dev_info.device);
3272 if (bus && !strcmp(bus->name, "pci")) {
3273 pci_dev = RTE_DEV_TO_PCI(dev_info.device);
3274 if (pci_dev) {
3275 vendor_id = pci_dev->id.vendor_id;
3276 device_id = pci_dev->id.device_id;
3277 }
3278 }
3279 ovs_mutex_unlock(&dpdk_mutex);
8a9562d2 3280
fa9f4eeb 3281 smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
58be5c0e
MK
3282 smap_add_format(args, "numa_id", "%d",
3283 rte_eth_dev_socket_id(dev->port_id));
8a9562d2
PS
3284 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
3285 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
4be4d22c 3286 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
8a9562d2
PS
3287 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
3288 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
3289 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
58be5c0e
MK
3290 smap_add_format(args, "max_hash_mac_addrs", "%u",
3291 dev_info.max_hash_mac_addrs);
8a9562d2
PS
3292 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
3293 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
3294
3eb8d4fa
MW
3295 /* Querying the DPDK library for iftype may be done in future, pending
3296 * support; cf. RFC 3635 Section 3.2.4. */
3297 enum { IF_TYPE_ETHERNETCSMACD = 6 };
3298
3299 smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
3300 smap_add_format(args, "if_descr", "%s %s", rte_version(),
3301 dev_info.driver_name);
03f3f9c0
OM
3302 smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id);
3303 smap_add_format(args, "pci-device_id", "0x%x", device_id);
8a9562d2 3304
31154f95
IS
3305 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3306 * In that case the speed will not be reported as part of the usual
3307 * call to get_features(). Get the link speed of the device and add it
3308 * to the device status in an easy to read string format.
3309 */
3310 smap_add(args, "link_speed",
3311 netdev_dpdk_link_speed_to_str__(link_speed));
3312
8a9562d2
PS
3313 return 0;
3314}
3315
3316static void
3317netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
3318 OVS_REQUIRES(dev->mutex)
3319{
3320 enum netdev_flags old_flags;
3321
3322 if (admin_state) {
3323 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
3324 } else {
3325 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
3326 }
3327}
3328
3329static void
3330netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
3331 const char *argv[], void *aux OVS_UNUSED)
3332{
3333 bool up;
3334
3335 if (!strcasecmp(argv[argc - 1], "up")) {
3336 up = true;
3337 } else if ( !strcasecmp(argv[argc - 1], "down")) {
3338 up = false;
3339 } else {
3340 unixctl_command_reply_error(conn, "Invalid Admin State");
3341 return;
3342 }
3343
3344 if (argc > 2) {
3345 struct netdev *netdev = netdev_from_name(argv[1]);
3d0d5ab1 3346
8a9562d2 3347 if (netdev && is_dpdk_class(netdev->netdev_class)) {
3d0d5ab1 3348 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 3349
3d0d5ab1
IM
3350 ovs_mutex_lock(&dev->mutex);
3351 netdev_dpdk_set_admin_state__(dev, up);
3352 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3353
3354 netdev_close(netdev);
3355 } else {
3356 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3357 netdev_close(netdev);
3358 return;
3359 }
3360 } else {
3d0d5ab1 3361 struct netdev_dpdk *dev;
8a9562d2
PS
3362
3363 ovs_mutex_lock(&dpdk_mutex);
3d0d5ab1
IM
3364 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3365 ovs_mutex_lock(&dev->mutex);
3366 netdev_dpdk_set_admin_state__(dev, up);
3367 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3368 }
3369 ovs_mutex_unlock(&dpdk_mutex);
3370 }
3371 unixctl_command_reply(conn, "OK");
3372}
3373
0ee821c2
DB
3374static void
3375netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
3376 const char *argv[], void *aux OVS_UNUSED)
3377{
0ee821c2 3378 char *response;
7ee94cba 3379 dpdk_port_t port_id;
0ee821c2 3380 struct netdev_dpdk *dev;
40e940e4
OM
3381 struct rte_device *rte_dev;
3382 struct ds used_interfaces = DS_EMPTY_INITIALIZER;
3383 bool used = false;
0ee821c2
DB
3384
3385 ovs_mutex_lock(&dpdk_mutex);
3386
40e940e4
OM
3387 port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
3388 if (!rte_eth_dev_is_valid_port(port_id)) {
0ee821c2
DB
3389 response = xasprintf("Device '%s' not found in DPDK", argv[1]);
3390 goto error;
3391 }
3392
40e940e4
OM
3393 rte_dev = rte_eth_devices[port_id].device;
3394 ds_put_format(&used_interfaces,
3395 "Device '%s' is being used by the following interfaces:",
3396 argv[1]);
3397
3398 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3399 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3400 if (rte_eth_devices[dev->port_id].device == rte_dev
3401 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
3402 used = true;
3403 ds_put_format(&used_interfaces, " %s",
3404 netdev_get_name(&dev->up));
3405 }
3406 }
3407
3408 if (used) {
3409 ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
3410 response = ds_steal_cstr(&used_interfaces);
3411 ds_destroy(&used_interfaces);
0ee821c2
DB
3412 goto error;
3413 }
40e940e4 3414 ds_destroy(&used_interfaces);
0ee821c2
DB
3415
3416 rte_eth_dev_close(port_id);
40e940e4 3417 if (rte_dev_remove(rte_dev) < 0) {
0ee821c2
DB
3418 response = xasprintf("Device '%s' can not be detached", argv[1]);
3419 goto error;
3420 }
3421
40e940e4
OM
3422 response = xasprintf("All devices shared with device '%s' "
3423 "have been detached", argv[1]);
0ee821c2
DB
3424
3425 ovs_mutex_unlock(&dpdk_mutex);
3426 unixctl_command_reply(conn, response);
3427 free(response);
3428 return;
3429
3430error:
3431 ovs_mutex_unlock(&dpdk_mutex);
3432 unixctl_command_reply_error(conn, response);
3433 free(response);
3434}
3435
be481733
IM
3436static void
3437netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
3438 int argc, const char *argv[],
3439 void *aux OVS_UNUSED)
3440{
3441 size_t size;
3442 FILE *stream;
3443 char *response = NULL;
3444 struct netdev *netdev = NULL;
3445
3446 if (argc == 2) {
3447 netdev = netdev_from_name(argv[1]);
3448 if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
3449 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3450 goto out;
3451 }
3452 }
3453
3454 stream = open_memstream(&response, &size);
3455 if (!stream) {
3456 response = xasprintf("Unable to open memstream: %s.",
3457 ovs_strerror(errno));
3458 unixctl_command_reply_error(conn, response);
3459 goto out;
3460 }
3461
3462 if (netdev) {
3463 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3464
3465 ovs_mutex_lock(&dev->mutex);
3466 ovs_mutex_lock(&dpdk_mp_mutex);
3467
43307ad0 3468 rte_mempool_dump(stream, dev->dpdk_mp->mp);
be481733
IM
3469
3470 ovs_mutex_unlock(&dpdk_mp_mutex);
3471 ovs_mutex_unlock(&dev->mutex);
3472 } else {
3473 ovs_mutex_lock(&dpdk_mp_mutex);
3474 rte_mempool_list_dump(stream);
3475 ovs_mutex_unlock(&dpdk_mp_mutex);
3476 }
3477
3478 fclose(stream);
3479
3480 unixctl_command_reply(conn, response);
3481out:
3482 free(response);
3483 netdev_close(netdev);
3484}
3485
58397e6c
KT
3486/*
3487 * Set virtqueue flags so that we do not receive interrupts.
3488 */
3489static void
0a0f39df 3490set_irq_status(int vid)
58397e6c 3491{
4573fbd3 3492 uint32_t i;
4573fbd3 3493
f3e7ec25
MW
3494 for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
3495 rte_vhost_enable_guest_notification(vid, i, 0);
4573fbd3
FL
3496 }
3497}
3498
585a5bea
IM
3499/*
3500 * Fixes mapping for vhost-user tx queues. Must be called after each
81acebda 3501 * enabling/disabling of queues and n_txq modifications.
585a5bea
IM
3502 */
3503static void
d46285a2
DDP
3504netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
3505 OVS_REQUIRES(dev->mutex)
585a5bea
IM
3506{
3507 int *enabled_queues, n_enabled = 0;
81acebda 3508 int i, k, total_txqs = dev->up.n_txq;
585a5bea 3509
eff23640 3510 enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
585a5bea
IM
3511
3512 for (i = 0; i < total_txqs; i++) {
3513 /* Enabled queues always mapped to themselves. */
d46285a2 3514 if (dev->tx_q[i].map == i) {
585a5bea
IM
3515 enabled_queues[n_enabled++] = i;
3516 }
3517 }
3518
3519 if (n_enabled == 0 && total_txqs != 0) {
f3ea2ad2 3520 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
585a5bea
IM
3521 n_enabled = 1;
3522 }
3523
3524 k = 0;
3525 for (i = 0; i < total_txqs; i++) {
d46285a2
DDP
3526 if (dev->tx_q[i].map != i) {
3527 dev->tx_q[i].map = enabled_queues[k];
585a5bea
IM
3528 k = (k + 1) % n_enabled;
3529 }
3530 }
3531
170ef726
IM
3532 if (VLOG_IS_DBG_ENABLED()) {
3533 struct ds mapping = DS_EMPTY_INITIALIZER;
3534
3535 ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
3536 netdev_get_name(&dev->up));
3537 for (i = 0; i < total_txqs; i++) {
3538 ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
3539 }
3540
3541 VLOG_DBG("%s", ds_cstr(&mapping));
3542 ds_destroy(&mapping);
585a5bea
IM
3543 }
3544
eff23640 3545 free(enabled_queues);
585a5bea 3546}
4573fbd3 3547
58397e6c
KT
3548/*
3549 * A new virtio-net device is added to a vhost port.
3550 */
3551static int
0a0f39df 3552new_device(int vid)
58397e6c 3553{
d46285a2 3554 struct netdev_dpdk *dev;
58397e6c 3555 bool exists = false;
db8f13b0 3556 int newnode = 0;
0a0f39df
CL
3557 char ifname[IF_NAME_SZ];
3558
58be5c0e 3559 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3560
3561 ovs_mutex_lock(&dpdk_mutex);
3562 /* Add device to the vhost port with the same name as that passed down. */
d46285a2 3563 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
c1ff66ac 3564 ovs_mutex_lock(&dev->mutex);
bb9d2623
IM
3565 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3566 uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
db8f13b0
CL
3567
3568 /* Get NUMA information */
0a0f39df
CL
3569 newnode = rte_vhost_get_numa_node(vid);
3570 if (newnode == -1) {
5b9bf9e0 3571#ifdef VHOST_NUMA
db8f13b0 3572 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
0a0f39df 3573 ifname);
5b9bf9e0 3574#endif
db8f13b0 3575 newnode = dev->socket_id;
db8f13b0
CL
3576 }
3577
7235cd20
DM
3578 if (dev->requested_n_txq < qp_num
3579 || dev->requested_n_rxq < qp_num
7f5f2bd0
IM
3580 || dev->requested_socket_id != newnode) {
3581 dev->requested_socket_id = newnode;
3582 dev->requested_n_rxq = qp_num;
3583 dev->requested_n_txq = qp_num;
3584 netdev_request_reconfigure(&dev->up);
3585 } else {
3586 /* Reconfiguration not required. */
3587 dev->vhost_reconfigured = true;
3588 }
81acebda 3589
0a0f39df 3590 ovsrcu_index_set(&dev->vid, vid);
81acebda
IM
3591 exists = true;
3592
58397e6c 3593 /* Disable notifications. */
0a0f39df 3594 set_irq_status(vid);
e543851d 3595 netdev_change_seq_changed(&dev->up);
d46285a2 3596 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3597 break;
3598 }
c1ff66ac 3599 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3600 }
3601 ovs_mutex_unlock(&dpdk_mutex);
3602
3603 if (!exists) {
0a0f39df 3604 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
58397e6c
KT
3605
3606 return -1;
3607 }
3608
0a0f39df
CL
3609 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
3610 ifname, newnode);
3611
58397e6c
KT
3612 return 0;
3613}
3614
f3ea2ad2
IM
3615/* Clears mapping for all available queues of vhost interface. */
3616static void
3617netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
3618 OVS_REQUIRES(dev->mutex)
3619{
3620 int i;
3621
81acebda 3622 for (i = 0; i < dev->up.n_txq; i++) {
f3ea2ad2
IM
3623 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
3624 }
3625}
3626
58397e6c
KT
3627/*
3628 * Remove a virtio-net device from the specific vhost port. Use dev->remove
3629 * flag to stop any more packets from being sent or received to/from a VM and
3630 * ensure all currently queued packets have been sent/received before removing
3631 * the device.
3632 */
3633static void
0a0f39df 3634destroy_device(int vid)
58397e6c 3635{
d46285a2 3636 struct netdev_dpdk *dev;
afee281f 3637 bool exists = false;
0a0f39df
CL
3638 char ifname[IF_NAME_SZ];
3639
58be5c0e 3640 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3641
3642 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3643 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
0a0f39df 3644 if (netdev_dpdk_get_vid(dev) == vid) {
58397e6c 3645
d46285a2 3646 ovs_mutex_lock(&dev->mutex);
0a0f39df
CL
3647 dev->vhost_reconfigured = false;
3648 ovsrcu_index_set(&dev->vid, -1);
35c91567
DM
3649 memset(dev->vhost_rxq_enabled, 0,
3650 dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
d46285a2 3651 netdev_dpdk_txq_map_clear(dev);
81acebda 3652
e543851d 3653 netdev_change_seq_changed(&dev->up);
d46285a2 3654 ovs_mutex_unlock(&dev->mutex);
81acebda 3655 exists = true;
afee281f 3656 break;
58397e6c
KT
3657 }
3658 }
afee281f 3659
58397e6c
KT
3660 ovs_mutex_unlock(&dpdk_mutex);
3661
0a0f39df 3662 if (exists) {
afee281f
KT
3663 /*
3664 * Wait for other threads to quiesce after setting the 'virtio_dev'
3665 * to NULL, before returning.
3666 */
3667 ovsrcu_synchronize();
3668 /*
3669 * As call to ovsrcu_synchronize() will end the quiescent state,
3670 * put thread back into quiescent state before returning.
3671 */
3672 ovsrcu_quiesce_start();
0a0f39df 3673 VLOG_INFO("vHost Device '%s' has been removed", ifname);
afee281f 3674 } else {
0a0f39df 3675 VLOG_INFO("vHost Device '%s' not found", ifname);
afee281f 3676 }
58397e6c
KT
3677}
3678
585a5bea 3679static int
0a0f39df 3680vring_state_changed(int vid, uint16_t queue_id, int enable)
585a5bea 3681{
d46285a2 3682 struct netdev_dpdk *dev;
585a5bea
IM
3683 bool exists = false;
3684 int qid = queue_id / VIRTIO_QNUM;
35c91567 3685 bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
0a0f39df
CL
3686 char ifname[IF_NAME_SZ];
3687
58be5c0e 3688 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
585a5bea 3689
585a5bea 3690 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3691 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
c1ff66ac 3692 ovs_mutex_lock(&dev->mutex);
bb9d2623 3693 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
35c91567
DM
3694 if (is_rx) {
3695 bool old_state = dev->vhost_rxq_enabled[qid];
3696
3697 dev->vhost_rxq_enabled[qid] = enable != 0;
3698 if (old_state != dev->vhost_rxq_enabled[qid]) {
3699 netdev_change_seq_changed(&dev->up);
3700 }
585a5bea 3701 } else {
35c91567
DM
3702 if (enable) {
3703 dev->tx_q[qid].map = qid;
3704 } else {
3705 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
3706 }
3707 netdev_dpdk_remap_txqs(dev);
585a5bea 3708 }
585a5bea 3709 exists = true;
d46285a2 3710 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3711 break;
3712 }
c1ff66ac 3713 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3714 }
3715 ovs_mutex_unlock(&dpdk_mutex);
3716
3717 if (exists) {
35c91567
DM
3718 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
3719 "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
3720 qid, ifname, (enable == 1) ? "enabled" : "disabled");
585a5bea 3721 } else {
0a0f39df 3722 VLOG_INFO("vHost Device '%s' not found", ifname);
585a5bea
IM
3723 return -1;
3724 }
3725
3726 return 0;
3727}
3728
61473a0e
DM
3729static void
3730destroy_connection(int vid)
3731{
3732 struct netdev_dpdk *dev;
3733 char ifname[IF_NAME_SZ];
3734 bool exists = false;
3735
3736 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3737
3738 ovs_mutex_lock(&dpdk_mutex);
3739 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3740 ovs_mutex_lock(&dev->mutex);
3741 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3742 uint32_t qp_num = NR_QUEUE;
3743
3744 if (netdev_dpdk_get_vid(dev) >= 0) {
3745 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
3746 "device still attached.", dev->vhost_id);
3747 }
3748
3749 /* Restore the number of queue pairs to default. */
3750 if (dev->requested_n_txq != qp_num
3751 || dev->requested_n_rxq != qp_num) {
3752 dev->requested_n_rxq = qp_num;
3753 dev->requested_n_txq = qp_num;
3754 netdev_request_reconfigure(&dev->up);
3755 }
3756 ovs_mutex_unlock(&dev->mutex);
3757 exists = true;
3758 break;
3759 }
3760 ovs_mutex_unlock(&dev->mutex);
3761 }
3762 ovs_mutex_unlock(&dpdk_mutex);
3763
3764 if (exists) {
3765 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
3766 } else {
3767 VLOG_INFO("vHost Device '%s' not found", ifname);
3768 }
3769}
3770
8492adc2
JS
3771/*
3772 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
3773 * or vhostuserclient netdev.
3774 *
3775 * Returns a value greater or equal to zero for a valid vid or '-1' if
3776 * there is no valid vid associated. A vid of '-1' must not be used in
3777 * rte_vhost_ APi calls.
3778 *
3779 * Once obtained and validated, a vid can be used by a PMD for multiple
3780 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
3781 * not fetch the vid again for each of a series of API calls.
3782 */
3783
0a0f39df
CL
3784int
3785netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
58397e6c 3786{
0a0f39df 3787 return ovsrcu_index_get(&dev->vid);
58397e6c
KT
3788}
3789
9509913a
IS
3790struct ingress_policer *
3791netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
3792{
3793 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
3794}
3795
58397e6c 3796static int
ecc1a34e 3797netdev_dpdk_class_init(void)
7d1ced01 3798{
ecc1a34e
DDP
3799 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3800
3801 /* This function can be called for different classes. The initialization
3802 * needs to be done only once */
3803 if (ovsthread_once_start(&once)) {
3804 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3805 unixctl_command_register("netdev-dpdk/set-admin-state",
3806 "[netdev] up|down", 1, 2,
3807 netdev_dpdk_set_admin_state, NULL);
3808
0ee821c2
DB
3809 unixctl_command_register("netdev-dpdk/detach",
3810 "pci address of device", 1, 1,
3811 netdev_dpdk_detach, NULL);
3812
be481733
IM
3813 unixctl_command_register("netdev-dpdk/get-mempool-info",
3814 "[netdev]", 0, 1,
3815 netdev_dpdk_get_mempool_info, NULL);
3816
ecc1a34e
DDP
3817 ovsthread_once_done(&once);
3818 }
362ca396 3819
7d1ced01
CL
3820 return 0;
3821}
3822
95fb793a 3823/* Client Rings */
3824
95fb793a 3825static int
3826dpdk_ring_create(const char dev_name[], unsigned int port_no,
bb37956a 3827 dpdk_port_t *eth_port_id)
95fb793a 3828{
48fffdee 3829 struct dpdk_ring *ring_pair;
0c6f39e5 3830 char *ring_name;
b8374d0d 3831 int port_id;
95fb793a 3832
48fffdee
KT
3833 ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
3834 if (!ring_pair) {
95fb793a 3835 return ENOMEM;
3836 }
3837
7251515e 3838 /* XXX: Add support for multiquque ring. */
0c6f39e5 3839 ring_name = xasprintf("%s_tx", dev_name);
95fb793a 3840
8f0a76c9 3841 /* Create single producer tx ring, netdev does explicit locking. */
48fffdee 3842 ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3843 RING_F_SP_ENQ);
0c6f39e5 3844 free(ring_name);
48fffdee
KT
3845 if (ring_pair->cring_tx == NULL) {
3846 rte_free(ring_pair);
95fb793a 3847 return ENOMEM;
3848 }
3849
0c6f39e5 3850 ring_name = xasprintf("%s_rx", dev_name);
95fb793a 3851
8f0a76c9 3852 /* Create single consumer rx ring, netdev does explicit locking. */
48fffdee 3853 ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3854 RING_F_SC_DEQ);
0c6f39e5 3855 free(ring_name);
48fffdee
KT
3856 if (ring_pair->cring_rx == NULL) {
3857 rte_free(ring_pair);
95fb793a 3858 return ENOMEM;
3859 }
3860
b8374d0d
MV
3861 port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
3862 &ring_pair->cring_tx, 1, SOCKET0);
d7310583 3863
b8374d0d 3864 if (port_id < 0) {
48fffdee 3865 rte_free(ring_pair);
95fb793a 3866 return ENODEV;
3867 }
3868
48fffdee 3869 ring_pair->user_port_id = port_no;
b8374d0d
MV
3870 ring_pair->eth_port_id = port_id;
3871 *eth_port_id = port_id;
3872
48fffdee 3873 ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
95fb793a 3874
95fb793a 3875 return 0;
3876}
3877
3878static int
bb37956a 3879dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
64839cf4 3880 OVS_REQUIRES(dpdk_mutex)
95fb793a 3881{
48fffdee 3882 struct dpdk_ring *ring_pair;
95fb793a 3883 unsigned int port_no;
3884 int err = 0;
3885
3886 /* Names always start with "dpdkr" */
3887 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
3888 if (err) {
3889 return err;
3890 }
3891
58be5c0e 3892 /* Look through our list to find the device */
48fffdee
KT
3893 LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
3894 if (ring_pair->user_port_id == port_no) {
58397e6c 3895 VLOG_INFO("Found dpdk ring device %s:", dev_name);
58be5c0e 3896 /* Really all that is needed */
48fffdee 3897 *eth_port_id = ring_pair->eth_port_id;
95fb793a 3898 return 0;
3899 }
3900 }
3901 /* Need to create the device rings */
3902 return dpdk_ring_create(dev_name, port_no, eth_port_id);
3903}
3904
7251515e 3905static int
d46285a2 3906netdev_dpdk_ring_send(struct netdev *netdev, int qid,
b30896c9 3907 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e 3908{
d46285a2 3909 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a543eb0 3910 struct dp_packet *packet;
1b99bb05 3911
58be5c0e 3912 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
a47e2db2 3913 * the offload fields are clear. This is because the same mbuf may be
58be5c0e 3914 * modified by the consumer of the ring and return into the datapath
a47e2db2 3915 * without recalculating the RSS hash or revalidating the checksums. */
e883448e 3916 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
a47e2db2 3917 dp_packet_reset_offload(packet);
1b99bb05 3918 }
7251515e 3919
b30896c9 3920 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e
DV
3921 return 0;
3922}
3923
95fb793a 3924static int
3925netdev_dpdk_ring_construct(struct netdev *netdev)
3926{
bb37956a 3927 dpdk_port_t port_no = 0;
95fb793a 3928 int err = 0;
3929
95fb793a 3930 ovs_mutex_lock(&dpdk_mutex);
3931
3932 err = dpdk_ring_open(netdev->name, &port_no);
3933 if (err) {
3934 goto unlock_dpdk;
3935 }
3936
1ce30dfd
DDP
3937 err = common_construct(netdev, port_no, DPDK_DEV_ETH,
3938 rte_eth_dev_socket_id(port_no));
95fb793a 3939unlock_dpdk:
3940 ovs_mutex_unlock(&dpdk_mutex);
3941 return err;
3942}
3943
0bf765f7
IS
3944/* QoS Functions */
3945
3946/*
3947 * Initialize QoS configuration operations.
3948 */
3949static void
3950qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
3951{
3952 conf->ops = ops;
78bd47cf 3953 rte_spinlock_init(&conf->lock);
0bf765f7
IS
3954}
3955
3956/*
3957 * Search existing QoS operations in qos_ops and compare each set of
3958 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
3959 * else return NULL
3960 */
3961static const struct dpdk_qos_ops *
3962qos_lookup_name(const char *name)
3963{
3964 const struct dpdk_qos_ops *const *opsp;
3965
3966 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3967 const struct dpdk_qos_ops *ops = *opsp;
3968 if (!strcmp(name, ops->qos_name)) {
3969 return ops;
3970 }
3971 }
3972 return NULL;
3973}
3974
0bf765f7
IS
3975static int
3976netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3977 struct sset *types)
3978{
3979 const struct dpdk_qos_ops *const *opsp;
3980
3981 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3982 const struct dpdk_qos_ops *ops = *opsp;
3983 if (ops->qos_construct && ops->qos_name[0] != '\0') {
3984 sset_add(types, ops->qos_name);
3985 }
3986 }
3987 return 0;
3988}
3989
3990static int
d46285a2 3991netdev_dpdk_get_qos(const struct netdev *netdev,
0bf765f7
IS
3992 const char **typep, struct smap *details)
3993{
d46285a2 3994 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
78bd47cf 3995 struct qos_conf *qos_conf;
0bf765f7
IS
3996 int error = 0;
3997
d46285a2 3998 ovs_mutex_lock(&dev->mutex);
78bd47cf
DDP
3999 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
4000 if (qos_conf) {
4001 *typep = qos_conf->ops->qos_name;
4002 error = (qos_conf->ops->qos_get
4003 ? qos_conf->ops->qos_get(qos_conf, details): 0);
d03603c4
MC
4004 } else {
4005 /* No QoS configuration set, return an empty string */
4006 *typep = "";
0bf765f7 4007 }
d46285a2 4008 ovs_mutex_unlock(&dev->mutex);
0bf765f7
IS
4009
4010 return error;
4011}
4012
4013static int
78bd47cf
DDP
4014netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
4015 const struct smap *details)
0bf765f7 4016{
d46285a2 4017 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
0bf765f7 4018 const struct dpdk_qos_ops *new_ops = NULL;
78bd47cf 4019 struct qos_conf *qos_conf, *new_qos_conf = NULL;
0bf765f7
IS
4020 int error = 0;
4021
d46285a2 4022 ovs_mutex_lock(&dev->mutex);
0bf765f7 4023
78bd47cf 4024 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
0bf765f7 4025
78bd47cf
DDP
4026 new_ops = qos_lookup_name(type);
4027
4028 if (!new_ops || !new_ops->qos_construct) {
4029 new_qos_conf = NULL;
4030 if (type && type[0]) {
4031 error = EOPNOTSUPP;
0bf765f7 4032 }
44975bb0 4033 } else if (qos_conf && qos_conf->ops == new_ops
78bd47cf
DDP
4034 && qos_conf->ops->qos_is_equal(qos_conf, details)) {
4035 new_qos_conf = qos_conf;
0bf765f7 4036 } else {
78bd47cf 4037 error = new_ops->qos_construct(details, &new_qos_conf);
7ea266e9
IS
4038 }
4039
7ea266e9 4040 if (error) {
78bd47cf
DDP
4041 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
4042 type, netdev->name, rte_strerror(error));
4043 }
4044
4045 if (new_qos_conf != qos_conf) {
4046 ovsrcu_set(&dev->qos_conf, new_qos_conf);
4047 if (qos_conf) {
4048 ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
4049 }
0bf765f7
IS
4050 }
4051
d46285a2 4052 ovs_mutex_unlock(&dev->mutex);
78bd47cf 4053
0bf765f7
IS
4054 return error;
4055}
4056
4057/* egress-policer details */
4058
4059struct egress_policer {
4060 struct qos_conf qos_conf;
4061 struct rte_meter_srtcm_params app_srtcm_params;
4062 struct rte_meter_srtcm egress_meter;
03f3f9c0 4063 struct rte_meter_srtcm_profile egress_prof;
0bf765f7
IS
4064};
4065
78bd47cf
DDP
4066static void
4067egress_policer_details_to_param(const struct smap *details,
4068 struct rte_meter_srtcm_params *params)
0bf765f7 4069{
78bd47cf
DDP
4070 memset(params, 0, sizeof *params);
4071 params->cir = smap_get_ullong(details, "cir", 0);
4072 params->cbs = smap_get_ullong(details, "cbs", 0);
4073 params->ebs = 0;
0bf765f7
IS
4074}
4075
4076static int
78bd47cf
DDP
4077egress_policer_qos_construct(const struct smap *details,
4078 struct qos_conf **conf)
0bf765f7 4079{
0bf765f7 4080 struct egress_policer *policer;
0bf765f7
IS
4081 int err = 0;
4082
0bf765f7
IS
4083 policer = xmalloc(sizeof *policer);
4084 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
78bd47cf 4085 egress_policer_details_to_param(details, &policer->app_srtcm_params);
03f3f9c0
OM
4086 err = rte_meter_srtcm_profile_config(&policer->egress_prof,
4087 &policer->app_srtcm_params);
4088 if (!err) {
4089 err = rte_meter_srtcm_config(&policer->egress_meter,
4090 &policer->egress_prof);
4091 }
4092
78bd47cf
DDP
4093 if (!err) {
4094 *conf = &policer->qos_conf;
4095 } else {
03f3f9c0 4096 VLOG_ERR("Could not create rte meter for egress policer");
7ea266e9 4097 free(policer);
78bd47cf 4098 *conf = NULL;
7ea266e9
IS
4099 err = -err;
4100 }
0bf765f7
IS
4101
4102 return err;
4103}
4104
4105static void
78bd47cf 4106egress_policer_qos_destruct(struct qos_conf *conf)
0bf765f7
IS
4107{
4108 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
78bd47cf 4109 qos_conf);
0bf765f7
IS
4110 free(policer);
4111}
4112
4113static int
78bd47cf 4114egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
0bf765f7 4115{
78bd47cf
DDP
4116 struct egress_policer *policer =
4117 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4118
4119 smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
4120 smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
050c60bf 4121
0bf765f7
IS
4122 return 0;
4123}
4124
78bd47cf 4125static bool
47a45d86
KT
4126egress_policer_qos_is_equal(const struct qos_conf *conf,
4127 const struct smap *details)
0bf765f7 4128{
78bd47cf
DDP
4129 struct egress_policer *policer =
4130 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4131 struct rte_meter_srtcm_params params;
0bf765f7 4132
78bd47cf 4133 egress_policer_details_to_param(details, &params);
7ea266e9 4134
78bd47cf 4135 return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
0bf765f7
IS
4136}
4137
0bf765f7 4138static int
3e90f7d7 4139egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
7d7ded7a 4140 bool should_steal)
0bf765f7 4141{
0bf765f7 4142 int cnt = 0;
78bd47cf
DDP
4143 struct egress_policer *policer =
4144 CONTAINER_OF(conf, struct egress_policer, qos_conf);
0bf765f7 4145
03f3f9c0
OM
4146 cnt = netdev_dpdk_policer_run(&policer->egress_meter,
4147 &policer->egress_prof, pkts,
7d7ded7a 4148 pkt_cnt, should_steal);
0bf765f7
IS
4149
4150 return cnt;
4151}
4152
4153static const struct dpdk_qos_ops egress_policer_ops = {
4154 "egress-policer", /* qos_name */
4155 egress_policer_qos_construct,
4156 egress_policer_qos_destruct,
4157 egress_policer_qos_get,
78bd47cf 4158 egress_policer_qos_is_equal,
0bf765f7
IS
4159 egress_policer_run
4160};
4161
050c60bf
DDP
4162static int
4163netdev_dpdk_reconfigure(struct netdev *netdev)
4164{
4165 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4166 int err = 0;
4167
050c60bf
DDP
4168 ovs_mutex_lock(&dev->mutex);
4169
4170 if (netdev->n_txq == dev->requested_n_txq
0072e931 4171 && netdev->n_rxq == dev->requested_n_rxq
b685696b 4172 && dev->mtu == dev->requested_mtu
f8b64a61 4173 && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
b685696b 4174 && dev->rxq_size == dev->requested_rxq_size
bd4e172b 4175 && dev->txq_size == dev->requested_txq_size
606f6650
EC
4176 && dev->socket_id == dev->requested_socket_id
4177 && dev->started) {
050c60bf
DDP
4178 /* Reconfiguration is unnecessary */
4179
4180 goto out;
4181 }
4182
4183 rte_eth_dev_stop(dev->port_id);
606f6650 4184 dev->started = false;
050c60bf 4185
d555d9bd 4186 err = netdev_dpdk_mempool_configure(dev);
b6b26021 4187 if (err && err != EEXIST) {
d555d9bd 4188 goto out;
0072e931
MK
4189 }
4190
f8b64a61
RM
4191 dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
4192
050c60bf
DDP
4193 netdev->n_txq = dev->requested_n_txq;
4194 netdev->n_rxq = dev->requested_n_rxq;
4195
b685696b
CL
4196 dev->rxq_size = dev->requested_rxq_size;
4197 dev->txq_size = dev->requested_txq_size;
4198
050c60bf
DDP
4199 rte_free(dev->tx_q);
4200 err = dpdk_eth_dev_init(dev);
eff23640
DDP
4201 dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
4202 if (!dev->tx_q) {
4203 err = ENOMEM;
4204 }
050c60bf 4205
0072e931
MK
4206 netdev_change_seq_changed(netdev);
4207
050c60bf 4208out:
050c60bf 4209 ovs_mutex_unlock(&dev->mutex);
050c60bf
DDP
4210 return err;
4211}
4212
7f381c2e 4213static int
2d24d165 4214dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
2d24d165 4215 OVS_REQUIRES(dev->mutex)
050c60bf 4216{
2d24d165
CL
4217 dev->up.n_txq = dev->requested_n_txq;
4218 dev->up.n_rxq = dev->requested_n_rxq;
96e9b168 4219 int err;
050c60bf 4220
35c91567
DM
4221 /* Always keep RX queue 0 enabled for implementations that won't
4222 * report vring states. */
4223 dev->vhost_rxq_enabled[0] = true;
4224
81acebda
IM
4225 /* Enable TX queue 0 by default if it wasn't disabled. */
4226 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
4227 dev->tx_q[0].map = 0;
4228 }
4229
4230 netdev_dpdk_remap_txqs(dev);
4231
d555d9bd 4232 err = netdev_dpdk_mempool_configure(dev);
b6b26021 4233 if (!err) {
43307ad0 4234 /* A new mempool was created or re-used. */
d555d9bd 4235 netdev_change_seq_changed(&dev->up);
03f3f9c0 4236 } else if (err != EEXIST) {
b6b26021 4237 return err;
db8f13b0 4238 }
0a0f39df 4239 if (netdev_dpdk_get_vid(dev) >= 0) {
894af647 4240 if (dev->vhost_reconfigured == false) {
4241 dev->vhost_reconfigured = true;
4242 /* Carrier status may need updating. */
4243 netdev_change_seq_changed(&dev->up);
4244 }
81acebda 4245 }
7f381c2e
DDP
4246
4247 return 0;
2d24d165
CL
4248}
4249
4250static int
4251netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
4252{
4253 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 4254 int err;
2d24d165 4255
2d24d165 4256 ovs_mutex_lock(&dev->mutex);
7f381c2e 4257 err = dpdk_vhost_reconfigure_helper(dev);
2d24d165 4258 ovs_mutex_unlock(&dev->mutex);
7f381c2e
DDP
4259
4260 return err;
2d24d165
CL
4261}
4262
4263static int
4264netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
4265{
4266 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 4267 int err;
a14d1cc8 4268 uint64_t vhost_flags = 0;
10087cba 4269 bool zc_enabled;
2d24d165 4270
2d24d165
CL
4271 ovs_mutex_lock(&dev->mutex);
4272
c1ff66ac
CL
4273 /* Configure vHost client mode if requested and if the following criteria
4274 * are met:
2d24d165
CL
4275 * 1. Device hasn't been registered yet.
4276 * 2. A path has been specified.
c1ff66ac 4277 */
bb9d2623 4278 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
a14d1cc8
MK
4279 /* Register client-mode device. */
4280 vhost_flags |= RTE_VHOST_USER_CLIENT;
4281
4282 /* Enable IOMMU support, if explicitly requested. */
4283 if (dpdk_vhost_iommu_enabled()) {
4284 vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
4285 }
10087cba 4286
30e834dc
LB
4287 /* Enable POSTCOPY support, if explicitly requested. */
4288 if (dpdk_vhost_postcopy_enabled()) {
4289 vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
4290 }
4291
10087cba
CL
4292 zc_enabled = dev->vhost_driver_flags
4293 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4294 /* Enable zero copy flag, if requested */
4295 if (zc_enabled) {
4296 vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4297 }
4298
a14d1cc8 4299 err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
c1ff66ac 4300 if (err) {
2d24d165
CL
4301 VLOG_ERR("vhost-user device setup failure for device %s\n",
4302 dev->vhost_id);
7f381c2e 4303 goto unlock;
c1ff66ac 4304 } else {
2d24d165 4305 /* Configuration successful */
a14d1cc8 4306 dev->vhost_driver_flags |= vhost_flags;
2d24d165
CL
4307 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
4308 "using client socket '%s'",
4309 dev->up.name, dev->vhost_id);
10087cba
CL
4310 if (zc_enabled) {
4311 VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
4312 }
c1ff66ac 4313 }
f3e7ec25
MW
4314
4315 err = rte_vhost_driver_callback_register(dev->vhost_id,
4316 &virtio_net_device_ops);
4317 if (err) {
4318 VLOG_ERR("rte_vhost_driver_callback_register failed for "
4319 "vhost user client port: %s\n", dev->up.name);
4320 goto unlock;
4321 }
4322
4323 err = rte_vhost_driver_disable_features(dev->vhost_id,
4324 1ULL << VIRTIO_NET_F_HOST_TSO4
4325 | 1ULL << VIRTIO_NET_F_HOST_TSO6
4326 | 1ULL << VIRTIO_NET_F_CSUM);
4327 if (err) {
4328 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
4329 "client port: %s\n", dev->up.name);
4330 goto unlock;
4331 }
4332
4333 err = rte_vhost_driver_start(dev->vhost_id);
4334 if (err) {
4335 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
4336 "client port: %s\n", dev->up.name);
4337 goto unlock;
4338 }
c1ff66ac
CL
4339 }
4340
7f381c2e
DDP
4341 err = dpdk_vhost_reconfigure_helper(dev);
4342
4343unlock:
050c60bf 4344 ovs_mutex_unlock(&dev->mutex);
050c60bf 4345
7f381c2e 4346 return err;
050c60bf
DDP
4347}
4348
5fc5c50f
IM
4349bool
4350netdev_dpdk_flow_api_supported(struct netdev *netdev)
4351{
4352 struct netdev_dpdk *dev;
4353 bool ret = false;
4354
4355 if (!is_dpdk_class(netdev->netdev_class)) {
4356 goto out;
4357 }
4358
4359 dev = netdev_dpdk_cast(netdev);
4360 ovs_mutex_lock(&dev->mutex);
4361 if (dev->type == DPDK_DEV_ETH) {
4362 /* TODO: Check if we able to offload some minimal flow. */
4363 ret = true;
4364 }
4365 ovs_mutex_unlock(&dev->mutex);
4366out:
4367 return ret;
4368}
4369
6775bdfc
RBY
4370int
4371netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
4372 struct rte_flow *rte_flow,
4373 struct rte_flow_error *error)
4374{
4375 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4376 int ret;
4377
4378 ovs_mutex_lock(&dev->mutex);
4379 ret = rte_flow_destroy(dev->port_id, rte_flow, error);
4380 ovs_mutex_unlock(&dev->mutex);
4381 return ret;
4382}
4383
4384struct rte_flow *
4385netdev_dpdk_rte_flow_create(struct netdev *netdev,
4386 const struct rte_flow_attr *attr,
4387 const struct rte_flow_item *items,
4388 const struct rte_flow_action *actions,
4389 struct rte_flow_error *error)
4390{
4391 struct rte_flow *flow;
4392 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4393
4394 ovs_mutex_lock(&dev->mutex);
4395 flow = rte_flow_create(dev->port_id, attr, items, actions, error);
4396 ovs_mutex_unlock(&dev->mutex);
4397 return flow;
4398}
e8a2b5bf 4399
89c09c1c
BP
4400#define NETDEV_DPDK_CLASS_COMMON \
4401 .is_pmd = true, \
4402 .alloc = netdev_dpdk_alloc, \
4403 .dealloc = netdev_dpdk_dealloc, \
4404 .get_config = netdev_dpdk_get_config, \
4405 .get_numa_id = netdev_dpdk_get_numa_id, \
4406 .set_etheraddr = netdev_dpdk_set_etheraddr, \
4407 .get_etheraddr = netdev_dpdk_get_etheraddr, \
4408 .get_mtu = netdev_dpdk_get_mtu, \
4409 .set_mtu = netdev_dpdk_set_mtu, \
4410 .get_ifindex = netdev_dpdk_get_ifindex, \
4411 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
4412 .set_miimon_interval = netdev_dpdk_set_miimon, \
4413 .set_policing = netdev_dpdk_set_policing, \
4414 .get_qos_types = netdev_dpdk_get_qos_types, \
4415 .get_qos = netdev_dpdk_get_qos, \
4416 .set_qos = netdev_dpdk_set_qos, \
4417 .update_flags = netdev_dpdk_update_flags, \
4418 .rxq_alloc = netdev_dpdk_rxq_alloc, \
4419 .rxq_construct = netdev_dpdk_rxq_construct, \
4420 .rxq_destruct = netdev_dpdk_rxq_destruct, \
c0af6425 4421 .rxq_dealloc = netdev_dpdk_rxq_dealloc
89c09c1c
BP
4422
4423#define NETDEV_DPDK_CLASS_BASE \
4424 NETDEV_DPDK_CLASS_COMMON, \
4425 .init = netdev_dpdk_class_init, \
4426 .destruct = netdev_dpdk_destruct, \
4427 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
4428 .get_carrier = netdev_dpdk_get_carrier, \
4429 .get_stats = netdev_dpdk_get_stats, \
4430 .get_custom_stats = netdev_dpdk_get_custom_stats, \
4431 .get_features = netdev_dpdk_get_features, \
4432 .get_status = netdev_dpdk_get_status, \
4433 .reconfigure = netdev_dpdk_reconfigure, \
5fc5c50f 4434 .rxq_recv = netdev_dpdk_rxq_recv
89c09c1c
BP
4435
4436static const struct netdev_class dpdk_class = {
4437 .type = "dpdk",
4438 NETDEV_DPDK_CLASS_BASE,
4439 .construct = netdev_dpdk_construct,
4440 .set_config = netdev_dpdk_set_config,
4441 .send = netdev_dpdk_eth_send,
4442};
4443
4444static const struct netdev_class dpdk_ring_class = {
4445 .type = "dpdkr",
4446 NETDEV_DPDK_CLASS_BASE,
4447 .construct = netdev_dpdk_ring_construct,
4448 .set_config = netdev_dpdk_ring_set_config,
4449 .send = netdev_dpdk_ring_send,
4450};
4451
4452static const struct netdev_class dpdk_vhost_class = {
4453 .type = "dpdkvhostuser",
4454 NETDEV_DPDK_CLASS_COMMON,
4455 .construct = netdev_dpdk_vhost_construct,
4456 .destruct = netdev_dpdk_vhost_destruct,
4457 .send = netdev_dpdk_vhost_send,
4458 .get_carrier = netdev_dpdk_vhost_get_carrier,
4459 .get_stats = netdev_dpdk_vhost_get_stats,
c161357d 4460 .get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
89c09c1c
BP
4461 .get_status = netdev_dpdk_vhost_user_get_status,
4462 .reconfigure = netdev_dpdk_vhost_reconfigure,
35c91567
DM
4463 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4464 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
89c09c1c
BP
4465};
4466
4467static const struct netdev_class dpdk_vhost_client_class = {
4468 .type = "dpdkvhostuserclient",
4469 NETDEV_DPDK_CLASS_COMMON,
4470 .construct = netdev_dpdk_vhost_client_construct,
4471 .destruct = netdev_dpdk_vhost_destruct,
4472 .set_config = netdev_dpdk_vhost_client_set_config,
4473 .send = netdev_dpdk_vhost_send,
4474 .get_carrier = netdev_dpdk_vhost_get_carrier,
4475 .get_stats = netdev_dpdk_vhost_get_stats,
c161357d 4476 .get_custom_stats = netdev_dpdk_vhost_get_custom_stats,
89c09c1c
BP
4477 .get_status = netdev_dpdk_vhost_user_get_status,
4478 .reconfigure = netdev_dpdk_vhost_client_reconfigure,
35c91567
DM
4479 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4480 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
89c09c1c 4481};
95fb793a 4482
8a9562d2
PS
4483void
4484netdev_dpdk_register(void)
4485{
bab69409
AC
4486 netdev_register_provider(&dpdk_class);
4487 netdev_register_provider(&dpdk_ring_class);
53f50d24 4488 netdev_register_provider(&dpdk_vhost_class);
2d24d165 4489 netdev_register_provider(&dpdk_vhost_client_class);
8a9562d2 4490}