]> git.proxmox.com Git - ovs.git/blame - lib/netdev-dpdk.c
netdev-dpdk: Reset queue number for vhost devices on vm shutdown.
[ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2 1/*
12d0d124 2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
8a9562d2
PS
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
01961bbd 18#include "netdev-dpdk.h"
8a9562d2 19
6ebc4b09 20#include <errno.h>
8a9562d2
PS
21#include <signal.h>
22#include <stdlib.h>
6ebc4b09 23#include <string.h>
8a9562d2 24#include <unistd.h>
f3e7ec25
MW
25#include <linux/virtio_net.h>
26#include <sys/socket.h>
27#include <linux/if.h>
01961bbd 28
5e925ccc 29#include <rte_bus_pci.h>
01961bbd
DDP
30#include <rte_config.h>
31#include <rte_cycles.h>
32#include <rte_errno.h>
33#include <rte_eth_ring.h>
34#include <rte_ethdev.h>
6ebc4b09 35#include <rte_flow.h>
01961bbd
DDP
36#include <rte_malloc.h>
37#include <rte_mbuf.h>
38#include <rte_meter.h>
fc56f5e0 39#include <rte_pci.h>
3eb8d4fa 40#include <rte_version.h>
6ebc4b09 41#include <rte_vhost.h>
8a9562d2 42
e8a2b5bf 43#include "cmap.h"
7d1ced01 44#include "dirs.h"
e14deea0 45#include "dp-packet.h"
01961bbd 46#include "dpdk.h"
8a9562d2 47#include "dpif-netdev.h"
e5c0f5a4 48#include "fatal-signal.h"
8a9562d2
PS
49#include "netdev-provider.h"
50#include "netdev-vport.h"
51#include "odp-util.h"
eac84432 52#include "openvswitch/dynamic-string.h"
25d436fb 53#include "openvswitch/list.h"
6ebc4b09 54#include "openvswitch/match.h"
25d436fb 55#include "openvswitch/ofp-print.h"
6ebc4b09 56#include "openvswitch/shash.h"
25d436fb 57#include "openvswitch/vlog.h"
94143fc4 58#include "ovs-numa.h"
8a9562d2 59#include "ovs-rcu.h"
6ebc4b09 60#include "ovs-thread.h"
8a9562d2 61#include "packets.h"
0bf765f7 62#include "smap.h"
8a9562d2 63#include "sset.h"
8a9562d2 64#include "timeval.h"
6ebc4b09 65#include "unaligned.h"
8a9562d2 66#include "unixctl.h"
6ebc4b09
IM
67#include "util.h"
68#include "uuid.h"
8a9562d2 69
f3e7ec25
MW
70enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
71
05b49df6 72VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
8a9562d2
PS
73static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
74
75#define DPDK_PORT_WATCHDOG_INTERVAL 5
76
77#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
78#define OVS_VPORT_DPDK "ovs_dpdk"
79
80/*
81 * need to reserve tons of extra space in the mbufs so we can align the
82 * DMA addresses to 4KB.
18f777b2
TP
83 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
84 * performance for standard Ethernet MTU.
8a9562d2 85 */
58be5c0e
MK
86#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
87 + (2 * VLAN_HEADER_LEN))
4be4d22c
MK
88#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
89#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
58be5c0e
MK
90#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
91 - ETHER_HDR_LEN - ETHER_CRC_LEN)
4be4d22c 92#define NETDEV_DPDK_MBUF_ALIGN 1024
0072e931 93#define NETDEV_DPDK_MAX_PKT_LEN 9728
8a9562d2 94
43307ad0
IS
95/* Max and min number of packets in the mempool. OVS tries to allocate a
96 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
97 * enough hugepages) we keep halving the number until the allocation succeeds
98 * or we reach MIN_NB_MBUF */
99
100#define MAX_NB_MBUF (4096 * 64)
da79ce2b
DDP
101#define MIN_NB_MBUF (4096 * 4)
102#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
103
43307ad0
IS
104/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
105BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
106 == 0);
107
108/* The smallest possible NB_MBUF that we're going to try should be a multiple
109 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
110BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
111 % MP_CACHE_SZ == 0);
112
d6e3feb5 113/*
114 * DPDK XSTATS Counter names definition
115 */
116#define XSTAT_RX_64_PACKETS "rx_size_64_packets"
117#define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
118#define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
119#define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
120#define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
121#define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
122#define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
123
124#define XSTAT_TX_64_PACKETS "tx_size_64_packets"
125#define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
126#define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
127#define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
128#define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
129#define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
130#define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
131
d57f777f 132#define XSTAT_RX_MULTICAST_PACKETS "rx_multicast_packets"
d6e3feb5 133#define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
134#define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
135#define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
136#define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
137#define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
138#define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
139#define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
140
8a9562d2
PS
141#define SOCKET0 0
142
b685696b
CL
143/* Default size of Physical NIC RXQ */
144#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
145/* Default size of Physical NIC TXQ */
146#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
147/* Maximum size of Physical NIC Queues */
148#define NIC_PORT_MAX_Q_SIZE 4096
79f5354c 149
585a5bea 150#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
f3ea2ad2
IM
151#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
152#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
153 * yet mapped to another queue. */
585a5bea 154
bb37956a
IM
155#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
156
5e925ccc
MK
157/* DPDK library uses uint16_t for port_id. */
158typedef uint16_t dpdk_port_t;
fa9f4eeb 159#define DPDK_PORT_ID_FMT "%"PRIu16
bb37956a 160
31871ee3 161#define VHOST_ENQ_RETRY_NUM 8
0a0f39df 162#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
95e9881f 163
8a9562d2 164static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
165 .rxmode = {
166 .mq_mode = ETH_MQ_RX_RSS,
167 .split_hdr_size = 0,
03f3f9c0 168 .offloads = 0,
a28ddd11
DDP
169 },
170 .rx_adv_conf = {
171 .rss_conf = {
172 .rss_key = NULL,
543342a4 173 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
8a9562d2 174 },
a28ddd11
DDP
175 },
176 .txmode = {
177 .mq_mode = ETH_MQ_TX_NONE,
178 },
8a9562d2
PS
179};
180
f3e7ec25
MW
181/*
182 * These callbacks allow virtio-net devices to be added to vhost ports when
183 * configuration has been fully completed.
184 */
185static int new_device(int vid);
186static void destroy_device(int vid);
187static int vring_state_changed(int vid, uint16_t queue_id, int enable);
61473a0e 188static void destroy_connection(int vid);
f3e7ec25
MW
189static const struct vhost_device_ops virtio_net_device_ops =
190{
191 .new_device = new_device,
192 .destroy_device = destroy_device,
193 .vring_state_changed = vring_state_changed,
61473a0e
DM
194 .features_changed = NULL,
195 .new_connection = NULL,
196 .destroy_connection = destroy_connection,
f3e7ec25
MW
197};
198
58f7c37b
DDP
199enum { DPDK_RING_SIZE = 256 };
200BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
201enum { DRAIN_TSC = 200000ULL };
202
58397e6c
KT
203enum dpdk_dev_type {
204 DPDK_DEV_ETH = 0,
7d1ced01 205 DPDK_DEV_VHOST = 1,
58397e6c
KT
206};
207
0bf765f7
IS
208/* Quality of Service */
209
210/* An instance of a QoS configuration. Always associated with a particular
211 * network device.
212 *
213 * Each QoS implementation subclasses this with whatever additional data it
214 * needs.
215 */
216struct qos_conf {
217 const struct dpdk_qos_ops *ops;
78bd47cf 218 rte_spinlock_t lock;
0bf765f7
IS
219};
220
221/* A particular implementation of dpdk QoS operations.
222 *
223 * The functions below return 0 if successful or a positive errno value on
224 * failure, except where otherwise noted. All of them must be provided, except
225 * where otherwise noted.
226 */
227struct dpdk_qos_ops {
228
229 /* Name of the QoS type */
230 const char *qos_name;
231
78bd47cf
DDP
232 /* Called to construct a qos_conf object. The implementation should make
233 * the appropriate calls to configure QoS according to 'details'.
0bf765f7
IS
234 *
235 * The contents of 'details' should be documented as valid for 'ovs_name'
236 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
237 * (which is built as ovs-vswitchd.conf.db(8)).
238 *
78bd47cf
DDP
239 * This function must return 0 if and only if it sets '*conf' to an
240 * initialized 'struct qos_conf'.
0bf765f7
IS
241 *
242 * For all QoS implementations it should always be non-null.
243 */
78bd47cf 244 int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
0bf765f7
IS
245
246 /* Destroys the data structures allocated by the implementation as part of
78bd47cf 247 * 'qos_conf'.
0bf765f7
IS
248 *
249 * For all QoS implementations it should always be non-null.
250 */
78bd47cf 251 void (*qos_destruct)(struct qos_conf *conf);
0bf765f7 252
78bd47cf 253 /* Retrieves details of 'conf' configuration into 'details'.
0bf765f7
IS
254 *
255 * The contents of 'details' should be documented as valid for 'ovs_name'
256 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
257 * (which is built as ovs-vswitchd.conf.db(8)).
258 */
78bd47cf 259 int (*qos_get)(const struct qos_conf *conf, struct smap *details);
0bf765f7 260
78bd47cf 261 /* Returns true if 'conf' is already configured according to 'details'.
0bf765f7
IS
262 *
263 * The contents of 'details' should be documented as valid for 'ovs_name'
264 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
265 * (which is built as ovs-vswitchd.conf.db(8)).
266 *
78bd47cf 267 * For all QoS implementations it should always be non-null.
0bf765f7 268 */
78bd47cf
DDP
269 bool (*qos_is_equal)(const struct qos_conf *conf,
270 const struct smap *details);
0bf765f7
IS
271
272 /* Modify an array of rte_mbufs. The modification is specific to
273 * each qos implementation.
274 *
275 * The function should take and array of mbufs and an int representing
276 * the current number of mbufs present in the array.
277 *
278 * After the function has performed a qos modification to the array of
279 * mbufs it returns an int representing the number of mbufs now present in
280 * the array. This value is can then be passed to the port send function
281 * along with the modified array for transmission.
282 *
283 * For all QoS implementations it should always be non-null.
284 */
78bd47cf 285 int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
7d7ded7a 286 int pkt_cnt, bool should_steal);
0bf765f7
IS
287};
288
289/* dpdk_qos_ops for each type of user space QoS implementation */
290static const struct dpdk_qos_ops egress_policer_ops;
291
292/*
293 * Array of dpdk_qos_ops, contains pointer to all supported QoS
294 * operations.
295 */
296static const struct dpdk_qos_ops *const qos_confs[] = {
297 &egress_policer_ops,
298 NULL
299};
300
c2adb102
IM
301static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
302
8a9562d2 303/* Contains all 'struct dpdk_dev's. */
ca6ba700 304static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 305 = OVS_LIST_INITIALIZER(&dpdk_list);
8a9562d2 306
c2adb102
IM
307static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
308 = OVS_MUTEX_INITIALIZER;
309
91fccdad 310/* Contains all 'struct dpdk_mp's. */
43307ad0
IS
311static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
312 = OVS_LIST_INITIALIZER(&dpdk_mp_list);
91fccdad 313
91fccdad
KT
314struct dpdk_mp {
315 struct rte_mempool *mp;
43307ad0
IS
316 int mtu;
317 int socket_id;
318 int refcount;
91fccdad
KT
319 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
320 };
321
5a034064
AW
322/* There should be one 'struct dpdk_tx_queue' created for
323 * each cpu core. */
8a9562d2 324struct dpdk_tx_queue {
a0cb2d66
DDP
325 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
326 * from concurrent access. It is used only
327 * if the queue is shared among different
324c8374 328 * pmd threads (see 'concurrent_txq'). */
585a5bea
IM
329 int map; /* Mapping of configured vhost-user queues
330 * to enabled by guest. */
8a9562d2
PS
331};
332
95fb793a 333/* dpdk has no way to remove dpdk ring ethernet devices
334 so we have to keep them around once they've been created
335*/
336
ca6ba700 337static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 338 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
95fb793a 339
340struct dpdk_ring {
341 /* For the client rings */
342 struct rte_ring *cring_tx;
343 struct rte_ring *cring_rx;
b83a2df1 344 unsigned int user_port_id; /* User given port no, parsed from port name */
bb37956a 345 dpdk_port_t eth_port_id; /* ethernet device port id */
ca6ba700 346 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
95fb793a 347};
348
9509913a
IS
349struct ingress_policer {
350 struct rte_meter_srtcm_params app_srtcm_params;
351 struct rte_meter_srtcm in_policer;
03f3f9c0 352 struct rte_meter_srtcm_profile in_prof;
9509913a
IS
353 rte_spinlock_t policer_lock;
354};
355
1a2bb118
SC
356enum dpdk_hw_ol_features {
357 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
e10ca8b9 358 NETDEV_RX_HW_CRC_STRIP = 1 << 1,
03f3f9c0 359 NETDEV_RX_HW_SCATTER = 1 << 2
1a2bb118
SC
360};
361
b2e72a9c
IM
362/*
363 * In order to avoid confusion in variables names, following naming convention
364 * should be used, if possible:
365 *
366 * 'struct netdev' : 'netdev'
367 * 'struct netdev_dpdk' : 'dev'
368 * 'struct netdev_rxq' : 'rxq'
369 * 'struct netdev_rxq_dpdk' : 'rx'
370 *
371 * Example:
372 * struct netdev *netdev = netdev_from_name(name);
373 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
374 *
375 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
376 * already defined.
377 */
378
8a9562d2 379struct netdev_dpdk {
23d4d53f
BB
380 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
381 dpdk_port_t port_id;
382
383 /* If true, device was attached by rte_eth_dev_attach(). */
384 bool attached;
606f6650
EC
385 /* If true, rte_eth_dev_start() was successfully called */
386 bool started;
23d4d53f
BB
387 struct eth_addr hwaddr;
388 int mtu;
389 int socket_id;
390 int buf_size;
391 int max_packet_len;
392 enum dpdk_dev_type type;
393 enum netdev_flags flags;
eaa43581 394 int link_reset_cnt;
bb9d2623
IM
395 union {
396 /* Device arguments for dpdk ports. */
397 char *devargs;
398 /* Identifier used to distinguish vhost devices from each other. */
399 char *vhost_id;
400 };
23d4d53f
BB
401 struct dpdk_tx_queue *tx_q;
402 struct rte_eth_link link;
23d4d53f
BB
403 );
404
405 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
406 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
43307ad0 407 struct dpdk_mp *dpdk_mp;
23d4d53f
BB
408
409 /* virtio identifier for vhost devices */
410 ovsrcu_index vid;
411
412 /* True if vHost device is 'up' and has been reconfigured at least once */
413 bool vhost_reconfigured;
414 /* 3 pad bytes here. */
415 );
416
23d4d53f
BB
417 PADDED_MEMBERS(CACHE_LINE_SIZE,
418 struct netdev up;
419 /* In dpdk_list. */
420 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
421
422 /* QoS configuration and lock for the device */
423 OVSRCU_TYPE(struct qos_conf *) qos_conf;
424
425 /* Ingress Policer */
426 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
427 uint32_t policer_rate;
428 uint32_t policer_burst;
35c91567
DM
429
430 /* Array of vhost rxq states, see vring_state_changed. */
431 bool *vhost_rxq_enabled;
23d4d53f
BB
432 );
433
434 PADDED_MEMBERS(CACHE_LINE_SIZE,
435 struct netdev_stats stats;
436 /* Protects stats */
437 rte_spinlock_t stats_lock;
438 /* 44 pad bytes here. */
439 );
440
441 PADDED_MEMBERS(CACHE_LINE_SIZE,
442 /* The following properties cannot be changed when a device is running,
443 * so we remember the request and update them next time
444 * netdev_dpdk*_reconfigure() is called */
445 int requested_mtu;
446 int requested_n_txq;
447 int requested_n_rxq;
448 int requested_rxq_size;
449 int requested_txq_size;
450
451 /* Number of rx/tx descriptors for physical devices */
452 int rxq_size;
453 int txq_size;
454
455 /* Socket ID detected when vHost device is brought up */
456 int requested_socket_id;
457
458 /* Denotes whether vHost port is client/server mode */
459 uint64_t vhost_driver_flags;
460
461 /* DPDK-ETH Flow control */
462 struct rte_eth_fc_conf fc_conf;
463
464 /* DPDK-ETH hardware offload features,
465 * from the enum set 'dpdk_hw_ol_features' */
466 uint32_t hw_ol_features;
f8b64a61
RM
467
468 /* Properties for link state change detection mode.
469 * If lsc_interrupt_mode is set to false, poll mode is used,
470 * otherwise interrupt mode is used. */
471 bool requested_lsc_interrupt_mode;
472 bool lsc_interrupt_mode;
23d4d53f 473 );
971f4b39
MW
474
475 PADDED_MEMBERS(CACHE_LINE_SIZE,
476 /* Names of all XSTATS counters */
477 struct rte_eth_xstat_name *rte_xstats_names;
478 int rte_xstats_names_size;
479 int rte_xstats_ids_size;
480 uint64_t *rte_xstats_ids;
481 );
8a9562d2
PS
482};
483
484struct netdev_rxq_dpdk {
485 struct netdev_rxq up;
bb37956a 486 dpdk_port_t port_id;
8a9562d2
PS
487};
488
f3e7ec25
MW
489static void netdev_dpdk_destruct(struct netdev *netdev);
490static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
8a9562d2 491
ac1a9bb9
IM
492static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
493
0a0f39df 494int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
58397e6c 495
9509913a
IS
496struct ingress_policer *
497netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
498
8a9562d2
PS
499static bool
500is_dpdk_class(const struct netdev_class *class)
501{
f3e7ec25
MW
502 return class->destruct == netdev_dpdk_destruct
503 || class->destruct == netdev_dpdk_vhost_destruct;
8a9562d2
PS
504}
505
4be4d22c
MK
506/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
507 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
508 * value, insufficient buffers are allocated to accomodate the packet in its
509 * entirety. Furthermore, certain drivers need to ensure that there is also
510 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
511 * frames). If the RX buffer is too small, then the driver enables scatter RX
58be5c0e
MK
512 * behaviour, which reduces performance. To prevent this, use a buffer size
513 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
4be4d22c
MK
514 */
515static uint32_t
516dpdk_buf_size(int mtu)
517{
a32bab26
TL
518 return ROUND_UP(MTU_TO_MAX_FRAME_LEN(mtu), NETDEV_DPDK_MBUF_ALIGN)
519 + RTE_PKTMBUF_HEADROOM;
4be4d22c
MK
520}
521
eff23640
DDP
522/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
523 *
524 * Unlike xmalloc(), this function can return NULL on failure. */
8a9562d2
PS
525static void *
526dpdk_rte_mzalloc(size_t sz)
527{
eff23640 528 return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
8a9562d2
PS
529}
530
531void
e14deea0 532free_dpdk_buf(struct dp_packet *p)
8a9562d2 533{
db73f716 534 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 535
b00b4a81 536 rte_pktmbuf_free(pkt);
8a9562d2
PS
537}
538
b3cd9f9d 539static void
401b70d6 540ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
b3cd9f9d 541 void *opaque_arg OVS_UNUSED,
2391135c 542 void *_p,
b3cd9f9d
PS
543 unsigned i OVS_UNUSED)
544{
2391135c 545 struct rte_mbuf *pkt = _p;
b3cd9f9d 546
3aaa6201 547 dp_packet_init_dpdk((struct dp_packet *) pkt);
b3cd9f9d
PS
548}
549
91fccdad
KT
550static int
551dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
552{
1f84a2d5
KT
553 /* At this point we want to know if all the mbufs are back
554 * in the mempool. rte_mempool_full() is not atomic but it's
555 * the best available and as we are no longer requesting mbufs
556 * from the mempool, it means mbufs will not move from
557 * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
558 * the ring is counted before caches, so we won't get false
559 * positives in this use case and we handle false negatives.
560 *
561 * If future implementations of rte_mempool_full() were to change
562 * it could be possible for a false positive. Even that would
563 * likely be ok, as there are additional checks during mempool
564 * freeing but it would make things racey.
91fccdad 565 */
1f84a2d5 566 return rte_mempool_full(mp);
91fccdad
KT
567}
568
569/* Free unused mempools. */
570static void
43307ad0 571dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
91fccdad
KT
572{
573 struct dpdk_mp *dmp, *next;
574
43307ad0
IS
575 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
576 if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
91fccdad
KT
577 VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
578 ovs_list_remove(&dmp->list_node);
579 rte_mempool_free(dmp->mp);
580 rte_free(dmp);
581 }
582 }
91fccdad
KT
583}
584
43307ad0
IS
585/* Calculating the required number of mbufs differs depending on the
586 * mempool model being used. Check if per port memory is in use before
587 * calculating.
588 */
589static uint32_t
590dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
91fccdad 591{
43307ad0 592 uint32_t n_mbufs;
91fccdad 593
43307ad0
IS
594 if (!per_port_mp) {
595 /* Shared memory are being used.
596 * XXX: this is a really rough method of provisioning memory.
597 * It's impossible to determine what the exact memory requirements are
598 * when the number of ports and rxqs that utilize a particular mempool
599 * can change dynamically at runtime. For now, use this rough
600 * heurisitic.
601 */
602 if (mtu >= ETHER_MTU) {
603 n_mbufs = MAX_NB_MBUF;
604 } else {
605 n_mbufs = MIN_NB_MBUF;
91fccdad 606 }
43307ad0
IS
607 } else {
608 /* Per port memory is being used.
609 * XXX: rough estimation of number of mbufs required for this port:
610 * <packets required to fill the device rxqs>
611 * + <packets that could be stuck on other ports txqs>
612 * + <packets in the pmd threads>
613 * + <additional memory for corner cases>
614 */
615 n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
616 + dev->requested_n_txq * dev->requested_txq_size
617 + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
618 + MIN_NB_MBUF;
91fccdad 619 }
43307ad0
IS
620
621 return n_mbufs;
91fccdad
KT
622}
623
43307ad0
IS
624static struct dpdk_mp *
625dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
8a9562d2 626{
24e78f93
IM
627 char mp_name[RTE_MEMPOOL_NAMESIZE];
628 const char *netdev_name = netdev_get_name(&dev->up);
629 int socket_id = dev->requested_socket_id;
dfaf00e8
MK
630 uint32_t n_mbufs = 0;
631 uint32_t mbuf_size = 0;
632 uint32_t aligned_mbuf_size = 0;
633 uint32_t mbuf_priv_data_len = 0;
634 uint32_t pkt_size = 0;
24e78f93 635 uint32_t hash = hash_string(netdev_name, 0);
43307ad0
IS
636 struct dpdk_mp *dmp = NULL;
637 int ret;
638
639 dmp = dpdk_rte_mzalloc(sizeof *dmp);
640 if (!dmp) {
641 return NULL;
642 }
643 dmp->socket_id = socket_id;
644 dmp->mtu = mtu;
645 dmp->refcount = 1;
646
dfaf00e8
MK
647 /* Get the size of each mbuf, based on the MTU */
648 mbuf_size = MTU_TO_FRAME_LEN(mtu);
649
43307ad0 650 n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);
d555d9bd 651
da79ce2b 652 do {
24e78f93 653 /* Full DPDK memory pool name must be unique and cannot be
43307ad0
IS
654 * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
655 * mempool case this can result in one device using a mempool
656 * which references a different device in it's name. However as
657 * mempool names are hashed, the device name will not be readable
658 * so this is not an issue for tasks such as debugging.
659 */
660 ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
dfaf00e8
MK
661 "ovs%08x%02d%05d%07u",
662 hash, socket_id, mtu, n_mbufs);
24e78f93
IM
663 if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
664 VLOG_DBG("snprintf returned %d. "
665 "Failed to generate a mempool name for \"%s\". "
666 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
667 ret, netdev_name, hash, socket_id, mtu, n_mbufs);
668 break;
65056fd7 669 }
95fb793a 670
dfaf00e8
MK
671 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u "
672 "on socket %d for %d Rx and %d Tx queues, "
673 "cache line size of %u",
674 netdev_name, n_mbufs, mbuf_size, socket_id,
675 dev->requested_n_rxq, dev->requested_n_txq,
676 RTE_CACHE_LINE_SIZE);
677
a32bab26
TL
678 /* The size of the mbuf's private area (i.e. area that holds OvS'
679 * dp_packet data)*/
dfaf00e8
MK
680 mbuf_priv_data_len = sizeof(struct dp_packet) -
681 sizeof(struct rte_mbuf);
682 /* The size of the entire dp_packet. */
683 pkt_size = sizeof(struct dp_packet) + mbuf_size;
684 /* mbuf size, rounded up to cacheline size. */
685 aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE);
686 /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
687 * This maintains mbuf size cache alignment, while also honoring RX
688 * buffer alignment in the data portion of the mbuf. If this adjustment
689 * is not made, there is a possiblity later on that for an element of
690 * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
691 * This is problematic in the case of multi-segment mbufs, particularly
692 * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN
693 * header, for example.
694 */
695 mbuf_priv_data_len += (aligned_mbuf_size - pkt_size);
696
697 dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
698 mbuf_priv_data_len,
699 mbuf_size,
43307ad0 700 socket_id);
24e78f93 701
43307ad0 702 if (dmp->mp) {
24e78f93
IM
703 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
704 mp_name, n_mbufs);
837c1761 705 /* rte_pktmbuf_pool_create has done some initialization of the
43307ad0
IS
706 * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
707 * initializes some OVS specific fields of dp_packet.
708 */
709 rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
710 return dmp;
d555d9bd
RW
711 } else if (rte_errno == EEXIST) {
712 /* A mempool with the same name already exists. We just
713 * retrieve its pointer to be returned to the caller. */
43307ad0 714 dmp->mp = rte_mempool_lookup(mp_name);
d555d9bd
RW
715 /* As the mempool create returned EEXIST we can expect the
716 * lookup has returned a valid pointer. If for some reason
717 * that's not the case we keep track of it. */
24e78f93 718 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
43307ad0
IS
719 mp_name, dmp->mp);
720 return dmp;
d555d9bd 721 } else {
43307ad0
IS
722 VLOG_DBG("Failed to create mempool \"%s\" with a request of "
723 "%u mbufs, retrying with %u mbufs",
724 mp_name, n_mbufs, n_mbufs / 2);
0c6f39e5 725 }
43307ad0 726 } while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
2ae3d542 727
43307ad0
IS
728 VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
729 mp_name, n_mbufs);
730
731 rte_free(dmp);
732 return NULL;
8a9562d2
PS
733}
734
43307ad0
IS
735static struct dpdk_mp *
736dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
8a9562d2 737{
43307ad0
IS
738 struct dpdk_mp *dmp, *next;
739 bool reuse = false;
8a9562d2 740
c2adb102 741 ovs_mutex_lock(&dpdk_mp_mutex);
43307ad0
IS
742 /* Check if shared memory is being used, if so check existing mempools
743 * to see if reuse is possible. */
744 if (!per_port_mp) {
745 LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
746 if (dmp->socket_id == dev->requested_socket_id
747 && dmp->mtu == mtu) {
748 VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
749 dmp->refcount++;
750 reuse = true;
751 break;
752 }
753 }
754 }
755 /* Sweep mempools after reuse or before create. */
756 dpdk_mp_sweep();
91fccdad 757
43307ad0
IS
758 if (!reuse) {
759 dmp = dpdk_mp_create(dev, mtu, per_port_mp);
91fccdad 760 if (dmp) {
43307ad0
IS
761 /* Shared memory will hit the reuse case above so will not
762 * request a mempool that already exists but we need to check
763 * for the EEXIST case for per port memory case. Compare the
764 * mempool returned by dmp to each entry in dpdk_mp_list. If a
765 * match is found, free dmp as a new entry is not required, set
766 * dmp to point to the existing entry and increment the refcount
767 * to avoid being freed at a later stage.
768 */
769 if (per_port_mp && rte_errno == EEXIST) {
770 LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
771 if (dmp->mp == next->mp) {
772 rte_free(dmp);
773 dmp = next;
774 dmp->refcount++;
775 }
776 }
777 } else {
778 ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
779 }
91fccdad
KT
780 }
781 }
43307ad0 782
43307ad0
IS
783 ovs_mutex_unlock(&dpdk_mp_mutex);
784
785 return dmp;
786}
787
788/* Decrement reference to a mempool. */
789static void
790dpdk_mp_put(struct dpdk_mp *dmp)
791{
792 if (!dmp) {
793 return;
794 }
795
796 ovs_mutex_lock(&dpdk_mp_mutex);
797 ovs_assert(dmp->refcount);
798 dmp->refcount--;
c2adb102 799 ovs_mutex_unlock(&dpdk_mp_mutex);
8a9562d2
PS
800}
801
43307ad0
IS
802/* Depending on the memory model being used this function tries to
803 * identify and reuse an existing mempool or tries to allocate a new
804 * mempool on requested_socket_id with mbuf size corresponding to the
805 * requested_mtu. On success, a new configuration will be applied.
0072e931
MK
806 * On error, device will be left unchanged. */
807static int
808netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
0072e931
MK
809 OVS_REQUIRES(dev->mutex)
810{
811 uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
43307ad0 812 struct dpdk_mp *dmp;
24e78f93 813 int ret = 0;
43307ad0 814 bool per_port_mp = dpdk_per_port_memory();
0072e931 815
43307ad0
IS
816 /* With shared memory we do not need to configure a mempool if the MTU
817 * and socket ID have not changed, the previous configuration is still
818 * valid so return 0 */
819 if (!per_port_mp && dev->mtu == dev->requested_mtu
820 && dev->socket_id == dev->requested_socket_id) {
821 return ret;
822 }
91fccdad 823
43307ad0
IS
824 dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
825 if (!dmp) {
c67e46c0
MK
826 VLOG_ERR("Failed to create memory pool for netdev "
827 "%s, with MTU %d on socket %d: %s\n",
828 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
829 rte_strerror(rte_errno));
24e78f93 830 ret = rte_errno;
0072e931 831 } else {
43307ad0
IS
832 /* Check for any pre-existing dpdk_mp for the device before accessing
833 * the associated mempool.
834 */
835 if (dev->dpdk_mp != NULL) {
836 /* A new MTU was requested, decrement the reference count for the
837 * devices current dpdk_mp. This is required even if a pointer to
838 * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
839 * has already been incremented by dpdk_mp_get at this stage so it
840 * must be decremented to keep an accurate refcount for the
841 * dpdk_mp.
842 */
843 dpdk_mp_put(dev->dpdk_mp);
844 }
845 dev->dpdk_mp = dmp;
0072e931
MK
846 dev->mtu = dev->requested_mtu;
847 dev->socket_id = dev->requested_socket_id;
848 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
849 }
850
24e78f93 851 return ret;
0072e931
MK
852}
853
8a9562d2
PS
854static void
855check_link_status(struct netdev_dpdk *dev)
856{
857 struct rte_eth_link link;
858
859 rte_eth_link_get_nowait(dev->port_id, &link);
860
861 if (dev->link.link_status != link.link_status) {
3e912ffc 862 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
863
864 dev->link_reset_cnt++;
865 dev->link = link;
866 if (dev->link.link_status) {
fa9f4eeb
IM
867 VLOG_DBG_RL(&rl,
868 "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
58be5c0e 869 dev->port_id, (unsigned) dev->link.link_speed,
fa9f4eeb
IM
870 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
871 ? "full-duplex" : "half-duplex");
8a9562d2 872 } else {
fa9f4eeb
IM
873 VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
874 dev->port_id);
8a9562d2
PS
875 }
876 }
877}
878
879static void *
880dpdk_watchdog(void *dummy OVS_UNUSED)
881{
882 struct netdev_dpdk *dev;
883
884 pthread_detach(pthread_self());
885
886 for (;;) {
887 ovs_mutex_lock(&dpdk_mutex);
888 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
889 ovs_mutex_lock(&dev->mutex);
1f5b157e
IM
890 if (dev->type == DPDK_DEV_ETH) {
891 check_link_status(dev);
892 }
8a9562d2
PS
893 ovs_mutex_unlock(&dev->mutex);
894 }
895 ovs_mutex_unlock(&dpdk_mutex);
896 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
897 }
898
899 return NULL;
900}
901
b98d7669 902static int
f8b64a61 903dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
b98d7669
DDP
904{
905 int diag = 0;
906 int i;
0072e931 907 struct rte_eth_conf conf = port_conf;
65a87968 908 struct rte_eth_dev_info info;
4dd16ca0 909 uint16_t conf_mtu;
65a87968 910
03f3f9c0
OM
911 rte_eth_dev_info_get(dev->port_id, &info);
912
65a87968 913 /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
03f3f9c0
OM
914 * scatter to support jumbo RX.
915 * Setting scatter for the device is done after checking for
916 * scatter support in the device capabilites. */
0072e931 917 if (dev->mtu > ETHER_MTU) {
03f3f9c0
OM
918 if (dev->hw_ol_features & NETDEV_RX_HW_SCATTER) {
919 conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
65a87968 920 }
0072e931 921 }
67fe6d63 922
f8b64a61 923 conf.intr_conf.lsc = dev->lsc_interrupt_mode;
e10ca8b9 924
03f3f9c0
OM
925 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
926 conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
927 }
928
929 if (!(dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP)
930 && info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
931 conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
e10ca8b9
MW
932 }
933
03f3f9c0
OM
934 /* Limit configured rss hash functions to only those supported
935 * by the eth device. */
936 conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
937
b98d7669
DDP
938 /* A device may report more queues than it makes available (this has
939 * been observed for Intel xl710, which reserves some of them for
940 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
941 * available. When this happens we can retry the configuration
942 * and request less queues */
943 while (n_rxq && n_txq) {
944 if (diag) {
945 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
946 }
947
0072e931 948 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
b98d7669 949 if (diag) {
0072e931
MK
950 VLOG_WARN("Interface %s eth_dev setup error %s\n",
951 dev->up.name, rte_strerror(-diag));
b98d7669
DDP
952 break;
953 }
954
67fe6d63
MK
955 diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
956 if (diag) {
4dd16ca0
IS
957 /* A device may not support rte_eth_dev_set_mtu, in this case
958 * flag a warning to the user and include the devices configured
959 * MTU value that will be used instead. */
960 if (-ENOTSUP == diag) {
961 rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
962 VLOG_WARN("Interface %s does not support MTU configuration, "
963 "max packet size supported is %"PRIu16".",
964 dev->up.name, conf_mtu);
965 } else {
966 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
967 dev->up.name, dev->mtu, rte_strerror(-diag));
968 break;
969 }
67fe6d63
MK
970 }
971
b98d7669 972 for (i = 0; i < n_txq; i++) {
b685696b 973 diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
b98d7669
DDP
974 dev->socket_id, NULL);
975 if (diag) {
1dfebee9 976 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
b98d7669
DDP
977 dev->up.name, i, rte_strerror(-diag));
978 break;
979 }
980 }
981
982 if (i != n_txq) {
983 /* Retry with less tx queues */
984 n_txq = i;
985 continue;
986 }
987
988 for (i = 0; i < n_rxq; i++) {
b685696b 989 diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
43307ad0
IS
990 dev->socket_id, NULL,
991 dev->dpdk_mp->mp);
b98d7669 992 if (diag) {
1dfebee9 993 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
b98d7669
DDP
994 dev->up.name, i, rte_strerror(-diag));
995 break;
996 }
997 }
998
999 if (i != n_rxq) {
1000 /* Retry with less rx queues */
1001 n_rxq = i;
1002 continue;
1003 }
1004
1005 dev->up.n_rxq = n_rxq;
81acebda 1006 dev->up.n_txq = n_txq;
b98d7669
DDP
1007
1008 return 0;
1009 }
1010
1011 return diag;
1012}
1013
9fd39370
SC
1014static void
1015dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
1016{
1017 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
fa9f4eeb 1018 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
bb37956a 1019 dev->port_id);
9fd39370
SC
1020 }
1021}
b98d7669 1022
8a9562d2 1023static int
c2adb102
IM
1024dpdk_eth_dev_init(struct netdev_dpdk *dev)
1025 OVS_REQUIRES(dev->mutex)
8a9562d2
PS
1026{
1027 struct rte_pktmbuf_pool_private *mbp_priv;
a0cb2d66 1028 struct rte_eth_dev_info info;
8a9562d2
PS
1029 struct ether_addr eth_addr;
1030 int diag;
b98d7669 1031 int n_rxq, n_txq;
d4f5282c
KT
1032 uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
1033 DEV_RX_OFFLOAD_TCP_CKSUM |
1034 DEV_RX_OFFLOAD_IPV4_CKSUM;
8a9562d2 1035
a0cb2d66 1036 rte_eth_dev_info_get(dev->port_id, &info);
a0cb2d66 1037
e10ca8b9
MW
1038 if (strstr(info.driver_name, "vf") != NULL) {
1039 VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
1040 dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
1041 } else {
1042 dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
1043 }
1044
d4f5282c
KT
1045 if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
1046 rx_chksm_offload_capa) {
fa9f4eeb
IM
1047 VLOG_WARN("Rx checksum offload is not supported on port "
1048 DPDK_PORT_ID_FMT, dev->port_id);
d4f5282c
KT
1049 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
1050 } else {
1051 dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
1052 }
1053
03f3f9c0
OM
1054 if (info.rx_offload_capa & DEV_RX_OFFLOAD_SCATTER) {
1055 dev->hw_ol_features |= NETDEV_RX_HW_SCATTER;
1056 } else {
1057 /* Do not warn on lack of scatter support */
1058 dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
1059 }
1060
b98d7669
DDP
1061 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
1062 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
1063
f8b64a61 1064 diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
8a9562d2 1065 if (diag) {
f8b64a61
RM
1066 VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
1067 "configure error: %s",
1068 dev->up.name, n_rxq, n_txq,
1069 dev->lsc_interrupt_mode ? "true" : "false",
1070 rte_strerror(-diag));
95fb793a 1071 return -diag;
8a9562d2
PS
1072 }
1073
8a9562d2
PS
1074 diag = rte_eth_dev_start(dev->port_id);
1075 if (diag) {
b98d7669
DDP
1076 VLOG_ERR("Interface %s start error: %s", dev->up.name,
1077 rte_strerror(-diag));
95fb793a 1078 return -diag;
8a9562d2 1079 }
606f6650 1080 dev->started = true;
8a9562d2
PS
1081
1082 rte_eth_promiscuous_enable(dev->port_id);
1083 rte_eth_allmulticast_enable(dev->port_id);
1084
1085 memset(&eth_addr, 0x0, sizeof(eth_addr));
1086 rte_eth_macaddr_get(dev->port_id, &eth_addr);
fa9f4eeb
IM
1087 VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
1088 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
8a9562d2 1089
ca92d173 1090 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
8a9562d2
PS
1091 rte_eth_link_get_nowait(dev->port_id, &dev->link);
1092
43307ad0 1093 mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
8a9562d2 1094 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
8a9562d2
PS
1095 return 0;
1096}
1097
1098static struct netdev_dpdk *
1099netdev_dpdk_cast(const struct netdev *netdev)
1100{
1101 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
1102}
1103
1104static struct netdev *
1105netdev_dpdk_alloc(void)
1106{
bab69409
AC
1107 struct netdev_dpdk *dev;
1108
65e19e70
DDP
1109 dev = dpdk_rte_mzalloc(sizeof *dev);
1110 if (dev) {
1111 return &dev->up;
bab69409 1112 }
65e19e70 1113
bab69409 1114 return NULL;
8a9562d2
PS
1115}
1116
eff23640
DDP
1117static struct dpdk_tx_queue *
1118netdev_dpdk_alloc_txq(unsigned int n_txqs)
5a034064 1119{
eff23640 1120 struct dpdk_tx_queue *txqs;
bd5131ba 1121 unsigned i;
5a034064 1122
eff23640
DDP
1123 txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
1124 if (txqs) {
1125 for (i = 0; i < n_txqs; i++) {
1126 /* Initialize map for vhost devices. */
1127 txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
1128 rte_spinlock_init(&txqs[i].tx_lock);
1129 }
5a034064 1130 }
eff23640
DDP
1131
1132 return txqs;
5a034064
AW
1133}
1134
8a9562d2 1135static int
bb37956a 1136common_construct(struct netdev *netdev, dpdk_port_t port_no,
1ce30dfd 1137 enum dpdk_dev_type type, int socket_id)
5a034064 1138 OVS_REQUIRES(dpdk_mutex)
8a9562d2 1139{
d46285a2 1140 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1141
d46285a2 1142 ovs_mutex_init(&dev->mutex);
8a9562d2 1143
d46285a2 1144 rte_spinlock_init(&dev->stats_lock);
45d947c4 1145
1b7a04e0
AW
1146 /* If the 'sid' is negative, it means that the kernel fails
1147 * to obtain the pci numa info. In that situation, always
1148 * use 'SOCKET0'. */
1ce30dfd 1149 dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
db8f13b0 1150 dev->requested_socket_id = dev->socket_id;
d46285a2
DDP
1151 dev->port_id = port_no;
1152 dev->type = type;
1153 dev->flags = 0;
7f381c2e 1154 dev->requested_mtu = ETHER_MTU;
d46285a2 1155 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
f8b64a61 1156 dev->requested_lsc_interrupt_mode = 0;
0a0f39df
CL
1157 ovsrcu_index_init(&dev->vid, -1);
1158 dev->vhost_reconfigured = false;
5dcde09c 1159 dev->attached = false;
8a9562d2 1160
78bd47cf 1161 ovsrcu_init(&dev->qos_conf, NULL);
0bf765f7 1162
9509913a
IS
1163 ovsrcu_init(&dev->ingress_policer, NULL);
1164 dev->policer_rate = 0;
1165 dev->policer_burst = 0;
1166
7f381c2e
DDP
1167 netdev->n_rxq = 0;
1168 netdev->n_txq = 0;
1169 dev->requested_n_rxq = NR_QUEUE;
1170 dev->requested_n_txq = NR_QUEUE;
1171 dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
1172 dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
58397e6c 1173
9fd39370
SC
1174 /* Initialize the flow control to NULL */
1175 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
1a2bb118
SC
1176
1177 /* Initilize the hardware offload flags to 0 */
1178 dev->hw_ol_features = 0;
3b1fb077
DDP
1179
1180 dev->flags = NETDEV_UP | NETDEV_PROMISC;
1181
d46285a2 1182 ovs_list_push_back(&dpdk_list, &dev->list_node);
8a9562d2 1183
7f381c2e
DDP
1184 netdev_request_reconfigure(netdev);
1185
971f4b39
MW
1186 dev->rte_xstats_names = NULL;
1187 dev->rte_xstats_names_size = 0;
1188
1189 dev->rte_xstats_ids = NULL;
1190 dev->rte_xstats_ids_size = 0;
1191
1ce30dfd 1192 return 0;
95fb793a 1193}
1194
b83a2df1
MV
1195/* dev_name must be the prefix followed by a positive decimal number.
1196 * (no leading + or - signs are allowed) */
95fb793a 1197static int
1198dpdk_dev_parse_name(const char dev_name[], const char prefix[],
1199 unsigned int *port_no)
1200{
1201 const char *cport;
1202
1203 if (strncmp(dev_name, prefix, strlen(prefix))) {
1204 return ENODEV;
1205 }
1206
1207 cport = dev_name + strlen(prefix);
b83a2df1
MV
1208
1209 if (str_to_uint(cport, 10, port_no)) {
1210 return 0;
1211 } else {
1212 return ENODEV;
1213 }
95fb793a 1214}
1215
40e940e4
OM
1216/* Get the number of OVS interfaces which have the same DPDK
1217 * rte device (e.g. same pci bus address).
1218 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1219 */
1220static int
1221netdev_dpdk_get_num_ports(struct rte_device *device)
1222 OVS_REQUIRES(dpdk_mutex)
1223{
1224 struct netdev_dpdk *dev;
1225 int count = 0;
1226
1227 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1228 if (rte_eth_devices[dev->port_id].device == device
1229 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
1230 count++;
1231 }
1232 }
1233 return count;
1234}
1235
1ce30dfd
DDP
1236static int
1237vhost_common_construct(struct netdev *netdev)
1238 OVS_REQUIRES(dpdk_mutex)
1239{
1240 int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
1241 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1242
35c91567
DM
1243 dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
1244 sizeof *dev->vhost_rxq_enabled);
1245 if (!dev->vhost_rxq_enabled) {
1246 return ENOMEM;
1247 }
1ce30dfd
DDP
1248 dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
1249 if (!dev->tx_q) {
35c91567 1250 rte_free(dev->vhost_rxq_enabled);
1ce30dfd
DDP
1251 return ENOMEM;
1252 }
1253
bb37956a
IM
1254 return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1255 DPDK_DEV_VHOST, socket_id);
1ce30dfd
DDP
1256}
1257
7d1ced01 1258static int
53f50d24 1259netdev_dpdk_vhost_construct(struct netdev *netdev)
7d1ced01 1260{
d46285a2
DDP
1261 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1262 const char *name = netdev->name;
7d1ced01 1263 int err;
a0cb2d66 1264
1af27e8a
DDP
1265 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1266 * the file system. '/' or '\' would traverse directories, so they're not
1267 * acceptable in 'name'. */
1268 if (strchr(name, '/') || strchr(name, '\\')) {
1269 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1270 "A valid name must not include '/' or '\\'",
1271 name);
1272 return EINVAL;
1273 }
1274
7d1ced01
CL
1275 ovs_mutex_lock(&dpdk_mutex);
1276 /* Take the name of the vhost-user port and append it to the location where
2d24d165 1277 * the socket is to be created, then register the socket.
7d1ced01 1278 */
bb9d2623 1279 dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name);
1af27e8a 1280
2d24d165
CL
1281 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
1282 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
7d1ced01
CL
1283 if (err) {
1284 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
2d24d165 1285 dev->vhost_id);
f3e7ec25 1286 goto out;
e5c0f5a4 1287 } else {
2d24d165
CL
1288 fatal_signal_add_file_to_unlink(dev->vhost_id);
1289 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1290 dev->vhost_id, name);
1291 }
f3e7ec25
MW
1292
1293 err = rte_vhost_driver_callback_register(dev->vhost_id,
1294 &virtio_net_device_ops);
1295 if (err) {
1296 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1297 "port: %s\n", name);
1298 goto out;
1299 }
1300
1301 err = rte_vhost_driver_disable_features(dev->vhost_id,
1302 1ULL << VIRTIO_NET_F_HOST_TSO4
1303 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1304 | 1ULL << VIRTIO_NET_F_CSUM);
1305 if (err) {
1306 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1307 "port: %s\n", name);
1308 goto out;
1309 }
1310
1311 err = rte_vhost_driver_start(dev->vhost_id);
1312 if (err) {
1313 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1314 "port: %s\n", name);
1315 goto out;
1316 }
1317
1ce30dfd 1318 err = vhost_common_construct(netdev);
f3e7ec25
MW
1319 if (err) {
1320 VLOG_ERR("vhost_common_construct failed for vhost user "
1321 "port: %s\n", name);
1322 }
2d24d165 1323
f3e7ec25 1324out:
bb9d2623
IM
1325 if (err) {
1326 free(dev->vhost_id);
1327 dev->vhost_id = NULL;
1328 }
1329
2d24d165 1330 ovs_mutex_unlock(&dpdk_mutex);
28ca969e
AC
1331 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1332 "please migrate to dpdkvhostuserclient ports.");
2d24d165
CL
1333 return err;
1334}
1335
1336static int
1337netdev_dpdk_vhost_client_construct(struct netdev *netdev)
1338{
1339 int err;
1340
2d24d165 1341 ovs_mutex_lock(&dpdk_mutex);
1ce30dfd 1342 err = vhost_common_construct(netdev);
f3e7ec25
MW
1343 if (err) {
1344 VLOG_ERR("vhost_common_construct failed for vhost user client"
1345 "port: %s\n", netdev->name);
1346 }
7d1ced01 1347 ovs_mutex_unlock(&dpdk_mutex);
58397e6c
KT
1348 return err;
1349}
1350
95fb793a 1351static int
1352netdev_dpdk_construct(struct netdev *netdev)
1353{
95fb793a 1354 int err;
1355
95fb793a 1356 ovs_mutex_lock(&dpdk_mutex);
bb37956a
IM
1357 err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1358 DPDK_DEV_ETH, SOCKET0);
8a9562d2
PS
1359 ovs_mutex_unlock(&dpdk_mutex);
1360 return err;
1361}
1362
1ce30dfd
DDP
1363static void
1364common_destruct(struct netdev_dpdk *dev)
1365 OVS_REQUIRES(dpdk_mutex)
1366 OVS_EXCLUDED(dev->mutex)
1367{
1368 rte_free(dev->tx_q);
43307ad0 1369 dpdk_mp_put(dev->dpdk_mp);
1ce30dfd
DDP
1370
1371 ovs_list_remove(&dev->list_node);
1372 free(ovsrcu_get_protected(struct ingress_policer *,
1373 &dev->ingress_policer));
1374 ovs_mutex_destroy(&dev->mutex);
1375}
1376
8a9562d2 1377static void
d46285a2 1378netdev_dpdk_destruct(struct netdev *netdev)
8a9562d2 1379{
d46285a2 1380 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
40e940e4
OM
1381 struct rte_device *rte_dev;
1382 struct rte_eth_dev *eth_dev;
1383 bool remove_on_close;
8a9562d2 1384
8d38823b 1385 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1386
8a9562d2 1387 rte_eth_dev_stop(dev->port_id);
606f6650 1388 dev->started = false;
5dcde09c
IM
1389
1390 if (dev->attached) {
40e940e4
OM
1391 /* Retrieve eth device data before closing it.
1392 * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
1393 */
1394 eth_dev = &rte_eth_devices[dev->port_id];
1395 remove_on_close =
1396 eth_dev->data &&
1397 (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
1398 rte_dev = eth_dev->device;
1399
1400 /* Remove the eth device. */
5dcde09c 1401 rte_eth_dev_close(dev->port_id);
40e940e4
OM
1402
1403 /* Remove this rte device and all its eth devices if flag
1404 * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
1405 * are not supported), or if all the eth devices belonging to the rte
1406 * device are closed.
1407 */
1408 if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
595ce47c
IM
1409 int ret = rte_dev_remove(rte_dev);
1410
1411 if (ret < 0) {
1412 VLOG_ERR("Device '%s' can not be detached: %s.",
1413 dev->devargs, rte_strerror(-ret));
40e940e4
OM
1414 } else {
1415 /* Device was closed and detached. */
1416 VLOG_INFO("Device '%s' has been removed and detached",
1417 dev->devargs);
1418 }
5dcde09c 1419 } else {
40e940e4
OM
1420 /* Device was only closed. rte_dev_remove() was not called. */
1421 VLOG_INFO("Device '%s' has been removed", dev->devargs);
5dcde09c
IM
1422 }
1423 }
1424
ac1a9bb9 1425 netdev_dpdk_clear_xstats(dev);
55e075e6 1426 free(dev->devargs);
1ce30dfd 1427 common_destruct(dev);
8d38823b 1428
8a9562d2 1429 ovs_mutex_unlock(&dpdk_mutex);
58397e6c 1430}
8a9562d2 1431
3f891bbe
DDP
1432/* rte_vhost_driver_unregister() can call back destroy_device(), which will
1433 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1434 * deadlock, none of the mutexes must be held while calling this function. */
1435static int
c1ff66ac
CL
1436dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
1437 char *vhost_id)
3f891bbe
DDP
1438 OVS_EXCLUDED(dpdk_mutex)
1439 OVS_EXCLUDED(dev->mutex)
1440{
c1ff66ac 1441 return rte_vhost_driver_unregister(vhost_id);
3f891bbe
DDP
1442}
1443
58397e6c 1444static void
d46285a2 1445netdev_dpdk_vhost_destruct(struct netdev *netdev)
58397e6c 1446{
d46285a2 1447 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
c1ff66ac 1448 char *vhost_id;
58397e6c 1449
8d38823b 1450 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1451
c62da695 1452 /* Guest becomes an orphan if still attached. */
c1ff66ac
CL
1453 if (netdev_dpdk_get_vid(dev) >= 0
1454 && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
c62da695 1455 VLOG_ERR("Removing port '%s' while vhost device still attached.",
d46285a2 1456 netdev->name);
58be5c0e
MK
1457 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1458 "socket '%s' must be restarted.", dev->vhost_id);
58397e6c
KT
1459 }
1460
bb9d2623
IM
1461 vhost_id = dev->vhost_id;
1462 dev->vhost_id = NULL;
35c91567 1463 rte_free(dev->vhost_rxq_enabled);
c1ff66ac 1464
1ce30dfd
DDP
1465 common_destruct(dev);
1466
58397e6c 1467 ovs_mutex_unlock(&dpdk_mutex);
3f891bbe 1468
bb9d2623 1469 if (!vhost_id) {
821b8664
IM
1470 goto out;
1471 }
1472
c1ff66ac 1473 if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
41964543
IM
1474 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1475 netdev->name, vhost_id);
c1ff66ac
CL
1476 } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1477 /* OVS server mode - remove this socket from list for deletion */
1478 fatal_signal_remove_file_to_unlink(vhost_id);
3f891bbe 1479 }
821b8664 1480out:
c1ff66ac 1481 free(vhost_id);
8a9562d2
PS
1482}
1483
1484static void
d46285a2 1485netdev_dpdk_dealloc(struct netdev *netdev)
8a9562d2 1486{
d46285a2 1487 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1488
d46285a2 1489 rte_free(dev);
8a9562d2
PS
1490}
1491
971f4b39 1492static void
ac1a9bb9 1493netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
971f4b39
MW
1494{
1495 /* If statistics are already allocated, we have to
1496 * reconfigure, as port_id could have been changed. */
1497 if (dev->rte_xstats_names) {
1498 free(dev->rte_xstats_names);
1499 dev->rte_xstats_names = NULL;
1500 dev->rte_xstats_names_size = 0;
1501 }
1502 if (dev->rte_xstats_ids) {
1503 free(dev->rte_xstats_ids);
1504 dev->rte_xstats_ids = NULL;
1505 dev->rte_xstats_ids_size = 0;
1506 }
1507}
1508
1509static const char*
1510netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
1511{
1512 if (id >= dev->rte_xstats_names_size) {
1513 return "UNKNOWN";
1514 }
1515 return dev->rte_xstats_names[id].name;
1516}
1517
1518static bool
1519netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
1520 OVS_REQUIRES(dev->mutex)
1521{
1522 int rte_xstats_len;
1523 bool ret;
1524 struct rte_eth_xstat *rte_xstats;
1525 uint64_t id;
1526 int xstats_no;
1527 const char *name;
1528
1529 /* Retrieving all XSTATS names. If something will go wrong
1530 * or amount of counters will be equal 0, rte_xstats_names
1531 * buffer will be marked as NULL, and any further xstats
1532 * query won't be performed (e.g. during netdev_dpdk_get_stats
1533 * execution). */
1534
1535 ret = false;
1536 rte_xstats = NULL;
1537
1538 if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
1539 dev->rte_xstats_names_size =
1540 rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1541
1542 if (dev->rte_xstats_names_size < 0) {
fa9f4eeb
IM
1543 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
1544 dev->port_id);
971f4b39
MW
1545 dev->rte_xstats_names_size = 0;
1546 } else {
1547 /* Reserve memory for xstats names and values */
1548 dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
1549 sizeof *dev->rte_xstats_names);
1550
1551 if (dev->rte_xstats_names) {
1552 /* Retreive xstats names */
1553 rte_xstats_len =
1554 rte_eth_xstats_get_names(dev->port_id,
1555 dev->rte_xstats_names,
1556 dev->rte_xstats_names_size);
1557
1558 if (rte_xstats_len < 0) {
fa9f4eeb
IM
1559 VLOG_WARN("Cannot get XSTATS names for port: "
1560 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1561 goto out;
1562 } else if (rte_xstats_len != dev->rte_xstats_names_size) {
fa9f4eeb
IM
1563 VLOG_WARN("XSTATS size doesn't match for port: "
1564 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1565 goto out;
1566 }
1567
1568 dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
1569 sizeof(uint64_t));
1570
1571 /* We have to calculate number of counters */
1572 rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
1573 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1574
1575 /* Retreive xstats values */
1576 if (rte_eth_xstats_get(dev->port_id, rte_xstats,
1577 rte_xstats_len) > 0) {
1578 dev->rte_xstats_ids_size = 0;
1579 xstats_no = 0;
1580 for (uint32_t i = 0; i < rte_xstats_len; i++) {
1581 id = rte_xstats[i].id;
1582 name = netdev_dpdk_get_xstat_name(dev, id);
1583 /* We need to filter out everything except
1584 * dropped, error and management counters */
1585 if (string_ends_with(name, "_errors") ||
1586 strstr(name, "_management_") ||
1587 string_ends_with(name, "_dropped")) {
1588
1589 dev->rte_xstats_ids[xstats_no] = id;
1590 xstats_no++;
1591 }
1592 }
1593 dev->rte_xstats_ids_size = xstats_no;
1594 ret = true;
1595 } else {
fa9f4eeb
IM
1596 VLOG_WARN("Can't get XSTATS IDs for port: "
1597 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39 1598 }
34eb0863
IM
1599
1600 free(rte_xstats);
971f4b39
MW
1601 }
1602 }
1603 } else {
1604 /* Already configured */
1605 ret = true;
1606 }
1607
1608out:
1609 if (!ret) {
1610 netdev_dpdk_clear_xstats(dev);
1611 }
1612 return ret;
1613}
1614
8a9562d2 1615static int
a14b8947 1616netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
8a9562d2 1617{
a14b8947 1618 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
1619
1620 ovs_mutex_lock(&dev->mutex);
1621
050c60bf 1622 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
a14b8947 1623 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
81acebda
IM
1624 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1625 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
0072e931 1626 smap_add_format(args, "mtu", "%d", dev->mtu);
451f26fd
IM
1627
1628 if (dev->type == DPDK_DEV_ETH) {
1629 smap_add_format(args, "requested_rxq_descriptors", "%d",
1630 dev->requested_rxq_size);
1631 smap_add_format(args, "configured_rxq_descriptors", "%d",
1632 dev->rxq_size);
1633 smap_add_format(args, "requested_txq_descriptors", "%d",
1634 dev->requested_txq_size);
1635 smap_add_format(args, "configured_txq_descriptors", "%d",
1636 dev->txq_size);
1a2bb118
SC
1637 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
1638 smap_add(args, "rx_csum_offload", "true");
8155ab7e
KT
1639 } else {
1640 smap_add(args, "rx_csum_offload", "false");
1a2bb118 1641 }
f8b64a61
RM
1642 smap_add(args, "lsc_interrupt_mode",
1643 dev->lsc_interrupt_mode ? "true" : "false");
451f26fd 1644 }
8a9562d2
PS
1645 ovs_mutex_unlock(&dev->mutex);
1646
1647 return 0;
1648}
1649
55e075e6 1650static struct netdev_dpdk *
bb37956a 1651netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
55e075e6
CL
1652 OVS_REQUIRES(dpdk_mutex)
1653{
1654 struct netdev_dpdk *dev;
1655
1656 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1657 if (dev->port_id == port_id) {
1658 return dev;
1659 }
1660 }
1661
1662 return NULL;
1663}
1664
5e758818
YL
1665static dpdk_port_t
1666netdev_dpdk_get_port_by_mac(const char *mac_str)
1667{
1668 dpdk_port_t port_id;
1669 struct eth_addr mac, port_mac;
1670
1671 if (!eth_addr_from_string(mac_str, &mac)) {
1672 VLOG_ERR("invalid mac: %s", mac_str);
1673 return DPDK_ETH_PORT_ID_INVALID;
1674 }
1675
1676 RTE_ETH_FOREACH_DEV (port_id) {
1677 struct ether_addr ea;
1678
1679 rte_eth_macaddr_get(port_id, &ea);
1680 memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
1681 if (eth_addr_equals(mac, port_mac)) {
1682 return port_id;
1683 }
1684 }
1685
1686 return DPDK_ETH_PORT_ID_INVALID;
1687}
1688
40e940e4
OM
1689/* Return the first DPDK port id matching the devargs pattern. */
1690static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs)
1691 OVS_REQUIRES(dpdk_mutex)
1692{
1693 dpdk_port_t port_id;
1694 struct rte_dev_iterator iterator;
1695
1696 RTE_ETH_FOREACH_MATCHING_DEV (port_id, devargs, &iterator) {
1697 /* If a break is done - must call rte_eth_iterator_cleanup. */
1698 rte_eth_iterator_cleanup(&iterator);
1699 break;
1700 }
1701
1702 return port_id;
1703}
1704
5e758818 1705/*
40e940e4
OM
1706 * Normally, a PCI id (optionally followed by a representor number)
1707 * is enough for identifying a specific DPDK port.
5e758818
YL
1708 * However, for some NICs having multiple ports sharing the same PCI
1709 * id, using PCI id won't work then.
1710 *
1711 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1712 *
1713 * Note that the compatibility is fully kept: user can still use the
1714 * PCI id for adding ports (when it's enough for them).
1715 */
bb37956a 1716static dpdk_port_t
5dcde09c
IM
1717netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
1718 const char *devargs, char **errp)
40e940e4 1719 OVS_REQUIRES(dpdk_mutex)
55e075e6 1720{
40e940e4 1721 dpdk_port_t new_port_id;
55e075e6 1722
5e758818
YL
1723 if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
1724 new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
1725 } else {
40e940e4
OM
1726 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1727 if (!rte_eth_dev_is_valid_port(new_port_id)) {
5e758818 1728 /* Device not found in DPDK, attempt to attach it */
40e940e4 1729 if (rte_dev_probe(devargs)) {
5e758818 1730 new_port_id = DPDK_ETH_PORT_ID_INVALID;
40e940e4
OM
1731 } else {
1732 new_port_id = netdev_dpdk_get_port_by_devargs(devargs);
1733 if (rte_eth_dev_is_valid_port(new_port_id)) {
1734 /* Attach successful */
1735 dev->attached = true;
1736 VLOG_INFO("Device '%s' attached to DPDK", devargs);
1737 } else {
1738 /* Attach unsuccessful */
1739 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1740 }
5e758818 1741 }
55e075e6 1742 }
5e758818
YL
1743 }
1744
1745 if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
1746 VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
55e075e6
CL
1747 }
1748
1749 return new_port_id;
1750}
1751
c3d062a7
CL
1752static void
1753dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
b614c894 1754 OVS_REQUIRES(dev->mutex)
a14b8947 1755{
050c60bf 1756 int new_n_rxq;
a14b8947 1757
2a21e757 1758 new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
050c60bf
DDP
1759 if (new_n_rxq != dev->requested_n_rxq) {
1760 dev->requested_n_rxq = new_n_rxq;
c3d062a7 1761 netdev_request_reconfigure(&dev->up);
050c60bf 1762 }
c3d062a7
CL
1763}
1764
b685696b
CL
1765static void
1766dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
1767 const char *flag, int default_size, int *new_size)
1768{
1769 int queue_size = smap_get_int(args, flag, default_size);
1770
1771 if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
1772 || !is_pow2(queue_size)) {
1773 queue_size = default_size;
1774 }
1775
1776 if (queue_size != *new_size) {
1777 *new_size = queue_size;
1778 netdev_request_reconfigure(netdev);
1779 }
1780}
1781
c3d062a7 1782static int
9fff138e
DDP
1783netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
1784 char **errp)
c3d062a7
CL
1785{
1786 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
f8b64a61 1787 bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
b614c894
IM
1788 enum rte_eth_fc_mode fc_mode;
1789 static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
1790 {RTE_FC_NONE, RTE_FC_TX_PAUSE},
1791 {RTE_FC_RX_PAUSE, RTE_FC_FULL }
1792 };
55e075e6
CL
1793 const char *new_devargs;
1794 int err = 0;
c3d062a7 1795
55e075e6 1796 ovs_mutex_lock(&dpdk_mutex);
c3d062a7
CL
1797 ovs_mutex_lock(&dev->mutex);
1798
1799 dpdk_set_rxq_config(dev, args);
1800
b685696b
CL
1801 dpdk_process_queue_size(netdev, args, "n_rxq_desc",
1802 NIC_PORT_DEFAULT_RXQ_SIZE,
1803 &dev->requested_rxq_size);
1804 dpdk_process_queue_size(netdev, args, "n_txq_desc",
1805 NIC_PORT_DEFAULT_TXQ_SIZE,
1806 &dev->requested_txq_size);
1807
55e075e6
CL
1808 new_devargs = smap_get(args, "dpdk-devargs");
1809
1810 if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
1811 /* The user requested a new device. If we return error, the caller
1812 * will delete this netdev and try to recreate it. */
1813 err = EAGAIN;
1814 goto out;
1815 }
1816
1817 /* dpdk-devargs is required for device configuration */
1818 if (new_devargs && new_devargs[0]) {
1819 /* Don't process dpdk-devargs if value is unchanged and port id
1820 * is valid */
1821 if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
1822 && rte_eth_dev_is_valid_port(dev->port_id))) {
bb37956a
IM
1823 dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
1824 new_devargs,
1825 errp);
55e075e6
CL
1826 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1827 err = EINVAL;
1828 } else if (new_port_id == dev->port_id) {
1829 /* Already configured, do not reconfigure again */
1830 err = 0;
1831 } else {
1832 struct netdev_dpdk *dup_dev;
bb37956a 1833
55e075e6
CL
1834 dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
1835 if (dup_dev) {
9fff138e 1836 VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
40e940e4 1837 "which is already in use by '%s'",
9fff138e
DDP
1838 netdev_get_name(netdev), new_devargs,
1839 netdev_get_name(&dup_dev->up));
55e075e6
CL
1840 err = EADDRINUSE;
1841 } else {
bd4e172b 1842 int sid = rte_eth_dev_socket_id(new_port_id);
bb37956a 1843
bd4e172b 1844 dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
55e075e6
CL
1845 dev->devargs = xstrdup(new_devargs);
1846 dev->port_id = new_port_id;
1847 netdev_request_reconfigure(&dev->up);
971f4b39 1848 netdev_dpdk_clear_xstats(dev);
55e075e6
CL
1849 err = 0;
1850 }
1851 }
1852 }
1853 } else {
9fff138e
DDP
1854 VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
1855 "The old 'dpdk<port_id>' names are not supported",
1856 netdev_get_name(netdev));
55e075e6
CL
1857 err = EINVAL;
1858 }
1859
1860 if (err) {
1861 goto out;
1862 }
1863
f8b64a61
RM
1864 lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
1865 if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
1866 dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
1867 netdev_request_reconfigure(netdev);
1868 }
1869
c3d062a7
CL
1870 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
1871 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
b614c894 1872 autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
c3d062a7 1873
b614c894
IM
1874 fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
1875 if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
1876 dev->fc_conf.mode = fc_mode;
1877 dev->fc_conf.autoneg = autoneg;
7e1de65e
SC
1878 /* Get the Flow control configuration for DPDK-ETH */
1879 err = rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
1880 if (err) {
1881 VLOG_WARN("Cannot get flow control parameters on port "
1882 DPDK_PORT_ID_FMT", err=%d", dev->port_id, err);
1883 }
b614c894
IM
1884 dpdk_eth_flow_ctrl_setup(dev);
1885 }
9fd39370 1886
55e075e6 1887out:
c3d062a7 1888 ovs_mutex_unlock(&dev->mutex);
55e075e6 1889 ovs_mutex_unlock(&dpdk_mutex);
c3d062a7 1890
55e075e6 1891 return err;
c3d062a7
CL
1892}
1893
1894static int
9fff138e
DDP
1895netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
1896 char **errp OVS_UNUSED)
c3d062a7
CL
1897{
1898 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1899
1900 ovs_mutex_lock(&dev->mutex);
1901 dpdk_set_rxq_config(dev, args);
a14b8947
IM
1902 ovs_mutex_unlock(&dev->mutex);
1903
1904 return 0;
1905}
1906
c1ff66ac 1907static int
2d24d165 1908netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
9fff138e
DDP
1909 const struct smap *args,
1910 char **errp OVS_UNUSED)
c1ff66ac
CL
1911{
1912 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1913 const char *path;
1914
6881885a 1915 ovs_mutex_lock(&dev->mutex);
c1ff66ac
CL
1916 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1917 path = smap_get(args, "vhost-server-path");
bb9d2623
IM
1918 if (!nullable_string_is_equal(path, dev->vhost_id)) {
1919 free(dev->vhost_id);
1920 dev->vhost_id = nullable_xstrdup(path);
10087cba
CL
1921 /* check zero copy configuration */
1922 if (smap_get_bool(args, "dq-zero-copy", false)) {
1923 dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1924 } else {
1925 dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1926 }
c1ff66ac
CL
1927 netdev_request_reconfigure(netdev);
1928 }
1929 }
6881885a 1930 ovs_mutex_unlock(&dev->mutex);
c1ff66ac
CL
1931
1932 return 0;
1933}
1934
7dec44fe 1935static int
d46285a2 1936netdev_dpdk_get_numa_id(const struct netdev *netdev)
7dec44fe 1937{
d46285a2 1938 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7dec44fe 1939
d46285a2 1940 return dev->socket_id;
7dec44fe
AW
1941}
1942
050c60bf 1943/* Sets the number of tx queues for the dpdk interface. */
5496878c 1944static int
050c60bf 1945netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
5496878c 1946{
d46285a2 1947 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5496878c 1948
d46285a2 1949 ovs_mutex_lock(&dev->mutex);
91968eb0 1950
050c60bf
DDP
1951 if (dev->requested_n_txq == n_txq) {
1952 goto out;
4573fbd3
FL
1953 }
1954
050c60bf
DDP
1955 dev->requested_n_txq = n_txq;
1956 netdev_request_reconfigure(netdev);
58397e6c 1957
050c60bf 1958out:
d46285a2 1959 ovs_mutex_unlock(&dev->mutex);
050c60bf 1960 return 0;
58397e6c
KT
1961}
1962
8a9562d2
PS
1963static struct netdev_rxq *
1964netdev_dpdk_rxq_alloc(void)
1965{
1966 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
1967
eff23640
DDP
1968 if (rx) {
1969 return &rx->up;
1970 }
1971
1972 return NULL;
8a9562d2
PS
1973}
1974
1975static struct netdev_rxq_dpdk *
d46285a2 1976netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
8a9562d2 1977{
d46285a2 1978 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
8a9562d2
PS
1979}
1980
1981static int
d46285a2 1982netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
8a9562d2 1983{
d46285a2
DDP
1984 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1985 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
8a9562d2 1986
d46285a2
DDP
1987 ovs_mutex_lock(&dev->mutex);
1988 rx->port_id = dev->port_id;
1989 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
1990
1991 return 0;
1992}
1993
1994static void
d46285a2 1995netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
8a9562d2
PS
1996{
1997}
1998
1999static void
d46285a2 2000netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
8a9562d2 2001{
d46285a2 2002 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
8a9562d2
PS
2003
2004 rte_free(rx);
2005}
2006
819f13bd
DDP
2007/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
2008 * 'pkts', even in case of failure.
2009 *
2010 * Returns the number of packets that weren't transmitted. */
2011static inline int
b59cc14e 2012netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
819f13bd 2013 struct rte_mbuf **pkts, int cnt)
8a9562d2 2014{
1304f1f8
DDP
2015 uint32_t nb_tx = 0;
2016
b59cc14e 2017 while (nb_tx != cnt) {
1304f1f8
DDP
2018 uint32_t ret;
2019
b59cc14e 2020 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
1304f1f8
DDP
2021 if (!ret) {
2022 break;
2023 }
2024
2025 nb_tx += ret;
2026 }
8a9562d2 2027
b59cc14e 2028 if (OVS_UNLIKELY(nb_tx != cnt)) {
819f13bd 2029 /* Free buffers, which we couldn't transmit, one at a time (each
db73f716
DDP
2030 * packet could come from a different mempool) */
2031 int i;
2032
b59cc14e
IM
2033 for (i = nb_tx; i < cnt; i++) {
2034 rte_pktmbuf_free(pkts[i]);
db73f716 2035 }
8a9562d2 2036 }
819f13bd
DDP
2037
2038 return cnt - nb_tx;
8a9562d2
PS
2039}
2040
f3926f29
IS
2041static inline bool
2042netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
03f3f9c0 2043 struct rte_meter_srtcm_profile *profile,
f3926f29
IS
2044 struct rte_mbuf *pkt, uint64_t time)
2045{
2046 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
2047
03f3f9c0
OM
2048 return rte_meter_srtcm_color_blind_check(meter, profile, time, pkt_len) ==
2049 e_RTE_METER_GREEN;
f3926f29
IS
2050}
2051
2052static int
2053netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
03f3f9c0 2054 struct rte_meter_srtcm_profile *profile,
3e90f7d7 2055 struct rte_mbuf **pkts, int pkt_cnt,
7d7ded7a 2056 bool should_steal)
f3926f29
IS
2057{
2058 int i = 0;
2059 int cnt = 0;
2060 struct rte_mbuf *pkt = NULL;
2061 uint64_t current_time = rte_rdtsc();
2062
2063 for (i = 0; i < pkt_cnt; i++) {
2064 pkt = pkts[i];
2065 /* Handle current packet */
03f3f9c0
OM
2066 if (netdev_dpdk_policer_pkt_handle(meter, profile,
2067 pkt, current_time)) {
f3926f29
IS
2068 if (cnt != i) {
2069 pkts[cnt] = pkt;
2070 }
2071 cnt++;
2072 } else {
7d7ded7a 2073 if (should_steal) {
3e90f7d7
GZ
2074 rte_pktmbuf_free(pkt);
2075 }
f3926f29
IS
2076 }
2077 }
2078
2079 return cnt;
2080}
2081
9509913a
IS
2082static int
2083ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
7d7ded7a 2084 int pkt_cnt, bool should_steal)
9509913a
IS
2085{
2086 int cnt = 0;
2087
2088 rte_spinlock_lock(&policer->policer_lock);
03f3f9c0
OM
2089 cnt = netdev_dpdk_policer_run(&policer->in_policer, &policer->in_prof,
2090 pkts, pkt_cnt, should_steal);
9509913a
IS
2091 rte_spinlock_unlock(&policer->policer_lock);
2092
2093 return cnt;
2094}
2095
58397e6c 2096static bool
0a0f39df 2097is_vhost_running(struct netdev_dpdk *dev)
58397e6c 2098{
0a0f39df 2099 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
58397e6c
KT
2100}
2101
d6e3feb5 2102static inline void
2103netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
2104 unsigned int packet_size)
2105{
2106 /* Hard-coded search for the size bucket. */
2107 if (packet_size < 256) {
2108 if (packet_size >= 128) {
2109 stats->rx_128_to_255_packets++;
2110 } else if (packet_size <= 64) {
2111 stats->rx_1_to_64_packets++;
2112 } else {
2113 stats->rx_65_to_127_packets++;
2114 }
2115 } else {
2116 if (packet_size >= 1523) {
2117 stats->rx_1523_to_max_packets++;
2118 } else if (packet_size >= 1024) {
2119 stats->rx_1024_to_1522_packets++;
2120 } else if (packet_size < 512) {
2121 stats->rx_256_to_511_packets++;
2122 } else {
2123 stats->rx_512_to_1023_packets++;
2124 }
2125 }
2126}
2127
9e3ddd45
TP
2128static inline void
2129netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
9509913a
IS
2130 struct dp_packet **packets, int count,
2131 int dropped)
9e3ddd45
TP
2132{
2133 int i;
d6e3feb5 2134 unsigned int packet_size;
9e3ddd45
TP
2135 struct dp_packet *packet;
2136
2137 stats->rx_packets += count;
9509913a 2138 stats->rx_dropped += dropped;
9e3ddd45
TP
2139 for (i = 0; i < count; i++) {
2140 packet = packets[i];
d6e3feb5 2141 packet_size = dp_packet_size(packet);
9e3ddd45 2142
d6e3feb5 2143 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
9e3ddd45
TP
2144 /* This only protects the following multicast counting from
2145 * too short packets, but it does not stop the packet from
2146 * further processing. */
2147 stats->rx_errors++;
2148 stats->rx_length_errors++;
2149 continue;
2150 }
2151
d6e3feb5 2152 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
2153
9e3ddd45
TP
2154 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
2155 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
2156 stats->multicast++;
2157 }
2158
d6e3feb5 2159 stats->rx_bytes += packet_size;
9e3ddd45
TP
2160 }
2161}
2162
58397e6c
KT
2163/*
2164 * The receive path for the vhost port is the TX path out from guest.
2165 */
2166static int
d46285a2 2167netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
8492adc2 2168 struct dp_packet_batch *batch, int *qfill)
58397e6c 2169{
d46285a2 2170 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 2171 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
58397e6c 2172 uint16_t nb_rx = 0;
9509913a 2173 uint16_t dropped = 0;
8492adc2 2174 int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
daf22bf7 2175 int vid = netdev_dpdk_get_vid(dev);
58397e6c 2176
daf22bf7 2177 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
e543851d 2178 || !(dev->flags & NETDEV_UP))) {
58397e6c
KT
2179 return EAGAIN;
2180 }
2181
43307ad0 2182 nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
64839cf4 2183 (struct rte_mbuf **) batch->packets,
cd159f1a 2184 NETDEV_MAX_BURST);
58397e6c
KT
2185 if (!nb_rx) {
2186 return EAGAIN;
2187 }
2188
8492adc2
JS
2189 if (qfill) {
2190 if (nb_rx == NETDEV_MAX_BURST) {
2191 /* The DPDK API returns a uint32_t which often has invalid bits in
2192 * the upper 16-bits. Need to restrict the value to uint16_t. */
2193 *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
2194 } else {
2195 *qfill = 0;
2196 }
2197 }
2198
9509913a
IS
2199 if (policer) {
2200 dropped = nb_rx;
64839cf4
WT
2201 nb_rx = ingress_policer_run(policer,
2202 (struct rte_mbuf **) batch->packets,
3e90f7d7 2203 nb_rx, true);
9509913a
IS
2204 dropped -= nb_rx;
2205 }
2206
d46285a2 2207 rte_spinlock_lock(&dev->stats_lock);
64839cf4
WT
2208 netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
2209 nb_rx, dropped);
d46285a2 2210 rte_spinlock_unlock(&dev->stats_lock);
45d947c4 2211
75fb9148
ZB
2212 batch->count = nb_rx;
2213 dp_packet_batch_init_packet_fields(batch);
2214
58397e6c
KT
2215 return 0;
2216}
2217
35c91567
DM
2218static bool
2219netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
2220{
2221 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
2222
2223 return dev->vhost_rxq_enabled[rxq->queue_id];
2224}
2225
8a9562d2 2226static int
8492adc2
JS
2227netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
2228 int *qfill)
8a9562d2 2229{
d46285a2
DDP
2230 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
2231 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 2232 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
8a9562d2 2233 int nb_rx;
9509913a 2234 int dropped = 0;
8a9562d2 2235
3b1fb077
DDP
2236 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
2237 return EAGAIN;
2238 }
2239
d46285a2 2240 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
64839cf4 2241 (struct rte_mbuf **) batch->packets,
cd159f1a 2242 NETDEV_MAX_BURST);
8a9562d2
PS
2243 if (!nb_rx) {
2244 return EAGAIN;
2245 }
2246
9509913a
IS
2247 if (policer) {
2248 dropped = nb_rx;
64839cf4 2249 nb_rx = ingress_policer_run(policer,
58be5c0e 2250 (struct rte_mbuf **) batch->packets,
3e90f7d7 2251 nb_rx, true);
9509913a
IS
2252 dropped -= nb_rx;
2253 }
2254
2255 /* Update stats to reflect dropped packets */
2256 if (OVS_UNLIKELY(dropped)) {
2257 rte_spinlock_lock(&dev->stats_lock);
2258 dev->stats.rx_dropped += dropped;
2259 rte_spinlock_unlock(&dev->stats_lock);
2260 }
2261
64839cf4 2262 batch->count = nb_rx;
75fb9148 2263 dp_packet_batch_init_packet_fields(batch);
8a9562d2 2264
8492adc2
JS
2265 if (qfill) {
2266 if (nb_rx == NETDEV_MAX_BURST) {
2267 *qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
2268 } else {
2269 *qfill = 0;
2270 }
2271 }
2272
8a9562d2
PS
2273 return 0;
2274}
2275
0bf765f7 2276static inline int
78bd47cf 2277netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
7d7ded7a 2278 int cnt, bool should_steal)
0bf765f7 2279{
78bd47cf 2280 struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
0bf765f7 2281
78bd47cf
DDP
2282 if (qos_conf) {
2283 rte_spinlock_lock(&qos_conf->lock);
7d7ded7a 2284 cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
78bd47cf 2285 rte_spinlock_unlock(&qos_conf->lock);
0bf765f7
IS
2286 }
2287
2288 return cnt;
2289}
2290
c6ec9d17
IM
2291static int
2292netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
2293 int pkt_cnt)
2294{
2295 int i = 0;
2296 int cnt = 0;
2297 struct rte_mbuf *pkt;
2298
2299 for (i = 0; i < pkt_cnt; i++) {
2300 pkt = pkts[i];
2301 if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
2302 VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
2303 dev->up.name, pkt->pkt_len, dev->max_packet_len);
2304 rte_pktmbuf_free(pkt);
2305 continue;
2306 }
2307
2308 if (OVS_UNLIKELY(i != cnt)) {
2309 pkts[cnt] = pkt;
2310 }
2311 cnt++;
2312 }
2313
2314 return cnt;
2315}
2316
9e3ddd45
TP
2317static inline void
2318netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
2319 struct dp_packet **packets,
2320 int attempted,
2321 int dropped)
2322{
2323 int i;
2324 int sent = attempted - dropped;
2325
2326 stats->tx_packets += sent;
2327 stats->tx_dropped += dropped;
2328
2329 for (i = 0; i < sent; i++) {
2330 stats->tx_bytes += dp_packet_size(packets[i]);
2331 }
2332}
2333
58397e6c 2334static void
4573fbd3 2335__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
dd52de45 2336 struct dp_packet **pkts, int cnt)
58397e6c 2337{
d46285a2 2338 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
95e9881f
KT
2339 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
2340 unsigned int total_pkts = cnt;
c6ec9d17 2341 unsigned int dropped = 0;
dd52de45 2342 int i, retries = 0;
daf22bf7 2343 int vid = netdev_dpdk_get_vid(dev);
58397e6c 2344
81acebda 2345 qid = dev->tx_q[qid % netdev->n_txq].map;
585a5bea 2346
daf22bf7 2347 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
e543851d 2348 || !(dev->flags & NETDEV_UP))) {
d46285a2
DDP
2349 rte_spinlock_lock(&dev->stats_lock);
2350 dev->stats.tx_dropped+= cnt;
2351 rte_spinlock_unlock(&dev->stats_lock);
1b99bb05 2352 goto out;
58397e6c
KT
2353 }
2354
d46285a2 2355 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
58397e6c 2356
c6ec9d17 2357 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
0bf765f7 2358 /* Check has QoS has been configured for the netdev */
3e90f7d7 2359 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
c6ec9d17 2360 dropped = total_pkts - cnt;
0bf765f7 2361
95e9881f 2362 do {
4573fbd3 2363 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
95e9881f
KT
2364 unsigned int tx_pkts;
2365
daf22bf7 2366 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
95e9881f
KT
2367 if (OVS_LIKELY(tx_pkts)) {
2368 /* Packets have been sent.*/
2369 cnt -= tx_pkts;
31871ee3 2370 /* Prepare for possible retry.*/
95e9881f
KT
2371 cur_pkts = &cur_pkts[tx_pkts];
2372 } else {
31871ee3
KT
2373 /* No packets sent - do not retry.*/
2374 break;
95e9881f 2375 }
c6ec9d17 2376 } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM));
4573fbd3 2377
d46285a2 2378 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
95e9881f 2379
d46285a2 2380 rte_spinlock_lock(&dev->stats_lock);
0072e931 2381 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
c6ec9d17 2382 cnt + dropped);
d46285a2 2383 rte_spinlock_unlock(&dev->stats_lock);
58397e6c
KT
2384
2385out:
c6ec9d17 2386 for (i = 0; i < total_pkts - dropped; i++) {
dd52de45 2387 dp_packet_delete(pkts[i]);
58397e6c
KT
2388 }
2389}
2390
8a9562d2
PS
2391/* Tx function. Transmit packets indefinitely */
2392static void
64839cf4 2393dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
db73f716 2394 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2 2395{
8a14bd7b 2396 const size_t batch_cnt = dp_packet_batch_size(batch);
bce01e3a 2397#if !defined(__CHECKER__) && !defined(_WIN32)
8a14bd7b 2398 const size_t PKT_ARRAY_SIZE = batch_cnt;
bce01e3a
EJ
2399#else
2400 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 2401 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
bce01e3a 2402#endif
8a9562d2 2403 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2391135c 2404 struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
8a14bd7b 2405 uint32_t cnt = batch_cnt;
3e90f7d7
GZ
2406 uint32_t dropped = 0;
2407
2408 if (dev->type != DPDK_DEV_VHOST) {
2409 /* Check if QoS has been configured for this netdev. */
2410 cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
8a14bd7b
BB
2411 batch_cnt, false);
2412 dropped += batch_cnt - cnt;
3e90f7d7 2413 }
8a9562d2 2414
3e90f7d7
GZ
2415 uint32_t txcnt = 0;
2416
2417 for (uint32_t i = 0; i < cnt; i++) {
8a14bd7b
BB
2418 struct dp_packet *packet = batch->packets[i];
2419 uint32_t size = dp_packet_size(packet);
95fb793a 2420
f98d7864 2421 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
3e90f7d7
GZ
2422 VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
2423 size, dev->max_packet_len);
f4fd623c 2424
175cf4de 2425 dropped++;
f4fd623c
DDP
2426 continue;
2427 }
8a9562d2 2428
43307ad0 2429 pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
8a14bd7b 2430 if (OVS_UNLIKELY(!pkts[txcnt])) {
3e90f7d7 2431 dropped += cnt - i;
175cf4de 2432 break;
f4fd623c
DDP
2433 }
2434
2435 /* We have to do a copy for now */
3e90f7d7 2436 memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
8a14bd7b
BB
2437 dp_packet_data(packet), size);
2438 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
f4fd623c 2439
3e90f7d7 2440 txcnt++;
f4fd623c 2441 }
8a9562d2 2442
3e90f7d7
GZ
2443 if (OVS_LIKELY(txcnt)) {
2444 if (dev->type == DPDK_DEV_VHOST) {
2445 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
2446 txcnt);
2447 } else {
2448 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
2449 }
58397e6c 2450 }
db73f716 2451
0bf765f7
IS
2452 if (OVS_UNLIKELY(dropped)) {
2453 rte_spinlock_lock(&dev->stats_lock);
2454 dev->stats.tx_dropped += dropped;
2455 rte_spinlock_unlock(&dev->stats_lock);
2456 }
8a9562d2
PS
2457}
2458
58397e6c 2459static int
64839cf4
WT
2460netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2461 struct dp_packet_batch *batch,
b30896c9 2462 bool concurrent_txq OVS_UNUSED)
58397e6c 2463{
58397e6c 2464
b30896c9 2465 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
64839cf4 2466 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2467 dp_packet_delete_batch(batch, true);
58397e6c 2468 } else {
dd52de45 2469 __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
58397e6c
KT
2470 }
2471 return 0;
2472}
2473
7251515e
DV
2474static inline void
2475netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
b30896c9 2476 struct dp_packet_batch *batch,
324c8374 2477 bool concurrent_txq)
8a9562d2 2478{
3b1fb077 2479 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
b30896c9 2480 dp_packet_delete_batch(batch, true);
3b1fb077
DDP
2481 return;
2482 }
2483
324c8374 2484 if (OVS_UNLIKELY(concurrent_txq)) {
81acebda 2485 qid = qid % dev->up.n_txq;
a0cb2d66
DDP
2486 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2487 }
2488
b30896c9 2489 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
7251515e
DV
2490 struct netdev *netdev = &dev->up;
2491
64839cf4 2492 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2493 dp_packet_delete_batch(batch, true);
8a9562d2 2494 } else {
fd57eeba
BB
2495 int tx_cnt, dropped;
2496 int batch_cnt = dp_packet_batch_size(batch);
2391135c 2497 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
8a9562d2 2498
fd57eeba
BB
2499 tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
2500 tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
2501 dropped = batch_cnt - tx_cnt;
1b99bb05 2502
fd57eeba 2503 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);
8a9562d2 2504
f4fd623c 2505 if (OVS_UNLIKELY(dropped)) {
45d947c4 2506 rte_spinlock_lock(&dev->stats_lock);
f4fd623c 2507 dev->stats.tx_dropped += dropped;
45d947c4 2508 rte_spinlock_unlock(&dev->stats_lock);
f4fd623c 2509 }
8a9562d2 2510 }
a0cb2d66 2511
324c8374 2512 if (OVS_UNLIKELY(concurrent_txq)) {
a0cb2d66
DDP
2513 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2514 }
7251515e
DV
2515}
2516
2517static int
2518netdev_dpdk_eth_send(struct netdev *netdev, int qid,
b30896c9 2519 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e
DV
2520{
2521 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2522
b30896c9 2523 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e 2524 return 0;
8a9562d2
PS
2525}
2526
2527static int
74ff3298 2528netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
8a9562d2
PS
2529{
2530 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2531
2532 ovs_mutex_lock(&dev->mutex);
2533 if (!eth_addr_equals(dev->hwaddr, mac)) {
74ff3298 2534 dev->hwaddr = mac;
045c0d1a 2535 netdev_change_seq_changed(netdev);
8a9562d2
PS
2536 }
2537 ovs_mutex_unlock(&dev->mutex);
2538
2539 return 0;
2540}
2541
2542static int
74ff3298 2543netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
8a9562d2
PS
2544{
2545 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2546
2547 ovs_mutex_lock(&dev->mutex);
74ff3298 2548 *mac = dev->hwaddr;
8a9562d2
PS
2549 ovs_mutex_unlock(&dev->mutex);
2550
2551 return 0;
2552}
2553
2554static int
2555netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
2556{
2557 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2558
2559 ovs_mutex_lock(&dev->mutex);
2560 *mtup = dev->mtu;
2561 ovs_mutex_unlock(&dev->mutex);
2562
2563 return 0;
2564}
2565
0072e931
MK
2566static int
2567netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
2568{
2569 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2570
f6f50552
IS
2571 /* XXX: Ensure that the overall frame length of the requested MTU does not
2572 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2573 * the L2 frame length is calculated for a given MTU when
2574 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2575 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2576 * not include vlan headers. As such we should use
2577 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2578 * (8 bytes) for comparison. This avoids a failure later with
2579 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
2580 * a method to retrieve the upper bound MTU for a given device.
2581 */
2582 if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
0072e931
MK
2583 || mtu < ETHER_MIN_MTU) {
2584 VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
2585 return EINVAL;
2586 }
2587
2588 ovs_mutex_lock(&dev->mutex);
2589 if (dev->requested_mtu != mtu) {
2590 dev->requested_mtu = mtu;
2591 netdev_request_reconfigure(netdev);
2592 }
2593 ovs_mutex_unlock(&dev->mutex);
2594
2595 return 0;
2596}
2597
8a9562d2 2598static int
d46285a2 2599netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
8a9562d2 2600
58397e6c
KT
2601static int
2602netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
2603 struct netdev_stats *stats)
2604{
2605 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2606
2607 ovs_mutex_lock(&dev->mutex);
58397e6c 2608
45d947c4 2609 rte_spinlock_lock(&dev->stats_lock);
58397e6c 2610 /* Supported Stats */
50986e78 2611 stats->rx_packets = dev->stats.rx_packets;
2612 stats->tx_packets = dev->stats.tx_packets;
9509913a 2613 stats->rx_dropped = dev->stats.rx_dropped;
50986e78 2614 stats->tx_dropped = dev->stats.tx_dropped;
9e3ddd45
TP
2615 stats->multicast = dev->stats.multicast;
2616 stats->rx_bytes = dev->stats.rx_bytes;
2617 stats->tx_bytes = dev->stats.tx_bytes;
2618 stats->rx_errors = dev->stats.rx_errors;
2619 stats->rx_length_errors = dev->stats.rx_length_errors;
d6e3feb5 2620
2621 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
2622 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
2623 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
2624 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
2625 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
2626 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
2627 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
2628
45d947c4 2629 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45 2630
58397e6c
KT
2631 ovs_mutex_unlock(&dev->mutex);
2632
2633 return 0;
2634}
2635
d6e3feb5 2636static void
2637netdev_dpdk_convert_xstats(struct netdev_stats *stats,
0a0f39df
CL
2638 const struct rte_eth_xstat *xstats,
2639 const struct rte_eth_xstat_name *names,
d6e3feb5 2640 const unsigned int size)
2641{
d6e3feb5 2642 for (unsigned int i = 0; i < size; i++) {
0a0f39df 2643 if (strcmp(XSTAT_RX_64_PACKETS, names[i].name) == 0) {
d6e3feb5 2644 stats->rx_1_to_64_packets = xstats[i].value;
0a0f39df 2645 } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, names[i].name) == 0) {
d6e3feb5 2646 stats->rx_65_to_127_packets = xstats[i].value;
0a0f39df 2647 } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, names[i].name) == 0) {
d6e3feb5 2648 stats->rx_128_to_255_packets = xstats[i].value;
0a0f39df 2649 } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, names[i].name) == 0) {
d6e3feb5 2650 stats->rx_256_to_511_packets = xstats[i].value;
0a0f39df 2651 } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS, names[i].name) == 0) {
d6e3feb5 2652 stats->rx_512_to_1023_packets = xstats[i].value;
0a0f39df 2653 } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS, names[i].name) == 0) {
d6e3feb5 2654 stats->rx_1024_to_1522_packets = xstats[i].value;
0a0f39df 2655 } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
d6e3feb5 2656 stats->rx_1523_to_max_packets = xstats[i].value;
0a0f39df 2657 } else if (strcmp(XSTAT_TX_64_PACKETS, names[i].name) == 0) {
d6e3feb5 2658 stats->tx_1_to_64_packets = xstats[i].value;
0a0f39df 2659 } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, names[i].name) == 0) {
d6e3feb5 2660 stats->tx_65_to_127_packets = xstats[i].value;
0a0f39df 2661 } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, names[i].name) == 0) {
d6e3feb5 2662 stats->tx_128_to_255_packets = xstats[i].value;
0a0f39df 2663 } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, names[i].name) == 0) {
d6e3feb5 2664 stats->tx_256_to_511_packets = xstats[i].value;
0a0f39df 2665 } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS, names[i].name) == 0) {
d6e3feb5 2666 stats->tx_512_to_1023_packets = xstats[i].value;
0a0f39df 2667 } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS, names[i].name) == 0) {
d6e3feb5 2668 stats->tx_1024_to_1522_packets = xstats[i].value;
0a0f39df 2669 } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
d6e3feb5 2670 stats->tx_1523_to_max_packets = xstats[i].value;
d57f777f
PS
2671 } else if (strcmp(XSTAT_RX_MULTICAST_PACKETS, names[i].name) == 0) {
2672 stats->multicast = xstats[i].value;
0a0f39df 2673 } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2674 stats->tx_multicast_packets = xstats[i].value;
0a0f39df 2675 } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2676 stats->rx_broadcast_packets = xstats[i].value;
0a0f39df 2677 } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2678 stats->tx_broadcast_packets = xstats[i].value;
0a0f39df 2679 } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
d6e3feb5 2680 stats->rx_undersized_errors = xstats[i].value;
0a0f39df 2681 } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
d6e3feb5 2682 stats->rx_fragmented_errors = xstats[i].value;
0a0f39df 2683 } else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
d6e3feb5 2684 stats->rx_jabber_errors = xstats[i].value;
2685 }
2686 }
2687}
2688
8a9562d2
PS
2689static int
2690netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
2691{
2692 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2693 struct rte_eth_stats rte_stats;
2694 bool gg;
2695
2696 netdev_dpdk_get_carrier(netdev, &gg);
2697 ovs_mutex_lock(&dev->mutex);
8a9562d2 2698
0a0f39df
CL
2699 struct rte_eth_xstat *rte_xstats = NULL;
2700 struct rte_eth_xstat_name *rte_xstats_names = NULL;
2701 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
d6e3feb5 2702
2703 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
fa9f4eeb
IM
2704 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
2705 dev->port_id);
f9256822 2706 ovs_mutex_unlock(&dev->mutex);
d6e3feb5 2707 return EPROTO;
2708 }
2709
0a0f39df
CL
2710 /* Get length of statistics */
2711 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
2712 if (rte_xstats_len < 0) {
fa9f4eeb
IM
2713 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2714 dev->port_id);
0a0f39df
CL
2715 goto out;
2716 }
2717 /* Reserve memory for xstats names and values */
2718 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
2719 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
2720
2721 /* Retreive xstats names */
2722 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
2723 rte_xstats_names,
2724 rte_xstats_len);
2725 if (rte_xstats_new_len != rte_xstats_len) {
fa9f4eeb
IM
2726 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
2727 dev->port_id);
0a0f39df
CL
2728 goto out;
2729 }
2730 /* Retreive xstats values */
2731 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
2732 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
2733 rte_xstats_len);
2734 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
2735 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
2736 rte_xstats_len);
d6e3feb5 2737 } else {
fa9f4eeb
IM
2738 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2739 dev->port_id);
d6e3feb5 2740 }
8a9562d2 2741
0a0f39df
CL
2742out:
2743 free(rte_xstats);
2744 free(rte_xstats_names);
2745
2f9dd77f
PS
2746 stats->rx_packets = rte_stats.ipackets;
2747 stats->tx_packets = rte_stats.opackets;
2748 stats->rx_bytes = rte_stats.ibytes;
2749 stats->tx_bytes = rte_stats.obytes;
21e9844c 2750 stats->rx_errors = rte_stats.ierrors;
2f9dd77f 2751 stats->tx_errors = rte_stats.oerrors;
8a9562d2 2752
45d947c4 2753 rte_spinlock_lock(&dev->stats_lock);
2f9dd77f 2754 stats->tx_dropped = dev->stats.tx_dropped;
9509913a 2755 stats->rx_dropped = dev->stats.rx_dropped;
45d947c4 2756 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45
TP
2757
2758 /* These are the available DPDK counters for packets not received due to
2759 * local resource constraints in DPDK and NIC respectively. */
9509913a 2760 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
9e3ddd45
TP
2761 stats->rx_missed_errors = rte_stats.imissed;
2762
8a9562d2
PS
2763 ovs_mutex_unlock(&dev->mutex);
2764
2765 return 0;
2766}
2767
971f4b39
MW
2768static int
2769netdev_dpdk_get_custom_stats(const struct netdev *netdev,
2770 struct netdev_custom_stats *custom_stats)
2771{
2772
2773 uint32_t i;
2774 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2775 int rte_xstats_ret;
2776
2777 ovs_mutex_lock(&dev->mutex);
2778
2779 if (netdev_dpdk_configure_xstats(dev)) {
2780 uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
2781 sizeof(uint64_t));
2782
2783 rte_xstats_ret =
2784 rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
2785 values, dev->rte_xstats_ids_size);
2786
2787 if (rte_xstats_ret > 0 &&
2788 rte_xstats_ret <= dev->rte_xstats_ids_size) {
2789
2790 custom_stats->size = rte_xstats_ret;
2791 custom_stats->counters =
2792 (struct netdev_custom_counter *) xcalloc(rte_xstats_ret,
2793 sizeof(struct netdev_custom_counter));
2794
2795 for (i = 0; i < rte_xstats_ret; i++) {
2796 ovs_strlcpy(custom_stats->counters[i].name,
2797 netdev_dpdk_get_xstat_name(dev,
2798 dev->rte_xstats_ids[i]),
2799 NETDEV_CUSTOM_STATS_NAME_SIZE);
2800 custom_stats->counters[i].value = values[i];
2801 }
2802 } else {
fa9f4eeb 2803 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
971f4b39
MW
2804 dev->port_id);
2805 custom_stats->counters = NULL;
2806 custom_stats->size = 0;
2807 /* Let's clear statistics cache, so it will be
2808 * reconfigured */
2809 netdev_dpdk_clear_xstats(dev);
2810 }
526259f2
IM
2811
2812 free(values);
971f4b39
MW
2813 }
2814
2815 ovs_mutex_unlock(&dev->mutex);
2816
2817 return 0;
2818}
2819
8a9562d2 2820static int
d46285a2 2821netdev_dpdk_get_features(const struct netdev *netdev,
8a9562d2 2822 enum netdev_features *current,
ca3d4f55
BX
2823 enum netdev_features *advertised,
2824 enum netdev_features *supported,
2825 enum netdev_features *peer)
8a9562d2 2826{
d46285a2 2827 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2828 struct rte_eth_link link;
dfcb5b8a 2829 uint32_t feature = 0;
8a9562d2
PS
2830
2831 ovs_mutex_lock(&dev->mutex);
2832 link = dev->link;
2833 ovs_mutex_unlock(&dev->mutex);
2834
dfcb5b8a
IS
2835 /* Match against OpenFlow defined link speed values. */
2836 if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
2837 switch (link.link_speed) {
2838 case ETH_SPEED_NUM_10M:
2839 feature |= NETDEV_F_10MB_FD;
2840 break;
2841 case ETH_SPEED_NUM_100M:
2842 feature |= NETDEV_F_100MB_FD;
2843 break;
2844 case ETH_SPEED_NUM_1G:
2845 feature |= NETDEV_F_1GB_FD;
2846 break;
2847 case ETH_SPEED_NUM_10G:
2848 feature |= NETDEV_F_10GB_FD;
2849 break;
2850 case ETH_SPEED_NUM_40G:
2851 feature |= NETDEV_F_40GB_FD;
2852 break;
2853 case ETH_SPEED_NUM_100G:
2854 feature |= NETDEV_F_100GB_FD;
2855 break;
2856 default:
2857 feature |= NETDEV_F_OTHER;
8a9562d2 2858 }
dfcb5b8a
IS
2859 } else if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
2860 switch (link.link_speed) {
2861 case ETH_SPEED_NUM_10M:
2862 feature |= NETDEV_F_10MB_HD;
2863 break;
2864 case ETH_SPEED_NUM_100M:
2865 feature |= NETDEV_F_100MB_HD;
2866 break;
2867 case ETH_SPEED_NUM_1G:
2868 feature |= NETDEV_F_1GB_HD;
2869 break;
2870 default:
2871 feature |= NETDEV_F_OTHER;
74cd69a4 2872 }
8a9562d2
PS
2873 }
2874
362ca396 2875 if (link.link_autoneg) {
dfcb5b8a 2876 feature |= NETDEV_F_AUTONEG;
362ca396 2877 }
2878
dfcb5b8a 2879 *current = feature;
ca3d4f55
BX
2880 *advertised = *supported = *peer = 0;
2881
8a9562d2
PS
2882 return 0;
2883}
2884
9509913a
IS
2885static struct ingress_policer *
2886netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
2887{
2888 struct ingress_policer *policer = NULL;
2889 uint64_t rate_bytes;
2890 uint64_t burst_bytes;
2891 int err = 0;
2892
2893 policer = xmalloc(sizeof *policer);
2894 rte_spinlock_init(&policer->policer_lock);
2895
2896 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
602c8668
LR
2897 rate_bytes = rate * 1000ULL / 8;
2898 burst_bytes = burst * 1000ULL / 8;
9509913a
IS
2899
2900 policer->app_srtcm_params.cir = rate_bytes;
2901 policer->app_srtcm_params.cbs = burst_bytes;
2902 policer->app_srtcm_params.ebs = 0;
03f3f9c0
OM
2903 err = rte_meter_srtcm_profile_config(&policer->in_prof,
2904 &policer->app_srtcm_params);
2905 if (!err) {
2906 err = rte_meter_srtcm_config(&policer->in_policer,
2907 &policer->in_prof);
2908 }
58be5c0e 2909 if (err) {
9509913a 2910 VLOG_ERR("Could not create rte meter for ingress policer");
4c47ddde 2911 free(policer);
9509913a
IS
2912 return NULL;
2913 }
2914
2915 return policer;
2916}
2917
2918static int
2919netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
2920 uint32_t policer_burst)
2921{
2922 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2923 struct ingress_policer *policer;
2924
2925 /* Force to 0 if no rate specified,
2926 * default to 8000 kbits if burst is 0,
2927 * else stick with user-specified value.
2928 */
2929 policer_burst = (!policer_rate ? 0
2930 : !policer_burst ? 8000
2931 : policer_burst);
2932
2933 ovs_mutex_lock(&dev->mutex);
2934
2935 policer = ovsrcu_get_protected(struct ingress_policer *,
2936 &dev->ingress_policer);
2937
2938 if (dev->policer_rate == policer_rate &&
2939 dev->policer_burst == policer_burst) {
2940 /* Assume that settings haven't changed since we last set them. */
2941 ovs_mutex_unlock(&dev->mutex);
2942 return 0;
2943 }
2944
2945 /* Destroy any existing ingress policer for the device if one exists */
2946 if (policer) {
2947 ovsrcu_postpone(free, policer);
2948 }
2949
2950 if (policer_rate != 0) {
2951 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
2952 } else {
2953 policer = NULL;
2954 }
2955 ovsrcu_set(&dev->ingress_policer, policer);
2956 dev->policer_rate = policer_rate;
2957 dev->policer_burst = policer_burst;
2958 ovs_mutex_unlock(&dev->mutex);
2959
2960 return 0;
2961}
2962
8a9562d2
PS
2963static int
2964netdev_dpdk_get_ifindex(const struct netdev *netdev)
2965{
2966 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2967
2968 ovs_mutex_lock(&dev->mutex);
12d0d124
PL
2969 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
2970 * postive integer to meet RFC 2863 recommendations.
2971 */
2972 int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
8a9562d2
PS
2973 ovs_mutex_unlock(&dev->mutex);
2974
2975 return ifindex;
2976}
2977
2978static int
d46285a2 2979netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
8a9562d2 2980{
d46285a2 2981 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2982
2983 ovs_mutex_lock(&dev->mutex);
2984 check_link_status(dev);
2985 *carrier = dev->link.link_status;
58397e6c
KT
2986
2987 ovs_mutex_unlock(&dev->mutex);
2988
2989 return 0;
2990}
2991
2992static int
d46285a2 2993netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
58397e6c 2994{
d46285a2 2995 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
58397e6c
KT
2996
2997 ovs_mutex_lock(&dev->mutex);
2998
0a0f39df 2999 if (is_vhost_running(dev)) {
58397e6c
KT
3000 *carrier = 1;
3001 } else {
3002 *carrier = 0;
3003 }
3004
8a9562d2
PS
3005 ovs_mutex_unlock(&dev->mutex);
3006
3007 return 0;
3008}
3009
3010static long long int
d46285a2 3011netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
8a9562d2 3012{
d46285a2 3013 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3014 long long int carrier_resets;
3015
3016 ovs_mutex_lock(&dev->mutex);
3017 carrier_resets = dev->link_reset_cnt;
3018 ovs_mutex_unlock(&dev->mutex);
3019
3020 return carrier_resets;
3021}
3022
3023static int
d46285a2 3024netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
8a9562d2
PS
3025 long long int interval OVS_UNUSED)
3026{
ee32150e 3027 return EOPNOTSUPP;
8a9562d2
PS
3028}
3029
3030static int
3031netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
3032 enum netdev_flags off, enum netdev_flags on,
64839cf4
WT
3033 enum netdev_flags *old_flagsp)
3034 OVS_REQUIRES(dev->mutex)
8a9562d2 3035{
8a9562d2
PS
3036 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
3037 return EINVAL;
3038 }
3039
3040 *old_flagsp = dev->flags;
3041 dev->flags |= on;
3042 dev->flags &= ~off;
3043
3044 if (dev->flags == *old_flagsp) {
3045 return 0;
3046 }
3047
58397e6c 3048 if (dev->type == DPDK_DEV_ETH) {
2d37de73
EC
3049
3050 if ((dev->flags ^ *old_flagsp) & NETDEV_UP) {
3051 int err;
3052
3053 if (dev->flags & NETDEV_UP) {
3054 err = rte_eth_dev_set_link_up(dev->port_id);
3055 } else {
3056 err = rte_eth_dev_set_link_down(dev->port_id);
3057 }
3058 if (err == -ENOTSUP) {
3059 VLOG_INFO("Interface %s does not support link state "
3060 "configuration", netdev_get_name(&dev->up));
3061 } else if (err < 0) {
3062 VLOG_ERR("Interface %s link change error: %s",
3063 netdev_get_name(&dev->up), rte_strerror(-err));
3064 dev->flags = *old_flagsp;
3065 return -err;
3066 }
3067 }
3068
58397e6c
KT
3069 if (dev->flags & NETDEV_PROMISC) {
3070 rte_eth_promiscuous_enable(dev->port_id);
3071 }
8a9562d2 3072
314fb5ad 3073 netdev_change_seq_changed(&dev->up);
e543851d
ZB
3074 } else {
3075 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
3076 * running then change netdev's change_seq to trigger link state
3077 * update. */
e543851d
ZB
3078
3079 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
0a0f39df 3080 && is_vhost_running(dev)) {
e543851d
ZB
3081 netdev_change_seq_changed(&dev->up);
3082
3083 /* Clear statistics if device is getting up. */
3084 if (NETDEV_UP & on) {
3085 rte_spinlock_lock(&dev->stats_lock);
58be5c0e 3086 memset(&dev->stats, 0, sizeof dev->stats);
e543851d
ZB
3087 rte_spinlock_unlock(&dev->stats_lock);
3088 }
3089 }
8a9562d2
PS
3090 }
3091
3092 return 0;
3093}
3094
3095static int
d46285a2 3096netdev_dpdk_update_flags(struct netdev *netdev,
8a9562d2
PS
3097 enum netdev_flags off, enum netdev_flags on,
3098 enum netdev_flags *old_flagsp)
3099{
d46285a2 3100 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
3101 int error;
3102
d46285a2
DDP
3103 ovs_mutex_lock(&dev->mutex);
3104 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
3105 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3106
3107 return error;
3108}
3109
b2e8b12f
FL
3110static int
3111netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
3112 struct smap *args)
3113{
3114 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3115
3116 ovs_mutex_lock(&dev->mutex);
3117
3118 bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
3119 smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
3120
3121 int vid = netdev_dpdk_get_vid(dev);
3122 if (vid < 0) {
3123 smap_add_format(args, "status", "disconnected");
3124 ovs_mutex_unlock(&dev->mutex);
3125 return 0;
3126 } else {
3127 smap_add_format(args, "status", "connected");
3128 }
3129
3130 char socket_name[PATH_MAX];
3131 if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
3132 smap_add_format(args, "socket", "%s", socket_name);
3133 }
3134
3135 uint64_t features;
3136 if (!rte_vhost_get_negotiated_features(vid, &features)) {
3137 smap_add_format(args, "features", "0x%016"PRIx64, features);
3138 }
3139
3140 uint16_t mtu;
3141 if (!rte_vhost_get_mtu(vid, &mtu)) {
3142 smap_add_format(args, "mtu", "%d", mtu);
3143 }
3144
3145 int numa = rte_vhost_get_numa_node(vid);
3146 if (numa >= 0) {
3147 smap_add_format(args, "numa", "%d", numa);
3148 }
3149
3150 uint16_t vring_num = rte_vhost_get_vring_num(vid);
3151 if (vring_num) {
3152 smap_add_format(args, "num_of_vrings", "%d", vring_num);
3153 }
3154
3155 for (int i = 0; i < vring_num; i++) {
3156 struct rte_vhost_vring vring;
b2e8b12f
FL
3157
3158 rte_vhost_get_vhost_vring(vid, i, &vring);
b9a3183d
AC
3159 smap_add_nocopy(args, xasprintf("vring_%d_size", i),
3160 xasprintf("%d", vring.size));
b2e8b12f
FL
3161 }
3162
3163 ovs_mutex_unlock(&dev->mutex);
3164 return 0;
3165}
3166
31154f95
IS
3167/*
3168 * Convert a given uint32_t link speed defined in DPDK to a string
3169 * equivalent.
3170 */
3171static const char *
3172netdev_dpdk_link_speed_to_str__(uint32_t link_speed)
3173{
3174 switch (link_speed) {
3175 case ETH_SPEED_NUM_10M: return "10Mbps";
3176 case ETH_SPEED_NUM_100M: return "100Mbps";
3177 case ETH_SPEED_NUM_1G: return "1Gbps";
3178 case ETH_SPEED_NUM_2_5G: return "2.5Gbps";
3179 case ETH_SPEED_NUM_5G: return "5Gbps";
3180 case ETH_SPEED_NUM_10G: return "10Gbps";
3181 case ETH_SPEED_NUM_20G: return "20Gbps";
3182 case ETH_SPEED_NUM_25G: return "25Gbps";
3183 case ETH_SPEED_NUM_40G: return "40Gbps";
3184 case ETH_SPEED_NUM_50G: return "50Gbps";
3185 case ETH_SPEED_NUM_56G: return "56Gbps";
3186 case ETH_SPEED_NUM_100G: return "100Gbps";
3187 default: return "Not Defined";
3188 }
3189}
3190
8a9562d2 3191static int
d46285a2 3192netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
8a9562d2 3193{
d46285a2 3194 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 3195 struct rte_eth_dev_info dev_info;
31154f95 3196 uint32_t link_speed;
8a9562d2 3197
7cd1261d 3198 if (!rte_eth_dev_is_valid_port(dev->port_id)) {
8a9562d2 3199 return ENODEV;
7cd1261d 3200 }
8a9562d2 3201
03f3f9c0 3202 ovs_mutex_lock(&dpdk_mutex);
8a9562d2
PS
3203 ovs_mutex_lock(&dev->mutex);
3204 rte_eth_dev_info_get(dev->port_id, &dev_info);
31154f95 3205 link_speed = dev->link.link_speed;
8a9562d2 3206 ovs_mutex_unlock(&dev->mutex);
03f3f9c0
OM
3207 const struct rte_bus *bus;
3208 const struct rte_pci_device *pci_dev;
3209 uint16_t vendor_id = PCI_ANY_ID;
3210 uint16_t device_id = PCI_ANY_ID;
3211 bus = rte_bus_find_by_device(dev_info.device);
3212 if (bus && !strcmp(bus->name, "pci")) {
3213 pci_dev = RTE_DEV_TO_PCI(dev_info.device);
3214 if (pci_dev) {
3215 vendor_id = pci_dev->id.vendor_id;
3216 device_id = pci_dev->id.device_id;
3217 }
3218 }
3219 ovs_mutex_unlock(&dpdk_mutex);
8a9562d2 3220
fa9f4eeb 3221 smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
58be5c0e
MK
3222 smap_add_format(args, "numa_id", "%d",
3223 rte_eth_dev_socket_id(dev->port_id));
8a9562d2
PS
3224 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
3225 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
4be4d22c 3226 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
8a9562d2
PS
3227 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
3228 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
3229 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
58be5c0e
MK
3230 smap_add_format(args, "max_hash_mac_addrs", "%u",
3231 dev_info.max_hash_mac_addrs);
8a9562d2
PS
3232 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
3233 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
3234
3eb8d4fa
MW
3235 /* Querying the DPDK library for iftype may be done in future, pending
3236 * support; cf. RFC 3635 Section 3.2.4. */
3237 enum { IF_TYPE_ETHERNETCSMACD = 6 };
3238
3239 smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
3240 smap_add_format(args, "if_descr", "%s %s", rte_version(),
3241 dev_info.driver_name);
03f3f9c0
OM
3242 smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id);
3243 smap_add_format(args, "pci-device_id", "0x%x", device_id);
8a9562d2 3244
31154f95
IS
3245 /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps.
3246 * In that case the speed will not be reported as part of the usual
3247 * call to get_features(). Get the link speed of the device and add it
3248 * to the device status in an easy to read string format.
3249 */
3250 smap_add(args, "link_speed",
3251 netdev_dpdk_link_speed_to_str__(link_speed));
3252
8a9562d2
PS
3253 return 0;
3254}
3255
3256static void
3257netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
3258 OVS_REQUIRES(dev->mutex)
3259{
3260 enum netdev_flags old_flags;
3261
3262 if (admin_state) {
3263 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
3264 } else {
3265 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
3266 }
3267}
3268
3269static void
3270netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
3271 const char *argv[], void *aux OVS_UNUSED)
3272{
3273 bool up;
3274
3275 if (!strcasecmp(argv[argc - 1], "up")) {
3276 up = true;
3277 } else if ( !strcasecmp(argv[argc - 1], "down")) {
3278 up = false;
3279 } else {
3280 unixctl_command_reply_error(conn, "Invalid Admin State");
3281 return;
3282 }
3283
3284 if (argc > 2) {
3285 struct netdev *netdev = netdev_from_name(argv[1]);
3d0d5ab1 3286
8a9562d2 3287 if (netdev && is_dpdk_class(netdev->netdev_class)) {
3d0d5ab1 3288 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 3289
3d0d5ab1
IM
3290 ovs_mutex_lock(&dev->mutex);
3291 netdev_dpdk_set_admin_state__(dev, up);
3292 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3293
3294 netdev_close(netdev);
3295 } else {
3296 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3297 netdev_close(netdev);
3298 return;
3299 }
3300 } else {
3d0d5ab1 3301 struct netdev_dpdk *dev;
8a9562d2
PS
3302
3303 ovs_mutex_lock(&dpdk_mutex);
3d0d5ab1
IM
3304 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3305 ovs_mutex_lock(&dev->mutex);
3306 netdev_dpdk_set_admin_state__(dev, up);
3307 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
3308 }
3309 ovs_mutex_unlock(&dpdk_mutex);
3310 }
3311 unixctl_command_reply(conn, "OK");
3312}
3313
0ee821c2
DB
3314static void
3315netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
3316 const char *argv[], void *aux OVS_UNUSED)
3317{
0ee821c2 3318 char *response;
7ee94cba 3319 dpdk_port_t port_id;
0ee821c2 3320 struct netdev_dpdk *dev;
40e940e4
OM
3321 struct rte_device *rte_dev;
3322 struct ds used_interfaces = DS_EMPTY_INITIALIZER;
3323 bool used = false;
0ee821c2
DB
3324
3325 ovs_mutex_lock(&dpdk_mutex);
3326
40e940e4
OM
3327 port_id = netdev_dpdk_get_port_by_devargs(argv[1]);
3328 if (!rte_eth_dev_is_valid_port(port_id)) {
0ee821c2
DB
3329 response = xasprintf("Device '%s' not found in DPDK", argv[1]);
3330 goto error;
3331 }
3332
40e940e4
OM
3333 rte_dev = rte_eth_devices[port_id].device;
3334 ds_put_format(&used_interfaces,
3335 "Device '%s' is being used by the following interfaces:",
3336 argv[1]);
3337
3338 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3339 /* FIXME: avoid direct access to DPDK array rte_eth_devices. */
3340 if (rte_eth_devices[dev->port_id].device == rte_dev
3341 && rte_eth_devices[dev->port_id].state != RTE_ETH_DEV_UNUSED) {
3342 used = true;
3343 ds_put_format(&used_interfaces, " %s",
3344 netdev_get_name(&dev->up));
3345 }
3346 }
3347
3348 if (used) {
3349 ds_put_cstr(&used_interfaces, ". Remove them before detaching.");
3350 response = ds_steal_cstr(&used_interfaces);
3351 ds_destroy(&used_interfaces);
0ee821c2
DB
3352 goto error;
3353 }
40e940e4 3354 ds_destroy(&used_interfaces);
0ee821c2
DB
3355
3356 rte_eth_dev_close(port_id);
40e940e4 3357 if (rte_dev_remove(rte_dev) < 0) {
0ee821c2
DB
3358 response = xasprintf("Device '%s' can not be detached", argv[1]);
3359 goto error;
3360 }
3361
40e940e4
OM
3362 response = xasprintf("All devices shared with device '%s' "
3363 "have been detached", argv[1]);
0ee821c2
DB
3364
3365 ovs_mutex_unlock(&dpdk_mutex);
3366 unixctl_command_reply(conn, response);
3367 free(response);
3368 return;
3369
3370error:
3371 ovs_mutex_unlock(&dpdk_mutex);
3372 unixctl_command_reply_error(conn, response);
3373 free(response);
3374}
3375
be481733
IM
3376static void
3377netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
3378 int argc, const char *argv[],
3379 void *aux OVS_UNUSED)
3380{
3381 size_t size;
3382 FILE *stream;
3383 char *response = NULL;
3384 struct netdev *netdev = NULL;
3385
3386 if (argc == 2) {
3387 netdev = netdev_from_name(argv[1]);
3388 if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
3389 unixctl_command_reply_error(conn, "Not a DPDK Interface");
3390 goto out;
3391 }
3392 }
3393
3394 stream = open_memstream(&response, &size);
3395 if (!stream) {
3396 response = xasprintf("Unable to open memstream: %s.",
3397 ovs_strerror(errno));
3398 unixctl_command_reply_error(conn, response);
3399 goto out;
3400 }
3401
3402 if (netdev) {
3403 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3404
3405 ovs_mutex_lock(&dev->mutex);
3406 ovs_mutex_lock(&dpdk_mp_mutex);
3407
43307ad0 3408 rte_mempool_dump(stream, dev->dpdk_mp->mp);
be481733
IM
3409
3410 ovs_mutex_unlock(&dpdk_mp_mutex);
3411 ovs_mutex_unlock(&dev->mutex);
3412 } else {
3413 ovs_mutex_lock(&dpdk_mp_mutex);
3414 rte_mempool_list_dump(stream);
3415 ovs_mutex_unlock(&dpdk_mp_mutex);
3416 }
3417
3418 fclose(stream);
3419
3420 unixctl_command_reply(conn, response);
3421out:
3422 free(response);
3423 netdev_close(netdev);
3424}
3425
58397e6c
KT
3426/*
3427 * Set virtqueue flags so that we do not receive interrupts.
3428 */
3429static void
0a0f39df 3430set_irq_status(int vid)
58397e6c 3431{
4573fbd3 3432 uint32_t i;
4573fbd3 3433
f3e7ec25
MW
3434 for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
3435 rte_vhost_enable_guest_notification(vid, i, 0);
4573fbd3
FL
3436 }
3437}
3438
585a5bea
IM
3439/*
3440 * Fixes mapping for vhost-user tx queues. Must be called after each
81acebda 3441 * enabling/disabling of queues and n_txq modifications.
585a5bea
IM
3442 */
3443static void
d46285a2
DDP
3444netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
3445 OVS_REQUIRES(dev->mutex)
585a5bea
IM
3446{
3447 int *enabled_queues, n_enabled = 0;
81acebda 3448 int i, k, total_txqs = dev->up.n_txq;
585a5bea 3449
eff23640 3450 enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
585a5bea
IM
3451
3452 for (i = 0; i < total_txqs; i++) {
3453 /* Enabled queues always mapped to themselves. */
d46285a2 3454 if (dev->tx_q[i].map == i) {
585a5bea
IM
3455 enabled_queues[n_enabled++] = i;
3456 }
3457 }
3458
3459 if (n_enabled == 0 && total_txqs != 0) {
f3ea2ad2 3460 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
585a5bea
IM
3461 n_enabled = 1;
3462 }
3463
3464 k = 0;
3465 for (i = 0; i < total_txqs; i++) {
d46285a2
DDP
3466 if (dev->tx_q[i].map != i) {
3467 dev->tx_q[i].map = enabled_queues[k];
585a5bea
IM
3468 k = (k + 1) % n_enabled;
3469 }
3470 }
3471
170ef726
IM
3472 if (VLOG_IS_DBG_ENABLED()) {
3473 struct ds mapping = DS_EMPTY_INITIALIZER;
3474
3475 ds_put_format(&mapping, "TX queue mapping for port '%s':\n",
3476 netdev_get_name(&dev->up));
3477 for (i = 0; i < total_txqs; i++) {
3478 ds_put_format(&mapping, "%2d --> %2d\n", i, dev->tx_q[i].map);
3479 }
3480
3481 VLOG_DBG("%s", ds_cstr(&mapping));
3482 ds_destroy(&mapping);
585a5bea
IM
3483 }
3484
eff23640 3485 free(enabled_queues);
585a5bea 3486}
4573fbd3 3487
58397e6c
KT
3488/*
3489 * A new virtio-net device is added to a vhost port.
3490 */
3491static int
0a0f39df 3492new_device(int vid)
58397e6c 3493{
d46285a2 3494 struct netdev_dpdk *dev;
58397e6c 3495 bool exists = false;
db8f13b0 3496 int newnode = 0;
0a0f39df
CL
3497 char ifname[IF_NAME_SZ];
3498
58be5c0e 3499 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3500
3501 ovs_mutex_lock(&dpdk_mutex);
3502 /* Add device to the vhost port with the same name as that passed down. */
d46285a2 3503 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
c1ff66ac 3504 ovs_mutex_lock(&dev->mutex);
bb9d2623
IM
3505 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3506 uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
db8f13b0
CL
3507
3508 /* Get NUMA information */
0a0f39df
CL
3509 newnode = rte_vhost_get_numa_node(vid);
3510 if (newnode == -1) {
5b9bf9e0 3511#ifdef VHOST_NUMA
db8f13b0 3512 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
0a0f39df 3513 ifname);
5b9bf9e0 3514#endif
db8f13b0 3515 newnode = dev->socket_id;
db8f13b0
CL
3516 }
3517
7235cd20
DM
3518 if (dev->requested_n_txq < qp_num
3519 || dev->requested_n_rxq < qp_num
7f5f2bd0
IM
3520 || dev->requested_socket_id != newnode) {
3521 dev->requested_socket_id = newnode;
3522 dev->requested_n_rxq = qp_num;
3523 dev->requested_n_txq = qp_num;
3524 netdev_request_reconfigure(&dev->up);
3525 } else {
3526 /* Reconfiguration not required. */
3527 dev->vhost_reconfigured = true;
3528 }
81acebda 3529
0a0f39df 3530 ovsrcu_index_set(&dev->vid, vid);
81acebda
IM
3531 exists = true;
3532
58397e6c 3533 /* Disable notifications. */
0a0f39df 3534 set_irq_status(vid);
e543851d 3535 netdev_change_seq_changed(&dev->up);
d46285a2 3536 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3537 break;
3538 }
c1ff66ac 3539 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3540 }
3541 ovs_mutex_unlock(&dpdk_mutex);
3542
3543 if (!exists) {
0a0f39df 3544 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
58397e6c
KT
3545
3546 return -1;
3547 }
3548
0a0f39df
CL
3549 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
3550 ifname, newnode);
3551
58397e6c
KT
3552 return 0;
3553}
3554
f3ea2ad2
IM
3555/* Clears mapping for all available queues of vhost interface. */
3556static void
3557netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
3558 OVS_REQUIRES(dev->mutex)
3559{
3560 int i;
3561
81acebda 3562 for (i = 0; i < dev->up.n_txq; i++) {
f3ea2ad2
IM
3563 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
3564 }
3565}
3566
58397e6c
KT
3567/*
3568 * Remove a virtio-net device from the specific vhost port. Use dev->remove
3569 * flag to stop any more packets from being sent or received to/from a VM and
3570 * ensure all currently queued packets have been sent/received before removing
3571 * the device.
3572 */
3573static void
0a0f39df 3574destroy_device(int vid)
58397e6c 3575{
d46285a2 3576 struct netdev_dpdk *dev;
afee281f 3577 bool exists = false;
0a0f39df
CL
3578 char ifname[IF_NAME_SZ];
3579
58be5c0e 3580 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3581
3582 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3583 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
0a0f39df 3584 if (netdev_dpdk_get_vid(dev) == vid) {
58397e6c 3585
d46285a2 3586 ovs_mutex_lock(&dev->mutex);
0a0f39df
CL
3587 dev->vhost_reconfigured = false;
3588 ovsrcu_index_set(&dev->vid, -1);
35c91567
DM
3589 memset(dev->vhost_rxq_enabled, 0,
3590 dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
d46285a2 3591 netdev_dpdk_txq_map_clear(dev);
81acebda 3592
e543851d 3593 netdev_change_seq_changed(&dev->up);
d46285a2 3594 ovs_mutex_unlock(&dev->mutex);
81acebda 3595 exists = true;
afee281f 3596 break;
58397e6c
KT
3597 }
3598 }
afee281f 3599
58397e6c
KT
3600 ovs_mutex_unlock(&dpdk_mutex);
3601
0a0f39df 3602 if (exists) {
afee281f
KT
3603 /*
3604 * Wait for other threads to quiesce after setting the 'virtio_dev'
3605 * to NULL, before returning.
3606 */
3607 ovsrcu_synchronize();
3608 /*
3609 * As call to ovsrcu_synchronize() will end the quiescent state,
3610 * put thread back into quiescent state before returning.
3611 */
3612 ovsrcu_quiesce_start();
0a0f39df 3613 VLOG_INFO("vHost Device '%s' has been removed", ifname);
afee281f 3614 } else {
0a0f39df 3615 VLOG_INFO("vHost Device '%s' not found", ifname);
afee281f 3616 }
58397e6c
KT
3617}
3618
585a5bea 3619static int
0a0f39df 3620vring_state_changed(int vid, uint16_t queue_id, int enable)
585a5bea 3621{
d46285a2 3622 struct netdev_dpdk *dev;
585a5bea
IM
3623 bool exists = false;
3624 int qid = queue_id / VIRTIO_QNUM;
35c91567 3625 bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
0a0f39df
CL
3626 char ifname[IF_NAME_SZ];
3627
58be5c0e 3628 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
585a5bea 3629
585a5bea 3630 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3631 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
c1ff66ac 3632 ovs_mutex_lock(&dev->mutex);
bb9d2623 3633 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
35c91567
DM
3634 if (is_rx) {
3635 bool old_state = dev->vhost_rxq_enabled[qid];
3636
3637 dev->vhost_rxq_enabled[qid] = enable != 0;
3638 if (old_state != dev->vhost_rxq_enabled[qid]) {
3639 netdev_change_seq_changed(&dev->up);
3640 }
585a5bea 3641 } else {
35c91567
DM
3642 if (enable) {
3643 dev->tx_q[qid].map = qid;
3644 } else {
3645 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
3646 }
3647 netdev_dpdk_remap_txqs(dev);
585a5bea 3648 }
585a5bea 3649 exists = true;
d46285a2 3650 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3651 break;
3652 }
c1ff66ac 3653 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3654 }
3655 ovs_mutex_unlock(&dpdk_mutex);
3656
3657 if (exists) {
35c91567
DM
3658 VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
3659 "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
3660 qid, ifname, (enable == 1) ? "enabled" : "disabled");
585a5bea 3661 } else {
0a0f39df 3662 VLOG_INFO("vHost Device '%s' not found", ifname);
585a5bea
IM
3663 return -1;
3664 }
3665
3666 return 0;
3667}
3668
61473a0e
DM
3669static void
3670destroy_connection(int vid)
3671{
3672 struct netdev_dpdk *dev;
3673 char ifname[IF_NAME_SZ];
3674 bool exists = false;
3675
3676 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
3677
3678 ovs_mutex_lock(&dpdk_mutex);
3679 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
3680 ovs_mutex_lock(&dev->mutex);
3681 if (nullable_string_is_equal(ifname, dev->vhost_id)) {
3682 uint32_t qp_num = NR_QUEUE;
3683
3684 if (netdev_dpdk_get_vid(dev) >= 0) {
3685 VLOG_ERR("Connection on socket '%s' destroyed while vhost "
3686 "device still attached.", dev->vhost_id);
3687 }
3688
3689 /* Restore the number of queue pairs to default. */
3690 if (dev->requested_n_txq != qp_num
3691 || dev->requested_n_rxq != qp_num) {
3692 dev->requested_n_rxq = qp_num;
3693 dev->requested_n_txq = qp_num;
3694 netdev_request_reconfigure(&dev->up);
3695 }
3696 ovs_mutex_unlock(&dev->mutex);
3697 exists = true;
3698 break;
3699 }
3700 ovs_mutex_unlock(&dev->mutex);
3701 }
3702 ovs_mutex_unlock(&dpdk_mutex);
3703
3704 if (exists) {
3705 VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
3706 } else {
3707 VLOG_INFO("vHost Device '%s' not found", ifname);
3708 }
3709}
3710
8492adc2
JS
3711/*
3712 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
3713 * or vhostuserclient netdev.
3714 *
3715 * Returns a value greater or equal to zero for a valid vid or '-1' if
3716 * there is no valid vid associated. A vid of '-1' must not be used in
3717 * rte_vhost_ APi calls.
3718 *
3719 * Once obtained and validated, a vid can be used by a PMD for multiple
3720 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
3721 * not fetch the vid again for each of a series of API calls.
3722 */
3723
0a0f39df
CL
3724int
3725netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
58397e6c 3726{
0a0f39df 3727 return ovsrcu_index_get(&dev->vid);
58397e6c
KT
3728}
3729
9509913a
IS
3730struct ingress_policer *
3731netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
3732{
3733 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
3734}
3735
58397e6c 3736static int
ecc1a34e 3737netdev_dpdk_class_init(void)
7d1ced01 3738{
ecc1a34e
DDP
3739 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3740
3741 /* This function can be called for different classes. The initialization
3742 * needs to be done only once */
3743 if (ovsthread_once_start(&once)) {
3744 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3745 unixctl_command_register("netdev-dpdk/set-admin-state",
3746 "[netdev] up|down", 1, 2,
3747 netdev_dpdk_set_admin_state, NULL);
3748
0ee821c2
DB
3749 unixctl_command_register("netdev-dpdk/detach",
3750 "pci address of device", 1, 1,
3751 netdev_dpdk_detach, NULL);
3752
be481733
IM
3753 unixctl_command_register("netdev-dpdk/get-mempool-info",
3754 "[netdev]", 0, 1,
3755 netdev_dpdk_get_mempool_info, NULL);
3756
ecc1a34e
DDP
3757 ovsthread_once_done(&once);
3758 }
362ca396 3759
7d1ced01
CL
3760 return 0;
3761}
3762
95fb793a 3763/* Client Rings */
3764
95fb793a 3765static int
3766dpdk_ring_create(const char dev_name[], unsigned int port_no,
bb37956a 3767 dpdk_port_t *eth_port_id)
95fb793a 3768{
48fffdee 3769 struct dpdk_ring *ring_pair;
0c6f39e5 3770 char *ring_name;
b8374d0d 3771 int port_id;
95fb793a 3772
48fffdee
KT
3773 ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
3774 if (!ring_pair) {
95fb793a 3775 return ENOMEM;
3776 }
3777
7251515e 3778 /* XXX: Add support for multiquque ring. */
0c6f39e5 3779 ring_name = xasprintf("%s_tx", dev_name);
95fb793a 3780
8f0a76c9 3781 /* Create single producer tx ring, netdev does explicit locking. */
48fffdee 3782 ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3783 RING_F_SP_ENQ);
0c6f39e5 3784 free(ring_name);
48fffdee
KT
3785 if (ring_pair->cring_tx == NULL) {
3786 rte_free(ring_pair);
95fb793a 3787 return ENOMEM;
3788 }
3789
0c6f39e5 3790 ring_name = xasprintf("%s_rx", dev_name);
95fb793a 3791
8f0a76c9 3792 /* Create single consumer rx ring, netdev does explicit locking. */
48fffdee 3793 ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3794 RING_F_SC_DEQ);
0c6f39e5 3795 free(ring_name);
48fffdee
KT
3796 if (ring_pair->cring_rx == NULL) {
3797 rte_free(ring_pair);
95fb793a 3798 return ENOMEM;
3799 }
3800
b8374d0d
MV
3801 port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
3802 &ring_pair->cring_tx, 1, SOCKET0);
d7310583 3803
b8374d0d 3804 if (port_id < 0) {
48fffdee 3805 rte_free(ring_pair);
95fb793a 3806 return ENODEV;
3807 }
3808
48fffdee 3809 ring_pair->user_port_id = port_no;
b8374d0d
MV
3810 ring_pair->eth_port_id = port_id;
3811 *eth_port_id = port_id;
3812
48fffdee 3813 ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
95fb793a 3814
95fb793a 3815 return 0;
3816}
3817
3818static int
bb37956a 3819dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
64839cf4 3820 OVS_REQUIRES(dpdk_mutex)
95fb793a 3821{
48fffdee 3822 struct dpdk_ring *ring_pair;
95fb793a 3823 unsigned int port_no;
3824 int err = 0;
3825
3826 /* Names always start with "dpdkr" */
3827 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
3828 if (err) {
3829 return err;
3830 }
3831
58be5c0e 3832 /* Look through our list to find the device */
48fffdee
KT
3833 LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
3834 if (ring_pair->user_port_id == port_no) {
58397e6c 3835 VLOG_INFO("Found dpdk ring device %s:", dev_name);
58be5c0e 3836 /* Really all that is needed */
48fffdee 3837 *eth_port_id = ring_pair->eth_port_id;
95fb793a 3838 return 0;
3839 }
3840 }
3841 /* Need to create the device rings */
3842 return dpdk_ring_create(dev_name, port_no, eth_port_id);
3843}
3844
7251515e 3845static int
d46285a2 3846netdev_dpdk_ring_send(struct netdev *netdev, int qid,
b30896c9 3847 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e 3848{
d46285a2 3849 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a543eb0 3850 struct dp_packet *packet;
1b99bb05 3851
58be5c0e 3852 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
a47e2db2 3853 * the offload fields are clear. This is because the same mbuf may be
58be5c0e 3854 * modified by the consumer of the ring and return into the datapath
a47e2db2 3855 * without recalculating the RSS hash or revalidating the checksums. */
e883448e 3856 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
a47e2db2 3857 dp_packet_reset_offload(packet);
1b99bb05 3858 }
7251515e 3859
b30896c9 3860 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e
DV
3861 return 0;
3862}
3863
95fb793a 3864static int
3865netdev_dpdk_ring_construct(struct netdev *netdev)
3866{
bb37956a 3867 dpdk_port_t port_no = 0;
95fb793a 3868 int err = 0;
3869
95fb793a 3870 ovs_mutex_lock(&dpdk_mutex);
3871
3872 err = dpdk_ring_open(netdev->name, &port_no);
3873 if (err) {
3874 goto unlock_dpdk;
3875 }
3876
1ce30dfd
DDP
3877 err = common_construct(netdev, port_no, DPDK_DEV_ETH,
3878 rte_eth_dev_socket_id(port_no));
95fb793a 3879unlock_dpdk:
3880 ovs_mutex_unlock(&dpdk_mutex);
3881 return err;
3882}
3883
0bf765f7
IS
3884/* QoS Functions */
3885
3886/*
3887 * Initialize QoS configuration operations.
3888 */
3889static void
3890qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
3891{
3892 conf->ops = ops;
78bd47cf 3893 rte_spinlock_init(&conf->lock);
0bf765f7
IS
3894}
3895
3896/*
3897 * Search existing QoS operations in qos_ops and compare each set of
3898 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
3899 * else return NULL
3900 */
3901static const struct dpdk_qos_ops *
3902qos_lookup_name(const char *name)
3903{
3904 const struct dpdk_qos_ops *const *opsp;
3905
3906 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3907 const struct dpdk_qos_ops *ops = *opsp;
3908 if (!strcmp(name, ops->qos_name)) {
3909 return ops;
3910 }
3911 }
3912 return NULL;
3913}
3914
0bf765f7
IS
3915static int
3916netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3917 struct sset *types)
3918{
3919 const struct dpdk_qos_ops *const *opsp;
3920
3921 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3922 const struct dpdk_qos_ops *ops = *opsp;
3923 if (ops->qos_construct && ops->qos_name[0] != '\0') {
3924 sset_add(types, ops->qos_name);
3925 }
3926 }
3927 return 0;
3928}
3929
3930static int
d46285a2 3931netdev_dpdk_get_qos(const struct netdev *netdev,
0bf765f7
IS
3932 const char **typep, struct smap *details)
3933{
d46285a2 3934 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
78bd47cf 3935 struct qos_conf *qos_conf;
0bf765f7
IS
3936 int error = 0;
3937
d46285a2 3938 ovs_mutex_lock(&dev->mutex);
78bd47cf
DDP
3939 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
3940 if (qos_conf) {
3941 *typep = qos_conf->ops->qos_name;
3942 error = (qos_conf->ops->qos_get
3943 ? qos_conf->ops->qos_get(qos_conf, details): 0);
d03603c4
MC
3944 } else {
3945 /* No QoS configuration set, return an empty string */
3946 *typep = "";
0bf765f7 3947 }
d46285a2 3948 ovs_mutex_unlock(&dev->mutex);
0bf765f7
IS
3949
3950 return error;
3951}
3952
3953static int
78bd47cf
DDP
3954netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
3955 const struct smap *details)
0bf765f7 3956{
d46285a2 3957 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
0bf765f7 3958 const struct dpdk_qos_ops *new_ops = NULL;
78bd47cf 3959 struct qos_conf *qos_conf, *new_qos_conf = NULL;
0bf765f7
IS
3960 int error = 0;
3961
d46285a2 3962 ovs_mutex_lock(&dev->mutex);
0bf765f7 3963
78bd47cf 3964 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
0bf765f7 3965
78bd47cf
DDP
3966 new_ops = qos_lookup_name(type);
3967
3968 if (!new_ops || !new_ops->qos_construct) {
3969 new_qos_conf = NULL;
3970 if (type && type[0]) {
3971 error = EOPNOTSUPP;
0bf765f7 3972 }
44975bb0 3973 } else if (qos_conf && qos_conf->ops == new_ops
78bd47cf
DDP
3974 && qos_conf->ops->qos_is_equal(qos_conf, details)) {
3975 new_qos_conf = qos_conf;
0bf765f7 3976 } else {
78bd47cf 3977 error = new_ops->qos_construct(details, &new_qos_conf);
7ea266e9
IS
3978 }
3979
7ea266e9 3980 if (error) {
78bd47cf
DDP
3981 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
3982 type, netdev->name, rte_strerror(error));
3983 }
3984
3985 if (new_qos_conf != qos_conf) {
3986 ovsrcu_set(&dev->qos_conf, new_qos_conf);
3987 if (qos_conf) {
3988 ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
3989 }
0bf765f7
IS
3990 }
3991
d46285a2 3992 ovs_mutex_unlock(&dev->mutex);
78bd47cf 3993
0bf765f7
IS
3994 return error;
3995}
3996
3997/* egress-policer details */
3998
3999struct egress_policer {
4000 struct qos_conf qos_conf;
4001 struct rte_meter_srtcm_params app_srtcm_params;
4002 struct rte_meter_srtcm egress_meter;
03f3f9c0 4003 struct rte_meter_srtcm_profile egress_prof;
0bf765f7
IS
4004};
4005
78bd47cf
DDP
4006static void
4007egress_policer_details_to_param(const struct smap *details,
4008 struct rte_meter_srtcm_params *params)
0bf765f7 4009{
78bd47cf
DDP
4010 memset(params, 0, sizeof *params);
4011 params->cir = smap_get_ullong(details, "cir", 0);
4012 params->cbs = smap_get_ullong(details, "cbs", 0);
4013 params->ebs = 0;
0bf765f7
IS
4014}
4015
4016static int
78bd47cf
DDP
4017egress_policer_qos_construct(const struct smap *details,
4018 struct qos_conf **conf)
0bf765f7 4019{
0bf765f7 4020 struct egress_policer *policer;
0bf765f7
IS
4021 int err = 0;
4022
0bf765f7
IS
4023 policer = xmalloc(sizeof *policer);
4024 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
78bd47cf 4025 egress_policer_details_to_param(details, &policer->app_srtcm_params);
03f3f9c0
OM
4026 err = rte_meter_srtcm_profile_config(&policer->egress_prof,
4027 &policer->app_srtcm_params);
4028 if (!err) {
4029 err = rte_meter_srtcm_config(&policer->egress_meter,
4030 &policer->egress_prof);
4031 }
4032
78bd47cf
DDP
4033 if (!err) {
4034 *conf = &policer->qos_conf;
4035 } else {
03f3f9c0 4036 VLOG_ERR("Could not create rte meter for egress policer");
7ea266e9 4037 free(policer);
78bd47cf 4038 *conf = NULL;
7ea266e9
IS
4039 err = -err;
4040 }
0bf765f7
IS
4041
4042 return err;
4043}
4044
4045static void
78bd47cf 4046egress_policer_qos_destruct(struct qos_conf *conf)
0bf765f7
IS
4047{
4048 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
78bd47cf 4049 qos_conf);
0bf765f7
IS
4050 free(policer);
4051}
4052
4053static int
78bd47cf 4054egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
0bf765f7 4055{
78bd47cf
DDP
4056 struct egress_policer *policer =
4057 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4058
4059 smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
4060 smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
050c60bf 4061
0bf765f7
IS
4062 return 0;
4063}
4064
78bd47cf 4065static bool
47a45d86
KT
4066egress_policer_qos_is_equal(const struct qos_conf *conf,
4067 const struct smap *details)
0bf765f7 4068{
78bd47cf
DDP
4069 struct egress_policer *policer =
4070 CONTAINER_OF(conf, struct egress_policer, qos_conf);
4071 struct rte_meter_srtcm_params params;
0bf765f7 4072
78bd47cf 4073 egress_policer_details_to_param(details, &params);
7ea266e9 4074
78bd47cf 4075 return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
0bf765f7
IS
4076}
4077
0bf765f7 4078static int
3e90f7d7 4079egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
7d7ded7a 4080 bool should_steal)
0bf765f7 4081{
0bf765f7 4082 int cnt = 0;
78bd47cf
DDP
4083 struct egress_policer *policer =
4084 CONTAINER_OF(conf, struct egress_policer, qos_conf);
0bf765f7 4085
03f3f9c0
OM
4086 cnt = netdev_dpdk_policer_run(&policer->egress_meter,
4087 &policer->egress_prof, pkts,
7d7ded7a 4088 pkt_cnt, should_steal);
0bf765f7
IS
4089
4090 return cnt;
4091}
4092
4093static const struct dpdk_qos_ops egress_policer_ops = {
4094 "egress-policer", /* qos_name */
4095 egress_policer_qos_construct,
4096 egress_policer_qos_destruct,
4097 egress_policer_qos_get,
78bd47cf 4098 egress_policer_qos_is_equal,
0bf765f7
IS
4099 egress_policer_run
4100};
4101
050c60bf
DDP
4102static int
4103netdev_dpdk_reconfigure(struct netdev *netdev)
4104{
4105 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4106 int err = 0;
4107
050c60bf
DDP
4108 ovs_mutex_lock(&dev->mutex);
4109
4110 if (netdev->n_txq == dev->requested_n_txq
0072e931 4111 && netdev->n_rxq == dev->requested_n_rxq
b685696b 4112 && dev->mtu == dev->requested_mtu
f8b64a61 4113 && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
b685696b 4114 && dev->rxq_size == dev->requested_rxq_size
bd4e172b 4115 && dev->txq_size == dev->requested_txq_size
606f6650
EC
4116 && dev->socket_id == dev->requested_socket_id
4117 && dev->started) {
050c60bf
DDP
4118 /* Reconfiguration is unnecessary */
4119
4120 goto out;
4121 }
4122
4123 rte_eth_dev_stop(dev->port_id);
606f6650 4124 dev->started = false;
050c60bf 4125
d555d9bd 4126 err = netdev_dpdk_mempool_configure(dev);
b6b26021 4127 if (err && err != EEXIST) {
d555d9bd 4128 goto out;
0072e931
MK
4129 }
4130
f8b64a61
RM
4131 dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;
4132
050c60bf
DDP
4133 netdev->n_txq = dev->requested_n_txq;
4134 netdev->n_rxq = dev->requested_n_rxq;
4135
b685696b
CL
4136 dev->rxq_size = dev->requested_rxq_size;
4137 dev->txq_size = dev->requested_txq_size;
4138
050c60bf
DDP
4139 rte_free(dev->tx_q);
4140 err = dpdk_eth_dev_init(dev);
eff23640
DDP
4141 dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
4142 if (!dev->tx_q) {
4143 err = ENOMEM;
4144 }
050c60bf 4145
0072e931
MK
4146 netdev_change_seq_changed(netdev);
4147
050c60bf 4148out:
050c60bf 4149 ovs_mutex_unlock(&dev->mutex);
050c60bf
DDP
4150 return err;
4151}
4152
7f381c2e 4153static int
2d24d165 4154dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
2d24d165 4155 OVS_REQUIRES(dev->mutex)
050c60bf 4156{
2d24d165
CL
4157 dev->up.n_txq = dev->requested_n_txq;
4158 dev->up.n_rxq = dev->requested_n_rxq;
96e9b168 4159 int err;
050c60bf 4160
35c91567
DM
4161 /* Always keep RX queue 0 enabled for implementations that won't
4162 * report vring states. */
4163 dev->vhost_rxq_enabled[0] = true;
4164
81acebda
IM
4165 /* Enable TX queue 0 by default if it wasn't disabled. */
4166 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
4167 dev->tx_q[0].map = 0;
4168 }
4169
4170 netdev_dpdk_remap_txqs(dev);
4171
d555d9bd 4172 err = netdev_dpdk_mempool_configure(dev);
b6b26021 4173 if (!err) {
43307ad0 4174 /* A new mempool was created or re-used. */
d555d9bd 4175 netdev_change_seq_changed(&dev->up);
03f3f9c0 4176 } else if (err != EEXIST) {
b6b26021 4177 return err;
db8f13b0 4178 }
0a0f39df 4179 if (netdev_dpdk_get_vid(dev) >= 0) {
894af647 4180 if (dev->vhost_reconfigured == false) {
4181 dev->vhost_reconfigured = true;
4182 /* Carrier status may need updating. */
4183 netdev_change_seq_changed(&dev->up);
4184 }
81acebda 4185 }
7f381c2e
DDP
4186
4187 return 0;
2d24d165
CL
4188}
4189
4190static int
4191netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
4192{
4193 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 4194 int err;
2d24d165 4195
2d24d165 4196 ovs_mutex_lock(&dev->mutex);
7f381c2e 4197 err = dpdk_vhost_reconfigure_helper(dev);
2d24d165 4198 ovs_mutex_unlock(&dev->mutex);
7f381c2e
DDP
4199
4200 return err;
2d24d165
CL
4201}
4202
4203static int
4204netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
4205{
4206 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 4207 int err;
a14d1cc8 4208 uint64_t vhost_flags = 0;
10087cba 4209 bool zc_enabled;
2d24d165 4210
2d24d165
CL
4211 ovs_mutex_lock(&dev->mutex);
4212
c1ff66ac
CL
4213 /* Configure vHost client mode if requested and if the following criteria
4214 * are met:
2d24d165
CL
4215 * 1. Device hasn't been registered yet.
4216 * 2. A path has been specified.
c1ff66ac 4217 */
bb9d2623 4218 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
a14d1cc8
MK
4219 /* Register client-mode device. */
4220 vhost_flags |= RTE_VHOST_USER_CLIENT;
4221
4222 /* Enable IOMMU support, if explicitly requested. */
4223 if (dpdk_vhost_iommu_enabled()) {
4224 vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
4225 }
10087cba 4226
30e834dc
LB
4227 /* Enable POSTCOPY support, if explicitly requested. */
4228 if (dpdk_vhost_postcopy_enabled()) {
4229 vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
4230 }
4231
10087cba
CL
4232 zc_enabled = dev->vhost_driver_flags
4233 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4234 /* Enable zero copy flag, if requested */
4235 if (zc_enabled) {
4236 vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
4237 }
4238
a14d1cc8 4239 err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
c1ff66ac 4240 if (err) {
2d24d165
CL
4241 VLOG_ERR("vhost-user device setup failure for device %s\n",
4242 dev->vhost_id);
7f381c2e 4243 goto unlock;
c1ff66ac 4244 } else {
2d24d165 4245 /* Configuration successful */
a14d1cc8 4246 dev->vhost_driver_flags |= vhost_flags;
2d24d165
CL
4247 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
4248 "using client socket '%s'",
4249 dev->up.name, dev->vhost_id);
10087cba
CL
4250 if (zc_enabled) {
4251 VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
4252 }
c1ff66ac 4253 }
f3e7ec25
MW
4254
4255 err = rte_vhost_driver_callback_register(dev->vhost_id,
4256 &virtio_net_device_ops);
4257 if (err) {
4258 VLOG_ERR("rte_vhost_driver_callback_register failed for "
4259 "vhost user client port: %s\n", dev->up.name);
4260 goto unlock;
4261 }
4262
4263 err = rte_vhost_driver_disable_features(dev->vhost_id,
4264 1ULL << VIRTIO_NET_F_HOST_TSO4
4265 | 1ULL << VIRTIO_NET_F_HOST_TSO6
4266 | 1ULL << VIRTIO_NET_F_CSUM);
4267 if (err) {
4268 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
4269 "client port: %s\n", dev->up.name);
4270 goto unlock;
4271 }
4272
4273 err = rte_vhost_driver_start(dev->vhost_id);
4274 if (err) {
4275 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
4276 "client port: %s\n", dev->up.name);
4277 goto unlock;
4278 }
c1ff66ac
CL
4279 }
4280
7f381c2e
DDP
4281 err = dpdk_vhost_reconfigure_helper(dev);
4282
4283unlock:
050c60bf 4284 ovs_mutex_unlock(&dev->mutex);
050c60bf 4285
7f381c2e 4286 return err;
050c60bf
DDP
4287}
4288
5fc5c50f
IM
4289bool
4290netdev_dpdk_flow_api_supported(struct netdev *netdev)
4291{
4292 struct netdev_dpdk *dev;
4293 bool ret = false;
4294
4295 if (!is_dpdk_class(netdev->netdev_class)) {
4296 goto out;
4297 }
4298
4299 dev = netdev_dpdk_cast(netdev);
4300 ovs_mutex_lock(&dev->mutex);
4301 if (dev->type == DPDK_DEV_ETH) {
4302 /* TODO: Check if we able to offload some minimal flow. */
4303 ret = true;
4304 }
4305 ovs_mutex_unlock(&dev->mutex);
4306out:
4307 return ret;
4308}
4309
6775bdfc
RBY
4310int
4311netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
4312 struct rte_flow *rte_flow,
4313 struct rte_flow_error *error)
4314{
4315 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4316 int ret;
4317
4318 ovs_mutex_lock(&dev->mutex);
4319 ret = rte_flow_destroy(dev->port_id, rte_flow, error);
4320 ovs_mutex_unlock(&dev->mutex);
4321 return ret;
4322}
4323
4324struct rte_flow *
4325netdev_dpdk_rte_flow_create(struct netdev *netdev,
4326 const struct rte_flow_attr *attr,
4327 const struct rte_flow_item *items,
4328 const struct rte_flow_action *actions,
4329 struct rte_flow_error *error)
4330{
4331 struct rte_flow *flow;
4332 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
4333
4334 ovs_mutex_lock(&dev->mutex);
4335 flow = rte_flow_create(dev->port_id, attr, items, actions, error);
4336 ovs_mutex_unlock(&dev->mutex);
4337 return flow;
4338}
e8a2b5bf 4339
89c09c1c
BP
4340#define NETDEV_DPDK_CLASS_COMMON \
4341 .is_pmd = true, \
4342 .alloc = netdev_dpdk_alloc, \
4343 .dealloc = netdev_dpdk_dealloc, \
4344 .get_config = netdev_dpdk_get_config, \
4345 .get_numa_id = netdev_dpdk_get_numa_id, \
4346 .set_etheraddr = netdev_dpdk_set_etheraddr, \
4347 .get_etheraddr = netdev_dpdk_get_etheraddr, \
4348 .get_mtu = netdev_dpdk_get_mtu, \
4349 .set_mtu = netdev_dpdk_set_mtu, \
4350 .get_ifindex = netdev_dpdk_get_ifindex, \
4351 .get_carrier_resets = netdev_dpdk_get_carrier_resets, \
4352 .set_miimon_interval = netdev_dpdk_set_miimon, \
4353 .set_policing = netdev_dpdk_set_policing, \
4354 .get_qos_types = netdev_dpdk_get_qos_types, \
4355 .get_qos = netdev_dpdk_get_qos, \
4356 .set_qos = netdev_dpdk_set_qos, \
4357 .update_flags = netdev_dpdk_update_flags, \
4358 .rxq_alloc = netdev_dpdk_rxq_alloc, \
4359 .rxq_construct = netdev_dpdk_rxq_construct, \
4360 .rxq_destruct = netdev_dpdk_rxq_destruct, \
c0af6425 4361 .rxq_dealloc = netdev_dpdk_rxq_dealloc
89c09c1c
BP
4362
4363#define NETDEV_DPDK_CLASS_BASE \
4364 NETDEV_DPDK_CLASS_COMMON, \
4365 .init = netdev_dpdk_class_init, \
4366 .destruct = netdev_dpdk_destruct, \
4367 .set_tx_multiq = netdev_dpdk_set_tx_multiq, \
4368 .get_carrier = netdev_dpdk_get_carrier, \
4369 .get_stats = netdev_dpdk_get_stats, \
4370 .get_custom_stats = netdev_dpdk_get_custom_stats, \
4371 .get_features = netdev_dpdk_get_features, \
4372 .get_status = netdev_dpdk_get_status, \
4373 .reconfigure = netdev_dpdk_reconfigure, \
5fc5c50f 4374 .rxq_recv = netdev_dpdk_rxq_recv
89c09c1c
BP
4375
4376static const struct netdev_class dpdk_class = {
4377 .type = "dpdk",
4378 NETDEV_DPDK_CLASS_BASE,
4379 .construct = netdev_dpdk_construct,
4380 .set_config = netdev_dpdk_set_config,
4381 .send = netdev_dpdk_eth_send,
4382};
4383
4384static const struct netdev_class dpdk_ring_class = {
4385 .type = "dpdkr",
4386 NETDEV_DPDK_CLASS_BASE,
4387 .construct = netdev_dpdk_ring_construct,
4388 .set_config = netdev_dpdk_ring_set_config,
4389 .send = netdev_dpdk_ring_send,
4390};
4391
4392static const struct netdev_class dpdk_vhost_class = {
4393 .type = "dpdkvhostuser",
4394 NETDEV_DPDK_CLASS_COMMON,
4395 .construct = netdev_dpdk_vhost_construct,
4396 .destruct = netdev_dpdk_vhost_destruct,
4397 .send = netdev_dpdk_vhost_send,
4398 .get_carrier = netdev_dpdk_vhost_get_carrier,
4399 .get_stats = netdev_dpdk_vhost_get_stats,
4400 .get_status = netdev_dpdk_vhost_user_get_status,
4401 .reconfigure = netdev_dpdk_vhost_reconfigure,
35c91567
DM
4402 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4403 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
89c09c1c
BP
4404};
4405
4406static const struct netdev_class dpdk_vhost_client_class = {
4407 .type = "dpdkvhostuserclient",
4408 NETDEV_DPDK_CLASS_COMMON,
4409 .construct = netdev_dpdk_vhost_client_construct,
4410 .destruct = netdev_dpdk_vhost_destruct,
4411 .set_config = netdev_dpdk_vhost_client_set_config,
4412 .send = netdev_dpdk_vhost_send,
4413 .get_carrier = netdev_dpdk_vhost_get_carrier,
4414 .get_stats = netdev_dpdk_vhost_get_stats,
4415 .get_status = netdev_dpdk_vhost_user_get_status,
4416 .reconfigure = netdev_dpdk_vhost_client_reconfigure,
35c91567
DM
4417 .rxq_recv = netdev_dpdk_vhost_rxq_recv,
4418 .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
89c09c1c 4419};
95fb793a 4420
8a9562d2
PS
4421void
4422netdev_dpdk_register(void)
4423{
bab69409
AC
4424 netdev_register_provider(&dpdk_class);
4425 netdev_register_provider(&dpdk_ring_class);
53f50d24 4426 netdev_register_provider(&dpdk_vhost_class);
2d24d165 4427 netdev_register_provider(&dpdk_vhost_client_class);
8a9562d2 4428}