]> git.proxmox.com Git - ovs.git/blame - lib/netdev-dpdk.c
netdev-dpdk: Free mempool only when no in-use mbufs.
[ovs.git] / lib / netdev-dpdk.c
CommitLineData
8a9562d2 1/*
12d0d124 2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
8a9562d2
PS
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
01961bbd 18#include "netdev-dpdk.h"
8a9562d2 19
8a9562d2
PS
20#include <string.h>
21#include <signal.h>
22#include <stdlib.h>
8a9562d2 23#include <errno.h>
8a9562d2 24#include <unistd.h>
f3e7ec25
MW
25#include <linux/virtio_net.h>
26#include <sys/socket.h>
27#include <linux/if.h>
01961bbd 28
5e925ccc 29#include <rte_bus_pci.h>
01961bbd
DDP
30#include <rte_config.h>
31#include <rte_cycles.h>
32#include <rte_errno.h>
33#include <rte_eth_ring.h>
34#include <rte_ethdev.h>
35#include <rte_malloc.h>
36#include <rte_mbuf.h>
37#include <rte_meter.h>
fc56f5e0 38#include <rte_pci.h>
f3e7ec25 39#include <rte_vhost.h>
3eb8d4fa 40#include <rte_version.h>
8a9562d2 41
7d1ced01 42#include "dirs.h"
e14deea0 43#include "dp-packet.h"
01961bbd 44#include "dpdk.h"
8a9562d2 45#include "dpif-netdev.h"
e5c0f5a4 46#include "fatal-signal.h"
8a9562d2
PS
47#include "netdev-provider.h"
48#include "netdev-vport.h"
49#include "odp-util.h"
eac84432 50#include "openvswitch/dynamic-string.h"
25d436fb
BW
51#include "openvswitch/list.h"
52#include "openvswitch/ofp-print.h"
53#include "openvswitch/vlog.h"
94143fc4 54#include "ovs-numa.h"
8a9562d2
PS
55#include "ovs-thread.h"
56#include "ovs-rcu.h"
57#include "packets.h"
ee89ea7b 58#include "openvswitch/shash.h"
0bf765f7 59#include "smap.h"
8a9562d2
PS
60#include "sset.h"
61#include "unaligned.h"
62#include "timeval.h"
63#include "unixctl.h"
8a9562d2 64
f3e7ec25
MW
65enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
66
05b49df6 67VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
8a9562d2
PS
68static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
69
70#define DPDK_PORT_WATCHDOG_INTERVAL 5
71
72#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
73#define OVS_VPORT_DPDK "ovs_dpdk"
74
75/*
76 * need to reserve tons of extra space in the mbufs so we can align the
77 * DMA addresses to 4KB.
18f777b2
TP
78 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
79 * performance for standard Ethernet MTU.
8a9562d2 80 */
58be5c0e
MK
81#define ETHER_HDR_MAX_LEN (ETHER_HDR_LEN + ETHER_CRC_LEN \
82 + (2 * VLAN_HEADER_LEN))
4be4d22c
MK
83#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
84#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
58be5c0e
MK
85#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
86 - ETHER_HDR_LEN - ETHER_CRC_LEN)
31b88c97
SS
87#define MBUF_SIZE(mtu) ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
88 + sizeof(struct dp_packet) \
89 + RTE_PKTMBUF_HEADROOM), \
90 RTE_CACHE_LINE_SIZE)
4be4d22c 91#define NETDEV_DPDK_MBUF_ALIGN 1024
0072e931 92#define NETDEV_DPDK_MAX_PKT_LEN 9728
8a9562d2 93
bc57ed90
IM
94/* Min number of packets in the mempool. OVS tries to allocate a mempool with
95 * roughly estimated number of mbufs: if this fails (because the system doesn't
96 * have enough hugepages) we keep halving the number until the allocation
97 * succeeds or we reach MIN_NB_MBUF */
da79ce2b
DDP
98#define MIN_NB_MBUF (4096 * 4)
99#define MP_CACHE_SZ RTE_MEMPOOL_CACHE_MAX_SIZE
100
d6e3feb5 101/*
102 * DPDK XSTATS Counter names definition
103 */
104#define XSTAT_RX_64_PACKETS "rx_size_64_packets"
105#define XSTAT_RX_65_TO_127_PACKETS "rx_size_65_to_127_packets"
106#define XSTAT_RX_128_TO_255_PACKETS "rx_size_128_to_255_packets"
107#define XSTAT_RX_256_TO_511_PACKETS "rx_size_256_to_511_packets"
108#define XSTAT_RX_512_TO_1023_PACKETS "rx_size_512_to_1023_packets"
109#define XSTAT_RX_1024_TO_1522_PACKETS "rx_size_1024_to_1522_packets"
110#define XSTAT_RX_1523_TO_MAX_PACKETS "rx_size_1523_to_max_packets"
111
112#define XSTAT_TX_64_PACKETS "tx_size_64_packets"
113#define XSTAT_TX_65_TO_127_PACKETS "tx_size_65_to_127_packets"
114#define XSTAT_TX_128_TO_255_PACKETS "tx_size_128_to_255_packets"
115#define XSTAT_TX_256_TO_511_PACKETS "tx_size_256_to_511_packets"
116#define XSTAT_TX_512_TO_1023_PACKETS "tx_size_512_to_1023_packets"
117#define XSTAT_TX_1024_TO_1522_PACKETS "tx_size_1024_to_1522_packets"
118#define XSTAT_TX_1523_TO_MAX_PACKETS "tx_size_1523_to_max_packets"
119
d57f777f 120#define XSTAT_RX_MULTICAST_PACKETS "rx_multicast_packets"
d6e3feb5 121#define XSTAT_TX_MULTICAST_PACKETS "tx_multicast_packets"
122#define XSTAT_RX_BROADCAST_PACKETS "rx_broadcast_packets"
123#define XSTAT_TX_BROADCAST_PACKETS "tx_broadcast_packets"
124#define XSTAT_RX_UNDERSIZED_ERRORS "rx_undersized_errors"
125#define XSTAT_RX_OVERSIZE_ERRORS "rx_oversize_errors"
126#define XSTAT_RX_FRAGMENTED_ERRORS "rx_fragmented_errors"
127#define XSTAT_RX_JABBER_ERRORS "rx_jabber_errors"
128
8a9562d2
PS
129#define SOCKET0 0
130
b685696b
CL
131/* Default size of Physical NIC RXQ */
132#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
133/* Default size of Physical NIC TXQ */
134#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
135/* Maximum size of Physical NIC Queues */
136#define NIC_PORT_MAX_Q_SIZE 4096
79f5354c 137
585a5bea 138#define OVS_VHOST_MAX_QUEUE_NUM 1024 /* Maximum number of vHost TX queues. */
f3ea2ad2
IM
139#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
140#define OVS_VHOST_QUEUE_DISABLED (-2) /* Queue was disabled by guest and not
141 * yet mapped to another queue. */
585a5bea 142
bb37956a
IM
143#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
144
5e925ccc
MK
145/* DPDK library uses uint16_t for port_id. */
146typedef uint16_t dpdk_port_t;
fa9f4eeb 147#define DPDK_PORT_ID_FMT "%"PRIu16
bb37956a 148
31871ee3 149#define VHOST_ENQ_RETRY_NUM 8
0a0f39df 150#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
95e9881f 151
8a9562d2 152static const struct rte_eth_conf port_conf = {
a28ddd11
DDP
153 .rxmode = {
154 .mq_mode = ETH_MQ_RX_RSS,
155 .split_hdr_size = 0,
156 .header_split = 0, /* Header Split disabled */
157 .hw_ip_checksum = 0, /* IP checksum offload disabled */
158 .hw_vlan_filter = 0, /* VLAN filtering disabled */
159 .jumbo_frame = 0, /* Jumbo Frame Support disabled */
160 .hw_strip_crc = 0,
161 },
162 .rx_adv_conf = {
163 .rss_conf = {
164 .rss_key = NULL,
543342a4 165 .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
8a9562d2 166 },
a28ddd11
DDP
167 },
168 .txmode = {
169 .mq_mode = ETH_MQ_TX_NONE,
170 },
8a9562d2
PS
171};
172
f3e7ec25
MW
173/*
174 * These callbacks allow virtio-net devices to be added to vhost ports when
175 * configuration has been fully completed.
176 */
177static int new_device(int vid);
178static void destroy_device(int vid);
179static int vring_state_changed(int vid, uint16_t queue_id, int enable);
180static const struct vhost_device_ops virtio_net_device_ops =
181{
182 .new_device = new_device,
183 .destroy_device = destroy_device,
184 .vring_state_changed = vring_state_changed,
185 .features_changed = NULL
186};
187
58f7c37b
DDP
188enum { DPDK_RING_SIZE = 256 };
189BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
8a9562d2
PS
190enum { DRAIN_TSC = 200000ULL };
191
58397e6c
KT
192enum dpdk_dev_type {
193 DPDK_DEV_ETH = 0,
7d1ced01 194 DPDK_DEV_VHOST = 1,
58397e6c
KT
195};
196
0bf765f7
IS
197/* Quality of Service */
198
199/* An instance of a QoS configuration. Always associated with a particular
200 * network device.
201 *
202 * Each QoS implementation subclasses this with whatever additional data it
203 * needs.
204 */
205struct qos_conf {
206 const struct dpdk_qos_ops *ops;
78bd47cf 207 rte_spinlock_t lock;
0bf765f7
IS
208};
209
210/* A particular implementation of dpdk QoS operations.
211 *
212 * The functions below return 0 if successful or a positive errno value on
213 * failure, except where otherwise noted. All of them must be provided, except
214 * where otherwise noted.
215 */
216struct dpdk_qos_ops {
217
218 /* Name of the QoS type */
219 const char *qos_name;
220
78bd47cf
DDP
221 /* Called to construct a qos_conf object. The implementation should make
222 * the appropriate calls to configure QoS according to 'details'.
0bf765f7
IS
223 *
224 * The contents of 'details' should be documented as valid for 'ovs_name'
225 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
226 * (which is built as ovs-vswitchd.conf.db(8)).
227 *
78bd47cf
DDP
228 * This function must return 0 if and only if it sets '*conf' to an
229 * initialized 'struct qos_conf'.
0bf765f7
IS
230 *
231 * For all QoS implementations it should always be non-null.
232 */
78bd47cf 233 int (*qos_construct)(const struct smap *details, struct qos_conf **conf);
0bf765f7
IS
234
235 /* Destroys the data structures allocated by the implementation as part of
78bd47cf 236 * 'qos_conf'.
0bf765f7
IS
237 *
238 * For all QoS implementations it should always be non-null.
239 */
78bd47cf 240 void (*qos_destruct)(struct qos_conf *conf);
0bf765f7 241
78bd47cf 242 /* Retrieves details of 'conf' configuration into 'details'.
0bf765f7
IS
243 *
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
246 * (which is built as ovs-vswitchd.conf.db(8)).
247 */
78bd47cf 248 int (*qos_get)(const struct qos_conf *conf, struct smap *details);
0bf765f7 249
78bd47cf 250 /* Returns true if 'conf' is already configured according to 'details'.
0bf765f7
IS
251 *
252 * The contents of 'details' should be documented as valid for 'ovs_name'
253 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
254 * (which is built as ovs-vswitchd.conf.db(8)).
255 *
78bd47cf 256 * For all QoS implementations it should always be non-null.
0bf765f7 257 */
78bd47cf
DDP
258 bool (*qos_is_equal)(const struct qos_conf *conf,
259 const struct smap *details);
0bf765f7
IS
260
261 /* Modify an array of rte_mbufs. The modification is specific to
262 * each qos implementation.
263 *
264 * The function should take and array of mbufs and an int representing
265 * the current number of mbufs present in the array.
266 *
267 * After the function has performed a qos modification to the array of
268 * mbufs it returns an int representing the number of mbufs now present in
269 * the array. This value is can then be passed to the port send function
270 * along with the modified array for transmission.
271 *
272 * For all QoS implementations it should always be non-null.
273 */
78bd47cf 274 int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
3e90f7d7 275 int pkt_cnt, bool may_steal);
0bf765f7
IS
276};
277
278/* dpdk_qos_ops for each type of user space QoS implementation */
279static const struct dpdk_qos_ops egress_policer_ops;
280
281/*
282 * Array of dpdk_qos_ops, contains pointer to all supported QoS
283 * operations.
284 */
285static const struct dpdk_qos_ops *const qos_confs[] = {
286 &egress_policer_ops,
287 NULL
288};
289
c2adb102
IM
290static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
291
8a9562d2 292/* Contains all 'struct dpdk_dev's. */
ca6ba700 293static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 294 = OVS_LIST_INITIALIZER(&dpdk_list);
8a9562d2 295
c2adb102
IM
296static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
297 = OVS_MUTEX_INITIALIZER;
298
91fccdad
KT
299/* Contains all 'struct dpdk_mp's. */
300static struct ovs_list dpdk_mp_free_list OVS_GUARDED_BY(dpdk_mp_mutex)
301 = OVS_LIST_INITIALIZER(&dpdk_mp_free_list);
302
303/* Wrapper for a mempool released but not yet freed. */
304struct dpdk_mp {
305 struct rte_mempool *mp;
306 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
307 };
308
5a034064
AW
309/* There should be one 'struct dpdk_tx_queue' created for
310 * each cpu core. */
8a9562d2 311struct dpdk_tx_queue {
a0cb2d66
DDP
312 rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
313 * from concurrent access. It is used only
314 * if the queue is shared among different
324c8374 315 * pmd threads (see 'concurrent_txq'). */
585a5bea
IM
316 int map; /* Mapping of configured vhost-user queues
317 * to enabled by guest. */
8a9562d2
PS
318};
319
95fb793a 320/* dpdk has no way to remove dpdk ring ethernet devices
321 so we have to keep them around once they've been created
322*/
323
ca6ba700 324static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
55951e15 325 = OVS_LIST_INITIALIZER(&dpdk_ring_list);
95fb793a 326
327struct dpdk_ring {
328 /* For the client rings */
329 struct rte_ring *cring_tx;
330 struct rte_ring *cring_rx;
b83a2df1 331 unsigned int user_port_id; /* User given port no, parsed from port name */
bb37956a 332 dpdk_port_t eth_port_id; /* ethernet device port id */
ca6ba700 333 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
95fb793a 334};
335
9509913a
IS
336struct ingress_policer {
337 struct rte_meter_srtcm_params app_srtcm_params;
338 struct rte_meter_srtcm in_policer;
339 rte_spinlock_t policer_lock;
340};
341
1a2bb118
SC
342enum dpdk_hw_ol_features {
343 NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
344};
345
b2e72a9c
IM
346/*
347 * In order to avoid confusion in variables names, following naming convention
348 * should be used, if possible:
349 *
350 * 'struct netdev' : 'netdev'
351 * 'struct netdev_dpdk' : 'dev'
352 * 'struct netdev_rxq' : 'rxq'
353 * 'struct netdev_rxq_dpdk' : 'rx'
354 *
355 * Example:
356 * struct netdev *netdev = netdev_from_name(name);
357 * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
358 *
359 * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
360 * already defined.
361 */
362
8a9562d2 363struct netdev_dpdk {
23d4d53f
BB
364 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
365 dpdk_port_t port_id;
366
367 /* If true, device was attached by rte_eth_dev_attach(). */
368 bool attached;
369 struct eth_addr hwaddr;
370 int mtu;
371 int socket_id;
372 int buf_size;
373 int max_packet_len;
374 enum dpdk_dev_type type;
375 enum netdev_flags flags;
376 char *devargs; /* Device arguments for dpdk ports */
377 struct dpdk_tx_queue *tx_q;
378 struct rte_eth_link link;
379 int link_reset_cnt;
380 /* 4 pad bytes here. */
381 );
382
383 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
384 struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
24e78f93 385 struct rte_mempool *mp;
23d4d53f
BB
386
387 /* virtio identifier for vhost devices */
388 ovsrcu_index vid;
389
390 /* True if vHost device is 'up' and has been reconfigured at least once */
391 bool vhost_reconfigured;
392 /* 3 pad bytes here. */
393 );
394
395 PADDED_MEMBERS(CACHE_LINE_SIZE,
396 /* Identifier used to distinguish vhost devices from each other. */
397 char vhost_id[PATH_MAX];
398 );
399
400 PADDED_MEMBERS(CACHE_LINE_SIZE,
401 struct netdev up;
402 /* In dpdk_list. */
403 struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
404
405 /* QoS configuration and lock for the device */
406 OVSRCU_TYPE(struct qos_conf *) qos_conf;
407
408 /* Ingress Policer */
409 OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
410 uint32_t policer_rate;
411 uint32_t policer_burst;
412 );
413
414 PADDED_MEMBERS(CACHE_LINE_SIZE,
415 struct netdev_stats stats;
416 /* Protects stats */
417 rte_spinlock_t stats_lock;
418 /* 44 pad bytes here. */
419 );
420
421 PADDED_MEMBERS(CACHE_LINE_SIZE,
422 /* The following properties cannot be changed when a device is running,
423 * so we remember the request and update them next time
424 * netdev_dpdk*_reconfigure() is called */
425 int requested_mtu;
426 int requested_n_txq;
427 int requested_n_rxq;
428 int requested_rxq_size;
429 int requested_txq_size;
430
431 /* Number of rx/tx descriptors for physical devices */
432 int rxq_size;
433 int txq_size;
434
435 /* Socket ID detected when vHost device is brought up */
436 int requested_socket_id;
437
438 /* Denotes whether vHost port is client/server mode */
439 uint64_t vhost_driver_flags;
440
441 /* DPDK-ETH Flow control */
442 struct rte_eth_fc_conf fc_conf;
443
444 /* DPDK-ETH hardware offload features,
445 * from the enum set 'dpdk_hw_ol_features' */
446 uint32_t hw_ol_features;
447 );
971f4b39
MW
448
449 PADDED_MEMBERS(CACHE_LINE_SIZE,
450 /* Names of all XSTATS counters */
451 struct rte_eth_xstat_name *rte_xstats_names;
452 int rte_xstats_names_size;
453 int rte_xstats_ids_size;
454 uint64_t *rte_xstats_ids;
455 );
8a9562d2
PS
456};
457
458struct netdev_rxq_dpdk {
459 struct netdev_rxq up;
bb37956a 460 dpdk_port_t port_id;
8a9562d2
PS
461};
462
f3e7ec25
MW
463static void netdev_dpdk_destruct(struct netdev *netdev);
464static void netdev_dpdk_vhost_destruct(struct netdev *netdev);
8a9562d2 465
ac1a9bb9
IM
466static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
467
0a0f39df 468int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
58397e6c 469
9509913a
IS
470struct ingress_policer *
471netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
472
8a9562d2
PS
473static bool
474is_dpdk_class(const struct netdev_class *class)
475{
f3e7ec25
MW
476 return class->destruct == netdev_dpdk_destruct
477 || class->destruct == netdev_dpdk_vhost_destruct;
8a9562d2
PS
478}
479
4be4d22c
MK
480/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
481 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
482 * value, insufficient buffers are allocated to accomodate the packet in its
483 * entirety. Furthermore, certain drivers need to ensure that there is also
484 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
485 * frames). If the RX buffer is too small, then the driver enables scatter RX
58be5c0e
MK
486 * behaviour, which reduces performance. To prevent this, use a buffer size
487 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
4be4d22c
MK
488 */
489static uint32_t
490dpdk_buf_size(int mtu)
491{
492 return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
493 NETDEV_DPDK_MBUF_ALIGN);
494}
495
eff23640
DDP
496/* Allocates an area of 'sz' bytes from DPDK. The memory is zero'ed.
497 *
498 * Unlike xmalloc(), this function can return NULL on failure. */
8a9562d2
PS
499static void *
500dpdk_rte_mzalloc(size_t sz)
501{
eff23640 502 return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
8a9562d2
PS
503}
504
505void
e14deea0 506free_dpdk_buf(struct dp_packet *p)
8a9562d2 507{
db73f716 508 struct rte_mbuf *pkt = (struct rte_mbuf *) p;
8a9562d2 509
b00b4a81 510 rte_pktmbuf_free(pkt);
8a9562d2
PS
511}
512
b3cd9f9d 513static void
401b70d6 514ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
b3cd9f9d 515 void *opaque_arg OVS_UNUSED,
2391135c 516 void *_p,
b3cd9f9d
PS
517 unsigned i OVS_UNUSED)
518{
2391135c 519 struct rte_mbuf *pkt = _p;
b3cd9f9d 520
2391135c 521 dp_packet_init_dpdk((struct dp_packet *) pkt, pkt->buf_len);
b3cd9f9d
PS
522}
523
91fccdad
KT
524static int
525dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
526{
527 unsigned ring_count;
528 /* This logic is needed because rte_mempool_full() is not guaranteed to
529 * be atomic and mbufs could be moved from mempool cache --> mempool ring
530 * during the call. However, as no mbufs will be taken from the mempool
531 * at this time, we can work around it by also checking the ring entries
532 * separately and ensuring that they have not changed.
533 */
534 ring_count = rte_mempool_ops_get_count(mp);
535 if (rte_mempool_full(mp) && rte_mempool_ops_get_count(mp) == ring_count) {
536 return 1;
537 }
538
539 return 0;
540}
541
542/* Free unused mempools. */
543static void
544dpdk_mp_sweep(void)
545{
546 struct dpdk_mp *dmp, *next;
547
548 ovs_mutex_lock(&dpdk_mp_mutex);
549 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_free_list) {
550 if (dpdk_mp_full(dmp->mp)) {
551 VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
552 ovs_list_remove(&dmp->list_node);
553 rte_mempool_free(dmp->mp);
554 rte_free(dmp);
555 }
556 }
557 ovs_mutex_unlock(&dpdk_mp_mutex);
558}
559
560/* Ensure a mempool will not be freed. */
561static void
562dpdk_mp_do_not_free(struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
563{
564 struct dpdk_mp *dmp, *next;
565
566 LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_free_list) {
567 if (dmp->mp == mp) {
568 VLOG_DBG("Removing mempool \"%s\" from free list", dmp->mp->name);
569 ovs_list_remove(&dmp->list_node);
570 rte_free(dmp);
571 break;
572 }
573 }
574}
575
24e78f93
IM
576/* Returns a valid pointer when either of the following is true:
577 * - a new mempool was just created;
578 * - a matching mempool already exists. */
579static struct rte_mempool *
580dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
8a9562d2 581{
24e78f93
IM
582 char mp_name[RTE_MEMPOOL_NAMESIZE];
583 const char *netdev_name = netdev_get_name(&dev->up);
584 int socket_id = dev->requested_socket_id;
585 uint32_t n_mbufs;
586 uint32_t hash = hash_string(netdev_name, 0);
587 struct rte_mempool *mp = NULL;
d555d9bd
RW
588
589 /*
ad9b5b9b 590 * XXX: rough estimation of number of mbufs required for this port:
d555d9bd
RW
591 * <packets required to fill the device rxqs>
592 * + <packets that could be stuck on other ports txqs>
593 * + <packets in the pmd threads>
594 * + <additional memory for corner cases>
0072e931 595 */
24e78f93
IM
596 n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
597 + dev->requested_n_txq * dev->requested_txq_size
598 + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
599 + MIN_NB_MBUF;
d555d9bd 600
24e78f93 601 ovs_mutex_lock(&dpdk_mp_mutex);
da79ce2b 602 do {
24e78f93
IM
603 /* Full DPDK memory pool name must be unique and cannot be
604 * longer than RTE_MEMPOOL_NAMESIZE. */
af5b0dad
IM
605 int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
606 "ovs%08x%02d%05d%07u",
24e78f93
IM
607 hash, socket_id, mtu, n_mbufs);
608 if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
609 VLOG_DBG("snprintf returned %d. "
610 "Failed to generate a mempool name for \"%s\". "
611 "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
612 ret, netdev_name, hash, socket_id, mtu, n_mbufs);
613 break;
65056fd7 614 }
95fb793a 615
f06546a5
FA
616 VLOG_DBG("Port %s: Requesting a mempool of %u mbufs "
617 "on socket %d for %d Rx and %d Tx queues.",
24e78f93 618 netdev_name, n_mbufs, socket_id,
f06546a5 619 dev->requested_n_rxq, dev->requested_n_txq);
d555d9bd 620
24e78f93
IM
621 mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
622 sizeof (struct dp_packet) - sizeof (struct rte_mbuf),
623 MBUF_SIZE(mtu) - sizeof(struct dp_packet), socket_id);
624
625 if (mp) {
626 VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
627 mp_name, n_mbufs);
837c1761
FA
628 /* rte_pktmbuf_pool_create has done some initialization of the
629 * rte_mbuf part of each dp_packet. Some OvS specific fields
630 * of the packet still need to be initialized by
631 * ovs_rte_pktmbuf_init. */
24e78f93 632 rte_mempool_obj_iter(mp, ovs_rte_pktmbuf_init, NULL);
d555d9bd
RW
633 } else if (rte_errno == EEXIST) {
634 /* A mempool with the same name already exists. We just
635 * retrieve its pointer to be returned to the caller. */
24e78f93 636 mp = rte_mempool_lookup(mp_name);
d555d9bd
RW
637 /* As the mempool create returned EEXIST we can expect the
638 * lookup has returned a valid pointer. If for some reason
639 * that's not the case we keep track of it. */
24e78f93
IM
640 VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
641 mp_name, mp);
91fccdad
KT
642 /* Ensure this reused mempool will not be freed. */
643 dpdk_mp_do_not_free(mp);
d555d9bd
RW
644 } else {
645 VLOG_ERR("Failed mempool \"%s\" create request of %u mbufs",
24e78f93 646 mp_name, n_mbufs);
0c6f39e5 647 }
24e78f93 648 } while (!mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);
2ae3d542 649
c2adb102 650 ovs_mutex_unlock(&dpdk_mp_mutex);
24e78f93 651 return mp;
8a9562d2
PS
652}
653
a08a115d 654/* Release an existing mempool. */
8a9562d2 655static void
91fccdad 656dpdk_mp_release(struct rte_mempool *mp)
8a9562d2 657{
24e78f93 658 if (!mp) {
8a9562d2
PS
659 return;
660 }
661
c2adb102 662 ovs_mutex_lock(&dpdk_mp_mutex);
91fccdad
KT
663 if (dpdk_mp_full(mp)) {
664 VLOG_DBG("Freeing mempool \"%s\"", mp->name);
665 rte_mempool_free(mp);
666 } else {
667 struct dpdk_mp *dmp;
668
669 dmp = dpdk_rte_mzalloc(sizeof *dmp);
670 if (dmp) {
671 dmp->mp = mp;
672 ovs_list_push_back(&dpdk_mp_free_list, &dmp->list_node);
673 }
674 }
c2adb102 675 ovs_mutex_unlock(&dpdk_mp_mutex);
8a9562d2
PS
676}
677
b6b26021
FA
678/* Tries to allocate a new mempool - or re-use an existing one where
679 * appropriate - on requested_socket_id with a size determined by
680 * requested_mtu and requested Rx/Tx queues.
681 * On success - or when re-using an existing mempool - the new configuration
682 * will be applied.
0072e931
MK
683 * On error, device will be left unchanged. */
684static int
685netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
0072e931
MK
686 OVS_REQUIRES(dev->mutex)
687{
688 uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
24e78f93
IM
689 struct rte_mempool *mp;
690 int ret = 0;
0072e931 691
91fccdad
KT
692 dpdk_mp_sweep();
693
24e78f93 694 mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size));
0072e931 695 if (!mp) {
c67e46c0
MK
696 VLOG_ERR("Failed to create memory pool for netdev "
697 "%s, with MTU %d on socket %d: %s\n",
698 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
699 rte_strerror(rte_errno));
24e78f93 700 ret = rte_errno;
0072e931 701 } else {
24e78f93
IM
702 /* If a new MTU was requested and its rounded value equals the one
703 * that is currently used, then the existing mempool is returned. */
704 if (dev->mp != mp) {
705 /* A new mempool was created, release the previous one. */
91fccdad 706 dpdk_mp_release(dev->mp);
24e78f93
IM
707 } else {
708 ret = EEXIST;
709 }
710 dev->mp = mp;
0072e931
MK
711 dev->mtu = dev->requested_mtu;
712 dev->socket_id = dev->requested_socket_id;
713 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
714 }
715
24e78f93 716 return ret;
0072e931
MK
717}
718
8a9562d2
PS
719static void
720check_link_status(struct netdev_dpdk *dev)
721{
722 struct rte_eth_link link;
723
724 rte_eth_link_get_nowait(dev->port_id, &link);
725
726 if (dev->link.link_status != link.link_status) {
3e912ffc 727 netdev_change_seq_changed(&dev->up);
8a9562d2
PS
728
729 dev->link_reset_cnt++;
730 dev->link = link;
731 if (dev->link.link_status) {
fa9f4eeb
IM
732 VLOG_DBG_RL(&rl,
733 "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
58be5c0e 734 dev->port_id, (unsigned) dev->link.link_speed,
fa9f4eeb
IM
735 (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
736 ? "full-duplex" : "half-duplex");
8a9562d2 737 } else {
fa9f4eeb
IM
738 VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
739 dev->port_id);
8a9562d2
PS
740 }
741 }
742}
743
744static void *
745dpdk_watchdog(void *dummy OVS_UNUSED)
746{
747 struct netdev_dpdk *dev;
748
749 pthread_detach(pthread_self());
750
751 for (;;) {
752 ovs_mutex_lock(&dpdk_mutex);
753 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
754 ovs_mutex_lock(&dev->mutex);
1f5b157e
IM
755 if (dev->type == DPDK_DEV_ETH) {
756 check_link_status(dev);
757 }
8a9562d2
PS
758 ovs_mutex_unlock(&dev->mutex);
759 }
760 ovs_mutex_unlock(&dpdk_mutex);
761 xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
762 }
763
764 return NULL;
765}
766
b98d7669
DDP
767static int
768dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
769{
770 int diag = 0;
771 int i;
0072e931 772 struct rte_eth_conf conf = port_conf;
b98d7669 773
67fe6d63
MK
774 /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly
775 * enabled. */
0072e931 776 if (dev->mtu > ETHER_MTU) {
67fe6d63 777 conf.rxmode.enable_scatter = 1;
0072e931 778 }
67fe6d63 779
1a2bb118
SC
780 conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
781 NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
b98d7669
DDP
782 /* A device may report more queues than it makes available (this has
783 * been observed for Intel xl710, which reserves some of them for
784 * SRIOV): rte_eth_*_queue_setup will fail if a queue is not
785 * available. When this happens we can retry the configuration
786 * and request less queues */
787 while (n_rxq && n_txq) {
788 if (diag) {
789 VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
790 }
791
0072e931 792 diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
b98d7669 793 if (diag) {
0072e931
MK
794 VLOG_WARN("Interface %s eth_dev setup error %s\n",
795 dev->up.name, rte_strerror(-diag));
b98d7669
DDP
796 break;
797 }
798
67fe6d63
MK
799 diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
800 if (diag) {
801 VLOG_ERR("Interface %s MTU (%d) setup error: %s",
802 dev->up.name, dev->mtu, rte_strerror(-diag));
803 break;
804 }
805
b98d7669 806 for (i = 0; i < n_txq; i++) {
b685696b 807 diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
b98d7669
DDP
808 dev->socket_id, NULL);
809 if (diag) {
1dfebee9 810 VLOG_INFO("Interface %s unable to setup txq(%d): %s",
b98d7669
DDP
811 dev->up.name, i, rte_strerror(-diag));
812 break;
813 }
814 }
815
816 if (i != n_txq) {
817 /* Retry with less tx queues */
818 n_txq = i;
819 continue;
820 }
821
822 for (i = 0; i < n_rxq; i++) {
b685696b 823 diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
24e78f93 824 dev->socket_id, NULL, dev->mp);
b98d7669 825 if (diag) {
1dfebee9 826 VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
b98d7669
DDP
827 dev->up.name, i, rte_strerror(-diag));
828 break;
829 }
830 }
831
832 if (i != n_rxq) {
833 /* Retry with less rx queues */
834 n_rxq = i;
835 continue;
836 }
837
838 dev->up.n_rxq = n_rxq;
81acebda 839 dev->up.n_txq = n_txq;
b98d7669
DDP
840
841 return 0;
842 }
843
844 return diag;
845}
846
9fd39370
SC
847static void
848dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
849{
850 if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
fa9f4eeb 851 VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
bb37956a 852 dev->port_id);
9fd39370
SC
853 }
854}
b98d7669 855
8a9562d2 856static int
c2adb102
IM
857dpdk_eth_dev_init(struct netdev_dpdk *dev)
858 OVS_REQUIRES(dev->mutex)
8a9562d2
PS
859{
860 struct rte_pktmbuf_pool_private *mbp_priv;
a0cb2d66 861 struct rte_eth_dev_info info;
8a9562d2
PS
862 struct ether_addr eth_addr;
863 int diag;
b98d7669 864 int n_rxq, n_txq;
d4f5282c
KT
865 uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
866 DEV_RX_OFFLOAD_TCP_CKSUM |
867 DEV_RX_OFFLOAD_IPV4_CKSUM;
8a9562d2 868
a0cb2d66 869 rte_eth_dev_info_get(dev->port_id, &info);
a0cb2d66 870
d4f5282c
KT
871 if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
872 rx_chksm_offload_capa) {
fa9f4eeb
IM
873 VLOG_WARN("Rx checksum offload is not supported on port "
874 DPDK_PORT_ID_FMT, dev->port_id);
d4f5282c
KT
875 dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
876 } else {
877 dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
878 }
879
b98d7669
DDP
880 n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
881 n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
882
883 diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
8a9562d2 884 if (diag) {
b98d7669
DDP
885 VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
886 dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
95fb793a 887 return -diag;
8a9562d2
PS
888 }
889
8a9562d2
PS
890 diag = rte_eth_dev_start(dev->port_id);
891 if (diag) {
b98d7669
DDP
892 VLOG_ERR("Interface %s start error: %s", dev->up.name,
893 rte_strerror(-diag));
95fb793a 894 return -diag;
8a9562d2
PS
895 }
896
897 rte_eth_promiscuous_enable(dev->port_id);
898 rte_eth_allmulticast_enable(dev->port_id);
899
900 memset(&eth_addr, 0x0, sizeof(eth_addr));
901 rte_eth_macaddr_get(dev->port_id, &eth_addr);
fa9f4eeb
IM
902 VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
903 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));
8a9562d2 904
ca92d173 905 memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
8a9562d2
PS
906 rte_eth_link_get_nowait(dev->port_id, &dev->link);
907
24e78f93 908 mbp_priv = rte_mempool_get_priv(dev->mp);
8a9562d2
PS
909 dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
910
9fd39370
SC
911 /* Get the Flow control configuration for DPDK-ETH */
912 diag = rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
913 if (diag) {
fa9f4eeb
IM
914 VLOG_DBG("cannot get flow control parameters on port "DPDK_PORT_ID_FMT
915 ", err=%d", dev->port_id, diag);
9fd39370
SC
916 }
917
8a9562d2
PS
918 return 0;
919}
920
921static struct netdev_dpdk *
922netdev_dpdk_cast(const struct netdev *netdev)
923{
924 return CONTAINER_OF(netdev, struct netdev_dpdk, up);
925}
926
927static struct netdev *
928netdev_dpdk_alloc(void)
929{
bab69409
AC
930 struct netdev_dpdk *dev;
931
65e19e70
DDP
932 dev = dpdk_rte_mzalloc(sizeof *dev);
933 if (dev) {
934 return &dev->up;
bab69409 935 }
65e19e70 936
bab69409 937 return NULL;
8a9562d2
PS
938}
939
eff23640
DDP
940static struct dpdk_tx_queue *
941netdev_dpdk_alloc_txq(unsigned int n_txqs)
5a034064 942{
eff23640 943 struct dpdk_tx_queue *txqs;
bd5131ba 944 unsigned i;
5a034064 945
eff23640
DDP
946 txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
947 if (txqs) {
948 for (i = 0; i < n_txqs; i++) {
949 /* Initialize map for vhost devices. */
950 txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
951 rte_spinlock_init(&txqs[i].tx_lock);
952 }
5a034064 953 }
eff23640
DDP
954
955 return txqs;
5a034064
AW
956}
957
8a9562d2 958static int
bb37956a 959common_construct(struct netdev *netdev, dpdk_port_t port_no,
1ce30dfd 960 enum dpdk_dev_type type, int socket_id)
5a034064 961 OVS_REQUIRES(dpdk_mutex)
8a9562d2 962{
d46285a2 963 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 964
d46285a2 965 ovs_mutex_init(&dev->mutex);
8a9562d2 966
d46285a2 967 rte_spinlock_init(&dev->stats_lock);
45d947c4 968
1b7a04e0
AW
969 /* If the 'sid' is negative, it means that the kernel fails
970 * to obtain the pci numa info. In that situation, always
971 * use 'SOCKET0'. */
1ce30dfd 972 dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
db8f13b0 973 dev->requested_socket_id = dev->socket_id;
d46285a2
DDP
974 dev->port_id = port_no;
975 dev->type = type;
976 dev->flags = 0;
7f381c2e 977 dev->requested_mtu = ETHER_MTU;
d46285a2 978 dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
0a0f39df
CL
979 ovsrcu_index_init(&dev->vid, -1);
980 dev->vhost_reconfigured = false;
5dcde09c 981 dev->attached = false;
8a9562d2 982
78bd47cf 983 ovsrcu_init(&dev->qos_conf, NULL);
0bf765f7 984
9509913a
IS
985 ovsrcu_init(&dev->ingress_policer, NULL);
986 dev->policer_rate = 0;
987 dev->policer_burst = 0;
988
7f381c2e
DDP
989 netdev->n_rxq = 0;
990 netdev->n_txq = 0;
991 dev->requested_n_rxq = NR_QUEUE;
992 dev->requested_n_txq = NR_QUEUE;
993 dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
994 dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
58397e6c 995
9fd39370
SC
996 /* Initialize the flow control to NULL */
997 memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
1a2bb118
SC
998
999 /* Initilize the hardware offload flags to 0 */
1000 dev->hw_ol_features = 0;
3b1fb077
DDP
1001
1002 dev->flags = NETDEV_UP | NETDEV_PROMISC;
1003
d46285a2 1004 ovs_list_push_back(&dpdk_list, &dev->list_node);
8a9562d2 1005
7f381c2e
DDP
1006 netdev_request_reconfigure(netdev);
1007
971f4b39
MW
1008 dev->rte_xstats_names = NULL;
1009 dev->rte_xstats_names_size = 0;
1010
1011 dev->rte_xstats_ids = NULL;
1012 dev->rte_xstats_ids_size = 0;
1013
1ce30dfd 1014 return 0;
95fb793a 1015}
1016
b83a2df1
MV
1017/* dev_name must be the prefix followed by a positive decimal number.
1018 * (no leading + or - signs are allowed) */
95fb793a 1019static int
1020dpdk_dev_parse_name(const char dev_name[], const char prefix[],
1021 unsigned int *port_no)
1022{
1023 const char *cport;
1024
1025 if (strncmp(dev_name, prefix, strlen(prefix))) {
1026 return ENODEV;
1027 }
1028
1029 cport = dev_name + strlen(prefix);
b83a2df1
MV
1030
1031 if (str_to_uint(cport, 10, port_no)) {
1032 return 0;
1033 } else {
1034 return ENODEV;
1035 }
95fb793a 1036}
1037
1ce30dfd
DDP
1038static int
1039vhost_common_construct(struct netdev *netdev)
1040 OVS_REQUIRES(dpdk_mutex)
1041{
1042 int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
1043 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1044
1045 dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
1046 if (!dev->tx_q) {
1047 return ENOMEM;
1048 }
1049
bb37956a
IM
1050 return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1051 DPDK_DEV_VHOST, socket_id);
1ce30dfd
DDP
1052}
1053
7d1ced01 1054static int
53f50d24 1055netdev_dpdk_vhost_construct(struct netdev *netdev)
7d1ced01 1056{
d46285a2
DDP
1057 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1058 const char *name = netdev->name;
7d1ced01 1059 int err;
a0cb2d66 1060
1af27e8a
DDP
1061 /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
1062 * the file system. '/' or '\' would traverse directories, so they're not
1063 * acceptable in 'name'. */
1064 if (strchr(name, '/') || strchr(name, '\\')) {
1065 VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
1066 "A valid name must not include '/' or '\\'",
1067 name);
1068 return EINVAL;
1069 }
1070
7d1ced01
CL
1071 ovs_mutex_lock(&dpdk_mutex);
1072 /* Take the name of the vhost-user port and append it to the location where
2d24d165 1073 * the socket is to be created, then register the socket.
7d1ced01 1074 */
2d24d165 1075 snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s",
01961bbd 1076 dpdk_get_vhost_sock_dir(), name);
1af27e8a 1077
2d24d165
CL
1078 dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
1079 err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
7d1ced01
CL
1080 if (err) {
1081 VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
2d24d165 1082 dev->vhost_id);
f3e7ec25 1083 goto out;
e5c0f5a4 1084 } else {
2d24d165
CL
1085 fatal_signal_add_file_to_unlink(dev->vhost_id);
1086 VLOG_INFO("Socket %s created for vhost-user port %s\n",
1087 dev->vhost_id, name);
1088 }
f3e7ec25
MW
1089
1090 err = rte_vhost_driver_callback_register(dev->vhost_id,
1091 &virtio_net_device_ops);
1092 if (err) {
1093 VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
1094 "port: %s\n", name);
1095 goto out;
1096 }
1097
1098 err = rte_vhost_driver_disable_features(dev->vhost_id,
1099 1ULL << VIRTIO_NET_F_HOST_TSO4
1100 | 1ULL << VIRTIO_NET_F_HOST_TSO6
1101 | 1ULL << VIRTIO_NET_F_CSUM);
1102 if (err) {
1103 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
1104 "port: %s\n", name);
1105 goto out;
1106 }
1107
1108 err = rte_vhost_driver_start(dev->vhost_id);
1109 if (err) {
1110 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
1111 "port: %s\n", name);
1112 goto out;
1113 }
1114
1ce30dfd 1115 err = vhost_common_construct(netdev);
f3e7ec25
MW
1116 if (err) {
1117 VLOG_ERR("vhost_common_construct failed for vhost user "
1118 "port: %s\n", name);
1119 }
2d24d165 1120
f3e7ec25 1121out:
2d24d165 1122 ovs_mutex_unlock(&dpdk_mutex);
28ca969e
AC
1123 VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated; "
1124 "please migrate to dpdkvhostuserclient ports.");
2d24d165
CL
1125 return err;
1126}
1127
1128static int
1129netdev_dpdk_vhost_client_construct(struct netdev *netdev)
1130{
1131 int err;
1132
2d24d165 1133 ovs_mutex_lock(&dpdk_mutex);
1ce30dfd 1134 err = vhost_common_construct(netdev);
f3e7ec25
MW
1135 if (err) {
1136 VLOG_ERR("vhost_common_construct failed for vhost user client"
1137 "port: %s\n", netdev->name);
1138 }
7d1ced01 1139 ovs_mutex_unlock(&dpdk_mutex);
58397e6c
KT
1140 return err;
1141}
1142
95fb793a 1143static int
1144netdev_dpdk_construct(struct netdev *netdev)
1145{
95fb793a 1146 int err;
1147
95fb793a 1148 ovs_mutex_lock(&dpdk_mutex);
bb37956a
IM
1149 err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
1150 DPDK_DEV_ETH, SOCKET0);
8a9562d2
PS
1151 ovs_mutex_unlock(&dpdk_mutex);
1152 return err;
1153}
1154
1ce30dfd
DDP
1155static void
1156common_destruct(struct netdev_dpdk *dev)
1157 OVS_REQUIRES(dpdk_mutex)
1158 OVS_EXCLUDED(dev->mutex)
1159{
1160 rte_free(dev->tx_q);
91fccdad 1161 dpdk_mp_release(dev->mp);
1ce30dfd
DDP
1162
1163 ovs_list_remove(&dev->list_node);
1164 free(ovsrcu_get_protected(struct ingress_policer *,
1165 &dev->ingress_policer));
1166 ovs_mutex_destroy(&dev->mutex);
1167}
1168
8a9562d2 1169static void
d46285a2 1170netdev_dpdk_destruct(struct netdev *netdev)
8a9562d2 1171{
d46285a2 1172 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5dcde09c 1173 char devname[RTE_ETH_NAME_MAX_LEN];
8a9562d2 1174
8d38823b 1175 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1176
8a9562d2 1177 rte_eth_dev_stop(dev->port_id);
5dcde09c
IM
1178
1179 if (dev->attached) {
1180 rte_eth_dev_close(dev->port_id);
1181 if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
1182 VLOG_ERR("Device '%s' can not be detached", dev->devargs);
1183 } else {
0ee821c2 1184 VLOG_INFO("Device '%s' has been detached", devname);
5dcde09c
IM
1185 }
1186 }
1187
ac1a9bb9 1188 netdev_dpdk_clear_xstats(dev);
55e075e6 1189 free(dev->devargs);
1ce30dfd 1190 common_destruct(dev);
8d38823b 1191
8a9562d2 1192 ovs_mutex_unlock(&dpdk_mutex);
58397e6c 1193}
8a9562d2 1194
3f891bbe
DDP
1195/* rte_vhost_driver_unregister() can call back destroy_device(), which will
1196 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'. To avoid a
1197 * deadlock, none of the mutexes must be held while calling this function. */
1198static int
c1ff66ac
CL
1199dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
1200 char *vhost_id)
3f891bbe
DDP
1201 OVS_EXCLUDED(dpdk_mutex)
1202 OVS_EXCLUDED(dev->mutex)
1203{
c1ff66ac 1204 return rte_vhost_driver_unregister(vhost_id);
3f891bbe
DDP
1205}
1206
58397e6c 1207static void
d46285a2 1208netdev_dpdk_vhost_destruct(struct netdev *netdev)
58397e6c 1209{
d46285a2 1210 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
c1ff66ac 1211 char *vhost_id;
58397e6c 1212
8d38823b 1213 ovs_mutex_lock(&dpdk_mutex);
8d38823b 1214
c62da695 1215 /* Guest becomes an orphan if still attached. */
c1ff66ac
CL
1216 if (netdev_dpdk_get_vid(dev) >= 0
1217 && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
c62da695 1218 VLOG_ERR("Removing port '%s' while vhost device still attached.",
d46285a2 1219 netdev->name);
58be5c0e
MK
1220 VLOG_ERR("To restore connectivity after re-adding of port, VM on "
1221 "socket '%s' must be restarted.", dev->vhost_id);
58397e6c
KT
1222 }
1223
2d24d165 1224 vhost_id = xstrdup(dev->vhost_id);
c1ff66ac 1225
1ce30dfd
DDP
1226 common_destruct(dev);
1227
58397e6c 1228 ovs_mutex_unlock(&dpdk_mutex);
3f891bbe 1229
569c26da 1230 if (!vhost_id[0]) {
821b8664
IM
1231 goto out;
1232 }
1233
c1ff66ac 1234 if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
41964543
IM
1235 VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
1236 netdev->name, vhost_id);
c1ff66ac
CL
1237 } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1238 /* OVS server mode - remove this socket from list for deletion */
1239 fatal_signal_remove_file_to_unlink(vhost_id);
3f891bbe 1240 }
821b8664 1241out:
c1ff66ac 1242 free(vhost_id);
8a9562d2
PS
1243}
1244
1245static void
d46285a2 1246netdev_dpdk_dealloc(struct netdev *netdev)
8a9562d2 1247{
d46285a2 1248 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 1249
d46285a2 1250 rte_free(dev);
8a9562d2
PS
1251}
1252
971f4b39 1253static void
ac1a9bb9 1254netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
971f4b39
MW
1255{
1256 /* If statistics are already allocated, we have to
1257 * reconfigure, as port_id could have been changed. */
1258 if (dev->rte_xstats_names) {
1259 free(dev->rte_xstats_names);
1260 dev->rte_xstats_names = NULL;
1261 dev->rte_xstats_names_size = 0;
1262 }
1263 if (dev->rte_xstats_ids) {
1264 free(dev->rte_xstats_ids);
1265 dev->rte_xstats_ids = NULL;
1266 dev->rte_xstats_ids_size = 0;
1267 }
1268}
1269
1270static const char*
1271netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
1272{
1273 if (id >= dev->rte_xstats_names_size) {
1274 return "UNKNOWN";
1275 }
1276 return dev->rte_xstats_names[id].name;
1277}
1278
1279static bool
1280netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
1281 OVS_REQUIRES(dev->mutex)
1282{
1283 int rte_xstats_len;
1284 bool ret;
1285 struct rte_eth_xstat *rte_xstats;
1286 uint64_t id;
1287 int xstats_no;
1288 const char *name;
1289
1290 /* Retrieving all XSTATS names. If something will go wrong
1291 * or amount of counters will be equal 0, rte_xstats_names
1292 * buffer will be marked as NULL, and any further xstats
1293 * query won't be performed (e.g. during netdev_dpdk_get_stats
1294 * execution). */
1295
1296 ret = false;
1297 rte_xstats = NULL;
1298
1299 if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
1300 dev->rte_xstats_names_size =
1301 rte_eth_xstats_get_names(dev->port_id, NULL, 0);
1302
1303 if (dev->rte_xstats_names_size < 0) {
fa9f4eeb
IM
1304 VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
1305 dev->port_id);
971f4b39
MW
1306 dev->rte_xstats_names_size = 0;
1307 } else {
1308 /* Reserve memory for xstats names and values */
1309 dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
1310 sizeof *dev->rte_xstats_names);
1311
1312 if (dev->rte_xstats_names) {
1313 /* Retreive xstats names */
1314 rte_xstats_len =
1315 rte_eth_xstats_get_names(dev->port_id,
1316 dev->rte_xstats_names,
1317 dev->rte_xstats_names_size);
1318
1319 if (rte_xstats_len < 0) {
fa9f4eeb
IM
1320 VLOG_WARN("Cannot get XSTATS names for port: "
1321 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1322 goto out;
1323 } else if (rte_xstats_len != dev->rte_xstats_names_size) {
fa9f4eeb
IM
1324 VLOG_WARN("XSTATS size doesn't match for port: "
1325 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39
MW
1326 goto out;
1327 }
1328
1329 dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
1330 sizeof(uint64_t));
1331
1332 /* We have to calculate number of counters */
1333 rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
1334 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
1335
1336 /* Retreive xstats values */
1337 if (rte_eth_xstats_get(dev->port_id, rte_xstats,
1338 rte_xstats_len) > 0) {
1339 dev->rte_xstats_ids_size = 0;
1340 xstats_no = 0;
1341 for (uint32_t i = 0; i < rte_xstats_len; i++) {
1342 id = rte_xstats[i].id;
1343 name = netdev_dpdk_get_xstat_name(dev, id);
1344 /* We need to filter out everything except
1345 * dropped, error and management counters */
1346 if (string_ends_with(name, "_errors") ||
1347 strstr(name, "_management_") ||
1348 string_ends_with(name, "_dropped")) {
1349
1350 dev->rte_xstats_ids[xstats_no] = id;
1351 xstats_no++;
1352 }
1353 }
1354 dev->rte_xstats_ids_size = xstats_no;
1355 ret = true;
1356 } else {
fa9f4eeb
IM
1357 VLOG_WARN("Can't get XSTATS IDs for port: "
1358 DPDK_PORT_ID_FMT, dev->port_id);
971f4b39 1359 }
34eb0863
IM
1360
1361 free(rte_xstats);
971f4b39
MW
1362 }
1363 }
1364 } else {
1365 /* Already configured */
1366 ret = true;
1367 }
1368
1369out:
1370 if (!ret) {
1371 netdev_dpdk_clear_xstats(dev);
1372 }
1373 return ret;
1374}
1375
8a9562d2 1376static int
a14b8947 1377netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
8a9562d2 1378{
a14b8947 1379 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
1380
1381 ovs_mutex_lock(&dev->mutex);
1382
050c60bf 1383 smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
a14b8947 1384 smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
81acebda
IM
1385 smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
1386 smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
0072e931 1387 smap_add_format(args, "mtu", "%d", dev->mtu);
451f26fd
IM
1388
1389 if (dev->type == DPDK_DEV_ETH) {
1390 smap_add_format(args, "requested_rxq_descriptors", "%d",
1391 dev->requested_rxq_size);
1392 smap_add_format(args, "configured_rxq_descriptors", "%d",
1393 dev->rxq_size);
1394 smap_add_format(args, "requested_txq_descriptors", "%d",
1395 dev->requested_txq_size);
1396 smap_add_format(args, "configured_txq_descriptors", "%d",
1397 dev->txq_size);
1a2bb118
SC
1398 if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
1399 smap_add(args, "rx_csum_offload", "true");
8155ab7e
KT
1400 } else {
1401 smap_add(args, "rx_csum_offload", "false");
1a2bb118 1402 }
451f26fd 1403 }
8a9562d2
PS
1404 ovs_mutex_unlock(&dev->mutex);
1405
1406 return 0;
1407}
1408
55e075e6 1409static struct netdev_dpdk *
bb37956a 1410netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
55e075e6
CL
1411 OVS_REQUIRES(dpdk_mutex)
1412{
1413 struct netdev_dpdk *dev;
1414
1415 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
1416 if (dev->port_id == port_id) {
1417 return dev;
1418 }
1419 }
1420
1421 return NULL;
1422}
1423
5e758818
YL
1424static dpdk_port_t
1425netdev_dpdk_get_port_by_mac(const char *mac_str)
1426{
1427 dpdk_port_t port_id;
1428 struct eth_addr mac, port_mac;
1429
1430 if (!eth_addr_from_string(mac_str, &mac)) {
1431 VLOG_ERR("invalid mac: %s", mac_str);
1432 return DPDK_ETH_PORT_ID_INVALID;
1433 }
1434
1435 RTE_ETH_FOREACH_DEV (port_id) {
1436 struct ether_addr ea;
1437
1438 rte_eth_macaddr_get(port_id, &ea);
1439 memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
1440 if (eth_addr_equals(mac, port_mac)) {
1441 return port_id;
1442 }
1443 }
1444
1445 return DPDK_ETH_PORT_ID_INVALID;
1446}
1447
1448/*
1449 * Normally, a PCI id is enough for identifying a specific DPDK port.
1450 * However, for some NICs having multiple ports sharing the same PCI
1451 * id, using PCI id won't work then.
1452 *
1453 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
1454 *
1455 * Note that the compatibility is fully kept: user can still use the
1456 * PCI id for adding ports (when it's enough for them).
1457 */
bb37956a 1458static dpdk_port_t
5dcde09c
IM
1459netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
1460 const char *devargs, char **errp)
55e075e6 1461{
5e758818 1462 char *name;
bb37956a 1463 dpdk_port_t new_port_id = DPDK_ETH_PORT_ID_INVALID;
55e075e6 1464
5e758818
YL
1465 if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
1466 new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
1467 } else {
1468 name = xmemdup0(devargs, strcspn(devargs, ","));
1469 if (rte_eth_dev_get_port_by_name(name, &new_port_id)
1470 || !rte_eth_dev_is_valid_port(new_port_id)) {
1471 /* Device not found in DPDK, attempt to attach it */
1472 if (!rte_eth_dev_attach(devargs, &new_port_id)) {
1473 /* Attach successful */
1474 dev->attached = true;
1475 VLOG_INFO("Device '%s' attached to DPDK", devargs);
1476 } else {
1477 /* Attach unsuccessful */
1478 new_port_id = DPDK_ETH_PORT_ID_INVALID;
1479 }
55e075e6 1480 }
5e758818
YL
1481 free(name);
1482 }
1483
1484 if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
1485 VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
55e075e6
CL
1486 }
1487
1488 return new_port_id;
1489}
1490
c3d062a7
CL
1491static void
1492dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
b614c894 1493 OVS_REQUIRES(dev->mutex)
a14b8947 1494{
050c60bf 1495 int new_n_rxq;
a14b8947 1496
2a21e757 1497 new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
050c60bf
DDP
1498 if (new_n_rxq != dev->requested_n_rxq) {
1499 dev->requested_n_rxq = new_n_rxq;
c3d062a7 1500 netdev_request_reconfigure(&dev->up);
050c60bf 1501 }
c3d062a7
CL
1502}
1503
b685696b
CL
1504static void
1505dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
1506 const char *flag, int default_size, int *new_size)
1507{
1508 int queue_size = smap_get_int(args, flag, default_size);
1509
1510 if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
1511 || !is_pow2(queue_size)) {
1512 queue_size = default_size;
1513 }
1514
1515 if (queue_size != *new_size) {
1516 *new_size = queue_size;
1517 netdev_request_reconfigure(netdev);
1518 }
1519}
1520
c3d062a7 1521static int
9fff138e
DDP
1522netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
1523 char **errp)
c3d062a7
CL
1524{
1525 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
b614c894
IM
1526 bool rx_fc_en, tx_fc_en, autoneg;
1527 enum rte_eth_fc_mode fc_mode;
1528 static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
1529 {RTE_FC_NONE, RTE_FC_TX_PAUSE},
1530 {RTE_FC_RX_PAUSE, RTE_FC_FULL }
1531 };
55e075e6
CL
1532 const char *new_devargs;
1533 int err = 0;
c3d062a7 1534
55e075e6 1535 ovs_mutex_lock(&dpdk_mutex);
c3d062a7
CL
1536 ovs_mutex_lock(&dev->mutex);
1537
1538 dpdk_set_rxq_config(dev, args);
1539
b685696b
CL
1540 dpdk_process_queue_size(netdev, args, "n_rxq_desc",
1541 NIC_PORT_DEFAULT_RXQ_SIZE,
1542 &dev->requested_rxq_size);
1543 dpdk_process_queue_size(netdev, args, "n_txq_desc",
1544 NIC_PORT_DEFAULT_TXQ_SIZE,
1545 &dev->requested_txq_size);
1546
55e075e6
CL
1547 new_devargs = smap_get(args, "dpdk-devargs");
1548
1549 if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
1550 /* The user requested a new device. If we return error, the caller
1551 * will delete this netdev and try to recreate it. */
1552 err = EAGAIN;
1553 goto out;
1554 }
1555
1556 /* dpdk-devargs is required for device configuration */
1557 if (new_devargs && new_devargs[0]) {
1558 /* Don't process dpdk-devargs if value is unchanged and port id
1559 * is valid */
1560 if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
1561 && rte_eth_dev_is_valid_port(dev->port_id))) {
bb37956a
IM
1562 dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
1563 new_devargs,
1564 errp);
55e075e6
CL
1565 if (!rte_eth_dev_is_valid_port(new_port_id)) {
1566 err = EINVAL;
1567 } else if (new_port_id == dev->port_id) {
1568 /* Already configured, do not reconfigure again */
1569 err = 0;
1570 } else {
1571 struct netdev_dpdk *dup_dev;
bb37956a 1572
55e075e6
CL
1573 dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
1574 if (dup_dev) {
9fff138e
DDP
1575 VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
1576 "which is already in use by '%s'",
1577 netdev_get_name(netdev), new_devargs,
1578 netdev_get_name(&dup_dev->up));
55e075e6
CL
1579 err = EADDRINUSE;
1580 } else {
bd4e172b 1581 int sid = rte_eth_dev_socket_id(new_port_id);
bb37956a 1582
bd4e172b 1583 dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
55e075e6
CL
1584 dev->devargs = xstrdup(new_devargs);
1585 dev->port_id = new_port_id;
1586 netdev_request_reconfigure(&dev->up);
971f4b39 1587 netdev_dpdk_clear_xstats(dev);
55e075e6
CL
1588 err = 0;
1589 }
1590 }
1591 }
1592 } else {
9fff138e
DDP
1593 VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
1594 "The old 'dpdk<port_id>' names are not supported",
1595 netdev_get_name(netdev));
55e075e6
CL
1596 err = EINVAL;
1597 }
1598
1599 if (err) {
1600 goto out;
1601 }
1602
c3d062a7
CL
1603 rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
1604 tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
b614c894 1605 autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
c3d062a7 1606
b614c894
IM
1607 fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
1608 if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
1609 dev->fc_conf.mode = fc_mode;
1610 dev->fc_conf.autoneg = autoneg;
1611 dpdk_eth_flow_ctrl_setup(dev);
1612 }
9fd39370 1613
55e075e6 1614out:
c3d062a7 1615 ovs_mutex_unlock(&dev->mutex);
55e075e6 1616 ovs_mutex_unlock(&dpdk_mutex);
c3d062a7 1617
55e075e6 1618 return err;
c3d062a7
CL
1619}
1620
1621static int
9fff138e
DDP
1622netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
1623 char **errp OVS_UNUSED)
c3d062a7
CL
1624{
1625 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1626
1627 ovs_mutex_lock(&dev->mutex);
1628 dpdk_set_rxq_config(dev, args);
a14b8947
IM
1629 ovs_mutex_unlock(&dev->mutex);
1630
1631 return 0;
1632}
1633
c1ff66ac 1634static int
2d24d165 1635netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
9fff138e
DDP
1636 const struct smap *args,
1637 char **errp OVS_UNUSED)
c1ff66ac
CL
1638{
1639 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
1640 const char *path;
1641
6881885a 1642 ovs_mutex_lock(&dev->mutex);
c1ff66ac
CL
1643 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
1644 path = smap_get(args, "vhost-server-path");
2d24d165
CL
1645 if (path && strcmp(path, dev->vhost_id)) {
1646 strcpy(dev->vhost_id, path);
10087cba
CL
1647 /* check zero copy configuration */
1648 if (smap_get_bool(args, "dq-zero-copy", false)) {
1649 dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1650 } else {
1651 dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1652 }
c1ff66ac
CL
1653 netdev_request_reconfigure(netdev);
1654 }
1655 }
6881885a 1656 ovs_mutex_unlock(&dev->mutex);
c1ff66ac
CL
1657
1658 return 0;
1659}
1660
7dec44fe 1661static int
d46285a2 1662netdev_dpdk_get_numa_id(const struct netdev *netdev)
7dec44fe 1663{
d46285a2 1664 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7dec44fe 1665
d46285a2 1666 return dev->socket_id;
7dec44fe
AW
1667}
1668
050c60bf 1669/* Sets the number of tx queues for the dpdk interface. */
5496878c 1670static int
050c60bf 1671netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
5496878c 1672{
d46285a2 1673 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
5496878c 1674
d46285a2 1675 ovs_mutex_lock(&dev->mutex);
91968eb0 1676
050c60bf
DDP
1677 if (dev->requested_n_txq == n_txq) {
1678 goto out;
4573fbd3
FL
1679 }
1680
050c60bf
DDP
1681 dev->requested_n_txq = n_txq;
1682 netdev_request_reconfigure(netdev);
58397e6c 1683
050c60bf 1684out:
d46285a2 1685 ovs_mutex_unlock(&dev->mutex);
050c60bf 1686 return 0;
58397e6c
KT
1687}
1688
8a9562d2
PS
1689static struct netdev_rxq *
1690netdev_dpdk_rxq_alloc(void)
1691{
1692 struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);
1693
eff23640
DDP
1694 if (rx) {
1695 return &rx->up;
1696 }
1697
1698 return NULL;
8a9562d2
PS
1699}
1700
1701static struct netdev_rxq_dpdk *
d46285a2 1702netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
8a9562d2 1703{
d46285a2 1704 return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
8a9562d2
PS
1705}
1706
1707static int
d46285a2 1708netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
8a9562d2 1709{
d46285a2
DDP
1710 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1711 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
8a9562d2 1712
d46285a2
DDP
1713 ovs_mutex_lock(&dev->mutex);
1714 rx->port_id = dev->port_id;
1715 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
1716
1717 return 0;
1718}
1719
1720static void
d46285a2 1721netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
8a9562d2
PS
1722{
1723}
1724
1725static void
d46285a2 1726netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
8a9562d2 1727{
d46285a2 1728 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
8a9562d2
PS
1729
1730 rte_free(rx);
1731}
1732
819f13bd
DDP
1733/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
1734 * 'pkts', even in case of failure.
1735 *
1736 * Returns the number of packets that weren't transmitted. */
1737static inline int
b59cc14e 1738netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
819f13bd 1739 struct rte_mbuf **pkts, int cnt)
8a9562d2 1740{
1304f1f8
DDP
1741 uint32_t nb_tx = 0;
1742
b59cc14e 1743 while (nb_tx != cnt) {
1304f1f8
DDP
1744 uint32_t ret;
1745
b59cc14e 1746 ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
1304f1f8
DDP
1747 if (!ret) {
1748 break;
1749 }
1750
1751 nb_tx += ret;
1752 }
8a9562d2 1753
b59cc14e 1754 if (OVS_UNLIKELY(nb_tx != cnt)) {
819f13bd 1755 /* Free buffers, which we couldn't transmit, one at a time (each
db73f716
DDP
1756 * packet could come from a different mempool) */
1757 int i;
1758
b59cc14e
IM
1759 for (i = nb_tx; i < cnt; i++) {
1760 rte_pktmbuf_free(pkts[i]);
db73f716 1761 }
8a9562d2 1762 }
819f13bd
DDP
1763
1764 return cnt - nb_tx;
8a9562d2
PS
1765}
1766
f3926f29
IS
1767static inline bool
1768netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
1769 struct rte_mbuf *pkt, uint64_t time)
1770{
1771 uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
1772
1773 return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
1774 e_RTE_METER_GREEN;
1775}
1776
1777static int
1778netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
3e90f7d7
GZ
1779 struct rte_mbuf **pkts, int pkt_cnt,
1780 bool may_steal)
f3926f29
IS
1781{
1782 int i = 0;
1783 int cnt = 0;
1784 struct rte_mbuf *pkt = NULL;
1785 uint64_t current_time = rte_rdtsc();
1786
1787 for (i = 0; i < pkt_cnt; i++) {
1788 pkt = pkts[i];
1789 /* Handle current packet */
1790 if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
1791 if (cnt != i) {
1792 pkts[cnt] = pkt;
1793 }
1794 cnt++;
1795 } else {
3e90f7d7
GZ
1796 if (may_steal) {
1797 rte_pktmbuf_free(pkt);
1798 }
f3926f29
IS
1799 }
1800 }
1801
1802 return cnt;
1803}
1804
9509913a
IS
1805static int
1806ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
3e90f7d7 1807 int pkt_cnt, bool may_steal)
9509913a
IS
1808{
1809 int cnt = 0;
1810
1811 rte_spinlock_lock(&policer->policer_lock);
3e90f7d7
GZ
1812 cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts,
1813 pkt_cnt, may_steal);
9509913a
IS
1814 rte_spinlock_unlock(&policer->policer_lock);
1815
1816 return cnt;
1817}
1818
58397e6c 1819static bool
0a0f39df 1820is_vhost_running(struct netdev_dpdk *dev)
58397e6c 1821{
0a0f39df 1822 return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
58397e6c
KT
1823}
1824
d6e3feb5 1825static inline void
1826netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
1827 unsigned int packet_size)
1828{
1829 /* Hard-coded search for the size bucket. */
1830 if (packet_size < 256) {
1831 if (packet_size >= 128) {
1832 stats->rx_128_to_255_packets++;
1833 } else if (packet_size <= 64) {
1834 stats->rx_1_to_64_packets++;
1835 } else {
1836 stats->rx_65_to_127_packets++;
1837 }
1838 } else {
1839 if (packet_size >= 1523) {
1840 stats->rx_1523_to_max_packets++;
1841 } else if (packet_size >= 1024) {
1842 stats->rx_1024_to_1522_packets++;
1843 } else if (packet_size < 512) {
1844 stats->rx_256_to_511_packets++;
1845 } else {
1846 stats->rx_512_to_1023_packets++;
1847 }
1848 }
1849}
1850
9e3ddd45
TP
1851static inline void
1852netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
9509913a
IS
1853 struct dp_packet **packets, int count,
1854 int dropped)
9e3ddd45
TP
1855{
1856 int i;
d6e3feb5 1857 unsigned int packet_size;
9e3ddd45
TP
1858 struct dp_packet *packet;
1859
1860 stats->rx_packets += count;
9509913a 1861 stats->rx_dropped += dropped;
9e3ddd45
TP
1862 for (i = 0; i < count; i++) {
1863 packet = packets[i];
d6e3feb5 1864 packet_size = dp_packet_size(packet);
9e3ddd45 1865
d6e3feb5 1866 if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
9e3ddd45
TP
1867 /* This only protects the following multicast counting from
1868 * too short packets, but it does not stop the packet from
1869 * further processing. */
1870 stats->rx_errors++;
1871 stats->rx_length_errors++;
1872 continue;
1873 }
1874
d6e3feb5 1875 netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
1876
9e3ddd45
TP
1877 struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
1878 if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
1879 stats->multicast++;
1880 }
1881
d6e3feb5 1882 stats->rx_bytes += packet_size;
9e3ddd45
TP
1883 }
1884}
1885
58397e6c
KT
1886/*
1887 * The receive path for the vhost port is the TX path out from guest.
1888 */
1889static int
d46285a2 1890netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
64839cf4 1891 struct dp_packet_batch *batch)
58397e6c 1892{
d46285a2 1893 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 1894 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
58397e6c 1895 uint16_t nb_rx = 0;
9509913a 1896 uint16_t dropped = 0;
daf22bf7
IM
1897 int qid = rxq->queue_id;
1898 int vid = netdev_dpdk_get_vid(dev);
58397e6c 1899
daf22bf7 1900 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
e543851d 1901 || !(dev->flags & NETDEV_UP))) {
58397e6c
KT
1902 return EAGAIN;
1903 }
1904
daf22bf7 1905 nb_rx = rte_vhost_dequeue_burst(vid, qid * VIRTIO_QNUM + VIRTIO_TXQ,
24e78f93 1906 dev->mp,
64839cf4 1907 (struct rte_mbuf **) batch->packets,
cd159f1a 1908 NETDEV_MAX_BURST);
58397e6c
KT
1909 if (!nb_rx) {
1910 return EAGAIN;
1911 }
1912
9509913a
IS
1913 if (policer) {
1914 dropped = nb_rx;
64839cf4
WT
1915 nb_rx = ingress_policer_run(policer,
1916 (struct rte_mbuf **) batch->packets,
3e90f7d7 1917 nb_rx, true);
9509913a
IS
1918 dropped -= nb_rx;
1919 }
1920
d46285a2 1921 rte_spinlock_lock(&dev->stats_lock);
64839cf4
WT
1922 netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
1923 nb_rx, dropped);
d46285a2 1924 rte_spinlock_unlock(&dev->stats_lock);
45d947c4 1925
75fb9148
ZB
1926 batch->count = nb_rx;
1927 dp_packet_batch_init_packet_fields(batch);
1928
58397e6c
KT
1929 return 0;
1930}
1931
8a9562d2 1932static int
64839cf4 1933netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
8a9562d2 1934{
d46285a2
DDP
1935 struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
1936 struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
9509913a 1937 struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
8a9562d2 1938 int nb_rx;
9509913a 1939 int dropped = 0;
8a9562d2 1940
3b1fb077
DDP
1941 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
1942 return EAGAIN;
1943 }
1944
d46285a2 1945 nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
64839cf4 1946 (struct rte_mbuf **) batch->packets,
cd159f1a 1947 NETDEV_MAX_BURST);
8a9562d2
PS
1948 if (!nb_rx) {
1949 return EAGAIN;
1950 }
1951
9509913a
IS
1952 if (policer) {
1953 dropped = nb_rx;
64839cf4 1954 nb_rx = ingress_policer_run(policer,
58be5c0e 1955 (struct rte_mbuf **) batch->packets,
3e90f7d7 1956 nb_rx, true);
9509913a
IS
1957 dropped -= nb_rx;
1958 }
1959
1960 /* Update stats to reflect dropped packets */
1961 if (OVS_UNLIKELY(dropped)) {
1962 rte_spinlock_lock(&dev->stats_lock);
1963 dev->stats.rx_dropped += dropped;
1964 rte_spinlock_unlock(&dev->stats_lock);
1965 }
1966
64839cf4 1967 batch->count = nb_rx;
75fb9148 1968 dp_packet_batch_init_packet_fields(batch);
8a9562d2
PS
1969
1970 return 0;
1971}
1972
0bf765f7 1973static inline int
78bd47cf 1974netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
3e90f7d7 1975 int cnt, bool may_steal)
0bf765f7 1976{
78bd47cf 1977 struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
0bf765f7 1978
78bd47cf
DDP
1979 if (qos_conf) {
1980 rte_spinlock_lock(&qos_conf->lock);
3e90f7d7 1981 cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, may_steal);
78bd47cf 1982 rte_spinlock_unlock(&qos_conf->lock);
0bf765f7
IS
1983 }
1984
1985 return cnt;
1986}
1987
c6ec9d17
IM
1988static int
1989netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
1990 int pkt_cnt)
1991{
1992 int i = 0;
1993 int cnt = 0;
1994 struct rte_mbuf *pkt;
1995
1996 for (i = 0; i < pkt_cnt; i++) {
1997 pkt = pkts[i];
1998 if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
1999 VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
2000 dev->up.name, pkt->pkt_len, dev->max_packet_len);
2001 rte_pktmbuf_free(pkt);
2002 continue;
2003 }
2004
2005 if (OVS_UNLIKELY(i != cnt)) {
2006 pkts[cnt] = pkt;
2007 }
2008 cnt++;
2009 }
2010
2011 return cnt;
2012}
2013
9e3ddd45
TP
2014static inline void
2015netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
2016 struct dp_packet **packets,
2017 int attempted,
2018 int dropped)
2019{
2020 int i;
2021 int sent = attempted - dropped;
2022
2023 stats->tx_packets += sent;
2024 stats->tx_dropped += dropped;
2025
2026 for (i = 0; i < sent; i++) {
2027 stats->tx_bytes += dp_packet_size(packets[i]);
2028 }
2029}
2030
58397e6c 2031static void
4573fbd3 2032__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
dd52de45 2033 struct dp_packet **pkts, int cnt)
58397e6c 2034{
d46285a2 2035 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
95e9881f
KT
2036 struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
2037 unsigned int total_pkts = cnt;
c6ec9d17 2038 unsigned int dropped = 0;
dd52de45 2039 int i, retries = 0;
daf22bf7 2040 int vid = netdev_dpdk_get_vid(dev);
58397e6c 2041
81acebda 2042 qid = dev->tx_q[qid % netdev->n_txq].map;
585a5bea 2043
daf22bf7 2044 if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
e543851d 2045 || !(dev->flags & NETDEV_UP))) {
d46285a2
DDP
2046 rte_spinlock_lock(&dev->stats_lock);
2047 dev->stats.tx_dropped+= cnt;
2048 rte_spinlock_unlock(&dev->stats_lock);
1b99bb05 2049 goto out;
58397e6c
KT
2050 }
2051
d46285a2 2052 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
58397e6c 2053
c6ec9d17 2054 cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
0bf765f7 2055 /* Check has QoS has been configured for the netdev */
3e90f7d7 2056 cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
c6ec9d17 2057 dropped = total_pkts - cnt;
0bf765f7 2058
95e9881f 2059 do {
4573fbd3 2060 int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
95e9881f
KT
2061 unsigned int tx_pkts;
2062
daf22bf7 2063 tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
95e9881f
KT
2064 if (OVS_LIKELY(tx_pkts)) {
2065 /* Packets have been sent.*/
2066 cnt -= tx_pkts;
31871ee3 2067 /* Prepare for possible retry.*/
95e9881f
KT
2068 cur_pkts = &cur_pkts[tx_pkts];
2069 } else {
31871ee3
KT
2070 /* No packets sent - do not retry.*/
2071 break;
95e9881f 2072 }
c6ec9d17 2073 } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM));
4573fbd3 2074
d46285a2 2075 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
95e9881f 2076
d46285a2 2077 rte_spinlock_lock(&dev->stats_lock);
0072e931 2078 netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
c6ec9d17 2079 cnt + dropped);
d46285a2 2080 rte_spinlock_unlock(&dev->stats_lock);
58397e6c
KT
2081
2082out:
c6ec9d17 2083 for (i = 0; i < total_pkts - dropped; i++) {
dd52de45 2084 dp_packet_delete(pkts[i]);
58397e6c
KT
2085 }
2086}
2087
8a9562d2
PS
2088/* Tx function. Transmit packets indefinitely */
2089static void
64839cf4 2090dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
db73f716 2091 OVS_NO_THREAD_SAFETY_ANALYSIS
8a9562d2 2092{
8a14bd7b 2093 const size_t batch_cnt = dp_packet_batch_size(batch);
bce01e3a 2094#if !defined(__CHECKER__) && !defined(_WIN32)
8a14bd7b 2095 const size_t PKT_ARRAY_SIZE = batch_cnt;
bce01e3a
EJ
2096#else
2097 /* Sparse or MSVC doesn't like variable length array. */
cd159f1a 2098 enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
bce01e3a 2099#endif
8a9562d2 2100 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2391135c 2101 struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
8a14bd7b 2102 uint32_t cnt = batch_cnt;
3e90f7d7
GZ
2103 uint32_t dropped = 0;
2104
2105 if (dev->type != DPDK_DEV_VHOST) {
2106 /* Check if QoS has been configured for this netdev. */
2107 cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
8a14bd7b
BB
2108 batch_cnt, false);
2109 dropped += batch_cnt - cnt;
3e90f7d7 2110 }
8a9562d2 2111
3e90f7d7
GZ
2112 uint32_t txcnt = 0;
2113
2114 for (uint32_t i = 0; i < cnt; i++) {
8a14bd7b
BB
2115 struct dp_packet *packet = batch->packets[i];
2116 uint32_t size = dp_packet_size(packet);
95fb793a 2117
f98d7864 2118 if (OVS_UNLIKELY(size > dev->max_packet_len)) {
3e90f7d7
GZ
2119 VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
2120 size, dev->max_packet_len);
f4fd623c 2121
175cf4de 2122 dropped++;
f4fd623c
DDP
2123 continue;
2124 }
8a9562d2 2125
24e78f93 2126 pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
8a14bd7b 2127 if (OVS_UNLIKELY(!pkts[txcnt])) {
3e90f7d7 2128 dropped += cnt - i;
175cf4de 2129 break;
f4fd623c
DDP
2130 }
2131
2132 /* We have to do a copy for now */
3e90f7d7 2133 memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
8a14bd7b
BB
2134 dp_packet_data(packet), size);
2135 dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
f4fd623c 2136
3e90f7d7 2137 txcnt++;
f4fd623c 2138 }
8a9562d2 2139
3e90f7d7
GZ
2140 if (OVS_LIKELY(txcnt)) {
2141 if (dev->type == DPDK_DEV_VHOST) {
2142 __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
2143 txcnt);
2144 } else {
2145 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
2146 }
58397e6c 2147 }
db73f716 2148
0bf765f7
IS
2149 if (OVS_UNLIKELY(dropped)) {
2150 rte_spinlock_lock(&dev->stats_lock);
2151 dev->stats.tx_dropped += dropped;
2152 rte_spinlock_unlock(&dev->stats_lock);
2153 }
8a9562d2
PS
2154}
2155
58397e6c 2156static int
64839cf4
WT
2157netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
2158 struct dp_packet_batch *batch,
b30896c9 2159 bool concurrent_txq OVS_UNUSED)
58397e6c 2160{
58397e6c 2161
b30896c9 2162 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
64839cf4 2163 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2164 dp_packet_delete_batch(batch, true);
58397e6c 2165 } else {
dd52de45 2166 __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
58397e6c
KT
2167 }
2168 return 0;
2169}
2170
7251515e
DV
2171static inline void
2172netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
b30896c9 2173 struct dp_packet_batch *batch,
324c8374 2174 bool concurrent_txq)
8a9562d2 2175{
3b1fb077 2176 if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
b30896c9 2177 dp_packet_delete_batch(batch, true);
3b1fb077
DDP
2178 return;
2179 }
2180
324c8374 2181 if (OVS_UNLIKELY(concurrent_txq)) {
81acebda 2182 qid = qid % dev->up.n_txq;
a0cb2d66
DDP
2183 rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
2184 }
2185
b30896c9 2186 if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
7251515e
DV
2187 struct netdev *netdev = &dev->up;
2188
64839cf4 2189 dpdk_do_tx_copy(netdev, qid, batch);
b30896c9 2190 dp_packet_delete_batch(batch, true);
8a9562d2 2191 } else {
fd57eeba
BB
2192 int tx_cnt, dropped;
2193 int batch_cnt = dp_packet_batch_size(batch);
2391135c 2194 struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
8a9562d2 2195
fd57eeba
BB
2196 tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
2197 tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
2198 dropped = batch_cnt - tx_cnt;
1b99bb05 2199
fd57eeba 2200 dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);
8a9562d2 2201
f4fd623c 2202 if (OVS_UNLIKELY(dropped)) {
45d947c4 2203 rte_spinlock_lock(&dev->stats_lock);
f4fd623c 2204 dev->stats.tx_dropped += dropped;
45d947c4 2205 rte_spinlock_unlock(&dev->stats_lock);
f4fd623c 2206 }
8a9562d2 2207 }
a0cb2d66 2208
324c8374 2209 if (OVS_UNLIKELY(concurrent_txq)) {
a0cb2d66
DDP
2210 rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
2211 }
7251515e
DV
2212}
2213
2214static int
2215netdev_dpdk_eth_send(struct netdev *netdev, int qid,
b30896c9 2216 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e
DV
2217{
2218 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2219
b30896c9 2220 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e 2221 return 0;
8a9562d2
PS
2222}
2223
2224static int
74ff3298 2225netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
8a9562d2
PS
2226{
2227 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2228
2229 ovs_mutex_lock(&dev->mutex);
2230 if (!eth_addr_equals(dev->hwaddr, mac)) {
74ff3298 2231 dev->hwaddr = mac;
045c0d1a 2232 netdev_change_seq_changed(netdev);
8a9562d2
PS
2233 }
2234 ovs_mutex_unlock(&dev->mutex);
2235
2236 return 0;
2237}
2238
2239static int
74ff3298 2240netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
8a9562d2
PS
2241{
2242 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2243
2244 ovs_mutex_lock(&dev->mutex);
74ff3298 2245 *mac = dev->hwaddr;
8a9562d2
PS
2246 ovs_mutex_unlock(&dev->mutex);
2247
2248 return 0;
2249}
2250
2251static int
2252netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
2253{
2254 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2255
2256 ovs_mutex_lock(&dev->mutex);
2257 *mtup = dev->mtu;
2258 ovs_mutex_unlock(&dev->mutex);
2259
2260 return 0;
2261}
2262
0072e931
MK
2263static int
2264netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
2265{
2266 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2267
f6f50552
IS
2268 /* XXX: Ensure that the overall frame length of the requested MTU does not
2269 * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
2270 * the L2 frame length is calculated for a given MTU when
2271 * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
2272 * headers, the em driver includes 1 x vlan header, the ixgbe driver does
2273 * not include vlan headers. As such we should use
2274 * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
2275 * (8 bytes) for comparison. This avoids a failure later with
2276 * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
2277 * a method to retrieve the upper bound MTU for a given device.
2278 */
2279 if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
0072e931
MK
2280 || mtu < ETHER_MIN_MTU) {
2281 VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
2282 return EINVAL;
2283 }
2284
2285 ovs_mutex_lock(&dev->mutex);
2286 if (dev->requested_mtu != mtu) {
2287 dev->requested_mtu = mtu;
2288 netdev_request_reconfigure(netdev);
2289 }
2290 ovs_mutex_unlock(&dev->mutex);
2291
2292 return 0;
2293}
2294
8a9562d2 2295static int
d46285a2 2296netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
8a9562d2 2297
58397e6c
KT
2298static int
2299netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
2300 struct netdev_stats *stats)
2301{
2302 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2303
2304 ovs_mutex_lock(&dev->mutex);
58397e6c 2305
45d947c4 2306 rte_spinlock_lock(&dev->stats_lock);
58397e6c 2307 /* Supported Stats */
50986e78 2308 stats->rx_packets = dev->stats.rx_packets;
2309 stats->tx_packets = dev->stats.tx_packets;
9509913a 2310 stats->rx_dropped = dev->stats.rx_dropped;
50986e78 2311 stats->tx_dropped = dev->stats.tx_dropped;
9e3ddd45
TP
2312 stats->multicast = dev->stats.multicast;
2313 stats->rx_bytes = dev->stats.rx_bytes;
2314 stats->tx_bytes = dev->stats.tx_bytes;
2315 stats->rx_errors = dev->stats.rx_errors;
2316 stats->rx_length_errors = dev->stats.rx_length_errors;
d6e3feb5 2317
2318 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
2319 stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
2320 stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
2321 stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
2322 stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
2323 stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
2324 stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
2325
45d947c4 2326 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45 2327
58397e6c
KT
2328 ovs_mutex_unlock(&dev->mutex);
2329
2330 return 0;
2331}
2332
d6e3feb5 2333static void
2334netdev_dpdk_convert_xstats(struct netdev_stats *stats,
0a0f39df
CL
2335 const struct rte_eth_xstat *xstats,
2336 const struct rte_eth_xstat_name *names,
d6e3feb5 2337 const unsigned int size)
2338{
d6e3feb5 2339 for (unsigned int i = 0; i < size; i++) {
0a0f39df 2340 if (strcmp(XSTAT_RX_64_PACKETS, names[i].name) == 0) {
d6e3feb5 2341 stats->rx_1_to_64_packets = xstats[i].value;
0a0f39df 2342 } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, names[i].name) == 0) {
d6e3feb5 2343 stats->rx_65_to_127_packets = xstats[i].value;
0a0f39df 2344 } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, names[i].name) == 0) {
d6e3feb5 2345 stats->rx_128_to_255_packets = xstats[i].value;
0a0f39df 2346 } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, names[i].name) == 0) {
d6e3feb5 2347 stats->rx_256_to_511_packets = xstats[i].value;
0a0f39df 2348 } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS, names[i].name) == 0) {
d6e3feb5 2349 stats->rx_512_to_1023_packets = xstats[i].value;
0a0f39df 2350 } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS, names[i].name) == 0) {
d6e3feb5 2351 stats->rx_1024_to_1522_packets = xstats[i].value;
0a0f39df 2352 } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
d6e3feb5 2353 stats->rx_1523_to_max_packets = xstats[i].value;
0a0f39df 2354 } else if (strcmp(XSTAT_TX_64_PACKETS, names[i].name) == 0) {
d6e3feb5 2355 stats->tx_1_to_64_packets = xstats[i].value;
0a0f39df 2356 } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, names[i].name) == 0) {
d6e3feb5 2357 stats->tx_65_to_127_packets = xstats[i].value;
0a0f39df 2358 } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, names[i].name) == 0) {
d6e3feb5 2359 stats->tx_128_to_255_packets = xstats[i].value;
0a0f39df 2360 } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, names[i].name) == 0) {
d6e3feb5 2361 stats->tx_256_to_511_packets = xstats[i].value;
0a0f39df 2362 } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS, names[i].name) == 0) {
d6e3feb5 2363 stats->tx_512_to_1023_packets = xstats[i].value;
0a0f39df 2364 } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS, names[i].name) == 0) {
d6e3feb5 2365 stats->tx_1024_to_1522_packets = xstats[i].value;
0a0f39df 2366 } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
d6e3feb5 2367 stats->tx_1523_to_max_packets = xstats[i].value;
d57f777f
PS
2368 } else if (strcmp(XSTAT_RX_MULTICAST_PACKETS, names[i].name) == 0) {
2369 stats->multicast = xstats[i].value;
0a0f39df 2370 } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2371 stats->tx_multicast_packets = xstats[i].value;
0a0f39df 2372 } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2373 stats->rx_broadcast_packets = xstats[i].value;
0a0f39df 2374 } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, names[i].name) == 0) {
d6e3feb5 2375 stats->tx_broadcast_packets = xstats[i].value;
0a0f39df 2376 } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
d6e3feb5 2377 stats->rx_undersized_errors = xstats[i].value;
0a0f39df 2378 } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
d6e3feb5 2379 stats->rx_fragmented_errors = xstats[i].value;
0a0f39df 2380 } else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
d6e3feb5 2381 stats->rx_jabber_errors = xstats[i].value;
2382 }
2383 }
2384}
2385
8a9562d2
PS
2386static int
2387netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
2388{
2389 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2390 struct rte_eth_stats rte_stats;
2391 bool gg;
2392
2393 netdev_dpdk_get_carrier(netdev, &gg);
2394 ovs_mutex_lock(&dev->mutex);
8a9562d2 2395
0a0f39df
CL
2396 struct rte_eth_xstat *rte_xstats = NULL;
2397 struct rte_eth_xstat_name *rte_xstats_names = NULL;
2398 int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
d6e3feb5 2399
2400 if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
fa9f4eeb
IM
2401 VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
2402 dev->port_id);
f9256822 2403 ovs_mutex_unlock(&dev->mutex);
d6e3feb5 2404 return EPROTO;
2405 }
2406
0a0f39df
CL
2407 /* Get length of statistics */
2408 rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
2409 if (rte_xstats_len < 0) {
fa9f4eeb
IM
2410 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2411 dev->port_id);
0a0f39df
CL
2412 goto out;
2413 }
2414 /* Reserve memory for xstats names and values */
2415 rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
2416 rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);
2417
2418 /* Retreive xstats names */
2419 rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
2420 rte_xstats_names,
2421 rte_xstats_len);
2422 if (rte_xstats_new_len != rte_xstats_len) {
fa9f4eeb
IM
2423 VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
2424 dev->port_id);
0a0f39df
CL
2425 goto out;
2426 }
2427 /* Retreive xstats values */
2428 memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
2429 rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
2430 rte_xstats_len);
2431 if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
2432 netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
2433 rte_xstats_len);
d6e3feb5 2434 } else {
fa9f4eeb
IM
2435 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
2436 dev->port_id);
d6e3feb5 2437 }
8a9562d2 2438
0a0f39df
CL
2439out:
2440 free(rte_xstats);
2441 free(rte_xstats_names);
2442
2f9dd77f
PS
2443 stats->rx_packets = rte_stats.ipackets;
2444 stats->tx_packets = rte_stats.opackets;
2445 stats->rx_bytes = rte_stats.ibytes;
2446 stats->tx_bytes = rte_stats.obytes;
21e9844c 2447 stats->rx_errors = rte_stats.ierrors;
2f9dd77f 2448 stats->tx_errors = rte_stats.oerrors;
8a9562d2 2449
45d947c4 2450 rte_spinlock_lock(&dev->stats_lock);
2f9dd77f 2451 stats->tx_dropped = dev->stats.tx_dropped;
9509913a 2452 stats->rx_dropped = dev->stats.rx_dropped;
45d947c4 2453 rte_spinlock_unlock(&dev->stats_lock);
9e3ddd45
TP
2454
2455 /* These are the available DPDK counters for packets not received due to
2456 * local resource constraints in DPDK and NIC respectively. */
9509913a 2457 stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
9e3ddd45
TP
2458 stats->rx_missed_errors = rte_stats.imissed;
2459
8a9562d2
PS
2460 ovs_mutex_unlock(&dev->mutex);
2461
2462 return 0;
2463}
2464
971f4b39
MW
2465static int
2466netdev_dpdk_get_custom_stats(const struct netdev *netdev,
2467 struct netdev_custom_stats *custom_stats)
2468{
2469
2470 uint32_t i;
2471 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2472 int rte_xstats_ret;
2473
2474 ovs_mutex_lock(&dev->mutex);
2475
2476 if (netdev_dpdk_configure_xstats(dev)) {
2477 uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
2478 sizeof(uint64_t));
2479
2480 rte_xstats_ret =
2481 rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
2482 values, dev->rte_xstats_ids_size);
2483
2484 if (rte_xstats_ret > 0 &&
2485 rte_xstats_ret <= dev->rte_xstats_ids_size) {
2486
2487 custom_stats->size = rte_xstats_ret;
2488 custom_stats->counters =
2489 (struct netdev_custom_counter *) xcalloc(rte_xstats_ret,
2490 sizeof(struct netdev_custom_counter));
2491
2492 for (i = 0; i < rte_xstats_ret; i++) {
2493 ovs_strlcpy(custom_stats->counters[i].name,
2494 netdev_dpdk_get_xstat_name(dev,
2495 dev->rte_xstats_ids[i]),
2496 NETDEV_CUSTOM_STATS_NAME_SIZE);
2497 custom_stats->counters[i].value = values[i];
2498 }
2499 } else {
fa9f4eeb 2500 VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
971f4b39
MW
2501 dev->port_id);
2502 custom_stats->counters = NULL;
2503 custom_stats->size = 0;
2504 /* Let's clear statistics cache, so it will be
2505 * reconfigured */
2506 netdev_dpdk_clear_xstats(dev);
2507 }
526259f2
IM
2508
2509 free(values);
971f4b39
MW
2510 }
2511
2512 ovs_mutex_unlock(&dev->mutex);
2513
2514 return 0;
2515}
2516
8a9562d2 2517static int
d46285a2 2518netdev_dpdk_get_features(const struct netdev *netdev,
8a9562d2 2519 enum netdev_features *current,
ca3d4f55
BX
2520 enum netdev_features *advertised,
2521 enum netdev_features *supported,
2522 enum netdev_features *peer)
8a9562d2 2523{
d46285a2 2524 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2525 struct rte_eth_link link;
2526
2527 ovs_mutex_lock(&dev->mutex);
2528 link = dev->link;
2529 ovs_mutex_unlock(&dev->mutex);
2530
362ca396 2531 if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
2532 if (link.link_speed == ETH_SPEED_NUM_10M) {
8a9562d2
PS
2533 *current = NETDEV_F_10MB_HD;
2534 }
362ca396 2535 if (link.link_speed == ETH_SPEED_NUM_100M) {
8a9562d2
PS
2536 *current = NETDEV_F_100MB_HD;
2537 }
362ca396 2538 if (link.link_speed == ETH_SPEED_NUM_1G) {
8a9562d2
PS
2539 *current = NETDEV_F_1GB_HD;
2540 }
2541 } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
362ca396 2542 if (link.link_speed == ETH_SPEED_NUM_10M) {
8a9562d2
PS
2543 *current = NETDEV_F_10MB_FD;
2544 }
362ca396 2545 if (link.link_speed == ETH_SPEED_NUM_100M) {
8a9562d2
PS
2546 *current = NETDEV_F_100MB_FD;
2547 }
362ca396 2548 if (link.link_speed == ETH_SPEED_NUM_1G) {
8a9562d2
PS
2549 *current = NETDEV_F_1GB_FD;
2550 }
362ca396 2551 if (link.link_speed == ETH_SPEED_NUM_10G) {
8a9562d2
PS
2552 *current = NETDEV_F_10GB_FD;
2553 }
2554 }
2555
362ca396 2556 if (link.link_autoneg) {
2557 *current |= NETDEV_F_AUTONEG;
2558 }
2559
ca3d4f55
BX
2560 *advertised = *supported = *peer = 0;
2561
8a9562d2
PS
2562 return 0;
2563}
2564
9509913a
IS
2565static struct ingress_policer *
2566netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
2567{
2568 struct ingress_policer *policer = NULL;
2569 uint64_t rate_bytes;
2570 uint64_t burst_bytes;
2571 int err = 0;
2572
2573 policer = xmalloc(sizeof *policer);
2574 rte_spinlock_init(&policer->policer_lock);
2575
2576 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
602c8668
LR
2577 rate_bytes = rate * 1000ULL / 8;
2578 burst_bytes = burst * 1000ULL / 8;
9509913a
IS
2579
2580 policer->app_srtcm_params.cir = rate_bytes;
2581 policer->app_srtcm_params.cbs = burst_bytes;
2582 policer->app_srtcm_params.ebs = 0;
2583 err = rte_meter_srtcm_config(&policer->in_policer,
2584 &policer->app_srtcm_params);
58be5c0e 2585 if (err) {
9509913a 2586 VLOG_ERR("Could not create rte meter for ingress policer");
4c47ddde 2587 free(policer);
9509913a
IS
2588 return NULL;
2589 }
2590
2591 return policer;
2592}
2593
2594static int
2595netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
2596 uint32_t policer_burst)
2597{
2598 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2599 struct ingress_policer *policer;
2600
2601 /* Force to 0 if no rate specified,
2602 * default to 8000 kbits if burst is 0,
2603 * else stick with user-specified value.
2604 */
2605 policer_burst = (!policer_rate ? 0
2606 : !policer_burst ? 8000
2607 : policer_burst);
2608
2609 ovs_mutex_lock(&dev->mutex);
2610
2611 policer = ovsrcu_get_protected(struct ingress_policer *,
2612 &dev->ingress_policer);
2613
2614 if (dev->policer_rate == policer_rate &&
2615 dev->policer_burst == policer_burst) {
2616 /* Assume that settings haven't changed since we last set them. */
2617 ovs_mutex_unlock(&dev->mutex);
2618 return 0;
2619 }
2620
2621 /* Destroy any existing ingress policer for the device if one exists */
2622 if (policer) {
2623 ovsrcu_postpone(free, policer);
2624 }
2625
2626 if (policer_rate != 0) {
2627 policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
2628 } else {
2629 policer = NULL;
2630 }
2631 ovsrcu_set(&dev->ingress_policer, policer);
2632 dev->policer_rate = policer_rate;
2633 dev->policer_burst = policer_burst;
2634 ovs_mutex_unlock(&dev->mutex);
2635
2636 return 0;
2637}
2638
8a9562d2
PS
2639static int
2640netdev_dpdk_get_ifindex(const struct netdev *netdev)
2641{
2642 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2643
2644 ovs_mutex_lock(&dev->mutex);
12d0d124
PL
2645 /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
2646 * postive integer to meet RFC 2863 recommendations.
2647 */
2648 int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
8a9562d2
PS
2649 ovs_mutex_unlock(&dev->mutex);
2650
2651 return ifindex;
2652}
2653
2654static int
d46285a2 2655netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
8a9562d2 2656{
d46285a2 2657 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2658
2659 ovs_mutex_lock(&dev->mutex);
2660 check_link_status(dev);
2661 *carrier = dev->link.link_status;
58397e6c
KT
2662
2663 ovs_mutex_unlock(&dev->mutex);
2664
2665 return 0;
2666}
2667
2668static int
d46285a2 2669netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
58397e6c 2670{
d46285a2 2671 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
58397e6c
KT
2672
2673 ovs_mutex_lock(&dev->mutex);
2674
0a0f39df 2675 if (is_vhost_running(dev)) {
58397e6c
KT
2676 *carrier = 1;
2677 } else {
2678 *carrier = 0;
2679 }
2680
8a9562d2
PS
2681 ovs_mutex_unlock(&dev->mutex);
2682
2683 return 0;
2684}
2685
2686static long long int
d46285a2 2687netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
8a9562d2 2688{
d46285a2 2689 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2690 long long int carrier_resets;
2691
2692 ovs_mutex_lock(&dev->mutex);
2693 carrier_resets = dev->link_reset_cnt;
2694 ovs_mutex_unlock(&dev->mutex);
2695
2696 return carrier_resets;
2697}
2698
2699static int
d46285a2 2700netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
8a9562d2
PS
2701 long long int interval OVS_UNUSED)
2702{
ee32150e 2703 return EOPNOTSUPP;
8a9562d2
PS
2704}
2705
2706static int
2707netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
2708 enum netdev_flags off, enum netdev_flags on,
64839cf4
WT
2709 enum netdev_flags *old_flagsp)
2710 OVS_REQUIRES(dev->mutex)
8a9562d2 2711{
8a9562d2
PS
2712 if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
2713 return EINVAL;
2714 }
2715
2716 *old_flagsp = dev->flags;
2717 dev->flags |= on;
2718 dev->flags &= ~off;
2719
2720 if (dev->flags == *old_flagsp) {
2721 return 0;
2722 }
2723
58397e6c 2724 if (dev->type == DPDK_DEV_ETH) {
58397e6c
KT
2725 if (dev->flags & NETDEV_PROMISC) {
2726 rte_eth_promiscuous_enable(dev->port_id);
2727 }
8a9562d2 2728
314fb5ad 2729 netdev_change_seq_changed(&dev->up);
e543851d
ZB
2730 } else {
2731 /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
2732 * running then change netdev's change_seq to trigger link state
2733 * update. */
e543851d
ZB
2734
2735 if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
0a0f39df 2736 && is_vhost_running(dev)) {
e543851d
ZB
2737 netdev_change_seq_changed(&dev->up);
2738
2739 /* Clear statistics if device is getting up. */
2740 if (NETDEV_UP & on) {
2741 rte_spinlock_lock(&dev->stats_lock);
58be5c0e 2742 memset(&dev->stats, 0, sizeof dev->stats);
e543851d
ZB
2743 rte_spinlock_unlock(&dev->stats_lock);
2744 }
2745 }
8a9562d2
PS
2746 }
2747
2748 return 0;
2749}
2750
2751static int
d46285a2 2752netdev_dpdk_update_flags(struct netdev *netdev,
8a9562d2
PS
2753 enum netdev_flags off, enum netdev_flags on,
2754 enum netdev_flags *old_flagsp)
2755{
d46285a2 2756 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2757 int error;
2758
d46285a2
DDP
2759 ovs_mutex_lock(&dev->mutex);
2760 error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
2761 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
2762
2763 return error;
2764}
2765
b2e8b12f
FL
2766static int
2767netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
2768 struct smap *args)
2769{
2770 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
2771
2772 ovs_mutex_lock(&dev->mutex);
2773
2774 bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
2775 smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
2776
2777 int vid = netdev_dpdk_get_vid(dev);
2778 if (vid < 0) {
2779 smap_add_format(args, "status", "disconnected");
2780 ovs_mutex_unlock(&dev->mutex);
2781 return 0;
2782 } else {
2783 smap_add_format(args, "status", "connected");
2784 }
2785
2786 char socket_name[PATH_MAX];
2787 if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
2788 smap_add_format(args, "socket", "%s", socket_name);
2789 }
2790
2791 uint64_t features;
2792 if (!rte_vhost_get_negotiated_features(vid, &features)) {
2793 smap_add_format(args, "features", "0x%016"PRIx64, features);
2794 }
2795
2796 uint16_t mtu;
2797 if (!rte_vhost_get_mtu(vid, &mtu)) {
2798 smap_add_format(args, "mtu", "%d", mtu);
2799 }
2800
2801 int numa = rte_vhost_get_numa_node(vid);
2802 if (numa >= 0) {
2803 smap_add_format(args, "numa", "%d", numa);
2804 }
2805
2806 uint16_t vring_num = rte_vhost_get_vring_num(vid);
2807 if (vring_num) {
2808 smap_add_format(args, "num_of_vrings", "%d", vring_num);
2809 }
2810
2811 for (int i = 0; i < vring_num; i++) {
2812 struct rte_vhost_vring vring;
2813 char vhost_vring[16];
2814
2815 rte_vhost_get_vhost_vring(vid, i, &vring);
2816 snprintf(vhost_vring, 16, "vring_%d_size", i);
2817 smap_add_format(args, vhost_vring, "%d", vring.size);
2818 }
2819
2820 ovs_mutex_unlock(&dev->mutex);
2821 return 0;
2822}
2823
8a9562d2 2824static int
d46285a2 2825netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
8a9562d2 2826{
d46285a2 2827 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2
PS
2828 struct rte_eth_dev_info dev_info;
2829
7cd1261d 2830 if (!rte_eth_dev_is_valid_port(dev->port_id)) {
8a9562d2 2831 return ENODEV;
7cd1261d 2832 }
8a9562d2
PS
2833
2834 ovs_mutex_lock(&dev->mutex);
2835 rte_eth_dev_info_get(dev->port_id, &dev_info);
2836 ovs_mutex_unlock(&dev->mutex);
2837
fa9f4eeb 2838 smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
58be5c0e
MK
2839 smap_add_format(args, "numa_id", "%d",
2840 rte_eth_dev_socket_id(dev->port_id));
8a9562d2
PS
2841 smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
2842 smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
4be4d22c 2843 smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
8a9562d2
PS
2844 smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
2845 smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
2846 smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
58be5c0e
MK
2847 smap_add_format(args, "max_hash_mac_addrs", "%u",
2848 dev_info.max_hash_mac_addrs);
8a9562d2
PS
2849 smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
2850 smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
2851
3eb8d4fa
MW
2852 /* Querying the DPDK library for iftype may be done in future, pending
2853 * support; cf. RFC 3635 Section 3.2.4. */
2854 enum { IF_TYPE_ETHERNETCSMACD = 6 };
2855
2856 smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
2857 smap_add_format(args, "if_descr", "%s %s", rte_version(),
2858 dev_info.driver_name);
2859
39c2baa9 2860 if (dev_info.pci_dev) {
2861 smap_add_format(args, "pci-vendor_id", "0x%u",
2862 dev_info.pci_dev->id.vendor_id);
2863 smap_add_format(args, "pci-device_id", "0x%x",
2864 dev_info.pci_dev->id.device_id);
2865 }
8a9562d2
PS
2866
2867 return 0;
2868}
2869
2870static void
2871netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
2872 OVS_REQUIRES(dev->mutex)
2873{
2874 enum netdev_flags old_flags;
2875
2876 if (admin_state) {
2877 netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
2878 } else {
2879 netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
2880 }
2881}
2882
2883static void
2884netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
2885 const char *argv[], void *aux OVS_UNUSED)
2886{
2887 bool up;
2888
2889 if (!strcasecmp(argv[argc - 1], "up")) {
2890 up = true;
2891 } else if ( !strcasecmp(argv[argc - 1], "down")) {
2892 up = false;
2893 } else {
2894 unixctl_command_reply_error(conn, "Invalid Admin State");
2895 return;
2896 }
2897
2898 if (argc > 2) {
2899 struct netdev *netdev = netdev_from_name(argv[1]);
3d0d5ab1 2900
8a9562d2 2901 if (netdev && is_dpdk_class(netdev->netdev_class)) {
3d0d5ab1 2902 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a9562d2 2903
3d0d5ab1
IM
2904 ovs_mutex_lock(&dev->mutex);
2905 netdev_dpdk_set_admin_state__(dev, up);
2906 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
2907
2908 netdev_close(netdev);
2909 } else {
2910 unixctl_command_reply_error(conn, "Not a DPDK Interface");
2911 netdev_close(netdev);
2912 return;
2913 }
2914 } else {
3d0d5ab1 2915 struct netdev_dpdk *dev;
8a9562d2
PS
2916
2917 ovs_mutex_lock(&dpdk_mutex);
3d0d5ab1
IM
2918 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
2919 ovs_mutex_lock(&dev->mutex);
2920 netdev_dpdk_set_admin_state__(dev, up);
2921 ovs_mutex_unlock(&dev->mutex);
8a9562d2
PS
2922 }
2923 ovs_mutex_unlock(&dpdk_mutex);
2924 }
2925 unixctl_command_reply(conn, "OK");
2926}
2927
0ee821c2
DB
2928static void
2929netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
2930 const char *argv[], void *aux OVS_UNUSED)
2931{
2932 int ret;
2933 char *response;
7ee94cba 2934 dpdk_port_t port_id;
0ee821c2
DB
2935 char devname[RTE_ETH_NAME_MAX_LEN];
2936 struct netdev_dpdk *dev;
2937
2938 ovs_mutex_lock(&dpdk_mutex);
2939
255b7bda 2940 if (rte_eth_dev_get_port_by_name(argv[1], &port_id)) {
0ee821c2
DB
2941 response = xasprintf("Device '%s' not found in DPDK", argv[1]);
2942 goto error;
2943 }
2944
2945 dev = netdev_dpdk_lookup_by_port_id(port_id);
2946 if (dev) {
2947 response = xasprintf("Device '%s' is being used by interface '%s'. "
2948 "Remove it before detaching",
2949 argv[1], netdev_get_name(&dev->up));
2950 goto error;
2951 }
2952
2953 rte_eth_dev_close(port_id);
2954
2955 ret = rte_eth_dev_detach(port_id, devname);
2956 if (ret < 0) {
2957 response = xasprintf("Device '%s' can not be detached", argv[1]);
2958 goto error;
2959 }
2960
2961 response = xasprintf("Device '%s' has been detached", argv[1]);
2962
2963 ovs_mutex_unlock(&dpdk_mutex);
2964 unixctl_command_reply(conn, response);
2965 free(response);
2966 return;
2967
2968error:
2969 ovs_mutex_unlock(&dpdk_mutex);
2970 unixctl_command_reply_error(conn, response);
2971 free(response);
2972}
2973
be481733
IM
2974static void
2975netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
2976 int argc, const char *argv[],
2977 void *aux OVS_UNUSED)
2978{
2979 size_t size;
2980 FILE *stream;
2981 char *response = NULL;
2982 struct netdev *netdev = NULL;
2983
2984 if (argc == 2) {
2985 netdev = netdev_from_name(argv[1]);
2986 if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
2987 unixctl_command_reply_error(conn, "Not a DPDK Interface");
2988 goto out;
2989 }
2990 }
2991
2992 stream = open_memstream(&response, &size);
2993 if (!stream) {
2994 response = xasprintf("Unable to open memstream: %s.",
2995 ovs_strerror(errno));
2996 unixctl_command_reply_error(conn, response);
2997 goto out;
2998 }
2999
3000 if (netdev) {
3001 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3002
3003 ovs_mutex_lock(&dev->mutex);
3004 ovs_mutex_lock(&dpdk_mp_mutex);
3005
3006 rte_mempool_dump(stream, dev->mp);
3007
3008 ovs_mutex_unlock(&dpdk_mp_mutex);
3009 ovs_mutex_unlock(&dev->mutex);
3010 } else {
3011 ovs_mutex_lock(&dpdk_mp_mutex);
3012 rte_mempool_list_dump(stream);
3013 ovs_mutex_unlock(&dpdk_mp_mutex);
3014 }
3015
3016 fclose(stream);
3017
3018 unixctl_command_reply(conn, response);
3019out:
3020 free(response);
3021 netdev_close(netdev);
3022}
3023
58397e6c
KT
3024/*
3025 * Set virtqueue flags so that we do not receive interrupts.
3026 */
3027static void
0a0f39df 3028set_irq_status(int vid)
58397e6c 3029{
4573fbd3 3030 uint32_t i;
4573fbd3 3031
f3e7ec25
MW
3032 for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
3033 rte_vhost_enable_guest_notification(vid, i, 0);
4573fbd3
FL
3034 }
3035}
3036
585a5bea
IM
3037/*
3038 * Fixes mapping for vhost-user tx queues. Must be called after each
81acebda 3039 * enabling/disabling of queues and n_txq modifications.
585a5bea
IM
3040 */
3041static void
d46285a2
DDP
3042netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
3043 OVS_REQUIRES(dev->mutex)
585a5bea
IM
3044{
3045 int *enabled_queues, n_enabled = 0;
81acebda 3046 int i, k, total_txqs = dev->up.n_txq;
585a5bea 3047
eff23640 3048 enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);
585a5bea
IM
3049
3050 for (i = 0; i < total_txqs; i++) {
3051 /* Enabled queues always mapped to themselves. */
d46285a2 3052 if (dev->tx_q[i].map == i) {
585a5bea
IM
3053 enabled_queues[n_enabled++] = i;
3054 }
3055 }
3056
3057 if (n_enabled == 0 && total_txqs != 0) {
f3ea2ad2 3058 enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
585a5bea
IM
3059 n_enabled = 1;
3060 }
3061
3062 k = 0;
3063 for (i = 0; i < total_txqs; i++) {
d46285a2
DDP
3064 if (dev->tx_q[i].map != i) {
3065 dev->tx_q[i].map = enabled_queues[k];
585a5bea
IM
3066 k = (k + 1) % n_enabled;
3067 }
3068 }
3069
2d24d165 3070 VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
585a5bea 3071 for (i = 0; i < total_txqs; i++) {
d46285a2 3072 VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
585a5bea
IM
3073 }
3074
eff23640 3075 free(enabled_queues);
585a5bea 3076}
4573fbd3 3077
58397e6c
KT
3078/*
3079 * A new virtio-net device is added to a vhost port.
3080 */
3081static int
0a0f39df 3082new_device(int vid)
58397e6c 3083{
d46285a2 3084 struct netdev_dpdk *dev;
58397e6c 3085 bool exists = false;
db8f13b0 3086 int newnode = 0;
0a0f39df
CL
3087 char ifname[IF_NAME_SZ];
3088
58be5c0e 3089 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3090
3091 ovs_mutex_lock(&dpdk_mutex);
3092 /* Add device to the vhost port with the same name as that passed down. */
d46285a2 3093 LIST_FOR_EACH(dev, list_node, &dpdk_list) {
c1ff66ac 3094 ovs_mutex_lock(&dev->mutex);
2d24d165 3095 if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
f3e7ec25 3096 uint32_t qp_num = rte_vhost_get_vring_num(vid)/VIRTIO_QNUM;
db8f13b0
CL
3097
3098 /* Get NUMA information */
0a0f39df
CL
3099 newnode = rte_vhost_get_numa_node(vid);
3100 if (newnode == -1) {
5b9bf9e0 3101#ifdef VHOST_NUMA
db8f13b0 3102 VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
0a0f39df 3103 ifname);
5b9bf9e0 3104#endif
db8f13b0 3105 newnode = dev->socket_id;
db8f13b0
CL
3106 }
3107
7f5f2bd0
IM
3108 if (dev->requested_n_txq != qp_num
3109 || dev->requested_n_rxq != qp_num
3110 || dev->requested_socket_id != newnode) {
3111 dev->requested_socket_id = newnode;
3112 dev->requested_n_rxq = qp_num;
3113 dev->requested_n_txq = qp_num;
3114 netdev_request_reconfigure(&dev->up);
3115 } else {
3116 /* Reconfiguration not required. */
3117 dev->vhost_reconfigured = true;
3118 }
81acebda 3119
0a0f39df 3120 ovsrcu_index_set(&dev->vid, vid);
81acebda
IM
3121 exists = true;
3122
58397e6c 3123 /* Disable notifications. */
0a0f39df 3124 set_irq_status(vid);
e543851d 3125 netdev_change_seq_changed(&dev->up);
d46285a2 3126 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3127 break;
3128 }
c1ff66ac 3129 ovs_mutex_unlock(&dev->mutex);
58397e6c
KT
3130 }
3131 ovs_mutex_unlock(&dpdk_mutex);
3132
3133 if (!exists) {
0a0f39df 3134 VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
58397e6c
KT
3135
3136 return -1;
3137 }
3138
0a0f39df
CL
3139 VLOG_INFO("vHost Device '%s' has been added on numa node %i",
3140 ifname, newnode);
3141
58397e6c
KT
3142 return 0;
3143}
3144
f3ea2ad2
IM
3145/* Clears mapping for all available queues of vhost interface. */
3146static void
3147netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
3148 OVS_REQUIRES(dev->mutex)
3149{
3150 int i;
3151
81acebda 3152 for (i = 0; i < dev->up.n_txq; i++) {
f3ea2ad2
IM
3153 dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
3154 }
3155}
3156
58397e6c
KT
3157/*
3158 * Remove a virtio-net device from the specific vhost port. Use dev->remove
3159 * flag to stop any more packets from being sent or received to/from a VM and
3160 * ensure all currently queued packets have been sent/received before removing
3161 * the device.
3162 */
3163static void
0a0f39df 3164destroy_device(int vid)
58397e6c 3165{
d46285a2 3166 struct netdev_dpdk *dev;
afee281f 3167 bool exists = false;
0a0f39df
CL
3168 char ifname[IF_NAME_SZ];
3169
58be5c0e 3170 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
58397e6c
KT
3171
3172 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3173 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
0a0f39df 3174 if (netdev_dpdk_get_vid(dev) == vid) {
58397e6c 3175
d46285a2 3176 ovs_mutex_lock(&dev->mutex);
0a0f39df
CL
3177 dev->vhost_reconfigured = false;
3178 ovsrcu_index_set(&dev->vid, -1);
d46285a2 3179 netdev_dpdk_txq_map_clear(dev);
81acebda 3180
e543851d 3181 netdev_change_seq_changed(&dev->up);
d46285a2 3182 ovs_mutex_unlock(&dev->mutex);
81acebda 3183 exists = true;
afee281f 3184 break;
58397e6c
KT
3185 }
3186 }
afee281f 3187
58397e6c
KT
3188 ovs_mutex_unlock(&dpdk_mutex);
3189
0a0f39df 3190 if (exists) {
afee281f
KT
3191 /*
3192 * Wait for other threads to quiesce after setting the 'virtio_dev'
3193 * to NULL, before returning.
3194 */
3195 ovsrcu_synchronize();
3196 /*
3197 * As call to ovsrcu_synchronize() will end the quiescent state,
3198 * put thread back into quiescent state before returning.
3199 */
3200 ovsrcu_quiesce_start();
0a0f39df 3201 VLOG_INFO("vHost Device '%s' has been removed", ifname);
afee281f 3202 } else {
0a0f39df 3203 VLOG_INFO("vHost Device '%s' not found", ifname);
afee281f 3204 }
58397e6c
KT
3205}
3206
585a5bea 3207static int
0a0f39df 3208vring_state_changed(int vid, uint16_t queue_id, int enable)
585a5bea 3209{
d46285a2 3210 struct netdev_dpdk *dev;
585a5bea
IM
3211 bool exists = false;
3212 int qid = queue_id / VIRTIO_QNUM;
0a0f39df
CL
3213 char ifname[IF_NAME_SZ];
3214
58be5c0e 3215 rte_vhost_get_ifname(vid, ifname, sizeof ifname);
585a5bea
IM
3216
3217 if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
3218 return 0;
3219 }
3220
3221 ovs_mutex_lock(&dpdk_mutex);
d46285a2 3222 LIST_FOR_EACH (dev, list_node, &dpdk_list) {
c1ff66ac 3223 ovs_mutex_lock(&dev->mutex);
2d24d165 3224 if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
585a5bea 3225 if (enable) {
d46285a2 3226 dev->tx_q[qid].map = qid;
585a5bea 3227 } else {
d46285a2 3228 dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
585a5bea 3229 }
d46285a2 3230 netdev_dpdk_remap_txqs(dev);
585a5bea 3231 exists = true;
d46285a2 3232 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3233 break;
3234 }
c1ff66ac 3235 ovs_mutex_unlock(&dev->mutex);
585a5bea
IM
3236 }
3237 ovs_mutex_unlock(&dpdk_mutex);
3238
3239 if (exists) {
0a0f39df
CL
3240 VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s'"
3241 "changed to \'%s\'", queue_id, qid, ifname,
d46285a2 3242 (enable == 1) ? "enabled" : "disabled");
585a5bea 3243 } else {
0a0f39df 3244 VLOG_INFO("vHost Device '%s' not found", ifname);
585a5bea
IM
3245 return -1;
3246 }
3247
3248 return 0;
3249}
3250
0a0f39df
CL
3251int
3252netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
58397e6c 3253{
0a0f39df 3254 return ovsrcu_index_get(&dev->vid);
58397e6c
KT
3255}
3256
9509913a
IS
3257struct ingress_policer *
3258netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
3259{
3260 return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
3261}
3262
58397e6c 3263static int
ecc1a34e 3264netdev_dpdk_class_init(void)
7d1ced01 3265{
ecc1a34e
DDP
3266 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3267
3268 /* This function can be called for different classes. The initialization
3269 * needs to be done only once */
3270 if (ovsthread_once_start(&once)) {
3271 ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
3272 unixctl_command_register("netdev-dpdk/set-admin-state",
3273 "[netdev] up|down", 1, 2,
3274 netdev_dpdk_set_admin_state, NULL);
3275
0ee821c2
DB
3276 unixctl_command_register("netdev-dpdk/detach",
3277 "pci address of device", 1, 1,
3278 netdev_dpdk_detach, NULL);
3279
be481733
IM
3280 unixctl_command_register("netdev-dpdk/get-mempool-info",
3281 "[netdev]", 0, 1,
3282 netdev_dpdk_get_mempool_info, NULL);
3283
ecc1a34e
DDP
3284 ovsthread_once_done(&once);
3285 }
362ca396 3286
7d1ced01
CL
3287 return 0;
3288}
3289
033e9df2 3290
95fb793a 3291/* Client Rings */
3292
95fb793a 3293static int
3294dpdk_ring_create(const char dev_name[], unsigned int port_no,
bb37956a 3295 dpdk_port_t *eth_port_id)
95fb793a 3296{
48fffdee 3297 struct dpdk_ring *ring_pair;
0c6f39e5 3298 char *ring_name;
b8374d0d 3299 int port_id;
95fb793a 3300
48fffdee
KT
3301 ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
3302 if (!ring_pair) {
95fb793a 3303 return ENOMEM;
3304 }
3305
7251515e 3306 /* XXX: Add support for multiquque ring. */
0c6f39e5 3307 ring_name = xasprintf("%s_tx", dev_name);
95fb793a 3308
8f0a76c9 3309 /* Create single producer tx ring, netdev does explicit locking. */
48fffdee 3310 ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3311 RING_F_SP_ENQ);
0c6f39e5 3312 free(ring_name);
48fffdee
KT
3313 if (ring_pair->cring_tx == NULL) {
3314 rte_free(ring_pair);
95fb793a 3315 return ENOMEM;
3316 }
3317
0c6f39e5 3318 ring_name = xasprintf("%s_rx", dev_name);
95fb793a 3319
8f0a76c9 3320 /* Create single consumer rx ring, netdev does explicit locking. */
48fffdee 3321 ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
8f0a76c9 3322 RING_F_SC_DEQ);
0c6f39e5 3323 free(ring_name);
48fffdee
KT
3324 if (ring_pair->cring_rx == NULL) {
3325 rte_free(ring_pair);
95fb793a 3326 return ENOMEM;
3327 }
3328
b8374d0d
MV
3329 port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
3330 &ring_pair->cring_tx, 1, SOCKET0);
d7310583 3331
b8374d0d 3332 if (port_id < 0) {
48fffdee 3333 rte_free(ring_pair);
95fb793a 3334 return ENODEV;
3335 }
3336
48fffdee 3337 ring_pair->user_port_id = port_no;
b8374d0d
MV
3338 ring_pair->eth_port_id = port_id;
3339 *eth_port_id = port_id;
3340
48fffdee 3341 ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);
95fb793a 3342
95fb793a 3343 return 0;
3344}
3345
3346static int
bb37956a 3347dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
64839cf4 3348 OVS_REQUIRES(dpdk_mutex)
95fb793a 3349{
48fffdee 3350 struct dpdk_ring *ring_pair;
95fb793a 3351 unsigned int port_no;
3352 int err = 0;
3353
3354 /* Names always start with "dpdkr" */
3355 err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
3356 if (err) {
3357 return err;
3358 }
3359
58be5c0e 3360 /* Look through our list to find the device */
48fffdee
KT
3361 LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
3362 if (ring_pair->user_port_id == port_no) {
58397e6c 3363 VLOG_INFO("Found dpdk ring device %s:", dev_name);
58be5c0e 3364 /* Really all that is needed */
48fffdee 3365 *eth_port_id = ring_pair->eth_port_id;
95fb793a 3366 return 0;
3367 }
3368 }
3369 /* Need to create the device rings */
3370 return dpdk_ring_create(dev_name, port_no, eth_port_id);
3371}
3372
7251515e 3373static int
d46285a2 3374netdev_dpdk_ring_send(struct netdev *netdev, int qid,
b30896c9 3375 struct dp_packet_batch *batch, bool concurrent_txq)
7251515e 3376{
d46285a2 3377 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
8a543eb0 3378 struct dp_packet *packet;
1b99bb05 3379
58be5c0e
MK
3380 /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
3381 * the rss hash field is clear. This is because the same mbuf may be
3382 * modified by the consumer of the ring and return into the datapath
3383 * without recalculating the RSS hash. */
e883448e 3384 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
8a543eb0 3385 dp_packet_mbuf_rss_flag_reset(packet);
1b99bb05 3386 }
7251515e 3387
b30896c9 3388 netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
7251515e
DV
3389 return 0;
3390}
3391
95fb793a 3392static int
3393netdev_dpdk_ring_construct(struct netdev *netdev)
3394{
bb37956a 3395 dpdk_port_t port_no = 0;
95fb793a 3396 int err = 0;
3397
95fb793a 3398 ovs_mutex_lock(&dpdk_mutex);
3399
3400 err = dpdk_ring_open(netdev->name, &port_no);
3401 if (err) {
3402 goto unlock_dpdk;
3403 }
3404
1ce30dfd
DDP
3405 err = common_construct(netdev, port_no, DPDK_DEV_ETH,
3406 rte_eth_dev_socket_id(port_no));
95fb793a 3407unlock_dpdk:
3408 ovs_mutex_unlock(&dpdk_mutex);
3409 return err;
3410}
3411
0bf765f7
IS
3412/* QoS Functions */
3413
3414/*
3415 * Initialize QoS configuration operations.
3416 */
3417static void
3418qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
3419{
3420 conf->ops = ops;
78bd47cf 3421 rte_spinlock_init(&conf->lock);
0bf765f7
IS
3422}
3423
3424/*
3425 * Search existing QoS operations in qos_ops and compare each set of
3426 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
3427 * else return NULL
3428 */
3429static const struct dpdk_qos_ops *
3430qos_lookup_name(const char *name)
3431{
3432 const struct dpdk_qos_ops *const *opsp;
3433
3434 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3435 const struct dpdk_qos_ops *ops = *opsp;
3436 if (!strcmp(name, ops->qos_name)) {
3437 return ops;
3438 }
3439 }
3440 return NULL;
3441}
3442
0bf765f7
IS
3443static int
3444netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3445 struct sset *types)
3446{
3447 const struct dpdk_qos_ops *const *opsp;
3448
3449 for (opsp = qos_confs; *opsp != NULL; opsp++) {
3450 const struct dpdk_qos_ops *ops = *opsp;
3451 if (ops->qos_construct && ops->qos_name[0] != '\0') {
3452 sset_add(types, ops->qos_name);
3453 }
3454 }
3455 return 0;
3456}
3457
3458static int
d46285a2 3459netdev_dpdk_get_qos(const struct netdev *netdev,
0bf765f7
IS
3460 const char **typep, struct smap *details)
3461{
d46285a2 3462 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
78bd47cf 3463 struct qos_conf *qos_conf;
0bf765f7
IS
3464 int error = 0;
3465
d46285a2 3466 ovs_mutex_lock(&dev->mutex);
78bd47cf
DDP
3467 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
3468 if (qos_conf) {
3469 *typep = qos_conf->ops->qos_name;
3470 error = (qos_conf->ops->qos_get
3471 ? qos_conf->ops->qos_get(qos_conf, details): 0);
d03603c4
MC
3472 } else {
3473 /* No QoS configuration set, return an empty string */
3474 *typep = "";
0bf765f7 3475 }
d46285a2 3476 ovs_mutex_unlock(&dev->mutex);
0bf765f7
IS
3477
3478 return error;
3479}
3480
3481static int
78bd47cf
DDP
3482netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
3483 const struct smap *details)
0bf765f7 3484{
d46285a2 3485 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
0bf765f7 3486 const struct dpdk_qos_ops *new_ops = NULL;
78bd47cf 3487 struct qos_conf *qos_conf, *new_qos_conf = NULL;
0bf765f7
IS
3488 int error = 0;
3489
d46285a2 3490 ovs_mutex_lock(&dev->mutex);
0bf765f7 3491
78bd47cf 3492 qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
0bf765f7 3493
78bd47cf
DDP
3494 new_ops = qos_lookup_name(type);
3495
3496 if (!new_ops || !new_ops->qos_construct) {
3497 new_qos_conf = NULL;
3498 if (type && type[0]) {
3499 error = EOPNOTSUPP;
0bf765f7 3500 }
44975bb0 3501 } else if (qos_conf && qos_conf->ops == new_ops
78bd47cf
DDP
3502 && qos_conf->ops->qos_is_equal(qos_conf, details)) {
3503 new_qos_conf = qos_conf;
0bf765f7 3504 } else {
78bd47cf 3505 error = new_ops->qos_construct(details, &new_qos_conf);
7ea266e9
IS
3506 }
3507
7ea266e9 3508 if (error) {
78bd47cf
DDP
3509 VLOG_ERR("Failed to set QoS type %s on port %s: %s",
3510 type, netdev->name, rte_strerror(error));
3511 }
3512
3513 if (new_qos_conf != qos_conf) {
3514 ovsrcu_set(&dev->qos_conf, new_qos_conf);
3515 if (qos_conf) {
3516 ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
3517 }
0bf765f7
IS
3518 }
3519
d46285a2 3520 ovs_mutex_unlock(&dev->mutex);
78bd47cf 3521
0bf765f7
IS
3522 return error;
3523}
3524
3525/* egress-policer details */
3526
3527struct egress_policer {
3528 struct qos_conf qos_conf;
3529 struct rte_meter_srtcm_params app_srtcm_params;
3530 struct rte_meter_srtcm egress_meter;
3531};
3532
78bd47cf
DDP
3533static void
3534egress_policer_details_to_param(const struct smap *details,
3535 struct rte_meter_srtcm_params *params)
0bf765f7 3536{
78bd47cf
DDP
3537 memset(params, 0, sizeof *params);
3538 params->cir = smap_get_ullong(details, "cir", 0);
3539 params->cbs = smap_get_ullong(details, "cbs", 0);
3540 params->ebs = 0;
0bf765f7
IS
3541}
3542
3543static int
78bd47cf
DDP
3544egress_policer_qos_construct(const struct smap *details,
3545 struct qos_conf **conf)
0bf765f7 3546{
0bf765f7 3547 struct egress_policer *policer;
0bf765f7
IS
3548 int err = 0;
3549
0bf765f7
IS
3550 policer = xmalloc(sizeof *policer);
3551 qos_conf_init(&policer->qos_conf, &egress_policer_ops);
78bd47cf 3552 egress_policer_details_to_param(details, &policer->app_srtcm_params);
0bf765f7 3553 err = rte_meter_srtcm_config(&policer->egress_meter,
78bd47cf
DDP
3554 &policer->app_srtcm_params);
3555 if (!err) {
3556 *conf = &policer->qos_conf;
3557 } else {
7ea266e9 3558 free(policer);
78bd47cf 3559 *conf = NULL;
7ea266e9
IS
3560 err = -err;
3561 }
0bf765f7
IS
3562
3563 return err;
3564}
3565
3566static void
78bd47cf 3567egress_policer_qos_destruct(struct qos_conf *conf)
0bf765f7
IS
3568{
3569 struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
78bd47cf 3570 qos_conf);
0bf765f7
IS
3571 free(policer);
3572}
3573
3574static int
78bd47cf 3575egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
0bf765f7 3576{
78bd47cf
DDP
3577 struct egress_policer *policer =
3578 CONTAINER_OF(conf, struct egress_policer, qos_conf);
3579
3580 smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
3581 smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);
050c60bf 3582
0bf765f7
IS
3583 return 0;
3584}
3585
78bd47cf 3586static bool
47a45d86
KT
3587egress_policer_qos_is_equal(const struct qos_conf *conf,
3588 const struct smap *details)
0bf765f7 3589{
78bd47cf
DDP
3590 struct egress_policer *policer =
3591 CONTAINER_OF(conf, struct egress_policer, qos_conf);
3592 struct rte_meter_srtcm_params params;
0bf765f7 3593
78bd47cf 3594 egress_policer_details_to_param(details, &params);
7ea266e9 3595
78bd47cf 3596 return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
0bf765f7
IS
3597}
3598
0bf765f7 3599static int
3e90f7d7
GZ
3600egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
3601 bool may_steal)
0bf765f7 3602{
0bf765f7 3603 int cnt = 0;
78bd47cf
DDP
3604 struct egress_policer *policer =
3605 CONTAINER_OF(conf, struct egress_policer, qos_conf);
0bf765f7 3606
3e90f7d7
GZ
3607 cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts,
3608 pkt_cnt, may_steal);
0bf765f7
IS
3609
3610 return cnt;
3611}
3612
3613static const struct dpdk_qos_ops egress_policer_ops = {
3614 "egress-policer", /* qos_name */
3615 egress_policer_qos_construct,
3616 egress_policer_qos_destruct,
3617 egress_policer_qos_get,
78bd47cf 3618 egress_policer_qos_is_equal,
0bf765f7
IS
3619 egress_policer_run
3620};
3621
050c60bf
DDP
3622static int
3623netdev_dpdk_reconfigure(struct netdev *netdev)
3624{
3625 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
3626 int err = 0;
3627
050c60bf
DDP
3628 ovs_mutex_lock(&dev->mutex);
3629
3630 if (netdev->n_txq == dev->requested_n_txq
0072e931 3631 && netdev->n_rxq == dev->requested_n_rxq
b685696b
CL
3632 && dev->mtu == dev->requested_mtu
3633 && dev->rxq_size == dev->requested_rxq_size
bd4e172b 3634 && dev->txq_size == dev->requested_txq_size
3635 && dev->socket_id == dev->requested_socket_id) {
050c60bf
DDP
3636 /* Reconfiguration is unnecessary */
3637
3638 goto out;
3639 }
3640
3641 rte_eth_dev_stop(dev->port_id);
3642
d555d9bd 3643 err = netdev_dpdk_mempool_configure(dev);
b6b26021 3644 if (err && err != EEXIST) {
d555d9bd 3645 goto out;
0072e931
MK
3646 }
3647
050c60bf
DDP
3648 netdev->n_txq = dev->requested_n_txq;
3649 netdev->n_rxq = dev->requested_n_rxq;
3650
b685696b
CL
3651 dev->rxq_size = dev->requested_rxq_size;
3652 dev->txq_size = dev->requested_txq_size;
3653
050c60bf
DDP
3654 rte_free(dev->tx_q);
3655 err = dpdk_eth_dev_init(dev);
eff23640
DDP
3656 dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
3657 if (!dev->tx_q) {
3658 err = ENOMEM;
3659 }
050c60bf 3660
0072e931
MK
3661 netdev_change_seq_changed(netdev);
3662
050c60bf 3663out:
050c60bf 3664 ovs_mutex_unlock(&dev->mutex);
050c60bf
DDP
3665 return err;
3666}
3667
7f381c2e 3668static int
2d24d165 3669dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
2d24d165 3670 OVS_REQUIRES(dev->mutex)
050c60bf 3671{
2d24d165
CL
3672 dev->up.n_txq = dev->requested_n_txq;
3673 dev->up.n_rxq = dev->requested_n_rxq;
96e9b168 3674 int err;
050c60bf 3675
81acebda
IM
3676 /* Enable TX queue 0 by default if it wasn't disabled. */
3677 if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
3678 dev->tx_q[0].map = 0;
3679 }
3680
3681 netdev_dpdk_remap_txqs(dev);
3682
d555d9bd 3683 err = netdev_dpdk_mempool_configure(dev);
b6b26021
FA
3684 if (!err) {
3685 /* A new mempool was created. */
d555d9bd 3686 netdev_change_seq_changed(&dev->up);
b6b26021
FA
3687 } else if (err != EEXIST){
3688 return err;
db8f13b0 3689 }
0a0f39df 3690 if (netdev_dpdk_get_vid(dev) >= 0) {
894af647 3691 if (dev->vhost_reconfigured == false) {
3692 dev->vhost_reconfigured = true;
3693 /* Carrier status may need updating. */
3694 netdev_change_seq_changed(&dev->up);
3695 }
81acebda 3696 }
7f381c2e
DDP
3697
3698 return 0;
2d24d165
CL
3699}
3700
3701static int
3702netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
3703{
3704 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 3705 int err;
2d24d165 3706
2d24d165 3707 ovs_mutex_lock(&dev->mutex);
7f381c2e 3708 err = dpdk_vhost_reconfigure_helper(dev);
2d24d165 3709 ovs_mutex_unlock(&dev->mutex);
7f381c2e
DDP
3710
3711 return err;
2d24d165
CL
3712}
3713
3714static int
3715netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
3716{
3717 struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
7f381c2e 3718 int err;
a14d1cc8 3719 uint64_t vhost_flags = 0;
10087cba 3720 bool zc_enabled;
2d24d165 3721
2d24d165
CL
3722 ovs_mutex_lock(&dev->mutex);
3723
c1ff66ac
CL
3724 /* Configure vHost client mode if requested and if the following criteria
3725 * are met:
2d24d165
CL
3726 * 1. Device hasn't been registered yet.
3727 * 2. A path has been specified.
c1ff66ac
CL
3728 */
3729 if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)
2d24d165 3730 && strlen(dev->vhost_id)) {
a14d1cc8
MK
3731 /* Register client-mode device. */
3732 vhost_flags |= RTE_VHOST_USER_CLIENT;
3733
3734 /* Enable IOMMU support, if explicitly requested. */
3735 if (dpdk_vhost_iommu_enabled()) {
3736 vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
3737 }
10087cba
CL
3738
3739 zc_enabled = dev->vhost_driver_flags
3740 & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
3741 /* Enable zero copy flag, if requested */
3742 if (zc_enabled) {
3743 vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
3744 }
3745
a14d1cc8 3746 err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
c1ff66ac 3747 if (err) {
2d24d165
CL
3748 VLOG_ERR("vhost-user device setup failure for device %s\n",
3749 dev->vhost_id);
7f381c2e 3750 goto unlock;
c1ff66ac 3751 } else {
2d24d165 3752 /* Configuration successful */
a14d1cc8 3753 dev->vhost_driver_flags |= vhost_flags;
2d24d165
CL
3754 VLOG_INFO("vHost User device '%s' created in 'client' mode, "
3755 "using client socket '%s'",
3756 dev->up.name, dev->vhost_id);
10087cba
CL
3757 if (zc_enabled) {
3758 VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
3759 }
c1ff66ac 3760 }
f3e7ec25
MW
3761
3762 err = rte_vhost_driver_callback_register(dev->vhost_id,
3763 &virtio_net_device_ops);
3764 if (err) {
3765 VLOG_ERR("rte_vhost_driver_callback_register failed for "
3766 "vhost user client port: %s\n", dev->up.name);
3767 goto unlock;
3768 }
3769
3770 err = rte_vhost_driver_disable_features(dev->vhost_id,
3771 1ULL << VIRTIO_NET_F_HOST_TSO4
3772 | 1ULL << VIRTIO_NET_F_HOST_TSO6
3773 | 1ULL << VIRTIO_NET_F_CSUM);
3774 if (err) {
3775 VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
3776 "client port: %s\n", dev->up.name);
3777 goto unlock;
3778 }
3779
3780 err = rte_vhost_driver_start(dev->vhost_id);
3781 if (err) {
3782 VLOG_ERR("rte_vhost_driver_start failed for vhost user "
3783 "client port: %s\n", dev->up.name);
3784 goto unlock;
3785 }
c1ff66ac
CL
3786 }
3787
7f381c2e
DDP
3788 err = dpdk_vhost_reconfigure_helper(dev);
3789
3790unlock:
050c60bf 3791 ovs_mutex_unlock(&dev->mutex);
050c60bf 3792
7f381c2e 3793 return err;
050c60bf
DDP
3794}
3795
ecc1a34e 3796#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
81acebda 3797 SET_CONFIG, SET_TX_MULTIQ, SEND, \
971f4b39
MW
3798 GET_CARRIER, GET_STATS, \
3799 GET_CUSTOM_STATS, \
81acebda
IM
3800 GET_FEATURES, GET_STATUS, \
3801 RECONFIGURE, RXQ_RECV) \
95fb793a 3802{ \
3803 NAME, \
118c77b1 3804 true, /* is_pmd */ \
ecc1a34e 3805 INIT, /* init */ \
95fb793a 3806 NULL, /* netdev_dpdk_run */ \
3807 NULL, /* netdev_dpdk_wait */ \
3808 \
3809 netdev_dpdk_alloc, \
3810 CONSTRUCT, \
58397e6c 3811 DESTRUCT, \
95fb793a 3812 netdev_dpdk_dealloc, \
3813 netdev_dpdk_get_config, \
81acebda 3814 SET_CONFIG, \
95fb793a 3815 NULL, /* get_tunnel_config */ \
58397e6c
KT
3816 NULL, /* build header */ \
3817 NULL, /* push header */ \
3818 NULL, /* pop header */ \
7dec44fe 3819 netdev_dpdk_get_numa_id, /* get_numa_id */ \
81acebda 3820 SET_TX_MULTIQ, \
95fb793a 3821 \
7251515e 3822 SEND, /* send */ \
95fb793a 3823 NULL, /* send_wait */ \
3824 \
3825 netdev_dpdk_set_etheraddr, \
3826 netdev_dpdk_get_etheraddr, \
3827 netdev_dpdk_get_mtu, \
0072e931 3828 netdev_dpdk_set_mtu, \
95fb793a 3829 netdev_dpdk_get_ifindex, \
58397e6c 3830 GET_CARRIER, \
95fb793a 3831 netdev_dpdk_get_carrier_resets, \
3832 netdev_dpdk_set_miimon, \
58397e6c 3833 GET_STATS, \
971f4b39 3834 GET_CUSTOM_STATS, \
58397e6c 3835 GET_FEATURES, \
95fb793a 3836 NULL, /* set_advertisements */ \
875ab130 3837 NULL, /* get_pt_mode */ \
95fb793a 3838 \
9509913a 3839 netdev_dpdk_set_policing, \
0bf765f7 3840 netdev_dpdk_get_qos_types, \
95fb793a 3841 NULL, /* get_qos_capabilities */ \
0bf765f7
IS
3842 netdev_dpdk_get_qos, \
3843 netdev_dpdk_set_qos, \
95fb793a 3844 NULL, /* get_queue */ \
3845 NULL, /* set_queue */ \
3846 NULL, /* delete_queue */ \
3847 NULL, /* get_queue_stats */ \
3848 NULL, /* queue_dump_start */ \
3849 NULL, /* queue_dump_next */ \
3850 NULL, /* queue_dump_done */ \
3851 NULL, /* dump_queue_stats */ \
3852 \
95fb793a 3853 NULL, /* set_in4 */ \
a8704b50 3854 NULL, /* get_addr_list */ \
95fb793a 3855 NULL, /* add_router */ \
3856 NULL, /* get_next_hop */ \
58397e6c 3857 GET_STATUS, \
95fb793a 3858 NULL, /* arp_lookup */ \
3859 \
3860 netdev_dpdk_update_flags, \
050c60bf 3861 RECONFIGURE, \
95fb793a 3862 \
3863 netdev_dpdk_rxq_alloc, \
3864 netdev_dpdk_rxq_construct, \
3865 netdev_dpdk_rxq_destruct, \
3866 netdev_dpdk_rxq_dealloc, \
58397e6c 3867 RXQ_RECV, \
95fb793a 3868 NULL, /* rx_wait */ \
3869 NULL, /* rxq_drain */ \
18ebd48c 3870 NO_OFFLOAD_API \
95fb793a 3871}
8a9562d2 3872
bce01e3a 3873static const struct netdev_class dpdk_class =
95fb793a 3874 NETDEV_DPDK_CLASS(
3875 "dpdk",
ecc1a34e 3876 netdev_dpdk_class_init,
5496878c 3877 netdev_dpdk_construct,
58397e6c 3878 netdev_dpdk_destruct,
81acebda
IM
3879 netdev_dpdk_set_config,
3880 netdev_dpdk_set_tx_multiq,
58397e6c
KT
3881 netdev_dpdk_eth_send,
3882 netdev_dpdk_get_carrier,
3883 netdev_dpdk_get_stats,
971f4b39 3884 netdev_dpdk_get_custom_stats,
58397e6c
KT
3885 netdev_dpdk_get_features,
3886 netdev_dpdk_get_status,
050c60bf 3887 netdev_dpdk_reconfigure,
58397e6c 3888 netdev_dpdk_rxq_recv);
95fb793a 3889
bce01e3a 3890static const struct netdev_class dpdk_ring_class =
95fb793a 3891 NETDEV_DPDK_CLASS(
3892 "dpdkr",
ecc1a34e 3893 netdev_dpdk_class_init,
5496878c 3894 netdev_dpdk_ring_construct,
58397e6c 3895 netdev_dpdk_destruct,
c3d062a7 3896 netdev_dpdk_ring_set_config,
81acebda 3897 netdev_dpdk_set_tx_multiq,
58397e6c
KT
3898 netdev_dpdk_ring_send,
3899 netdev_dpdk_get_carrier,
3900 netdev_dpdk_get_stats,
971f4b39 3901 netdev_dpdk_get_custom_stats,
58397e6c
KT
3902 netdev_dpdk_get_features,
3903 netdev_dpdk_get_status,
050c60bf 3904 netdev_dpdk_reconfigure,
58397e6c
KT
3905 netdev_dpdk_rxq_recv);
3906
53f50d24 3907static const struct netdev_class dpdk_vhost_class =
7d1ced01
CL
3908 NETDEV_DPDK_CLASS(
3909 "dpdkvhostuser",
f3e7ec25 3910 NULL,
53f50d24 3911 netdev_dpdk_vhost_construct,
58397e6c 3912 netdev_dpdk_vhost_destruct,
2d24d165 3913 NULL,
81acebda 3914 NULL,
58397e6c
KT
3915 netdev_dpdk_vhost_send,
3916 netdev_dpdk_vhost_get_carrier,
3917 netdev_dpdk_vhost_get_stats,
3918 NULL,
7251515e 3919 NULL,
b2e8b12f 3920 netdev_dpdk_vhost_user_get_status,
53f50d24 3921 netdev_dpdk_vhost_reconfigure,
58397e6c 3922 netdev_dpdk_vhost_rxq_recv);
2d24d165
CL
3923static const struct netdev_class dpdk_vhost_client_class =
3924 NETDEV_DPDK_CLASS(
3925 "dpdkvhostuserclient",
f3e7ec25 3926 NULL,
2d24d165
CL
3927 netdev_dpdk_vhost_client_construct,
3928 netdev_dpdk_vhost_destruct,
3929 netdev_dpdk_vhost_client_set_config,
3930 NULL,
3931 netdev_dpdk_vhost_send,
3932 netdev_dpdk_vhost_get_carrier,
3933 netdev_dpdk_vhost_get_stats,
3934 NULL,
3935 NULL,
b2e8b12f 3936 netdev_dpdk_vhost_user_get_status,
2d24d165
CL
3937 netdev_dpdk_vhost_client_reconfigure,
3938 netdev_dpdk_vhost_rxq_recv);
95fb793a 3939
8a9562d2
PS
3940void
3941netdev_dpdk_register(void)
3942{
bab69409
AC
3943 netdev_register_provider(&dpdk_class);
3944 netdev_register_provider(&dpdk_ring_class);
53f50d24 3945 netdev_register_provider(&dpdk_vhost_class);
2d24d165 3946 netdev_register_provider(&dpdk_vhost_client_class);
8a9562d2 3947}