2 * Copyright (c) 2018, 2019 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux-private.h"
20 #include "netdev-linux.h"
21 #include "netdev-afxdp.h"
22 #include "netdev-afxdp-pool.h"
26 #include <linux/rtnetlink.h>
27 #include <linux/if_xdp.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/types.h>
39 #include "dp-packet.h"
40 #include "dpif-netdev.h"
41 #include "fatal-signal.h"
42 #include "openvswitch/compiler.h"
43 #include "openvswitch/dynamic-string.h"
44 #include "openvswitch/list.h"
45 #include "openvswitch/thread.h"
46 #include "openvswitch/vlog.h"
49 #include "socket-util.h"
56 COVERAGE_DEFINE(afxdp_cq_empty
);
57 COVERAGE_DEFINE(afxdp_fq_full
);
58 COVERAGE_DEFINE(afxdp_tx_full
);
59 COVERAGE_DEFINE(afxdp_cq_skip
);
61 VLOG_DEFINE_THIS_MODULE(netdev_afxdp
);
63 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
66 #define FRAME_HEADROOM XDP_PACKET_HEADROOM
67 #define OVS_XDP_HEADROOM 128
68 #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
69 #define FRAME_SHIFT XSK_UMEM__DEFAULT_FRAME_SHIFT
70 #define FRAME_SHIFT_MASK ((1 << FRAME_SHIFT) - 1)
72 #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
73 #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
75 #ifdef HAVE_XDP_NEED_WAKEUP
76 #define NEED_WAKEUP_DEFAULT true
78 #define NEED_WAKEUP_DEFAULT false
81 /* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
82 * still on processing in threads. Number of packets currently in OVS
83 * processing is hard to estimate because it depends on number of ports.
84 * Setting NUM_FRAMES twice as large than total of ring sizes should be
85 * enough for most corner cases.
87 #define NUM_FRAMES (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS))
88 #define BATCH_SIZE NETDEV_MAX_BURST
90 BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES
));
91 BUILD_ASSERT_DECL(PROD_NUM_DESCS
== CONS_NUM_DESCS
);
93 #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base))
95 static struct xsk_socket_info
*xsk_configure(int ifindex
, int xdp_queue_id
,
98 bool report_socket_failures
);
99 static void xsk_remove_xdp_program(uint32_t ifindex
, enum afxdp_mode
);
100 static void xsk_destroy(struct xsk_socket_info
*xsk
);
101 static int xsk_configure_all(struct netdev
*netdev
);
102 static void xsk_destroy_all(struct netdev
*netdev
);
109 [OVS_AF_XDP_MODE_UNSPEC
] = {
110 .name
= "unspecified",
114 [OVS_AF_XDP_MODE_BEST_EFFORT
] = {
115 .name
= "best-effort",
119 [OVS_AF_XDP_MODE_NATIVE_ZC
] = {
120 .name
= "native-with-zerocopy",
121 .bind_flags
= XDP_ZEROCOPY
,
122 .xdp_flags
= XDP_FLAGS_DRV_MODE
,
124 [OVS_AF_XDP_MODE_NATIVE
] = {
126 .bind_flags
= XDP_COPY
,
127 .xdp_flags
= XDP_FLAGS_DRV_MODE
,
129 [OVS_AF_XDP_MODE_GENERIC
] = {
131 .bind_flags
= XDP_COPY
,
132 .xdp_flags
= XDP_FLAGS_SKB_MODE
,
137 struct xsk_umem_info
*umem_info
;
138 int lost_in_rings
; /* Number of packets left in tx, rx, cq and fq. */
139 struct ovs_list list_node
;
142 static struct ovs_mutex unused_pools_mutex
= OVS_MUTEX_INITIALIZER
;
143 static struct ovs_list unused_pools
OVS_GUARDED_BY(unused_pools_mutex
) =
144 OVS_LIST_INITIALIZER(&unused_pools
);
146 struct xsk_umem_info
{
147 struct umem_pool mpool
;
148 struct xpacket_pool xpool
;
149 struct xsk_ring_prod fq
;
150 struct xsk_ring_cons cq
;
151 struct xsk_umem
*umem
;
155 struct xsk_socket_info
{
156 struct xsk_ring_cons rx
;
157 struct xsk_ring_prod tx
;
158 struct xsk_umem_info
*umem
;
159 struct xsk_socket
*xsk
;
160 uint32_t outstanding_tx
; /* Number of descriptors filled in tx and cq. */
161 uint32_t available_rx
; /* Number of descriptors filled in rx and fq. */
162 atomic_uint64_t tx_dropped
;
165 struct netdev_afxdp_tx_lock
{
166 /* Padding to make netdev_afxdp_tx_lock exactly one cache line long. */
167 PADDED_MEMBERS(CACHE_LINE_SIZE
,
168 struct ovs_spin lock
;
172 #ifdef HAVE_XDP_NEED_WAKEUP
174 xsk_rx_wakeup_if_needed(struct xsk_umem_info
*umem
,
175 struct netdev
*netdev
, int fd
)
177 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
181 if (!dev
->use_need_wakeup
) {
185 if (xsk_ring_prod__needs_wakeup(&umem
->fq
)) {
189 ret
= poll(&pfd
, 1, 0);
190 if (OVS_UNLIKELY(ret
< 0)) {
191 VLOG_WARN_RL(&rl
, "%s: error polling rx fd: %s.",
192 netdev_get_name(netdev
),
193 ovs_strerror(errno
));
199 xsk_tx_need_wakeup(struct xsk_socket_info
*xsk_info
)
201 return xsk_ring_prod__needs_wakeup(&xsk_info
->tx
);
204 #else /* !HAVE_XDP_NEED_WAKEUP */
206 xsk_rx_wakeup_if_needed(struct xsk_umem_info
*umem OVS_UNUSED
,
207 struct netdev
*netdev OVS_UNUSED
,
214 xsk_tx_need_wakeup(struct xsk_socket_info
*xsk_info OVS_UNUSED
)
218 #endif /* HAVE_XDP_NEED_WAKEUP */
221 netdev_afxdp_cleanup_unused_pool(struct unused_pool
*pool
)
223 /* Free the packet buffer. */
224 free_pagealign(pool
->umem_info
->buffer
);
226 /* Cleanup umem pool. */
227 umem_pool_cleanup(&pool
->umem_info
->mpool
);
229 /* Cleanup metadata pool. */
230 xpacket_pool_cleanup(&pool
->umem_info
->xpool
);
232 free(pool
->umem_info
);
236 netdev_afxdp_sweep_unused_pools(void *aux OVS_UNUSED
)
238 struct unused_pool
*pool
, *next
;
241 ovs_mutex_lock(&unused_pools_mutex
);
242 LIST_FOR_EACH_SAFE (pool
, next
, list_node
, &unused_pools
) {
244 count
= umem_pool_count(&pool
->umem_info
->mpool
);
245 ovs_assert(count
+ pool
->lost_in_rings
<= NUM_FRAMES
);
247 if (count
+ pool
->lost_in_rings
== NUM_FRAMES
) {
248 /* OVS doesn't use this memory pool anymore. Kernel doesn't
249 * use it since closing the xdp socket. So, it's safe to free
251 VLOG_DBG("Freeing umem pool at 0x%"PRIxPTR
,
252 (uintptr_t) pool
->umem_info
);
253 ovs_list_remove(&pool
->list_node
);
254 netdev_afxdp_cleanup_unused_pool(pool
);
258 ovs_mutex_unlock(&unused_pools_mutex
);
261 static struct xsk_umem_info
*
262 xsk_configure_umem(void *buffer
, uint64_t size
)
264 struct xsk_umem_config uconfig
;
265 struct xsk_umem_info
*umem
;
269 umem
= xzalloc(sizeof *umem
);
271 memset(&uconfig
, 0, sizeof uconfig
);
272 uconfig
.fill_size
= PROD_NUM_DESCS
;
273 uconfig
.comp_size
= CONS_NUM_DESCS
;
274 uconfig
.frame_size
= FRAME_SIZE
;
275 uconfig
.frame_headroom
= OVS_XDP_HEADROOM
;
277 ret
= xsk_umem__create(&umem
->umem
, buffer
, size
, &umem
->fq
, &umem
->cq
,
280 VLOG_ERR("xsk_umem__create failed: %s.", ovs_strerror(errno
));
285 umem
->buffer
= buffer
;
287 /* Set-up umem pool. */
288 if (umem_pool_init(&umem
->mpool
, NUM_FRAMES
) < 0) {
289 VLOG_ERR("umem_pool_init failed");
290 if (xsk_umem__delete(umem
->umem
)) {
291 VLOG_ERR("xsk_umem__delete failed");
297 for (i
= NUM_FRAMES
- 1; i
>= 0; i
--) {
300 elem
= ALIGNED_CAST(void *, (char *)umem
->buffer
+ i
* FRAME_SIZE
);
301 umem_elem_push(&umem
->mpool
, elem
);
304 /* Set-up metadata. */
305 if (xpacket_pool_init(&umem
->xpool
, NUM_FRAMES
) < 0) {
306 VLOG_ERR("xpacket_pool_init failed");
307 umem_pool_cleanup(&umem
->mpool
);
308 if (xsk_umem__delete(umem
->umem
)) {
309 VLOG_ERR("xsk_umem__delete failed");
315 VLOG_DBG("%s: xpacket pool from %p to %p", __func__
,
317 (char *)umem
->xpool
.array
+
318 NUM_FRAMES
* sizeof(struct dp_packet_afxdp
));
320 for (i
= NUM_FRAMES
- 1; i
>= 0; i
--) {
321 struct dp_packet_afxdp
*xpacket
;
322 struct dp_packet
*packet
;
324 xpacket
= &umem
->xpool
.array
[i
];
325 xpacket
->mpool
= &umem
->mpool
;
327 packet
= &xpacket
->packet
;
328 packet
->source
= DPBUF_AFXDP
;
334 static struct xsk_socket_info
*
335 xsk_configure_socket(struct xsk_umem_info
*umem
, uint32_t ifindex
,
336 uint32_t queue_id
, enum afxdp_mode mode
,
337 bool use_need_wakeup
, bool report_socket_failures
)
339 struct xsk_socket_config cfg
;
340 struct xsk_socket_info
*xsk
;
341 char devname
[IF_NAMESIZE
];
342 uint32_t idx
= 0, prog_id
;
346 xsk
= xzalloc(sizeof *xsk
);
348 cfg
.rx_size
= CONS_NUM_DESCS
;
349 cfg
.tx_size
= PROD_NUM_DESCS
;
350 cfg
.libbpf_flags
= 0;
351 cfg
.bind_flags
= xdp_modes
[mode
].bind_flags
;
352 cfg
.xdp_flags
= xdp_modes
[mode
].xdp_flags
| XDP_FLAGS_UPDATE_IF_NOEXIST
;
354 #ifdef HAVE_XDP_NEED_WAKEUP
355 if (use_need_wakeup
) {
356 cfg
.bind_flags
|= XDP_USE_NEED_WAKEUP
;
360 if (if_indextoname(ifindex
, devname
) == NULL
) {
361 VLOG_ERR("ifindex %d to devname failed (%s)",
362 ifindex
, ovs_strerror(errno
));
367 ret
= xsk_socket__create(&xsk
->xsk
, devname
, queue_id
, umem
->umem
,
368 &xsk
->rx
, &xsk
->tx
, &cfg
);
370 VLOG(report_socket_failures
? VLL_ERR
: VLL_DBG
,
371 "xsk_socket__create failed (%s) mode: %s, "
372 "use-need-wakeup: %s, qid: %d",
373 ovs_strerror(errno
), xdp_modes
[mode
].name
,
374 use_need_wakeup
? "true" : "false", queue_id
);
379 /* Make sure the built-in AF_XDP program is loaded. */
380 ret
= bpf_get_link_xdp_id(ifindex
, &prog_id
, cfg
.xdp_flags
);
381 if (ret
|| !prog_id
) {
383 VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno
));
385 VLOG_ERR("No XDP program is loaded at ifindex %d", ifindex
);
387 xsk_socket__delete(xsk
->xsk
);
392 while (!xsk_ring_prod__reserve(&xsk
->umem
->fq
,
393 PROD_NUM_DESCS
, &idx
)) {
394 VLOG_WARN_RL(&rl
, "Retry xsk_ring_prod__reserve to FILL queue");
398 i
< PROD_NUM_DESCS
* FRAME_SIZE
;
403 elem
= umem_elem_pop(&xsk
->umem
->mpool
);
404 addr
= UMEM2DESC(elem
, xsk
->umem
->buffer
);
406 *xsk_ring_prod__fill_addr(&xsk
->umem
->fq
, idx
++) = addr
;
409 xsk_ring_prod__submit(&xsk
->umem
->fq
,
414 static struct xsk_socket_info
*
415 xsk_configure(int ifindex
, int xdp_queue_id
, enum afxdp_mode mode
,
416 bool use_need_wakeup
, bool report_socket_failures
)
418 struct xsk_socket_info
*xsk
;
419 struct xsk_umem_info
*umem
;
422 netdev_afxdp_sweep_unused_pools(NULL
);
424 /* Umem memory region. */
425 bufs
= xmalloc_pagealign(NUM_FRAMES
* FRAME_SIZE
);
426 memset(bufs
, 0, NUM_FRAMES
* FRAME_SIZE
);
428 /* Create AF_XDP socket. */
429 umem
= xsk_configure_umem(bufs
, NUM_FRAMES
* FRAME_SIZE
);
431 free_pagealign(bufs
);
435 VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR
, (uintptr_t) umem
);
437 xsk
= xsk_configure_socket(umem
, ifindex
, xdp_queue_id
, mode
,
438 use_need_wakeup
, report_socket_failures
);
440 /* Clean up umem and xpacket pool. */
441 if (xsk_umem__delete(umem
->umem
)) {
442 VLOG_ERR("xsk_umem__delete failed.");
444 free_pagealign(bufs
);
445 umem_pool_cleanup(&umem
->mpool
);
446 xpacket_pool_cleanup(&umem
->xpool
);
453 xsk_configure_queue(struct netdev_linux
*dev
, int ifindex
, int queue_id
,
454 enum afxdp_mode mode
, bool report_socket_failures
)
456 struct xsk_socket_info
*xsk_info
;
458 VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.",
459 netdev_get_name(&dev
->up
), queue_id
, xdp_modes
[mode
].name
,
460 dev
->use_need_wakeup
? "true" : "false");
461 xsk_info
= xsk_configure(ifindex
, queue_id
, mode
, dev
->use_need_wakeup
,
462 report_socket_failures
);
464 VLOG(report_socket_failures
? VLL_ERR
: VLL_DBG
,
465 "%s: Failed to create AF_XDP socket on queue %d in %s mode.",
466 netdev_get_name(&dev
->up
), queue_id
, xdp_modes
[mode
].name
);
467 dev
->xsks
[queue_id
] = NULL
;
470 dev
->xsks
[queue_id
] = xsk_info
;
471 atomic_init(&xsk_info
->tx_dropped
, 0);
472 xsk_info
->outstanding_tx
= 0;
473 xsk_info
->available_rx
= PROD_NUM_DESCS
;
479 xsk_configure_all(struct netdev
*netdev
)
481 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
482 int i
, ifindex
, n_rxq
, n_txq
;
485 ifindex
= linux_get_ifindex(netdev_get_name(netdev
));
487 ovs_assert(dev
->xsks
== NULL
);
488 ovs_assert(dev
->tx_locks
== NULL
);
490 n_rxq
= netdev_n_rxq(netdev
);
491 dev
->xsks
= xcalloc(n_rxq
, sizeof *dev
->xsks
);
493 if (dev
->xdp_mode
== OVS_AF_XDP_MODE_BEST_EFFORT
) {
494 /* Trying to configure first queue with different modes to
495 * find the most suitable. */
496 for (i
= OVS_AF_XDP_MODE_NATIVE_ZC
; i
< OVS_AF_XDP_MODE_MAX
; i
++) {
497 if (!xsk_configure_queue(dev
, ifindex
, qid
, i
,
498 i
== OVS_AF_XDP_MODE_MAX
- 1)) {
499 dev
->xdp_mode_in_use
= i
;
500 VLOG_INFO("%s: %s XDP mode will be in use.",
501 netdev_get_name(netdev
), xdp_modes
[i
].name
);
505 if (i
== OVS_AF_XDP_MODE_MAX
) {
506 VLOG_ERR("%s: Failed to detect suitable XDP mode.",
507 netdev_get_name(netdev
));
512 dev
->xdp_mode_in_use
= dev
->xdp_mode
;
515 /* Configure remaining queues. */
516 for (; qid
< n_rxq
; qid
++) {
517 if (xsk_configure_queue(dev
, ifindex
, qid
,
518 dev
->xdp_mode_in_use
, true)) {
519 VLOG_ERR("%s: Failed to create AF_XDP socket on queue %d.",
520 netdev_get_name(netdev
), qid
);
525 n_txq
= netdev_n_txq(netdev
);
526 dev
->tx_locks
= xzalloc_cacheline(n_txq
* sizeof *dev
->tx_locks
);
528 for (i
= 0; i
< n_txq
; i
++) {
529 ovs_spin_init(&dev
->tx_locks
[i
].lock
);
535 xsk_destroy_all(netdev
);
540 xsk_destroy(struct xsk_socket_info
*xsk_info
)
542 struct xsk_umem
*umem
;
543 struct unused_pool
*pool
;
545 xsk_socket__delete(xsk_info
->xsk
);
546 xsk_info
->xsk
= NULL
;
548 umem
= xsk_info
->umem
->umem
;
549 if (xsk_umem__delete(umem
)) {
550 VLOG_ERR("xsk_umem__delete failed.");
553 pool
= xzalloc(sizeof *pool
);
554 pool
->umem_info
= xsk_info
->umem
;
555 pool
->lost_in_rings
= xsk_info
->outstanding_tx
+ xsk_info
->available_rx
;
557 ovs_mutex_lock(&unused_pools_mutex
);
558 ovs_list_push_back(&unused_pools
, &pool
->list_node
);
559 ovs_mutex_unlock(&unused_pools_mutex
);
563 netdev_afxdp_sweep_unused_pools(NULL
);
567 xsk_destroy_all(struct netdev
*netdev
)
569 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
573 for (i
= 0; i
< netdev_n_rxq(netdev
); i
++) {
575 xsk_destroy(dev
->xsks
[i
]);
577 VLOG_DBG("%s: Destroyed xsk[%d].", netdev_get_name(netdev
), i
);
585 VLOG_INFO("%s: Removing xdp program.", netdev_get_name(netdev
));
586 ifindex
= linux_get_ifindex(netdev_get_name(netdev
));
587 xsk_remove_xdp_program(ifindex
, dev
->xdp_mode_in_use
);
590 for (i
= 0; i
< netdev_n_txq(netdev
); i
++) {
591 ovs_spin_destroy(&dev
->tx_locks
[i
].lock
);
593 free_cacheline(dev
->tx_locks
);
594 dev
->tx_locks
= NULL
;
599 netdev_afxdp_set_config(struct netdev
*netdev
, const struct smap
*args
,
600 char **errp OVS_UNUSED
)
602 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
603 const char *str_xdp_mode
;
604 enum afxdp_mode xdp_mode
;
608 ovs_mutex_lock(&dev
->mutex
);
609 new_n_rxq
= MAX(smap_get_int(args
, "n_rxq", NR_QUEUE
), 1);
610 if (new_n_rxq
> MAX_XSKQ
) {
611 ovs_mutex_unlock(&dev
->mutex
);
612 VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).",
613 netdev_get_name(netdev
), new_n_rxq
, MAX_XSKQ
);
617 str_xdp_mode
= smap_get_def(args
, "xdp-mode", "best-effort");
618 for (xdp_mode
= OVS_AF_XDP_MODE_BEST_EFFORT
;
619 xdp_mode
< OVS_AF_XDP_MODE_MAX
;
621 if (!strcasecmp(str_xdp_mode
, xdp_modes
[xdp_mode
].name
)) {
625 if (xdp_mode
== OVS_AF_XDP_MODE_MAX
) {
626 VLOG_ERR("%s: Incorrect xdp-mode (%s).",
627 netdev_get_name(netdev
), str_xdp_mode
);
628 ovs_mutex_unlock(&dev
->mutex
);
632 need_wakeup
= smap_get_bool(args
, "use-need-wakeup", NEED_WAKEUP_DEFAULT
);
633 #ifndef HAVE_XDP_NEED_WAKEUP
635 VLOG_WARN("XDP need_wakeup is not supported in libbpf.");
640 if (dev
->requested_n_rxq
!= new_n_rxq
641 || dev
->requested_xdp_mode
!= xdp_mode
642 || dev
->requested_need_wakeup
!= need_wakeup
) {
643 dev
->requested_n_rxq
= new_n_rxq
;
644 dev
->requested_xdp_mode
= xdp_mode
;
645 dev
->requested_need_wakeup
= need_wakeup
;
646 netdev_request_reconfigure(netdev
);
648 ovs_mutex_unlock(&dev
->mutex
);
653 netdev_afxdp_get_config(const struct netdev
*netdev
, struct smap
*args
)
655 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
657 ovs_mutex_lock(&dev
->mutex
);
658 smap_add_format(args
, "n_rxq", "%d", netdev
->n_rxq
);
659 smap_add_format(args
, "xdp-mode", "%s", xdp_modes
[dev
->xdp_mode
].name
);
660 smap_add_format(args
, "xdp-mode-in-use", "%s",
661 xdp_modes
[dev
->xdp_mode_in_use
].name
);
662 smap_add_format(args
, "use-need-wakeup", "%s",
663 dev
->use_need_wakeup
? "true" : "false");
664 ovs_mutex_unlock(&dev
->mutex
);
669 netdev_afxdp_reconfigure(struct netdev
*netdev
)
671 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
672 struct rlimit r
= {RLIM_INFINITY
, RLIM_INFINITY
};
673 struct bitmask
*old_bm
= NULL
;
674 int old_policy
, numa_id
;
677 /* Allocate all the xsk related memory in the netdev's NUMA domain. */
678 if (numa_available() != -1 && ovs_numa_get_n_numas() > 1) {
679 numa_id
= netdev_get_numa_id(netdev
);
680 if (numa_id
!= NETDEV_NUMA_UNSPEC
) {
681 old_bm
= numa_allocate_nodemask();
682 if (get_mempolicy(&old_policy
, old_bm
->maskp
, old_bm
->size
+ 1,
684 VLOG_INFO("Failed to get NUMA memory policy: %s.",
685 ovs_strerror(errno
));
686 numa_bitmask_free(old_bm
);
689 numa_set_preferred(numa_id
);
694 ovs_mutex_lock(&dev
->mutex
);
696 if (netdev
->n_rxq
== dev
->requested_n_rxq
697 && dev
->xdp_mode
== dev
->requested_xdp_mode
698 && dev
->use_need_wakeup
== dev
->requested_need_wakeup
703 xsk_destroy_all(netdev
);
705 netdev
->n_rxq
= dev
->requested_n_rxq
;
706 netdev
->n_txq
= netdev
->n_rxq
;
708 dev
->xdp_mode
= dev
->requested_xdp_mode
;
709 VLOG_INFO("%s: Setting XDP mode to %s.", netdev_get_name(netdev
),
710 xdp_modes
[dev
->xdp_mode
].name
);
712 if (setrlimit(RLIMIT_MEMLOCK
, &r
)) {
713 VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno
));
715 dev
->use_need_wakeup
= dev
->requested_need_wakeup
;
717 err
= xsk_configure_all(netdev
);
719 VLOG_ERR("%s: AF_XDP device reconfiguration failed.",
720 netdev_get_name(netdev
));
722 netdev_change_seq_changed(netdev
);
724 ovs_mutex_unlock(&dev
->mutex
);
726 if (set_mempolicy(old_policy
, old_bm
->maskp
, old_bm
->size
+ 1)) {
727 VLOG_WARN("Failed to restore NUMA memory policy: %s.",
728 ovs_strerror(errno
));
729 /* Can't restore correctly. Try to use localalloc as the most
730 * likely default memory policy. */
731 numa_set_localalloc();
733 numa_bitmask_free(old_bm
);
739 xsk_remove_xdp_program(uint32_t ifindex
, enum afxdp_mode mode
)
741 uint32_t flags
= xdp_modes
[mode
].xdp_flags
| XDP_FLAGS_UPDATE_IF_NOEXIST
;
742 uint32_t ret
, prog_id
= 0;
744 /* Check whether XDP program is loaded. */
745 ret
= bpf_get_link_xdp_id(ifindex
, &prog_id
, flags
);
747 VLOG_ERR("Failed to get XDP prog id (%s)", ovs_strerror(errno
));
752 VLOG_INFO("No XDP program is loaded at ifindex %d", ifindex
);
756 bpf_set_link_xdp_fd(ifindex
, -1, flags
);
760 signal_remove_xdp(struct netdev
*netdev
)
762 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
765 ifindex
= linux_get_ifindex(netdev_get_name(netdev
));
767 VLOG_WARN("Force removing xdp program.");
768 xsk_remove_xdp_program(ifindex
, dev
->xdp_mode_in_use
);
771 static struct dp_packet_afxdp
*
772 dp_packet_cast_afxdp(const struct dp_packet
*d
)
774 ovs_assert(d
->source
== DPBUF_AFXDP
);
775 return CONTAINER_OF(d
, struct dp_packet_afxdp
, packet
);
779 prepare_fill_queue(struct xsk_socket_info
*xsk_info
)
781 struct xsk_umem_info
*umem
;
782 void *elems
[BATCH_SIZE
];
786 umem
= xsk_info
->umem
;
788 if (xsk_prod_nb_free(&umem
->fq
, BATCH_SIZE
) < BATCH_SIZE
) {
792 ret
= umem_elem_pop_n(&umem
->mpool
, BATCH_SIZE
, elems
);
793 if (OVS_UNLIKELY(ret
)) {
797 if (!xsk_ring_prod__reserve(&umem
->fq
, BATCH_SIZE
, &idx_fq
)) {
798 umem_elem_push_n(&umem
->mpool
, BATCH_SIZE
, elems
);
799 COVERAGE_INC(afxdp_fq_full
);
803 for (i
= 0; i
< BATCH_SIZE
; i
++) {
808 index
= (uint64_t)((char *)elem
- (char *)umem
->buffer
);
809 ovs_assert((index
& FRAME_SHIFT_MASK
) == 0);
810 *xsk_ring_prod__fill_addr(&umem
->fq
, idx_fq
) = index
;
814 xsk_ring_prod__submit(&umem
->fq
, BATCH_SIZE
);
815 xsk_info
->available_rx
+= BATCH_SIZE
;
819 netdev_afxdp_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
822 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
823 struct netdev
*netdev
= rx
->up
.netdev
;
824 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
825 struct xsk_socket_info
*xsk_info
;
826 struct xsk_umem_info
*umem
;
828 int qid
= rxq_
->queue_id
;
829 unsigned int rcvd
, i
;
831 xsk_info
= dev
->xsks
[qid
];
832 if (!xsk_info
|| !xsk_info
->xsk
) {
836 prepare_fill_queue(xsk_info
);
838 umem
= xsk_info
->umem
;
839 rx
->fd
= xsk_socket__fd(xsk_info
->xsk
);
841 rcvd
= xsk_ring_cons__peek(&xsk_info
->rx
, BATCH_SIZE
, &idx_rx
);
843 xsk_rx_wakeup_if_needed(umem
, netdev
, rx
->fd
);
847 /* Setup a dp_packet batch from descriptors in RX queue. */
848 for (i
= 0; i
< rcvd
; i
++) {
849 struct dp_packet_afxdp
*xpacket
;
850 const struct xdp_desc
*desc
;
851 struct dp_packet
*packet
;
852 uint64_t addr
, index
;
856 desc
= xsk_ring_cons__rx_desc(&xsk_info
->rx
, idx_rx
);
860 pkt
= xsk_umem__get_data(umem
->buffer
, addr
);
861 index
= addr
>> FRAME_SHIFT
;
862 xpacket
= &umem
->xpool
.array
[index
];
863 packet
= &xpacket
->packet
;
865 /* Initialize the struct dp_packet. */
866 dp_packet_use_afxdp(packet
, pkt
,
867 FRAME_SIZE
- FRAME_HEADROOM
,
869 dp_packet_set_size(packet
, len
);
871 /* Add packet into batch, increase batch->count. */
872 dp_packet_batch_add(batch
, packet
);
876 /* Release the RX queue. */
877 xsk_ring_cons__release(&xsk_info
->rx
, rcvd
);
878 xsk_info
->available_rx
-= rcvd
;
881 /* TODO: return the number of remaining packets in the queue. */
888 kick_tx(struct xsk_socket_info
*xsk_info
, enum afxdp_mode mode
,
889 bool use_need_wakeup
)
892 static const int KERNEL_TX_BATCH_SIZE
= 16;
894 if (use_need_wakeup
&& !xsk_tx_need_wakeup(xsk_info
)) {
898 /* In all modes except native-with-zerocopy packet transmission is
899 * synchronous, and the kernel xmits only TX_BATCH_SIZE(16) packets for a
900 * single sendmsg syscall.
901 * So, we have to kick the kernel (n_packets / 16) times to be sure that
902 * all packets are transmitted. */
903 retries
= (mode
!= OVS_AF_XDP_MODE_NATIVE_ZC
)
904 ? xsk_info
->outstanding_tx
/ KERNEL_TX_BATCH_SIZE
907 /* This causes system call into kernel's xsk_sendmsg, and xsk_generic_xmit
908 * (generic and native modes) or xsk_zc_xmit (native-with-zerocopy mode).
910 ret
= sendto(xsk_socket__fd(xsk_info
->xsk
), NULL
, 0, MSG_DONTWAIT
,
913 if (retries
-- && errno
== EAGAIN
) {
916 if (errno
== ENXIO
|| errno
== ENOBUFS
|| errno
== EOPNOTSUPP
) {
920 /* No error, or EBUSY, or too many retries on EAGAIN. */
925 free_afxdp_buf(struct dp_packet
*p
)
927 struct dp_packet_afxdp
*xpacket
;
930 xpacket
= dp_packet_cast_afxdp(p
);
931 if (xpacket
->mpool
) {
932 void *base
= dp_packet_base(p
);
934 addr
= (uintptr_t)base
& (~FRAME_SHIFT_MASK
);
935 umem_elem_push(xpacket
->mpool
, (void *)addr
);
940 free_afxdp_buf_batch(struct dp_packet_batch
*batch
)
942 struct dp_packet_afxdp
*xpacket
= NULL
;
943 struct dp_packet
*packet
;
944 void *elems
[BATCH_SIZE
];
947 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
950 xpacket
= dp_packet_cast_afxdp(packet
);
951 base
= dp_packet_base(packet
);
952 addr
= (uintptr_t)base
& (~FRAME_SHIFT_MASK
);
953 elems
[i
] = (void *)addr
;
955 umem_elem_push_n(xpacket
->mpool
, dp_packet_batch_size(batch
), elems
);
956 dp_packet_batch_init(batch
);
960 check_free_batch(struct dp_packet_batch
*batch
)
962 struct umem_pool
*first_mpool
= NULL
;
963 struct dp_packet_afxdp
*xpacket
;
964 struct dp_packet
*packet
;
966 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
967 if (packet
->source
!= DPBUF_AFXDP
) {
970 xpacket
= dp_packet_cast_afxdp(packet
);
972 first_mpool
= xpacket
->mpool
;
975 if (xpacket
->mpool
!= first_mpool
) {
979 /* All packets are DPBUF_AFXDP and from the same mpool. */
984 afxdp_complete_tx(struct xsk_socket_info
*xsk_info
)
986 void *elems_push
[BATCH_SIZE
];
987 struct xsk_umem_info
*umem
;
992 umem
= xsk_info
->umem
;
993 tx_done
= xsk_ring_cons__peek(&umem
->cq
, CONS_NUM_DESCS
, &idx_cq
);
995 /* Recycle back to umem pool. */
996 for (j
= 0; j
< tx_done
; j
++) {
1000 addr
= (uint64_t *)xsk_ring_cons__comp_addr(&umem
->cq
, idx_cq
++);
1001 if (*addr
!= UINT64_MAX
) {
1002 elem
= ALIGNED_CAST(void *, (char *)umem
->buffer
+ *addr
);
1003 elems_push
[tx_to_free
] = elem
;
1004 *addr
= UINT64_MAX
; /* Mark as pushed. */
1007 /* The elem has been pushed already. */
1008 COVERAGE_INC(afxdp_cq_skip
);
1011 if (tx_to_free
== BATCH_SIZE
|| j
== tx_done
- 1) {
1012 umem_elem_push_n(&umem
->mpool
, tx_to_free
, elems_push
);
1013 xsk_info
->outstanding_tx
-= tx_to_free
;
1019 xsk_ring_cons__release(&umem
->cq
, tx_done
);
1021 COVERAGE_INC(afxdp_cq_empty
);
1026 __netdev_afxdp_batch_send(struct netdev
*netdev
, int qid
,
1027 struct dp_packet_batch
*batch
)
1029 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1030 struct xsk_socket_info
*xsk_info
;
1031 void *elems_pop
[BATCH_SIZE
];
1032 struct xsk_umem_info
*umem
;
1033 struct dp_packet
*packet
;
1034 bool free_batch
= false;
1040 xsk_info
= dev
->xsks
[qid
];
1041 if (!xsk_info
|| !xsk_info
->xsk
) {
1045 afxdp_complete_tx(xsk_info
);
1047 free_batch
= check_free_batch(batch
);
1049 umem
= xsk_info
->umem
;
1050 ret
= umem_elem_pop_n(&umem
->mpool
, dp_packet_batch_size(batch
),
1052 if (OVS_UNLIKELY(ret
)) {
1053 atomic_add_relaxed(&xsk_info
->tx_dropped
, dp_packet_batch_size(batch
),
1055 VLOG_WARN_RL(&rl
, "%s: send failed due to exhausted memory pool.",
1056 netdev_get_name(netdev
));
1061 /* Make sure we have enough TX descs. */
1062 ret
= xsk_ring_prod__reserve(&xsk_info
->tx
, dp_packet_batch_size(batch
),
1064 if (OVS_UNLIKELY(ret
== 0)) {
1065 umem_elem_push_n(&umem
->mpool
, dp_packet_batch_size(batch
), elems_pop
);
1066 atomic_add_relaxed(&xsk_info
->tx_dropped
, dp_packet_batch_size(batch
),
1068 COVERAGE_INC(afxdp_tx_full
);
1069 afxdp_complete_tx(xsk_info
);
1070 kick_tx(xsk_info
, dev
->xdp_mode_in_use
, dev
->use_need_wakeup
);
1075 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1079 elem
= elems_pop
[i
];
1080 /* Copy the packet to the umem we just pop from umem pool.
1081 * TODO: avoid this copy if the packet and the pop umem
1082 * are located in the same umem.
1084 memcpy(elem
, dp_packet_data(packet
), dp_packet_size(packet
));
1086 index
= (uint64_t)((char *)elem
- (char *)umem
->buffer
);
1087 xsk_ring_prod__tx_desc(&xsk_info
->tx
, idx
+ i
)->addr
= index
;
1088 xsk_ring_prod__tx_desc(&xsk_info
->tx
, idx
+ i
)->len
1089 = dp_packet_size(packet
);
1091 xsk_ring_prod__submit(&xsk_info
->tx
, dp_packet_batch_size(batch
));
1092 xsk_info
->outstanding_tx
+= dp_packet_batch_size(batch
);
1094 ret
= kick_tx(xsk_info
, dev
->xdp_mode_in_use
, dev
->use_need_wakeup
);
1095 if (OVS_UNLIKELY(ret
)) {
1096 VLOG_WARN_RL(&rl
, "%s: error sending AF_XDP packet: %s.",
1097 netdev_get_name(netdev
), ovs_strerror(ret
));
1102 free_afxdp_buf_batch(batch
);
1104 dp_packet_delete_batch(batch
, true);
1111 netdev_afxdp_batch_send(struct netdev
*netdev
, int qid
,
1112 struct dp_packet_batch
*batch
,
1113 bool concurrent_txq
)
1115 struct netdev_linux
*dev
;
1118 if (concurrent_txq
) {
1119 dev
= netdev_linux_cast(netdev
);
1120 qid
= qid
% netdev_n_txq(netdev
);
1122 ovs_spin_lock(&dev
->tx_locks
[qid
].lock
);
1123 ret
= __netdev_afxdp_batch_send(netdev
, qid
, batch
);
1124 ovs_spin_unlock(&dev
->tx_locks
[qid
].lock
);
1126 ret
= __netdev_afxdp_batch_send(netdev
, qid
, batch
);
1133 netdev_afxdp_rxq_construct(struct netdev_rxq
*rxq_ OVS_UNUSED
)
1135 /* Done at reconfigure. */
1140 netdev_afxdp_rxq_destruct(struct netdev_rxq
*rxq_ OVS_UNUSED
)
1146 libbpf_print(enum libbpf_print_level level
,
1147 const char *format
, va_list args
)
1149 if (level
== LIBBPF_WARN
) {
1150 vlog_valist(&this_module
, VLL_WARN
, format
, args
);
1151 } else if (level
== LIBBPF_INFO
) {
1152 vlog_valist(&this_module
, VLL_INFO
, format
, args
);
1154 vlog_valist(&this_module
, VLL_DBG
, format
, args
);
1159 int netdev_afxdp_init(void)
1161 libbpf_set_print(libbpf_print
);
1166 netdev_afxdp_construct(struct netdev
*netdev
)
1168 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1171 /* Configure common netdev-linux first. */
1172 ret
= netdev_linux_construct(netdev
);
1177 /* Queues should not be used before the first reconfiguration. Clearing. */
1180 dev
->xdp_mode
= OVS_AF_XDP_MODE_UNSPEC
;
1181 dev
->xdp_mode_in_use
= OVS_AF_XDP_MODE_UNSPEC
;
1183 dev
->requested_n_rxq
= NR_QUEUE
;
1184 dev
->requested_xdp_mode
= OVS_AF_XDP_MODE_BEST_EFFORT
;
1185 dev
->requested_need_wakeup
= NEED_WAKEUP_DEFAULT
;
1188 dev
->tx_locks
= NULL
;
1190 netdev_request_reconfigure(netdev
);
1195 netdev_afxdp_destruct(struct netdev
*netdev
)
1197 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
1198 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1200 if (ovsthread_once_start(&once
)) {
1201 fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools
,
1203 ovsthread_once_done(&once
);
1206 /* Note: tc is by-passed when using drv-mode, but when using
1207 * skb-mode, we might need to clean up tc. */
1209 xsk_destroy_all(netdev
);
1210 ovs_mutex_destroy(&dev
->mutex
);
1214 netdev_afxdp_verify_mtu_size(const struct netdev
*netdev OVS_UNUSED
, int mtu
)
1217 * If a device is used in xdpmode skb, no driver-specific MTU size is
1218 * checked and any value is allowed resulting in packet drops.
1219 * This check will verify the maximum supported value based on the
1220 * buffer size allocated and the additional headroom required.
1222 if (mtu
> (FRAME_SIZE
- OVS_XDP_HEADROOM
-
1223 XDP_PACKET_HEADROOM
- VLAN_ETH_HEADER_LEN
)) {
1231 netdev_afxdp_get_custom_stats(const struct netdev
*netdev
,
1232 struct netdev_custom_stats
*custom_stats
)
1234 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1235 struct xsk_socket_info
*xsk_info
;
1236 struct xdp_statistics stat
;
1240 ovs_mutex_lock(&dev
->mutex
);
1242 #define XDP_CSTATS \
1243 XDP_CSTAT(rx_dropped) \
1244 XDP_CSTAT(rx_invalid_descs) \
1245 XDP_CSTAT(tx_invalid_descs)
1247 #define XDP_CSTAT(NAME) + 1
1248 enum { N_XDP_CSTATS
= XDP_CSTATS
};
1251 custom_stats
->counters
= xcalloc(netdev_n_rxq(netdev
) * N_XDP_CSTATS
,
1252 sizeof *custom_stats
->counters
);
1254 /* Account the stats for each xsk. */
1255 for (i
= 0; i
< netdev_n_rxq(netdev
); i
++) {
1256 xsk_info
= dev
->xsks
[i
];
1257 optlen
= sizeof stat
;
1259 if (xsk_info
&& !getsockopt(xsk_socket__fd(xsk_info
->xsk
), SOL_XDP
,
1260 XDP_STATISTICS
, &stat
, &optlen
)) {
1261 #define XDP_CSTAT(NAME) \
1262 snprintf(custom_stats->counters[c].name, \
1263 NETDEV_CUSTOM_STATS_NAME_SIZE, \
1264 "xsk_queue_%d_" #NAME, i); \
1265 custom_stats->counters[c++].value = stat.NAME;
1270 custom_stats
->size
= c
;
1271 ovs_mutex_unlock(&dev
->mutex
);
1277 netdev_afxdp_get_stats(const struct netdev
*netdev
,
1278 struct netdev_stats
*stats
)
1280 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1281 struct xsk_socket_info
*xsk_info
;
1282 struct netdev_stats dev_stats
;
1285 ovs_mutex_lock(&dev
->mutex
);
1287 error
= get_stats_via_netlink(netdev
, &dev_stats
);
1289 VLOG_WARN_RL(&rl
, "%s: Error getting AF_XDP statistics.",
1290 netdev_get_name(netdev
));
1292 /* Use kernel netdev's packet and byte counts. */
1293 stats
->rx_packets
= dev_stats
.rx_packets
;
1294 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1295 stats
->tx_packets
= dev_stats
.tx_packets
;
1296 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1298 stats
->rx_errors
+= dev_stats
.rx_errors
;
1299 stats
->tx_errors
+= dev_stats
.tx_errors
;
1300 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1301 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1302 stats
->multicast
+= dev_stats
.multicast
;
1303 stats
->collisions
+= dev_stats
.collisions
;
1304 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1305 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1306 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1307 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1308 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1309 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1310 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1311 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1312 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1313 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1314 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1316 /* Account the dropped in each xsk. */
1317 for (i
= 0; i
< netdev_n_rxq(netdev
); i
++) {
1318 xsk_info
= dev
->xsks
[i
];
1320 uint64_t tx_dropped
;
1322 atomic_read_relaxed(&xsk_info
->tx_dropped
, &tx_dropped
);
1323 stats
->tx_dropped
+= tx_dropped
;
1327 ovs_mutex_unlock(&dev
->mutex
);