1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
4 * AF_XDP user-space access library.
6 * Copyright(c) 2018 - 2019 Intel Corporation.
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
15 #include <arpa/inet.h>
16 #include <asm/barrier.h>
17 #include <linux/compiler.h>
18 #include <linux/ethtool.h>
19 #include <linux/filter.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_packet.h>
22 #include <linux/if_xdp.h>
23 #include <linux/sockios.h>
25 #include <sys/ioctl.h>
27 #include <sys/socket.h>
28 #include <sys/types.h>
32 #include "libbpf_util.h"
48 struct xsk_ring_prod
*fill
;
49 struct xsk_ring_cons
*comp
;
51 struct xsk_umem_config config
;
57 struct xsk_ring_cons
*rx
;
58 struct xsk_ring_prod
*tx
;
60 struct xsk_umem
*umem
;
61 struct xsk_socket_config config
;
69 char ifname
[IFNAMSIZ
];
73 bool xdp_prog_attached
;
78 /* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
79 * Unfortunately, it is not part of glibc.
81 static inline void *xsk_mmap(void *addr
, size_t length
, int prot
, int flags
,
85 unsigned int page_shift
= __builtin_ffs(getpagesize()) - 1;
86 long ret
= syscall(__NR_mmap2
, addr
, length
, prot
, flags
, fd
,
87 (off_t
)(offset
>> page_shift
));
91 return mmap(addr
, length
, prot
, flags
, fd
, offset
);
95 int xsk_umem__fd(const struct xsk_umem
*umem
)
97 return umem
? umem
->fd
: -EINVAL
;
100 int xsk_socket__fd(const struct xsk_socket
*xsk
)
102 return xsk
? xsk
->fd
: -EINVAL
;
105 static bool xsk_page_aligned(void *buffer
)
107 unsigned long addr
= (unsigned long)buffer
;
109 return !(addr
& (getpagesize() - 1));
112 static void xsk_set_umem_config(struct xsk_umem_config
*cfg
,
113 const struct xsk_umem_config
*usr_cfg
)
116 cfg
->fill_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
;
117 cfg
->comp_size
= XSK_RING_CONS__DEFAULT_NUM_DESCS
;
118 cfg
->frame_size
= XSK_UMEM__DEFAULT_FRAME_SIZE
;
119 cfg
->frame_headroom
= XSK_UMEM__DEFAULT_FRAME_HEADROOM
;
123 cfg
->fill_size
= usr_cfg
->fill_size
;
124 cfg
->comp_size
= usr_cfg
->comp_size
;
125 cfg
->frame_size
= usr_cfg
->frame_size
;
126 cfg
->frame_headroom
= usr_cfg
->frame_headroom
;
129 static void xsk_set_xdp_socket_config(struct xsk_socket_config
*cfg
,
130 const struct xsk_socket_config
*usr_cfg
)
133 cfg
->rx_size
= XSK_RING_CONS__DEFAULT_NUM_DESCS
;
134 cfg
->tx_size
= XSK_RING_PROD__DEFAULT_NUM_DESCS
;
135 cfg
->libbpf_flags
= 0;
141 cfg
->rx_size
= usr_cfg
->rx_size
;
142 cfg
->tx_size
= usr_cfg
->tx_size
;
143 cfg
->libbpf_flags
= usr_cfg
->libbpf_flags
;
144 cfg
->xdp_flags
= usr_cfg
->xdp_flags
;
145 cfg
->bind_flags
= usr_cfg
->bind_flags
;
148 int xsk_umem__create(struct xsk_umem
**umem_ptr
, void *umem_area
, __u64 size
,
149 struct xsk_ring_prod
*fill
, struct xsk_ring_cons
*comp
,
150 const struct xsk_umem_config
*usr_config
)
152 struct xdp_mmap_offsets off
;
153 struct xdp_umem_reg mr
;
154 struct xsk_umem
*umem
;
159 if (!umem_area
|| !umem_ptr
|| !fill
|| !comp
)
161 if (!size
&& !xsk_page_aligned(umem_area
))
164 umem
= calloc(1, sizeof(*umem
));
168 umem
->fd
= socket(AF_XDP
, SOCK_RAW
, 0);
174 umem
->umem_area
= umem_area
;
175 xsk_set_umem_config(&umem
->config
, usr_config
);
177 mr
.addr
= (uintptr_t)umem_area
;
179 mr
.chunk_size
= umem
->config
.frame_size
;
180 mr
.headroom
= umem
->config
.frame_headroom
;
182 err
= setsockopt(umem
->fd
, SOL_XDP
, XDP_UMEM_REG
, &mr
, sizeof(mr
));
187 err
= setsockopt(umem
->fd
, SOL_XDP
, XDP_UMEM_FILL_RING
,
188 &umem
->config
.fill_size
,
189 sizeof(umem
->config
.fill_size
));
194 err
= setsockopt(umem
->fd
, SOL_XDP
, XDP_UMEM_COMPLETION_RING
,
195 &umem
->config
.comp_size
,
196 sizeof(umem
->config
.comp_size
));
202 optlen
= sizeof(off
);
203 err
= getsockopt(umem
->fd
, SOL_XDP
, XDP_MMAP_OFFSETS
, &off
, &optlen
);
209 map
= xsk_mmap(NULL
, off
.fr
.desc
+
210 umem
->config
.fill_size
* sizeof(__u64
),
211 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
,
212 umem
->fd
, XDP_UMEM_PGOFF_FILL_RING
);
213 if (map
== MAP_FAILED
) {
219 fill
->mask
= umem
->config
.fill_size
- 1;
220 fill
->size
= umem
->config
.fill_size
;
221 fill
->producer
= map
+ off
.fr
.producer
;
222 fill
->consumer
= map
+ off
.fr
.consumer
;
223 fill
->ring
= map
+ off
.fr
.desc
;
224 fill
->cached_cons
= umem
->config
.fill_size
;
227 off
.cr
.desc
+ umem
->config
.comp_size
* sizeof(__u64
),
228 PROT_READ
| PROT_WRITE
, MAP_SHARED
| MAP_POPULATE
,
229 umem
->fd
, XDP_UMEM_PGOFF_COMPLETION_RING
);
230 if (map
== MAP_FAILED
) {
236 comp
->mask
= umem
->config
.comp_size
- 1;
237 comp
->size
= umem
->config
.comp_size
;
238 comp
->producer
= map
+ off
.cr
.producer
;
239 comp
->consumer
= map
+ off
.cr
.consumer
;
240 comp
->ring
= map
+ off
.cr
.desc
;
247 off
.fr
.desc
+ umem
->config
.fill_size
* sizeof(__u64
));
255 static int xsk_load_xdp_prog(struct xsk_socket
*xsk
)
257 char bpf_log_buf
[BPF_LOG_BUF_SIZE
];
260 /* This is the C-program:
261 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
263 * int *qidconf, index = ctx->rx_queue_index;
265 * // A set entry here means that the correspnding queue_id
266 * // has an active AF_XDP socket bound to it.
267 * qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
269 * return XDP_ABORTED;
272 * return bpf_redirect_map(&xsks_map, index, 0);
277 struct bpf_insn prog
[] = {
278 /* r1 = *(u32 *)(r1 + 16) */
279 BPF_LDX_MEM(BPF_W
, BPF_REG_1
, BPF_REG_1
, 16),
280 /* *(u32 *)(r10 - 4) = r1 */
281 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_1
, -4),
282 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
283 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4),
284 BPF_LD_MAP_FD(BPF_REG_1
, xsk
->qidconf_map_fd
),
285 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem
),
286 BPF_MOV64_REG(BPF_REG_1
, BPF_REG_0
),
287 BPF_MOV32_IMM(BPF_REG_0
, 0),
288 /* if r1 == 0 goto +8 */
289 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_1
, 0, 8),
290 BPF_MOV32_IMM(BPF_REG_0
, 2),
291 /* r1 = *(u32 *)(r1 + 0) */
292 BPF_LDX_MEM(BPF_W
, BPF_REG_1
, BPF_REG_1
, 0),
293 /* if r1 == 0 goto +5 */
294 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_1
, 0, 5),
295 /* r2 = *(u32 *)(r10 - 4) */
296 BPF_LD_MAP_FD(BPF_REG_1
, xsk
->xsks_map_fd
),
297 BPF_LDX_MEM(BPF_W
, BPF_REG_2
, BPF_REG_10
, -4),
298 BPF_MOV32_IMM(BPF_REG_3
, 0),
299 BPF_EMIT_CALL(BPF_FUNC_redirect_map
),
300 /* The jumps are to this instruction */
303 size_t insns_cnt
= sizeof(prog
) / sizeof(struct bpf_insn
);
305 prog_fd
= bpf_load_program(BPF_PROG_TYPE_XDP
, prog
, insns_cnt
,
306 "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf
,
309 pr_warning("BPF log buffer:\n%s", bpf_log_buf
);
313 err
= bpf_set_link_xdp_fd(xsk
->ifindex
, prog_fd
, xsk
->config
.xdp_flags
);
319 xsk
->prog_fd
= prog_fd
;
323 static int xsk_get_max_queues(struct xsk_socket
*xsk
)
325 struct ethtool_channels channels
;
329 fd
= socket(AF_INET
, SOCK_DGRAM
, 0);
333 channels
.cmd
= ETHTOOL_GCHANNELS
;
334 ifr
.ifr_data
= (void *)&channels
;
335 strncpy(ifr
.ifr_name
, xsk
->ifname
, IFNAMSIZ
);
336 err
= ioctl(fd
, SIOCETHTOOL
, &ifr
);
337 if (err
&& errno
!= EOPNOTSUPP
) {
342 if (channels
.max_combined
== 0 || errno
== EOPNOTSUPP
)
343 /* If the device says it has no channels, then all traffic
344 * is sent to a single stream, so max queues = 1.
348 ret
= channels
.max_combined
;
355 static int xsk_create_bpf_maps(struct xsk_socket
*xsk
)
360 max_queues
= xsk_get_max_queues(xsk
);
364 fd
= bpf_create_map_name(BPF_MAP_TYPE_ARRAY
, "qidconf_map",
365 sizeof(int), sizeof(int), max_queues
, 0);
368 xsk
->qidconf_map_fd
= fd
;
370 fd
= bpf_create_map_name(BPF_MAP_TYPE_XSKMAP
, "xsks_map",
371 sizeof(int), sizeof(int), max_queues
, 0);
373 close(xsk
->qidconf_map_fd
);
376 xsk
->xsks_map_fd
= fd
;
381 static void xsk_delete_bpf_maps(struct xsk_socket
*xsk
)
383 close(xsk
->qidconf_map_fd
);
384 close(xsk
->xsks_map_fd
);
387 static int xsk_update_bpf_maps(struct xsk_socket
*xsk
, int qidconf_value
,
390 bool qidconf_map_updated
= false, xsks_map_updated
= false;
391 struct bpf_prog_info prog_info
= {};
392 __u32 prog_len
= sizeof(prog_info
);
393 struct bpf_map_info map_info
;
394 __u32 map_len
= sizeof(map_info
);
401 err
= bpf_obj_get_info_by_fd(xsk
->prog_fd
, &prog_info
, &prog_len
);
405 num_maps
= prog_info
.nr_map_ids
;
407 map_ids
= calloc(prog_info
.nr_map_ids
, sizeof(*map_ids
));
411 memset(&prog_info
, 0, prog_len
);
412 prog_info
.nr_map_ids
= num_maps
;
413 prog_info
.map_ids
= (__u64
)(unsigned long)map_ids
;
415 err
= bpf_obj_get_info_by_fd(xsk
->prog_fd
, &prog_info
, &prog_len
);
419 for (i
= 0; i
< prog_info
.nr_map_ids
; i
++) {
422 fd
= bpf_map_get_fd_by_id(map_ids
[i
]);
428 err
= bpf_obj_get_info_by_fd(fd
, &map_info
, &map_len
);
432 if (!strcmp(map_info
.name
, "qidconf_map")) {
433 err
= bpf_map_update_elem(fd
, &xsk
->queue_id
,
437 qidconf_map_updated
= true;
438 xsk
->qidconf_map_fd
= fd
;
439 } else if (!strcmp(map_info
.name
, "xsks_map")) {
440 err
= bpf_map_update_elem(fd
, &xsk
->queue_id
,
444 xsks_map_updated
= true;
445 xsk
->xsks_map_fd
= fd
;
448 if (qidconf_map_updated
&& xsks_map_updated
)
452 if (!(qidconf_map_updated
&& xsks_map_updated
)) {
461 if (qidconf_map_updated
)
462 (void)bpf_map_update_elem(xsk
->qidconf_map_fd
, &xsk
->queue_id
,
464 if (xsks_map_updated
)
465 (void)bpf_map_update_elem(xsk
->xsks_map_fd
, &xsk
->queue_id
,
468 if (qidconf_map_updated
)
469 close(xsk
->qidconf_map_fd
);
470 if (xsks_map_updated
)
471 close(xsk
->xsks_map_fd
);
477 static int xsk_setup_xdp_prog(struct xsk_socket
*xsk
)
479 bool prog_attached
= false;
483 err
= bpf_get_link_xdp_id(xsk
->ifindex
, &prog_id
,
484 xsk
->config
.xdp_flags
);
489 prog_attached
= true;
490 err
= xsk_create_bpf_maps(xsk
);
494 err
= xsk_load_xdp_prog(xsk
);
498 xsk
->prog_fd
= bpf_prog_get_fd_by_id(prog_id
);
501 err
= xsk_update_bpf_maps(xsk
, true, xsk
->fd
);
512 xsk_delete_bpf_maps(xsk
);
516 int xsk_socket__create(struct xsk_socket
**xsk_ptr
, const char *ifname
,
517 __u32 queue_id
, struct xsk_umem
*umem
,
518 struct xsk_ring_cons
*rx
, struct xsk_ring_prod
*tx
,
519 const struct xsk_socket_config
*usr_config
)
521 struct sockaddr_xdp sxdp
= {};
522 struct xdp_mmap_offsets off
;
523 struct xsk_socket
*xsk
;
528 if (!umem
|| !xsk_ptr
|| !rx
|| !tx
)
531 if (umem
->refcount
) {
532 pr_warning("Error: shared umems not supported by libbpf.\n");
536 xsk
= calloc(1, sizeof(*xsk
));
540 if (umem
->refcount
++ > 0) {
541 xsk
->fd
= socket(AF_XDP
, SOCK_RAW
, 0);
550 xsk
->outstanding_tx
= 0;
551 xsk
->queue_id
= queue_id
;
553 xsk
->ifindex
= if_nametoindex(ifname
);
558 strncpy(xsk
->ifname
, ifname
, IFNAMSIZ
);
560 xsk_set_xdp_socket_config(&xsk
->config
, usr_config
);
563 err
= setsockopt(xsk
->fd
, SOL_XDP
, XDP_RX_RING
,
564 &xsk
->config
.rx_size
,
565 sizeof(xsk
->config
.rx_size
));
572 err
= setsockopt(xsk
->fd
, SOL_XDP
, XDP_TX_RING
,
573 &xsk
->config
.tx_size
,
574 sizeof(xsk
->config
.tx_size
));
581 optlen
= sizeof(off
);
582 err
= getsockopt(xsk
->fd
, SOL_XDP
, XDP_MMAP_OFFSETS
, &off
, &optlen
);
589 map
= xsk_mmap(NULL
, off
.rx
.desc
+
590 xsk
->config
.rx_size
* sizeof(struct xdp_desc
),
591 PROT_READ
| PROT_WRITE
,
592 MAP_SHARED
| MAP_POPULATE
,
593 xsk
->fd
, XDP_PGOFF_RX_RING
);
594 if (map
== MAP_FAILED
) {
599 rx
->mask
= xsk
->config
.rx_size
- 1;
600 rx
->size
= xsk
->config
.rx_size
;
601 rx
->producer
= map
+ off
.rx
.producer
;
602 rx
->consumer
= map
+ off
.rx
.consumer
;
603 rx
->ring
= map
+ off
.rx
.desc
;
608 map
= xsk_mmap(NULL
, off
.tx
.desc
+
609 xsk
->config
.tx_size
* sizeof(struct xdp_desc
),
610 PROT_READ
| PROT_WRITE
,
611 MAP_SHARED
| MAP_POPULATE
,
612 xsk
->fd
, XDP_PGOFF_TX_RING
);
613 if (map
== MAP_FAILED
) {
618 tx
->mask
= xsk
->config
.tx_size
- 1;
619 tx
->size
= xsk
->config
.tx_size
;
620 tx
->producer
= map
+ off
.tx
.producer
;
621 tx
->consumer
= map
+ off
.tx
.consumer
;
622 tx
->ring
= map
+ off
.tx
.desc
;
623 tx
->cached_cons
= xsk
->config
.tx_size
;
627 sxdp
.sxdp_family
= PF_XDP
;
628 sxdp
.sxdp_ifindex
= xsk
->ifindex
;
629 sxdp
.sxdp_queue_id
= xsk
->queue_id
;
630 sxdp
.sxdp_flags
= xsk
->config
.bind_flags
;
632 err
= bind(xsk
->fd
, (struct sockaddr
*)&sxdp
, sizeof(sxdp
));
638 if (!(xsk
->config
.libbpf_flags
& XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD
)) {
639 err
= xsk_setup_xdp_prog(xsk
);
651 xsk
->config
.tx_size
* sizeof(struct xdp_desc
));
656 xsk
->config
.rx_size
* sizeof(struct xdp_desc
));
658 if (--umem
->refcount
)
665 int xsk_umem__delete(struct xsk_umem
*umem
)
667 struct xdp_mmap_offsets off
;
677 optlen
= sizeof(off
);
678 err
= getsockopt(umem
->fd
, SOL_XDP
, XDP_MMAP_OFFSETS
, &off
, &optlen
);
680 munmap(umem
->fill
->ring
,
681 off
.fr
.desc
+ umem
->config
.fill_size
* sizeof(__u64
));
682 munmap(umem
->comp
->ring
,
683 off
.cr
.desc
+ umem
->config
.comp_size
* sizeof(__u64
));
692 void xsk_socket__delete(struct xsk_socket
*xsk
)
694 struct xdp_mmap_offsets off
;
701 (void)xsk_update_bpf_maps(xsk
, 0, 0);
703 optlen
= sizeof(off
);
704 err
= getsockopt(xsk
->fd
, SOL_XDP
, XDP_MMAP_OFFSETS
, &off
, &optlen
);
707 munmap(xsk
->rx
->ring
,
709 xsk
->config
.rx_size
* sizeof(struct xdp_desc
));
711 munmap(xsk
->tx
->ring
,
713 xsk
->config
.tx_size
* sizeof(struct xdp_desc
));
716 xsk
->umem
->refcount
--;
717 /* Do not close an fd that also has an associated umem connected
720 if (xsk
->fd
!= xsk
->umem
->fd
)