1 // SPDX-License-Identifier: GPL-2.0
4 * AF_XDP sockets allows a channel between XDP programs and userspace
6 * Copyright(c) 2018 Intel Corporation.
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock.h>
28 #include "xsk_queue.h"
31 #define TX_BATCH_SIZE 16
33 static struct xdp_sock
*xdp_sk(struct sock
*sk
)
35 return (struct xdp_sock
*)sk
;
38 bool xsk_is_setup_for_bpf_map(struct xdp_sock
*xs
)
40 return READ_ONCE(xs
->rx
) && READ_ONCE(xs
->umem
) &&
41 READ_ONCE(xs
->umem
->fq
);
44 u64
*xsk_umem_peek_addr(struct xdp_umem
*umem
, u64
*addr
)
46 return xskq_peek_addr(umem
->fq
, addr
);
48 EXPORT_SYMBOL(xsk_umem_peek_addr
);
50 void xsk_umem_discard_addr(struct xdp_umem
*umem
)
52 xskq_discard_addr(umem
->fq
);
54 EXPORT_SYMBOL(xsk_umem_discard_addr
);
56 static int __xsk_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
, u32 len
)
58 void *to_buf
, *from_buf
;
63 if (!xskq_peek_addr(xs
->umem
->fq
, &addr
) ||
64 len
> xs
->umem
->chunk_size_nohr
- XDP_PACKET_HEADROOM
) {
69 addr
+= xs
->umem
->headroom
;
71 if (unlikely(xdp_data_meta_unsupported(xdp
))) {
75 from_buf
= xdp
->data_meta
;
76 metalen
= xdp
->data
- xdp
->data_meta
;
79 to_buf
= xdp_umem_get_data(xs
->umem
, addr
);
80 memcpy(to_buf
, from_buf
, len
+ metalen
);
82 err
= xskq_produce_batch_desc(xs
->rx
, addr
, len
);
84 xskq_discard_addr(xs
->umem
->fq
);
93 static int __xsk_rcv_zc(struct xdp_sock
*xs
, struct xdp_buff
*xdp
, u32 len
)
95 int err
= xskq_produce_batch_desc(xs
->rx
, (u64
)xdp
->handle
, len
);
103 int xsk_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
)
107 if (xs
->dev
!= xdp
->rxq
->dev
|| xs
->queue_id
!= xdp
->rxq
->queue_index
)
110 len
= xdp
->data_end
- xdp
->data
;
112 return (xdp
->rxq
->mem
.type
== MEM_TYPE_ZERO_COPY
) ?
113 __xsk_rcv_zc(xs
, xdp
, len
) : __xsk_rcv(xs
, xdp
, len
);
116 void xsk_flush(struct xdp_sock
*xs
)
118 xskq_produce_flush_desc(xs
->rx
);
119 xs
->sk
.sk_data_ready(&xs
->sk
);
122 int xsk_generic_rcv(struct xdp_sock
*xs
, struct xdp_buff
*xdp
)
124 u32 metalen
= xdp
->data
- xdp
->data_meta
;
125 u32 len
= xdp
->data_end
- xdp
->data
;
130 if (xs
->dev
!= xdp
->rxq
->dev
|| xs
->queue_id
!= xdp
->rxq
->queue_index
)
133 if (!xskq_peek_addr(xs
->umem
->fq
, &addr
) ||
134 len
> xs
->umem
->chunk_size_nohr
- XDP_PACKET_HEADROOM
) {
139 addr
+= xs
->umem
->headroom
;
141 buffer
= xdp_umem_get_data(xs
->umem
, addr
);
142 memcpy(buffer
, xdp
->data_meta
, len
+ metalen
);
144 err
= xskq_produce_batch_desc(xs
->rx
, addr
, len
);
146 xskq_discard_addr(xs
->umem
->fq
);
155 void xsk_umem_complete_tx(struct xdp_umem
*umem
, u32 nb_entries
)
157 xskq_produce_flush_addr_n(umem
->cq
, nb_entries
);
159 EXPORT_SYMBOL(xsk_umem_complete_tx
);
161 void xsk_umem_consume_tx_done(struct xdp_umem
*umem
)
166 list_for_each_entry_rcu(xs
, &umem
->xsk_list
, list
) {
167 xs
->sk
.sk_write_space(&xs
->sk
);
171 EXPORT_SYMBOL(xsk_umem_consume_tx_done
);
173 bool xsk_umem_consume_tx(struct xdp_umem
*umem
, dma_addr_t
*dma
, u32
*len
)
175 struct xdp_desc desc
;
179 list_for_each_entry_rcu(xs
, &umem
->xsk_list
, list
) {
180 if (!xskq_peek_desc(xs
->tx
, &desc
))
183 if (xskq_produce_addr_lazy(umem
->cq
, desc
.addr
))
186 *dma
= xdp_umem_get_dma(umem
, desc
.addr
);
189 xskq_discard_desc(xs
->tx
);
198 EXPORT_SYMBOL(xsk_umem_consume_tx
);
200 static int xsk_zc_xmit(struct sock
*sk
)
202 struct xdp_sock
*xs
= xdp_sk(sk
);
203 struct net_device
*dev
= xs
->dev
;
205 return dev
->netdev_ops
->ndo_xsk_async_xmit(dev
, xs
->queue_id
);
208 static void xsk_destruct_skb(struct sk_buff
*skb
)
210 u64 addr
= (u64
)(long)skb_shinfo(skb
)->destructor_arg
;
211 struct xdp_sock
*xs
= xdp_sk(skb
->sk
);
214 spin_lock_irqsave(&xs
->tx_completion_lock
, flags
);
215 WARN_ON_ONCE(xskq_produce_addr(xs
->umem
->cq
, addr
));
216 spin_unlock_irqrestore(&xs
->tx_completion_lock
, flags
);
221 static int xsk_generic_xmit(struct sock
*sk
, struct msghdr
*m
,
224 u32 max_batch
= TX_BATCH_SIZE
;
225 struct xdp_sock
*xs
= xdp_sk(sk
);
226 bool sent_frame
= false;
227 struct xdp_desc desc
;
231 mutex_lock(&xs
->mutex
);
233 while (xskq_peek_desc(xs
->tx
, &desc
)) {
238 if (max_batch
-- == 0) {
243 if (xskq_reserve_addr(xs
->umem
->cq
))
246 if (xs
->queue_id
>= xs
->dev
->real_num_tx_queues
)
250 skb
= sock_alloc_send_skb(sk
, len
, 1, &err
);
251 if (unlikely(!skb
)) {
258 buffer
= xdp_umem_get_data(xs
->umem
, addr
);
259 err
= skb_store_bits(skb
, 0, buffer
, len
);
266 skb
->priority
= sk
->sk_priority
;
267 skb
->mark
= sk
->sk_mark
;
268 skb_shinfo(skb
)->destructor_arg
= (void *)(long)addr
;
269 skb
->destructor
= xsk_destruct_skb
;
271 err
= dev_direct_xmit(skb
, xs
->queue_id
);
272 xskq_discard_desc(xs
->tx
);
273 /* Ignore NET_XMIT_CN as packet might have been sent */
274 if (err
== NET_XMIT_DROP
|| err
== NETDEV_TX_BUSY
) {
275 /* SKB completed but not sent */
285 sk
->sk_write_space(sk
);
287 mutex_unlock(&xs
->mutex
);
291 static int xsk_sendmsg(struct socket
*sock
, struct msghdr
*m
, size_t total_len
)
293 bool need_wait
= !(m
->msg_flags
& MSG_DONTWAIT
);
294 struct sock
*sk
= sock
->sk
;
295 struct xdp_sock
*xs
= xdp_sk(sk
);
297 if (unlikely(!xs
->dev
))
299 if (unlikely(!(xs
->dev
->flags
& IFF_UP
)))
301 if (unlikely(!xs
->tx
))
306 return (xs
->zc
) ? xsk_zc_xmit(sk
) : xsk_generic_xmit(sk
, m
, total_len
);
309 static unsigned int xsk_poll(struct file
*file
, struct socket
*sock
,
310 struct poll_table_struct
*wait
)
312 unsigned int mask
= datagram_poll(file
, sock
, wait
);
313 struct sock
*sk
= sock
->sk
;
314 struct xdp_sock
*xs
= xdp_sk(sk
);
316 if (xs
->rx
&& !xskq_empty_desc(xs
->rx
))
317 mask
|= POLLIN
| POLLRDNORM
;
318 if (xs
->tx
&& !xskq_full_desc(xs
->tx
))
319 mask
|= POLLOUT
| POLLWRNORM
;
324 static int xsk_init_queue(u32 entries
, struct xsk_queue
**queue
,
329 if (entries
== 0 || *queue
|| !is_power_of_2(entries
))
332 q
= xskq_create(entries
, umem_queue
);
336 /* Make sure queue is ready before it can be seen by others */
342 static int xsk_release(struct socket
*sock
)
344 struct sock
*sk
= sock
->sk
;
345 struct xdp_sock
*xs
= xdp_sk(sk
);
354 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
358 struct net_device
*dev
= xs
->dev
;
360 /* Wait for driver to stop using the xdp socket. */
361 xdp_del_sk_umem(xs
->umem
, xs
);
367 xskq_destroy(xs
->rx
);
368 xskq_destroy(xs
->tx
);
373 sk_refcnt_debug_release(sk
);
379 static struct socket
*xsk_lookup_xsk_from_fd(int fd
)
384 sock
= sockfd_lookup(fd
, &err
);
386 return ERR_PTR(-ENOTSOCK
);
388 if (sock
->sk
->sk_family
!= PF_XDP
) {
390 return ERR_PTR(-ENOPROTOOPT
);
396 static int xsk_bind(struct socket
*sock
, struct sockaddr
*addr
, int addr_len
)
398 struct sockaddr_xdp
*sxdp
= (struct sockaddr_xdp
*)addr
;
399 struct sock
*sk
= sock
->sk
;
400 struct xdp_sock
*xs
= xdp_sk(sk
);
401 struct net_device
*dev
;
405 if (addr_len
< sizeof(struct sockaddr_xdp
))
407 if (sxdp
->sxdp_family
!= AF_XDP
)
410 mutex_lock(&xs
->mutex
);
416 dev
= dev_get_by_index(sock_net(sk
), sxdp
->sxdp_ifindex
);
422 if (!xs
->rx
&& !xs
->tx
) {
427 qid
= sxdp
->sxdp_queue_id
;
428 flags
= sxdp
->sxdp_flags
;
430 if (flags
& XDP_SHARED_UMEM
) {
431 struct xdp_sock
*umem_xs
;
434 if ((flags
& XDP_COPY
) || (flags
& XDP_ZEROCOPY
)) {
435 /* Cannot specify flags for shared sockets. */
441 /* We have already our own. */
446 sock
= xsk_lookup_xsk_from_fd(sxdp
->sxdp_shared_umem_fd
);
452 umem_xs
= xdp_sk(sock
->sk
);
453 if (!umem_xs
->umem
) {
454 /* No umem to inherit. */
458 } else if (umem_xs
->dev
!= dev
|| umem_xs
->queue_id
!= qid
) {
464 xdp_get_umem(umem_xs
->umem
);
465 xs
->umem
= umem_xs
->umem
;
467 } else if (!xs
->umem
|| !xdp_umem_validate_queues(xs
->umem
)) {
471 /* This xsk has its own umem. */
472 xskq_set_umem(xs
->umem
->fq
, xs
->umem
->size
,
473 xs
->umem
->chunk_mask
);
474 xskq_set_umem(xs
->umem
->cq
, xs
->umem
->size
,
475 xs
->umem
->chunk_mask
);
477 err
= xdp_umem_assign_dev(xs
->umem
, dev
, qid
, flags
);
483 xs
->zc
= xs
->umem
->zc
;
485 xskq_set_umem(xs
->rx
, xs
->umem
->size
, xs
->umem
->chunk_mask
);
486 xskq_set_umem(xs
->tx
, xs
->umem
->size
, xs
->umem
->chunk_mask
);
487 xdp_add_sk_umem(xs
->umem
, xs
);
493 mutex_unlock(&xs
->mutex
);
497 static int xsk_setsockopt(struct socket
*sock
, int level
, int optname
,
498 char __user
*optval
, unsigned int optlen
)
500 struct sock
*sk
= sock
->sk
;
501 struct xdp_sock
*xs
= xdp_sk(sk
);
504 if (level
!= SOL_XDP
)
511 struct xsk_queue
**q
;
514 if (optlen
< sizeof(entries
))
516 if (copy_from_user(&entries
, optval
, sizeof(entries
)))
519 mutex_lock(&xs
->mutex
);
520 q
= (optname
== XDP_TX_RING
) ? &xs
->tx
: &xs
->rx
;
521 err
= xsk_init_queue(entries
, q
, false);
522 mutex_unlock(&xs
->mutex
);
527 struct xdp_umem_reg mr
;
528 struct xdp_umem
*umem
;
530 if (copy_from_user(&mr
, optval
, sizeof(mr
)))
533 mutex_lock(&xs
->mutex
);
535 mutex_unlock(&xs
->mutex
);
539 umem
= xdp_umem_create(&mr
);
541 mutex_unlock(&xs
->mutex
);
542 return PTR_ERR(umem
);
545 /* Make sure umem is ready before it can be seen by others */
548 mutex_unlock(&xs
->mutex
);
551 case XDP_UMEM_FILL_RING
:
552 case XDP_UMEM_COMPLETION_RING
:
554 struct xsk_queue
**q
;
557 if (copy_from_user(&entries
, optval
, sizeof(entries
)))
560 mutex_lock(&xs
->mutex
);
562 mutex_unlock(&xs
->mutex
);
566 q
= (optname
== XDP_UMEM_FILL_RING
) ? &xs
->umem
->fq
:
568 err
= xsk_init_queue(entries
, q
, true);
569 mutex_unlock(&xs
->mutex
);
579 static int xsk_getsockopt(struct socket
*sock
, int level
, int optname
,
580 char __user
*optval
, int __user
*optlen
)
582 struct sock
*sk
= sock
->sk
;
583 struct xdp_sock
*xs
= xdp_sk(sk
);
586 if (level
!= SOL_XDP
)
589 if (get_user(len
, optlen
))
597 struct xdp_statistics stats
;
599 if (len
< sizeof(stats
))
602 mutex_lock(&xs
->mutex
);
603 stats
.rx_dropped
= xs
->rx_dropped
;
604 stats
.rx_invalid_descs
= xskq_nb_invalid_descs(xs
->rx
);
605 stats
.tx_invalid_descs
= xskq_nb_invalid_descs(xs
->tx
);
606 mutex_unlock(&xs
->mutex
);
608 if (copy_to_user(optval
, &stats
, sizeof(stats
)))
610 if (put_user(sizeof(stats
), optlen
))
615 case XDP_MMAP_OFFSETS
:
617 struct xdp_mmap_offsets off
;
619 if (len
< sizeof(off
))
622 off
.rx
.producer
= offsetof(struct xdp_rxtx_ring
, ptrs
.producer
);
623 off
.rx
.consumer
= offsetof(struct xdp_rxtx_ring
, ptrs
.consumer
);
624 off
.rx
.desc
= offsetof(struct xdp_rxtx_ring
, desc
);
625 off
.tx
.producer
= offsetof(struct xdp_rxtx_ring
, ptrs
.producer
);
626 off
.tx
.consumer
= offsetof(struct xdp_rxtx_ring
, ptrs
.consumer
);
627 off
.tx
.desc
= offsetof(struct xdp_rxtx_ring
, desc
);
629 off
.fr
.producer
= offsetof(struct xdp_umem_ring
, ptrs
.producer
);
630 off
.fr
.consumer
= offsetof(struct xdp_umem_ring
, ptrs
.consumer
);
631 off
.fr
.desc
= offsetof(struct xdp_umem_ring
, desc
);
632 off
.cr
.producer
= offsetof(struct xdp_umem_ring
, ptrs
.producer
);
633 off
.cr
.consumer
= offsetof(struct xdp_umem_ring
, ptrs
.consumer
);
634 off
.cr
.desc
= offsetof(struct xdp_umem_ring
, desc
);
637 if (copy_to_user(optval
, &off
, len
))
639 if (put_user(len
, optlen
))
651 static int xsk_mmap(struct file
*file
, struct socket
*sock
,
652 struct vm_area_struct
*vma
)
654 loff_t offset
= (loff_t
)vma
->vm_pgoff
<< PAGE_SHIFT
;
655 unsigned long size
= vma
->vm_end
- vma
->vm_start
;
656 struct xdp_sock
*xs
= xdp_sk(sock
->sk
);
657 struct xsk_queue
*q
= NULL
;
658 struct xdp_umem
*umem
;
662 if (offset
== XDP_PGOFF_RX_RING
) {
663 q
= READ_ONCE(xs
->rx
);
664 } else if (offset
== XDP_PGOFF_TX_RING
) {
665 q
= READ_ONCE(xs
->tx
);
667 umem
= READ_ONCE(xs
->umem
);
671 if (offset
== XDP_UMEM_PGOFF_FILL_RING
)
672 q
= READ_ONCE(umem
->fq
);
673 else if (offset
== XDP_UMEM_PGOFF_COMPLETION_RING
)
674 q
= READ_ONCE(umem
->cq
);
680 qpg
= virt_to_head_page(q
->ring
);
681 if (size
> (PAGE_SIZE
<< compound_order(qpg
)))
684 pfn
= virt_to_phys(q
->ring
) >> PAGE_SHIFT
;
685 return remap_pfn_range(vma
, vma
->vm_start
, pfn
,
686 size
, vma
->vm_page_prot
);
689 static struct proto xsk_proto
= {
691 .owner
= THIS_MODULE
,
692 .obj_size
= sizeof(struct xdp_sock
),
695 static const struct proto_ops xsk_proto_ops
= {
697 .owner
= THIS_MODULE
,
698 .release
= xsk_release
,
700 .connect
= sock_no_connect
,
701 .socketpair
= sock_no_socketpair
,
702 .accept
= sock_no_accept
,
703 .getname
= sock_no_getname
,
705 .ioctl
= sock_no_ioctl
,
706 .listen
= sock_no_listen
,
707 .shutdown
= sock_no_shutdown
,
708 .setsockopt
= xsk_setsockopt
,
709 .getsockopt
= xsk_getsockopt
,
710 .sendmsg
= xsk_sendmsg
,
711 .recvmsg
= sock_no_recvmsg
,
713 .sendpage
= sock_no_sendpage
,
716 static void xsk_destruct(struct sock
*sk
)
718 struct xdp_sock
*xs
= xdp_sk(sk
);
720 if (!sock_flag(sk
, SOCK_DEAD
))
723 xdp_put_umem(xs
->umem
);
725 sk_refcnt_debug_dec(sk
);
728 static int xsk_create(struct net
*net
, struct socket
*sock
, int protocol
,
734 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
736 if (sock
->type
!= SOCK_RAW
)
737 return -ESOCKTNOSUPPORT
;
740 return -EPROTONOSUPPORT
;
742 sock
->state
= SS_UNCONNECTED
;
744 sk
= sk_alloc(net
, PF_XDP
, GFP_KERNEL
, &xsk_proto
, kern
);
748 sock
->ops
= &xsk_proto_ops
;
750 sock_init_data(sock
, sk
);
752 sk
->sk_family
= PF_XDP
;
754 sk
->sk_destruct
= xsk_destruct
;
755 sk_refcnt_debug_inc(sk
);
757 sock_set_flag(sk
, SOCK_RCU_FREE
);
760 mutex_init(&xs
->mutex
);
761 spin_lock_init(&xs
->tx_completion_lock
);
764 sock_prot_inuse_add(net
, &xsk_proto
, 1);
770 static const struct net_proto_family xsk_family_ops
= {
772 .create
= xsk_create
,
773 .owner
= THIS_MODULE
,
776 static int __init
xsk_init(void)
780 err
= proto_register(&xsk_proto
, 0 /* no slab */);
784 err
= sock_register(&xsk_family_ops
);
791 proto_unregister(&xsk_proto
);
796 fs_initcall(xsk_init
);