]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/xdp/xsk.c
xsk: proper fill queue descriptor validation
[mirror_ubuntu-jammy-kernel.git] / net / xdp / xsk.c
CommitLineData
c0c77d8f
BT
1// SPDX-License-Identifier: GPL-2.0
2/* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
c0c77d8f
BT
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14#include <linux/if_xdp.h>
15#include <linux/init.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/sched/task.h>
19#include <linux/socket.h>
20#include <linux/file.h>
21#include <linux/uaccess.h>
22#include <linux/net.h>
23#include <linux/netdevice.h>
24#include <net/xdp_sock.h>
b9b6b68e 25#include <net/xdp.h>
c0c77d8f 26
423f3832 27#include "xsk_queue.h"
c0c77d8f
BT
28#include "xdp_umem.h"
29
35fcde7f
MK
30#define TX_BATCH_SIZE 16
31
c0c77d8f
BT
32static struct xdp_sock *xdp_sk(struct sock *sk)
33{
34 return (struct xdp_sock *)sk;
35}
36
fbfc504a
BT
37bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
38{
39 return !!xs->rx;
40}
41
c497176c
BT
42static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
43{
4e64c835 44 u32 id, len = xdp->data_end - xdp->data;
c497176c 45 void *buffer;
4e64c835 46 int err;
c497176c
BT
47
48 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
49 return -EINVAL;
50
4e64c835 51 if (!xskq_peek_id(xs->umem->fq, &id))
c497176c
BT
52 return -ENOSPC;
53
4e64c835 54 buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
c497176c 55 memcpy(buffer, xdp->data, len);
4e64c835 56 err = xskq_produce_batch_desc(xs->rx, id, len,
c497176c
BT
57 xs->umem->frame_headroom);
58 if (!err)
59 xskq_discard_id(xs->umem->fq);
60
61 return err;
62}
63
64int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
65{
66 int err;
67
68 err = __xsk_rcv(xs, xdp);
69 if (likely(!err))
70 xdp_return_buff(xdp);
71 else
72 xs->rx_dropped++;
73
74 return err;
75}
76
77void xsk_flush(struct xdp_sock *xs)
78{
79 xskq_produce_flush_desc(xs->rx);
80 xs->sk.sk_data_ready(&xs->sk);
81}
82
83int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
84{
85 int err;
86
87 err = __xsk_rcv(xs, xdp);
88 if (!err)
89 xsk_flush(xs);
90 else
91 xs->rx_dropped++;
92
93 return err;
94}
95
35fcde7f
MK
96static void xsk_destruct_skb(struct sk_buff *skb)
97{
98 u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
99 struct xdp_sock *xs = xdp_sk(skb->sk);
100
101 WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
102
103 sock_wfree(skb);
104}
105
106static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
107 size_t total_len)
108{
109 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
110 u32 max_batch = TX_BATCH_SIZE;
111 struct xdp_sock *xs = xdp_sk(sk);
112 bool sent_frame = false;
113 struct xdp_desc desc;
114 struct sk_buff *skb;
115 int err = 0;
116
117 if (unlikely(!xs->tx))
118 return -ENOBUFS;
119 if (need_wait)
120 return -EOPNOTSUPP;
121
122 mutex_lock(&xs->mutex);
123
124 while (xskq_peek_desc(xs->tx, &desc)) {
125 char *buffer;
126 u32 id, len;
127
128 if (max_batch-- == 0) {
129 err = -EAGAIN;
130 goto out;
131 }
132
133 if (xskq_reserve_id(xs->umem->cq)) {
134 err = -EAGAIN;
135 goto out;
136 }
137
138 len = desc.len;
139 if (unlikely(len > xs->dev->mtu)) {
140 err = -EMSGSIZE;
141 goto out;
142 }
143
2e59dd5e
MK
144 if (xs->queue_id >= xs->dev->real_num_tx_queues) {
145 err = -ENXIO;
146 goto out;
147 }
148
35fcde7f
MK
149 skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
150 if (unlikely(!skb)) {
151 err = -EAGAIN;
152 goto out;
153 }
154
155 skb_put(skb, len);
156 id = desc.idx;
157 buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
158 err = skb_store_bits(skb, 0, buffer, len);
159 if (unlikely(err)) {
160 kfree_skb(skb);
161 goto out;
162 }
163
164 skb->dev = xs->dev;
165 skb->priority = sk->sk_priority;
166 skb->mark = sk->sk_mark;
167 skb_shinfo(skb)->destructor_arg = (void *)(long)id;
168 skb->destructor = xsk_destruct_skb;
169
170 err = dev_direct_xmit(skb, xs->queue_id);
171 /* Ignore NET_XMIT_CN as packet might have been sent */
172 if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
173 err = -EAGAIN;
174 /* SKB consumed by dev_direct_xmit() */
175 goto out;
176 }
177
178 sent_frame = true;
179 xskq_discard_desc(xs->tx);
180 }
181
182out:
183 if (sent_frame)
184 sk->sk_write_space(sk);
185
186 mutex_unlock(&xs->mutex);
187 return err;
188}
189
190static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
191{
192 struct sock *sk = sock->sk;
193 struct xdp_sock *xs = xdp_sk(sk);
194
195 if (unlikely(!xs->dev))
196 return -ENXIO;
197 if (unlikely(!(xs->dev->flags & IFF_UP)))
198 return -ENETDOWN;
199
200 return xsk_generic_xmit(sk, m, total_len);
201}
202
c497176c
BT
203static unsigned int xsk_poll(struct file *file, struct socket *sock,
204 struct poll_table_struct *wait)
205{
206 unsigned int mask = datagram_poll(file, sock, wait);
207 struct sock *sk = sock->sk;
208 struct xdp_sock *xs = xdp_sk(sk);
209
210 if (xs->rx && !xskq_empty_desc(xs->rx))
211 mask |= POLLIN | POLLRDNORM;
35fcde7f
MK
212 if (xs->tx && !xskq_full_desc(xs->tx))
213 mask |= POLLOUT | POLLWRNORM;
c497176c
BT
214
215 return mask;
216}
217
b9b6b68e
BT
218static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
219 bool umem_queue)
423f3832
MK
220{
221 struct xsk_queue *q;
222
223 if (entries == 0 || *queue || !is_power_of_2(entries))
224 return -EINVAL;
225
b9b6b68e 226 q = xskq_create(entries, umem_queue);
423f3832
MK
227 if (!q)
228 return -ENOMEM;
229
37b07693
BT
230 /* Make sure queue is ready before it can be seen by others */
231 smp_wmb();
423f3832
MK
232 *queue = q;
233 return 0;
234}
235
c0c77d8f
BT
236static int xsk_release(struct socket *sock)
237{
238 struct sock *sk = sock->sk;
965a9909 239 struct xdp_sock *xs = xdp_sk(sk);
c0c77d8f
BT
240 struct net *net;
241
242 if (!sk)
243 return 0;
244
245 net = sock_net(sk);
246
247 local_bh_disable();
248 sock_prot_inuse_add(net, sk->sk_prot, -1);
249 local_bh_enable();
250
965a9909 251 if (xs->dev) {
959b71db
BT
252 /* Wait for driver to stop using the xdp socket. */
253 synchronize_net();
254 dev_put(xs->dev);
965a9909
MK
255 xs->dev = NULL;
256 }
257
c0c77d8f
BT
258 sock_orphan(sk);
259 sock->sk = NULL;
260
261 sk_refcnt_debug_release(sk);
262 sock_put(sk);
263
264 return 0;
265}
266
965a9909
MK
267static struct socket *xsk_lookup_xsk_from_fd(int fd)
268{
269 struct socket *sock;
270 int err;
271
272 sock = sockfd_lookup(fd, &err);
273 if (!sock)
274 return ERR_PTR(-ENOTSOCK);
275
276 if (sock->sk->sk_family != PF_XDP) {
277 sockfd_put(sock);
278 return ERR_PTR(-ENOPROTOOPT);
279 }
280
281 return sock;
282}
283
284static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
285{
286 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
287 struct sock *sk = sock->sk;
965a9909 288 struct xdp_sock *xs = xdp_sk(sk);
959b71db 289 struct net_device *dev;
965a9909
MK
290 int err = 0;
291
292 if (addr_len < sizeof(struct sockaddr_xdp))
293 return -EINVAL;
294 if (sxdp->sxdp_family != AF_XDP)
295 return -EINVAL;
296
297 mutex_lock(&xs->mutex);
959b71db
BT
298 if (xs->dev) {
299 err = -EBUSY;
300 goto out_release;
301 }
302
965a9909
MK
303 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
304 if (!dev) {
305 err = -ENODEV;
306 goto out_release;
307 }
308
f6145903 309 if (!xs->rx && !xs->tx) {
965a9909
MK
310 err = -EINVAL;
311 goto out_unlock;
312 }
313
2e59dd5e
MK
314 if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
315 (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
965a9909
MK
316 err = -EINVAL;
317 goto out_unlock;
318 }
319
320 if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
321 struct xdp_sock *umem_xs;
322 struct socket *sock;
323
324 if (xs->umem) {
325 /* We have already our own. */
326 err = -EINVAL;
327 goto out_unlock;
328 }
329
330 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
331 if (IS_ERR(sock)) {
332 err = PTR_ERR(sock);
333 goto out_unlock;
334 }
335
336 umem_xs = xdp_sk(sock->sk);
337 if (!umem_xs->umem) {
338 /* No umem to inherit. */
339 err = -EBADF;
340 sockfd_put(sock);
341 goto out_unlock;
342 } else if (umem_xs->dev != dev ||
343 umem_xs->queue_id != sxdp->sxdp_queue_id) {
344 err = -EINVAL;
345 sockfd_put(sock);
346 goto out_unlock;
347 }
348
349 xdp_get_umem(umem_xs->umem);
965a9909
MK
350 xs->umem = umem_xs->umem;
351 sockfd_put(sock);
352 } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
353 err = -EINVAL;
354 goto out_unlock;
c497176c
BT
355 } else {
356 /* This xsk has its own umem. */
357 xskq_set_umem(xs->umem->fq, &xs->umem->props);
fe230832 358 xskq_set_umem(xs->umem->cq, &xs->umem->props);
965a9909
MK
359 }
360
965a9909
MK
361 xs->dev = dev;
362 xs->queue_id = sxdp->sxdp_queue_id;
363
364 xskq_set_umem(xs->rx, &xs->umem->props);
35fcde7f 365 xskq_set_umem(xs->tx, &xs->umem->props);
965a9909
MK
366
367out_unlock:
368 if (err)
369 dev_put(dev);
370out_release:
371 mutex_unlock(&xs->mutex);
372 return err;
373}
374
c0c77d8f
BT
375static int xsk_setsockopt(struct socket *sock, int level, int optname,
376 char __user *optval, unsigned int optlen)
377{
378 struct sock *sk = sock->sk;
379 struct xdp_sock *xs = xdp_sk(sk);
380 int err;
381
382 if (level != SOL_XDP)
383 return -ENOPROTOOPT;
384
385 switch (optname) {
b9b6b68e 386 case XDP_RX_RING:
f6145903 387 case XDP_TX_RING:
b9b6b68e
BT
388 {
389 struct xsk_queue **q;
390 int entries;
391
392 if (optlen < sizeof(entries))
393 return -EINVAL;
394 if (copy_from_user(&entries, optval, sizeof(entries)))
395 return -EFAULT;
396
397 mutex_lock(&xs->mutex);
f6145903 398 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
b9b6b68e
BT
399 err = xsk_init_queue(entries, q, false);
400 mutex_unlock(&xs->mutex);
401 return err;
402 }
c0c77d8f
BT
403 case XDP_UMEM_REG:
404 {
405 struct xdp_umem_reg mr;
406 struct xdp_umem *umem;
407
c0c77d8f
BT
408 if (copy_from_user(&mr, optval, sizeof(mr)))
409 return -EFAULT;
410
411 mutex_lock(&xs->mutex);
a49049ea
BT
412 if (xs->umem) {
413 mutex_unlock(&xs->mutex);
414 return -EBUSY;
415 }
c0c77d8f 416
a49049ea
BT
417 umem = xdp_umem_create(&mr);
418 if (IS_ERR(umem)) {
c0c77d8f 419 mutex_unlock(&xs->mutex);
a49049ea 420 return PTR_ERR(umem);
c0c77d8f
BT
421 }
422
423 /* Make sure umem is ready before it can be seen by others */
424 smp_wmb();
c0c77d8f
BT
425 xs->umem = umem;
426 mutex_unlock(&xs->mutex);
427 return 0;
428 }
423f3832 429 case XDP_UMEM_FILL_RING:
fe230832 430 case XDP_UMEM_COMPLETION_RING:
423f3832
MK
431 {
432 struct xsk_queue **q;
433 int entries;
434
423f3832
MK
435 if (copy_from_user(&entries, optval, sizeof(entries)))
436 return -EFAULT;
437
438 mutex_lock(&xs->mutex);
a49049ea
BT
439 if (!xs->umem) {
440 mutex_unlock(&xs->mutex);
441 return -EINVAL;
442 }
443
fe230832
MK
444 q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
445 &xs->umem->cq;
b9b6b68e 446 err = xsk_init_queue(entries, q, true);
423f3832
MK
447 mutex_unlock(&xs->mutex);
448 return err;
449 }
c0c77d8f
BT
450 default:
451 break;
452 }
453
454 return -ENOPROTOOPT;
455}
456
af75d9e0
MK
457static int xsk_getsockopt(struct socket *sock, int level, int optname,
458 char __user *optval, int __user *optlen)
459{
460 struct sock *sk = sock->sk;
461 struct xdp_sock *xs = xdp_sk(sk);
462 int len;
463
464 if (level != SOL_XDP)
465 return -ENOPROTOOPT;
466
467 if (get_user(len, optlen))
468 return -EFAULT;
469 if (len < 0)
470 return -EINVAL;
471
472 switch (optname) {
473 case XDP_STATISTICS:
474 {
475 struct xdp_statistics stats;
476
477 if (len < sizeof(stats))
478 return -EINVAL;
479
480 mutex_lock(&xs->mutex);
481 stats.rx_dropped = xs->rx_dropped;
482 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
483 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
484 mutex_unlock(&xs->mutex);
485
486 if (copy_to_user(optval, &stats, sizeof(stats)))
487 return -EFAULT;
488 if (put_user(sizeof(stats), optlen))
489 return -EFAULT;
490
491 return 0;
492 }
b3a9e0be
BT
493 case XDP_MMAP_OFFSETS:
494 {
495 struct xdp_mmap_offsets off;
496
497 if (len < sizeof(off))
498 return -EINVAL;
499
500 off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
501 off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
502 off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
503 off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
504 off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
505 off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
506
507 off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
508 off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
509 off.fr.desc = offsetof(struct xdp_umem_ring, desc);
510 off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
511 off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
512 off.cr.desc = offsetof(struct xdp_umem_ring, desc);
513
514 len = sizeof(off);
515 if (copy_to_user(optval, &off, len))
516 return -EFAULT;
517 if (put_user(len, optlen))
518 return -EFAULT;
519
520 return 0;
521 }
af75d9e0
MK
522 default:
523 break;
524 }
525
526 return -EOPNOTSUPP;
527}
528
423f3832
MK
529static int xsk_mmap(struct file *file, struct socket *sock,
530 struct vm_area_struct *vma)
531{
532 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
533 unsigned long size = vma->vm_end - vma->vm_start;
534 struct xdp_sock *xs = xdp_sk(sock->sk);
535 struct xsk_queue *q = NULL;
37b07693 536 struct xdp_umem *umem;
423f3832
MK
537 unsigned long pfn;
538 struct page *qpg;
539
b9b6b68e 540 if (offset == XDP_PGOFF_RX_RING) {
37b07693 541 q = READ_ONCE(xs->rx);
f6145903 542 } else if (offset == XDP_PGOFF_TX_RING) {
37b07693 543 q = READ_ONCE(xs->tx);
b9b6b68e 544 } else {
37b07693
BT
545 umem = READ_ONCE(xs->umem);
546 if (!umem)
b9b6b68e 547 return -EINVAL;
423f3832 548
b9b6b68e 549 if (offset == XDP_UMEM_PGOFF_FILL_RING)
37b07693 550 q = READ_ONCE(umem->fq);
fe230832 551 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
37b07693 552 q = READ_ONCE(umem->cq);
b9b6b68e 553 }
423f3832
MK
554
555 if (!q)
556 return -EINVAL;
557
558 qpg = virt_to_head_page(q->ring);
559 if (size > (PAGE_SIZE << compound_order(qpg)))
560 return -EINVAL;
561
562 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
563 return remap_pfn_range(vma, vma->vm_start, pfn,
564 size, vma->vm_page_prot);
565}
566
c0c77d8f
BT
567static struct proto xsk_proto = {
568 .name = "XDP",
569 .owner = THIS_MODULE,
570 .obj_size = sizeof(struct xdp_sock),
571};
572
573static const struct proto_ops xsk_proto_ops = {
c2f4374b
BT
574 .family = PF_XDP,
575 .owner = THIS_MODULE,
576 .release = xsk_release,
577 .bind = xsk_bind,
578 .connect = sock_no_connect,
579 .socketpair = sock_no_socketpair,
580 .accept = sock_no_accept,
581 .getname = sock_no_getname,
582 .poll = xsk_poll,
583 .ioctl = sock_no_ioctl,
584 .listen = sock_no_listen,
585 .shutdown = sock_no_shutdown,
586 .setsockopt = xsk_setsockopt,
587 .getsockopt = xsk_getsockopt,
588 .sendmsg = xsk_sendmsg,
589 .recvmsg = sock_no_recvmsg,
590 .mmap = xsk_mmap,
591 .sendpage = sock_no_sendpage,
c0c77d8f
BT
592};
593
594static void xsk_destruct(struct sock *sk)
595{
596 struct xdp_sock *xs = xdp_sk(sk);
597
598 if (!sock_flag(sk, SOCK_DEAD))
599 return;
600
b9b6b68e 601 xskq_destroy(xs->rx);
f6145903 602 xskq_destroy(xs->tx);
c0c77d8f
BT
603 xdp_put_umem(xs->umem);
604
605 sk_refcnt_debug_dec(sk);
606}
607
608static int xsk_create(struct net *net, struct socket *sock, int protocol,
609 int kern)
610{
611 struct sock *sk;
612 struct xdp_sock *xs;
613
614 if (!ns_capable(net->user_ns, CAP_NET_RAW))
615 return -EPERM;
616 if (sock->type != SOCK_RAW)
617 return -ESOCKTNOSUPPORT;
618
619 if (protocol)
620 return -EPROTONOSUPPORT;
621
622 sock->state = SS_UNCONNECTED;
623
624 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
625 if (!sk)
626 return -ENOBUFS;
627
628 sock->ops = &xsk_proto_ops;
629
630 sock_init_data(sock, sk);
631
632 sk->sk_family = PF_XDP;
633
634 sk->sk_destruct = xsk_destruct;
635 sk_refcnt_debug_inc(sk);
636
637 xs = xdp_sk(sk);
638 mutex_init(&xs->mutex);
639
640 local_bh_disable();
641 sock_prot_inuse_add(net, &xsk_proto, 1);
642 local_bh_enable();
643
644 return 0;
645}
646
647static const struct net_proto_family xsk_family_ops = {
648 .family = PF_XDP,
649 .create = xsk_create,
650 .owner = THIS_MODULE,
651};
652
653static int __init xsk_init(void)
654{
655 int err;
656
657 err = proto_register(&xsk_proto, 0 /* no slab */);
658 if (err)
659 goto out;
660
661 err = sock_register(&xsk_family_ops);
662 if (err)
663 goto out_proto;
664
665 return 0;
666
667out_proto:
668 proto_unregister(&xsk_proto);
669out:
670 return err;
671}
672
673fs_initcall(xsk_init);