]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - net/xdp/xsk.c
UBUNTU: Ubuntu-5.11.0-22.23
[mirror_ubuntu-hirsute-kernel.git] / net / xdp / xsk.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/busy_poll.h>
27 #include <net/xdp.h>
28
29 #include "xsk_queue.h"
30 #include "xdp_umem.h"
31 #include "xsk.h"
32
33 #define TX_BATCH_SIZE 16
34
35 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36
37 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
38 {
39 if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
40 return;
41
42 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
43 pool->cached_need_wakeup |= XDP_WAKEUP_RX;
44 }
45 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46
47 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
48 {
49 struct xdp_sock *xs;
50
51 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
52 return;
53
54 rcu_read_lock();
55 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
56 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 }
58 rcu_read_unlock();
59
60 pool->cached_need_wakeup |= XDP_WAKEUP_TX;
61 }
62 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63
64 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
65 {
66 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
67 return;
68
69 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
70 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
71 }
72 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73
74 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
75 {
76 struct xdp_sock *xs;
77
78 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
79 return;
80
81 rcu_read_lock();
82 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
83 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 }
85 rcu_read_unlock();
86
87 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
88 }
89 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90
91 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
92 {
93 return pool->uses_need_wakeup;
94 }
95 EXPORT_SYMBOL(xsk_uses_need_wakeup);
96
97 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 u16 queue_id)
99 {
100 if (queue_id < dev->real_num_rx_queues)
101 return dev->_rx[queue_id].pool;
102 if (queue_id < dev->real_num_tx_queues)
103 return dev->_tx[queue_id].pool;
104
105 return NULL;
106 }
107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
108
109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110 {
111 if (queue_id < dev->num_rx_queues)
112 dev->_rx[queue_id].pool = NULL;
113 if (queue_id < dev->num_tx_queues)
114 dev->_tx[queue_id].pool = NULL;
115 }
116
117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118 * not know if the device has more tx queues than rx, or the opposite.
119 * This might also change during run time.
120 */
121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 u16 queue_id)
123 {
124 if (queue_id >= max_t(unsigned int,
125 dev->real_num_rx_queues,
126 dev->real_num_tx_queues))
127 return -EINVAL;
128
129 if (queue_id < dev->real_num_rx_queues)
130 dev->_rx[queue_id].pool = pool;
131 if (queue_id < dev->real_num_tx_queues)
132 dev->_tx[queue_id].pool = pool;
133
134 return 0;
135 }
136
137 void xp_release(struct xdp_buff_xsk *xskb)
138 {
139 xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
140 }
141
142 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
143 {
144 u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
145
146 offset += xskb->pool->headroom;
147 if (!xskb->pool->unaligned)
148 return xskb->orig_addr + offset;
149 return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
150 }
151
152 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
153 {
154 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
155 u64 addr;
156 int err;
157
158 addr = xp_get_handle(xskb);
159 err = xskq_prod_reserve_desc(xs->rx, addr, len);
160 if (err) {
161 xs->rx_queue_full++;
162 return err;
163 }
164
165 xp_release(xskb);
166 return 0;
167 }
168
169 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
170 {
171 void *from_buf, *to_buf;
172 u32 metalen;
173
174 if (unlikely(xdp_data_meta_unsupported(from))) {
175 from_buf = from->data;
176 to_buf = to->data;
177 metalen = 0;
178 } else {
179 from_buf = from->data_meta;
180 metalen = from->data - from->data_meta;
181 to_buf = to->data - metalen;
182 }
183
184 memcpy(to_buf, from_buf, len + metalen);
185 }
186
187 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
188 bool explicit_free)
189 {
190 struct xdp_buff *xsk_xdp;
191 int err;
192
193 if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
194 xs->rx_dropped++;
195 return -ENOSPC;
196 }
197
198 xsk_xdp = xsk_buff_alloc(xs->pool);
199 if (!xsk_xdp) {
200 xs->rx_dropped++;
201 return -ENOSPC;
202 }
203
204 xsk_copy_xdp(xsk_xdp, xdp, len);
205 err = __xsk_rcv_zc(xs, xsk_xdp, len);
206 if (err) {
207 xsk_buff_free(xsk_xdp);
208 return err;
209 }
210 if (explicit_free)
211 xdp_return_buff(xdp);
212 return 0;
213 }
214
215 static bool xsk_tx_writeable(struct xdp_sock *xs)
216 {
217 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
218 return false;
219
220 return true;
221 }
222
223 static bool xsk_is_bound(struct xdp_sock *xs)
224 {
225 if (READ_ONCE(xs->state) == XSK_BOUND) {
226 /* Matches smp_wmb() in bind(). */
227 smp_rmb();
228 return true;
229 }
230 return false;
231 }
232
233 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
234 bool explicit_free)
235 {
236 u32 len;
237
238 if (!xsk_is_bound(xs))
239 return -EINVAL;
240
241 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
242 return -EINVAL;
243
244 sk_mark_napi_id_once_xdp(&xs->sk, xdp);
245 len = xdp->data_end - xdp->data;
246
247 return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
248 __xsk_rcv_zc(xs, xdp, len) :
249 __xsk_rcv(xs, xdp, len, explicit_free);
250 }
251
252 static void xsk_flush(struct xdp_sock *xs)
253 {
254 xskq_prod_submit(xs->rx);
255 __xskq_cons_release(xs->pool->fq);
256 sock_def_readable(&xs->sk);
257 }
258
259 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
260 {
261 int err;
262
263 spin_lock_bh(&xs->rx_lock);
264 err = xsk_rcv(xs, xdp, false);
265 xsk_flush(xs);
266 spin_unlock_bh(&xs->rx_lock);
267 return err;
268 }
269
270 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
271 {
272 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
273 int err;
274
275 err = xsk_rcv(xs, xdp, true);
276 if (err)
277 return err;
278
279 if (!xs->flush_node.prev)
280 list_add(&xs->flush_node, flush_list);
281
282 return 0;
283 }
284
285 void __xsk_map_flush(void)
286 {
287 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
288 struct xdp_sock *xs, *tmp;
289
290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
291 xsk_flush(xs);
292 __list_del_clearprev(&xs->flush_node);
293 }
294 }
295
296 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
297 {
298 xskq_prod_submit_n(pool->cq, nb_entries);
299 }
300 EXPORT_SYMBOL(xsk_tx_completed);
301
302 void xsk_tx_release(struct xsk_buff_pool *pool)
303 {
304 struct xdp_sock *xs;
305
306 rcu_read_lock();
307 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
308 __xskq_cons_release(xs->tx);
309 if (xsk_tx_writeable(xs))
310 xs->sk.sk_write_space(&xs->sk);
311 }
312 rcu_read_unlock();
313 }
314 EXPORT_SYMBOL(xsk_tx_release);
315
316 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
317 {
318 struct xdp_sock *xs;
319
320 rcu_read_lock();
321 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
322 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
323 xs->tx->queue_empty_descs++;
324 continue;
325 }
326
327 /* This is the backpressure mechanism for the Tx path.
328 * Reserve space in the completion queue and only proceed
329 * if there is space in it. This avoids having to implement
330 * any buffering in the Tx path.
331 */
332 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
333 goto out;
334
335 xskq_cons_release(xs->tx);
336 rcu_read_unlock();
337 return true;
338 }
339
340 out:
341 rcu_read_unlock();
342 return false;
343 }
344 EXPORT_SYMBOL(xsk_tx_peek_desc);
345
346 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
347 u32 max_entries)
348 {
349 u32 nb_pkts = 0;
350
351 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
352 nb_pkts++;
353
354 xsk_tx_release(pool);
355 return nb_pkts;
356 }
357
358 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
359 u32 max_entries)
360 {
361 struct xdp_sock *xs;
362 u32 nb_pkts;
363
364 rcu_read_lock();
365 if (!list_is_singular(&pool->xsk_tx_list)) {
366 /* Fallback to the non-batched version */
367 rcu_read_unlock();
368 return xsk_tx_peek_release_fallback(pool, descs, max_entries);
369 }
370
371 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
372 if (!xs) {
373 nb_pkts = 0;
374 goto out;
375 }
376
377 nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
378 if (!nb_pkts) {
379 xs->tx->queue_empty_descs++;
380 goto out;
381 }
382
383 /* This is the backpressure mechanism for the Tx path. Try to
384 * reserve space in the completion queue for all packets, but
385 * if there are fewer slots available, just process that many
386 * packets. This avoids having to implement any buffering in
387 * the Tx path.
388 */
389 nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
390 if (!nb_pkts)
391 goto out;
392
393 xskq_cons_release_n(xs->tx, nb_pkts);
394 __xskq_cons_release(xs->tx);
395 xs->sk.sk_write_space(&xs->sk);
396
397 out:
398 rcu_read_unlock();
399 return nb_pkts;
400 }
401 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
402
403 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
404 {
405 struct net_device *dev = xs->dev;
406 int err;
407
408 rcu_read_lock();
409 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
410 rcu_read_unlock();
411
412 return err;
413 }
414
415 static int xsk_zc_xmit(struct xdp_sock *xs)
416 {
417 return xsk_wakeup(xs, XDP_WAKEUP_TX);
418 }
419
420 static void xsk_destruct_skb(struct sk_buff *skb)
421 {
422 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
423 struct xdp_sock *xs = xdp_sk(skb->sk);
424 unsigned long flags;
425
426 spin_lock_irqsave(&xs->pool->cq_lock, flags);
427 xskq_prod_submit_addr(xs->pool->cq, addr);
428 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
429
430 sock_wfree(skb);
431 }
432
433 static int xsk_generic_xmit(struct sock *sk)
434 {
435 struct xdp_sock *xs = xdp_sk(sk);
436 u32 max_batch = TX_BATCH_SIZE;
437 bool sent_frame = false;
438 struct xdp_desc desc;
439 struct sk_buff *skb;
440 unsigned long flags;
441 int err = 0;
442 u32 hr, tr;
443
444 mutex_lock(&xs->mutex);
445
446 if (xs->queue_id >= xs->dev->real_num_tx_queues)
447 goto out;
448
449 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
450 tr = xs->dev->needed_tailroom;
451
452 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
453 char *buffer;
454 u64 addr;
455 u32 len;
456
457 if (max_batch-- == 0) {
458 err = -EAGAIN;
459 goto out;
460 }
461
462 len = desc.len;
463 skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
464 if (unlikely(!skb))
465 goto out;
466
467 skb_reserve(skb, hr);
468 skb_put(skb, len);
469
470 addr = desc.addr;
471 buffer = xsk_buff_raw_get_data(xs->pool, addr);
472 err = skb_store_bits(skb, 0, buffer, len);
473 /* This is the backpressure mechanism for the Tx path.
474 * Reserve space in the completion queue and only proceed
475 * if there is space in it. This avoids having to implement
476 * any buffering in the Tx path.
477 */
478 spin_lock_irqsave(&xs->pool->cq_lock, flags);
479 if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
480 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
481 kfree_skb(skb);
482 goto out;
483 }
484 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
485
486 skb->dev = xs->dev;
487 skb->priority = sk->sk_priority;
488 skb->mark = sk->sk_mark;
489 skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
490 skb->destructor = xsk_destruct_skb;
491
492 err = __dev_direct_xmit(skb, xs->queue_id);
493 if (err == NETDEV_TX_BUSY) {
494 /* Tell user-space to retry the send */
495 skb->destructor = sock_wfree;
496 spin_lock_irqsave(&xs->pool->cq_lock, flags);
497 xskq_prod_cancel(xs->pool->cq);
498 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
499 /* Free skb without triggering the perf drop trace */
500 consume_skb(skb);
501 err = -EAGAIN;
502 goto out;
503 }
504
505 xskq_cons_release(xs->tx);
506 /* Ignore NET_XMIT_CN as packet might have been sent */
507 if (err == NET_XMIT_DROP) {
508 /* SKB completed but not sent */
509 err = -EBUSY;
510 goto out;
511 }
512
513 sent_frame = true;
514 }
515
516 xs->tx->queue_empty_descs++;
517
518 out:
519 if (sent_frame)
520 if (xsk_tx_writeable(xs))
521 sk->sk_write_space(sk);
522
523 mutex_unlock(&xs->mutex);
524 return err;
525 }
526
527 static int __xsk_sendmsg(struct sock *sk)
528 {
529 struct xdp_sock *xs = xdp_sk(sk);
530
531 if (unlikely(!(xs->dev->flags & IFF_UP)))
532 return -ENETDOWN;
533 if (unlikely(!xs->tx))
534 return -ENOBUFS;
535
536 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
537 }
538
539 static bool xsk_no_wakeup(struct sock *sk)
540 {
541 #ifdef CONFIG_NET_RX_BUSY_POLL
542 /* Prefer busy-polling, skip the wakeup. */
543 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
544 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
545 #else
546 return false;
547 #endif
548 }
549
550 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
551 {
552 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
553 struct sock *sk = sock->sk;
554 struct xdp_sock *xs = xdp_sk(sk);
555 struct xsk_buff_pool *pool;
556
557 if (unlikely(!xsk_is_bound(xs)))
558 return -ENXIO;
559 if (unlikely(need_wait))
560 return -EOPNOTSUPP;
561
562 if (sk_can_busy_loop(sk))
563 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
564
565 if (xsk_no_wakeup(sk))
566 return 0;
567
568 pool = xs->pool;
569 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
570 return __xsk_sendmsg(sk);
571 return 0;
572 }
573
574 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
575 {
576 bool need_wait = !(flags & MSG_DONTWAIT);
577 struct sock *sk = sock->sk;
578 struct xdp_sock *xs = xdp_sk(sk);
579
580 if (unlikely(!xsk_is_bound(xs)))
581 return -ENXIO;
582 if (unlikely(!(xs->dev->flags & IFF_UP)))
583 return -ENETDOWN;
584 if (unlikely(!xs->rx))
585 return -ENOBUFS;
586 if (unlikely(need_wait))
587 return -EOPNOTSUPP;
588
589 if (sk_can_busy_loop(sk))
590 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
591
592 if (xsk_no_wakeup(sk))
593 return 0;
594
595 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
596 return xsk_wakeup(xs, XDP_WAKEUP_RX);
597 return 0;
598 }
599
600 static __poll_t xsk_poll(struct file *file, struct socket *sock,
601 struct poll_table_struct *wait)
602 {
603 __poll_t mask = 0;
604 struct sock *sk = sock->sk;
605 struct xdp_sock *xs = xdp_sk(sk);
606 struct xsk_buff_pool *pool;
607
608 sock_poll_wait(file, sock, wait);
609
610 if (unlikely(!xsk_is_bound(xs)))
611 return mask;
612
613 pool = xs->pool;
614
615 if (pool->cached_need_wakeup) {
616 if (xs->zc)
617 xsk_wakeup(xs, pool->cached_need_wakeup);
618 else
619 /* Poll needs to drive Tx also in copy mode */
620 __xsk_sendmsg(sk);
621 }
622
623 if (xs->rx && !xskq_prod_is_empty(xs->rx))
624 mask |= EPOLLIN | EPOLLRDNORM;
625 if (xs->tx && xsk_tx_writeable(xs))
626 mask |= EPOLLOUT | EPOLLWRNORM;
627
628 return mask;
629 }
630
631 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
632 bool umem_queue)
633 {
634 struct xsk_queue *q;
635
636 if (entries == 0 || *queue || !is_power_of_2(entries))
637 return -EINVAL;
638
639 q = xskq_create(entries, umem_queue);
640 if (!q)
641 return -ENOMEM;
642
643 /* Make sure queue is ready before it can be seen by others */
644 smp_wmb();
645 WRITE_ONCE(*queue, q);
646 return 0;
647 }
648
649 static void xsk_unbind_dev(struct xdp_sock *xs)
650 {
651 struct net_device *dev = xs->dev;
652
653 if (xs->state != XSK_BOUND)
654 return;
655 WRITE_ONCE(xs->state, XSK_UNBOUND);
656
657 /* Wait for driver to stop using the xdp socket. */
658 xp_del_xsk(xs->pool, xs);
659 xs->dev = NULL;
660 synchronize_net();
661 dev_put(dev);
662 }
663
664 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
665 struct xdp_sock ***map_entry)
666 {
667 struct xsk_map *map = NULL;
668 struct xsk_map_node *node;
669
670 *map_entry = NULL;
671
672 spin_lock_bh(&xs->map_list_lock);
673 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
674 node);
675 if (node) {
676 bpf_map_inc(&node->map->map);
677 map = node->map;
678 *map_entry = node->map_entry;
679 }
680 spin_unlock_bh(&xs->map_list_lock);
681 return map;
682 }
683
684 static void xsk_delete_from_maps(struct xdp_sock *xs)
685 {
686 /* This function removes the current XDP socket from all the
687 * maps it resides in. We need to take extra care here, due to
688 * the two locks involved. Each map has a lock synchronizing
689 * updates to the entries, and each socket has a lock that
690 * synchronizes access to the list of maps (map_list). For
691 * deadlock avoidance the locks need to be taken in the order
692 * "map lock"->"socket map list lock". We start off by
693 * accessing the socket map list, and take a reference to the
694 * map to guarantee existence between the
695 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
696 * calls. Then we ask the map to remove the socket, which
697 * tries to remove the socket from the map. Note that there
698 * might be updates to the map between
699 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
700 */
701 struct xdp_sock **map_entry = NULL;
702 struct xsk_map *map;
703
704 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
705 xsk_map_try_sock_delete(map, xs, map_entry);
706 bpf_map_put(&map->map);
707 }
708 }
709
710 static int xsk_release(struct socket *sock)
711 {
712 struct sock *sk = sock->sk;
713 struct xdp_sock *xs = xdp_sk(sk);
714 struct net *net;
715
716 if (!sk)
717 return 0;
718
719 net = sock_net(sk);
720
721 mutex_lock(&net->xdp.lock);
722 sk_del_node_init_rcu(sk);
723 mutex_unlock(&net->xdp.lock);
724
725 local_bh_disable();
726 sock_prot_inuse_add(net, sk->sk_prot, -1);
727 local_bh_enable();
728
729 xsk_delete_from_maps(xs);
730 mutex_lock(&xs->mutex);
731 xsk_unbind_dev(xs);
732 mutex_unlock(&xs->mutex);
733
734 xskq_destroy(xs->rx);
735 xskq_destroy(xs->tx);
736 xskq_destroy(xs->fq_tmp);
737 xskq_destroy(xs->cq_tmp);
738
739 sock_orphan(sk);
740 sock->sk = NULL;
741
742 sk_refcnt_debug_release(sk);
743 sock_put(sk);
744
745 return 0;
746 }
747
748 static struct socket *xsk_lookup_xsk_from_fd(int fd)
749 {
750 struct socket *sock;
751 int err;
752
753 sock = sockfd_lookup(fd, &err);
754 if (!sock)
755 return ERR_PTR(-ENOTSOCK);
756
757 if (sock->sk->sk_family != PF_XDP) {
758 sockfd_put(sock);
759 return ERR_PTR(-ENOPROTOOPT);
760 }
761
762 return sock;
763 }
764
765 static bool xsk_validate_queues(struct xdp_sock *xs)
766 {
767 return xs->fq_tmp && xs->cq_tmp;
768 }
769
770 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
771 {
772 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
773 struct sock *sk = sock->sk;
774 struct xdp_sock *xs = xdp_sk(sk);
775 struct net_device *dev;
776 u32 flags, qid;
777 int err = 0;
778
779 if (addr_len < sizeof(struct sockaddr_xdp))
780 return -EINVAL;
781 if (sxdp->sxdp_family != AF_XDP)
782 return -EINVAL;
783
784 flags = sxdp->sxdp_flags;
785 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
786 XDP_USE_NEED_WAKEUP))
787 return -EINVAL;
788
789 rtnl_lock();
790 mutex_lock(&xs->mutex);
791 if (xs->state != XSK_READY) {
792 err = -EBUSY;
793 goto out_release;
794 }
795
796 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
797 if (!dev) {
798 err = -ENODEV;
799 goto out_release;
800 }
801
802 if (!xs->rx && !xs->tx) {
803 err = -EINVAL;
804 goto out_unlock;
805 }
806
807 qid = sxdp->sxdp_queue_id;
808
809 if (flags & XDP_SHARED_UMEM) {
810 struct xdp_sock *umem_xs;
811 struct socket *sock;
812
813 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
814 (flags & XDP_USE_NEED_WAKEUP)) {
815 /* Cannot specify flags for shared sockets. */
816 err = -EINVAL;
817 goto out_unlock;
818 }
819
820 if (xs->umem) {
821 /* We have already our own. */
822 err = -EINVAL;
823 goto out_unlock;
824 }
825
826 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
827 if (IS_ERR(sock)) {
828 err = PTR_ERR(sock);
829 goto out_unlock;
830 }
831
832 umem_xs = xdp_sk(sock->sk);
833 if (!xsk_is_bound(umem_xs)) {
834 err = -EBADF;
835 sockfd_put(sock);
836 goto out_unlock;
837 }
838
839 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
840 /* Share the umem with another socket on another qid
841 * and/or device.
842 */
843 xs->pool = xp_create_and_assign_umem(xs,
844 umem_xs->umem);
845 if (!xs->pool) {
846 err = -ENOMEM;
847 sockfd_put(sock);
848 goto out_unlock;
849 }
850
851 err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
852 dev, qid);
853 if (err) {
854 xp_destroy(xs->pool);
855 xs->pool = NULL;
856 sockfd_put(sock);
857 goto out_unlock;
858 }
859 } else {
860 /* Share the buffer pool with the other socket. */
861 if (xs->fq_tmp || xs->cq_tmp) {
862 /* Do not allow setting your own fq or cq. */
863 err = -EINVAL;
864 sockfd_put(sock);
865 goto out_unlock;
866 }
867
868 xp_get_pool(umem_xs->pool);
869 xs->pool = umem_xs->pool;
870 }
871
872 xdp_get_umem(umem_xs->umem);
873 WRITE_ONCE(xs->umem, umem_xs->umem);
874 sockfd_put(sock);
875 } else if (!xs->umem || !xsk_validate_queues(xs)) {
876 err = -EINVAL;
877 goto out_unlock;
878 } else {
879 /* This xsk has its own umem. */
880 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
881 if (!xs->pool) {
882 err = -ENOMEM;
883 goto out_unlock;
884 }
885
886 err = xp_assign_dev(xs->pool, dev, qid, flags);
887 if (err) {
888 xp_destroy(xs->pool);
889 xs->pool = NULL;
890 goto out_unlock;
891 }
892 }
893
894 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
895 xs->fq_tmp = NULL;
896 xs->cq_tmp = NULL;
897
898 xs->dev = dev;
899 xs->zc = xs->umem->zc;
900 xs->queue_id = qid;
901 xp_add_xsk(xs->pool, xs);
902
903 out_unlock:
904 if (err) {
905 dev_put(dev);
906 } else {
907 /* Matches smp_rmb() in bind() for shared umem
908 * sockets, and xsk_is_bound().
909 */
910 smp_wmb();
911 WRITE_ONCE(xs->state, XSK_BOUND);
912 }
913 out_release:
914 mutex_unlock(&xs->mutex);
915 rtnl_unlock();
916 return err;
917 }
918
919 struct xdp_umem_reg_v1 {
920 __u64 addr; /* Start of packet data area */
921 __u64 len; /* Length of packet data area */
922 __u32 chunk_size;
923 __u32 headroom;
924 };
925
926 static int xsk_setsockopt(struct socket *sock, int level, int optname,
927 sockptr_t optval, unsigned int optlen)
928 {
929 struct sock *sk = sock->sk;
930 struct xdp_sock *xs = xdp_sk(sk);
931 int err;
932
933 if (level != SOL_XDP)
934 return -ENOPROTOOPT;
935
936 switch (optname) {
937 case XDP_RX_RING:
938 case XDP_TX_RING:
939 {
940 struct xsk_queue **q;
941 int entries;
942
943 if (optlen < sizeof(entries))
944 return -EINVAL;
945 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
946 return -EFAULT;
947
948 mutex_lock(&xs->mutex);
949 if (xs->state != XSK_READY) {
950 mutex_unlock(&xs->mutex);
951 return -EBUSY;
952 }
953 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
954 err = xsk_init_queue(entries, q, false);
955 if (!err && optname == XDP_TX_RING)
956 /* Tx needs to be explicitly woken up the first time */
957 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
958 mutex_unlock(&xs->mutex);
959 return err;
960 }
961 case XDP_UMEM_REG:
962 {
963 size_t mr_size = sizeof(struct xdp_umem_reg);
964 struct xdp_umem_reg mr = {};
965 struct xdp_umem *umem;
966
967 if (optlen < sizeof(struct xdp_umem_reg_v1))
968 return -EINVAL;
969 else if (optlen < sizeof(mr))
970 mr_size = sizeof(struct xdp_umem_reg_v1);
971
972 if (copy_from_sockptr(&mr, optval, mr_size))
973 return -EFAULT;
974
975 mutex_lock(&xs->mutex);
976 if (xs->state != XSK_READY || xs->umem) {
977 mutex_unlock(&xs->mutex);
978 return -EBUSY;
979 }
980
981 umem = xdp_umem_create(&mr);
982 if (IS_ERR(umem)) {
983 mutex_unlock(&xs->mutex);
984 return PTR_ERR(umem);
985 }
986
987 /* Make sure umem is ready before it can be seen by others */
988 smp_wmb();
989 WRITE_ONCE(xs->umem, umem);
990 mutex_unlock(&xs->mutex);
991 return 0;
992 }
993 case XDP_UMEM_FILL_RING:
994 case XDP_UMEM_COMPLETION_RING:
995 {
996 struct xsk_queue **q;
997 int entries;
998
999 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1000 return -EFAULT;
1001
1002 mutex_lock(&xs->mutex);
1003 if (xs->state != XSK_READY) {
1004 mutex_unlock(&xs->mutex);
1005 return -EBUSY;
1006 }
1007
1008 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1009 &xs->cq_tmp;
1010 err = xsk_init_queue(entries, q, true);
1011 mutex_unlock(&xs->mutex);
1012 return err;
1013 }
1014 default:
1015 break;
1016 }
1017
1018 return -ENOPROTOOPT;
1019 }
1020
1021 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1022 {
1023 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1024 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1025 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1026 }
1027
1028 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1029 {
1030 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1031 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1032 ring->desc = offsetof(struct xdp_umem_ring, desc);
1033 }
1034
1035 struct xdp_statistics_v1 {
1036 __u64 rx_dropped;
1037 __u64 rx_invalid_descs;
1038 __u64 tx_invalid_descs;
1039 };
1040
1041 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1042 char __user *optval, int __user *optlen)
1043 {
1044 struct sock *sk = sock->sk;
1045 struct xdp_sock *xs = xdp_sk(sk);
1046 int len;
1047
1048 if (level != SOL_XDP)
1049 return -ENOPROTOOPT;
1050
1051 if (get_user(len, optlen))
1052 return -EFAULT;
1053 if (len < 0)
1054 return -EINVAL;
1055
1056 switch (optname) {
1057 case XDP_STATISTICS:
1058 {
1059 struct xdp_statistics stats = {};
1060 bool extra_stats = true;
1061 size_t stats_size;
1062
1063 if (len < sizeof(struct xdp_statistics_v1)) {
1064 return -EINVAL;
1065 } else if (len < sizeof(stats)) {
1066 extra_stats = false;
1067 stats_size = sizeof(struct xdp_statistics_v1);
1068 } else {
1069 stats_size = sizeof(stats);
1070 }
1071
1072 mutex_lock(&xs->mutex);
1073 stats.rx_dropped = xs->rx_dropped;
1074 if (extra_stats) {
1075 stats.rx_ring_full = xs->rx_queue_full;
1076 stats.rx_fill_ring_empty_descs =
1077 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1078 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1079 } else {
1080 stats.rx_dropped += xs->rx_queue_full;
1081 }
1082 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1083 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1084 mutex_unlock(&xs->mutex);
1085
1086 if (copy_to_user(optval, &stats, stats_size))
1087 return -EFAULT;
1088 if (put_user(stats_size, optlen))
1089 return -EFAULT;
1090
1091 return 0;
1092 }
1093 case XDP_MMAP_OFFSETS:
1094 {
1095 struct xdp_mmap_offsets off;
1096 struct xdp_mmap_offsets_v1 off_v1;
1097 bool flags_supported = true;
1098 void *to_copy;
1099
1100 if (len < sizeof(off_v1))
1101 return -EINVAL;
1102 else if (len < sizeof(off))
1103 flags_supported = false;
1104
1105 if (flags_supported) {
1106 /* xdp_ring_offset is identical to xdp_ring_offset_v1
1107 * except for the flags field added to the end.
1108 */
1109 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1110 &off.rx);
1111 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1112 &off.tx);
1113 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1114 &off.fr);
1115 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1116 &off.cr);
1117 off.rx.flags = offsetof(struct xdp_rxtx_ring,
1118 ptrs.flags);
1119 off.tx.flags = offsetof(struct xdp_rxtx_ring,
1120 ptrs.flags);
1121 off.fr.flags = offsetof(struct xdp_umem_ring,
1122 ptrs.flags);
1123 off.cr.flags = offsetof(struct xdp_umem_ring,
1124 ptrs.flags);
1125
1126 len = sizeof(off);
1127 to_copy = &off;
1128 } else {
1129 xsk_enter_rxtx_offsets(&off_v1.rx);
1130 xsk_enter_rxtx_offsets(&off_v1.tx);
1131 xsk_enter_umem_offsets(&off_v1.fr);
1132 xsk_enter_umem_offsets(&off_v1.cr);
1133
1134 len = sizeof(off_v1);
1135 to_copy = &off_v1;
1136 }
1137
1138 if (copy_to_user(optval, to_copy, len))
1139 return -EFAULT;
1140 if (put_user(len, optlen))
1141 return -EFAULT;
1142
1143 return 0;
1144 }
1145 case XDP_OPTIONS:
1146 {
1147 struct xdp_options opts = {};
1148
1149 if (len < sizeof(opts))
1150 return -EINVAL;
1151
1152 mutex_lock(&xs->mutex);
1153 if (xs->zc)
1154 opts.flags |= XDP_OPTIONS_ZEROCOPY;
1155 mutex_unlock(&xs->mutex);
1156
1157 len = sizeof(opts);
1158 if (copy_to_user(optval, &opts, len))
1159 return -EFAULT;
1160 if (put_user(len, optlen))
1161 return -EFAULT;
1162
1163 return 0;
1164 }
1165 default:
1166 break;
1167 }
1168
1169 return -EOPNOTSUPP;
1170 }
1171
1172 static int xsk_mmap(struct file *file, struct socket *sock,
1173 struct vm_area_struct *vma)
1174 {
1175 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1176 unsigned long size = vma->vm_end - vma->vm_start;
1177 struct xdp_sock *xs = xdp_sk(sock->sk);
1178 struct xsk_queue *q = NULL;
1179 unsigned long pfn;
1180 struct page *qpg;
1181
1182 if (READ_ONCE(xs->state) != XSK_READY)
1183 return -EBUSY;
1184
1185 if (offset == XDP_PGOFF_RX_RING) {
1186 q = READ_ONCE(xs->rx);
1187 } else if (offset == XDP_PGOFF_TX_RING) {
1188 q = READ_ONCE(xs->tx);
1189 } else {
1190 /* Matches the smp_wmb() in XDP_UMEM_REG */
1191 smp_rmb();
1192 if (offset == XDP_UMEM_PGOFF_FILL_RING)
1193 q = READ_ONCE(xs->fq_tmp);
1194 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1195 q = READ_ONCE(xs->cq_tmp);
1196 }
1197
1198 if (!q)
1199 return -EINVAL;
1200
1201 /* Matches the smp_wmb() in xsk_init_queue */
1202 smp_rmb();
1203 qpg = virt_to_head_page(q->ring);
1204 if (size > page_size(qpg))
1205 return -EINVAL;
1206
1207 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1208 return remap_pfn_range(vma, vma->vm_start, pfn,
1209 size, vma->vm_page_prot);
1210 }
1211
1212 static int xsk_notifier(struct notifier_block *this,
1213 unsigned long msg, void *ptr)
1214 {
1215 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1216 struct net *net = dev_net(dev);
1217 struct sock *sk;
1218
1219 switch (msg) {
1220 case NETDEV_UNREGISTER:
1221 mutex_lock(&net->xdp.lock);
1222 sk_for_each(sk, &net->xdp.list) {
1223 struct xdp_sock *xs = xdp_sk(sk);
1224
1225 mutex_lock(&xs->mutex);
1226 if (xs->dev == dev) {
1227 sk->sk_err = ENETDOWN;
1228 if (!sock_flag(sk, SOCK_DEAD))
1229 sk->sk_error_report(sk);
1230
1231 xsk_unbind_dev(xs);
1232
1233 /* Clear device references. */
1234 xp_clear_dev(xs->pool);
1235 }
1236 mutex_unlock(&xs->mutex);
1237 }
1238 mutex_unlock(&net->xdp.lock);
1239 break;
1240 }
1241 return NOTIFY_DONE;
1242 }
1243
1244 static struct proto xsk_proto = {
1245 .name = "XDP",
1246 .owner = THIS_MODULE,
1247 .obj_size = sizeof(struct xdp_sock),
1248 };
1249
1250 static const struct proto_ops xsk_proto_ops = {
1251 .family = PF_XDP,
1252 .owner = THIS_MODULE,
1253 .release = xsk_release,
1254 .bind = xsk_bind,
1255 .connect = sock_no_connect,
1256 .socketpair = sock_no_socketpair,
1257 .accept = sock_no_accept,
1258 .getname = sock_no_getname,
1259 .poll = xsk_poll,
1260 .ioctl = sock_no_ioctl,
1261 .listen = sock_no_listen,
1262 .shutdown = sock_no_shutdown,
1263 .setsockopt = xsk_setsockopt,
1264 .getsockopt = xsk_getsockopt,
1265 .sendmsg = xsk_sendmsg,
1266 .recvmsg = xsk_recvmsg,
1267 .mmap = xsk_mmap,
1268 .sendpage = sock_no_sendpage,
1269 };
1270
1271 static void xsk_destruct(struct sock *sk)
1272 {
1273 struct xdp_sock *xs = xdp_sk(sk);
1274
1275 if (!sock_flag(sk, SOCK_DEAD))
1276 return;
1277
1278 if (!xp_put_pool(xs->pool))
1279 xdp_put_umem(xs->umem, !xs->pool);
1280
1281 sk_refcnt_debug_dec(sk);
1282 }
1283
1284 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1285 int kern)
1286 {
1287 struct xdp_sock *xs;
1288 struct sock *sk;
1289
1290 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1291 return -EPERM;
1292 if (sock->type != SOCK_RAW)
1293 return -ESOCKTNOSUPPORT;
1294
1295 if (protocol)
1296 return -EPROTONOSUPPORT;
1297
1298 sock->state = SS_UNCONNECTED;
1299
1300 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1301 if (!sk)
1302 return -ENOBUFS;
1303
1304 sock->ops = &xsk_proto_ops;
1305
1306 sock_init_data(sock, sk);
1307
1308 sk->sk_family = PF_XDP;
1309
1310 sk->sk_destruct = xsk_destruct;
1311 sk_refcnt_debug_inc(sk);
1312
1313 sock_set_flag(sk, SOCK_RCU_FREE);
1314
1315 xs = xdp_sk(sk);
1316 xs->state = XSK_READY;
1317 mutex_init(&xs->mutex);
1318 spin_lock_init(&xs->rx_lock);
1319
1320 INIT_LIST_HEAD(&xs->map_list);
1321 spin_lock_init(&xs->map_list_lock);
1322
1323 mutex_lock(&net->xdp.lock);
1324 sk_add_node_rcu(sk, &net->xdp.list);
1325 mutex_unlock(&net->xdp.lock);
1326
1327 local_bh_disable();
1328 sock_prot_inuse_add(net, &xsk_proto, 1);
1329 local_bh_enable();
1330
1331 return 0;
1332 }
1333
1334 static const struct net_proto_family xsk_family_ops = {
1335 .family = PF_XDP,
1336 .create = xsk_create,
1337 .owner = THIS_MODULE,
1338 };
1339
1340 static struct notifier_block xsk_netdev_notifier = {
1341 .notifier_call = xsk_notifier,
1342 };
1343
1344 static int __net_init xsk_net_init(struct net *net)
1345 {
1346 mutex_init(&net->xdp.lock);
1347 INIT_HLIST_HEAD(&net->xdp.list);
1348 return 0;
1349 }
1350
1351 static void __net_exit xsk_net_exit(struct net *net)
1352 {
1353 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1354 }
1355
1356 static struct pernet_operations xsk_net_ops = {
1357 .init = xsk_net_init,
1358 .exit = xsk_net_exit,
1359 };
1360
1361 static int __init xsk_init(void)
1362 {
1363 int err, cpu;
1364
1365 err = proto_register(&xsk_proto, 0 /* no slab */);
1366 if (err)
1367 goto out;
1368
1369 err = sock_register(&xsk_family_ops);
1370 if (err)
1371 goto out_proto;
1372
1373 err = register_pernet_subsys(&xsk_net_ops);
1374 if (err)
1375 goto out_sk;
1376
1377 err = register_netdevice_notifier(&xsk_netdev_notifier);
1378 if (err)
1379 goto out_pernet;
1380
1381 for_each_possible_cpu(cpu)
1382 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1383 return 0;
1384
1385 out_pernet:
1386 unregister_pernet_subsys(&xsk_net_ops);
1387 out_sk:
1388 sock_unregister(PF_XDP);
1389 out_proto:
1390 proto_unregister(&xsk_proto);
1391 out:
1392 return err;
1393 }
1394
1395 fs_initcall(xsk_init);