]>
Commit | Line | Data |
---|---|---|
c0c77d8f BT |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* XDP sockets | |
3 | * | |
4 | * AF_XDP sockets allows a channel between XDP programs and userspace | |
5 | * applications. | |
6 | * Copyright(c) 2018 Intel Corporation. | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or modify it | |
9 | * under the terms and conditions of the GNU General Public License, | |
10 | * version 2, as published by the Free Software Foundation. | |
11 | * | |
12 | * This program is distributed in the hope it will be useful, but WITHOUT | |
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
15 | * more details. | |
16 | * | |
17 | * Author(s): Björn Töpel <bjorn.topel@intel.com> | |
18 | * Magnus Karlsson <magnus.karlsson@intel.com> | |
19 | */ | |
20 | ||
21 | #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ | |
22 | ||
23 | #include <linux/if_xdp.h> | |
24 | #include <linux/init.h> | |
25 | #include <linux/sched/mm.h> | |
26 | #include <linux/sched/signal.h> | |
27 | #include <linux/sched/task.h> | |
28 | #include <linux/socket.h> | |
29 | #include <linux/file.h> | |
30 | #include <linux/uaccess.h> | |
31 | #include <linux/net.h> | |
32 | #include <linux/netdevice.h> | |
33 | #include <net/xdp_sock.h> | |
b9b6b68e | 34 | #include <net/xdp.h> |
c0c77d8f | 35 | |
423f3832 | 36 | #include "xsk_queue.h" |
c0c77d8f BT |
37 | #include "xdp_umem.h" |
38 | ||
39 | static struct xdp_sock *xdp_sk(struct sock *sk) | |
40 | { | |
41 | return (struct xdp_sock *)sk; | |
42 | } | |
43 | ||
fbfc504a BT |
44 | bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) |
45 | { | |
46 | return !!xs->rx; | |
47 | } | |
48 | ||
c497176c BT |
49 | static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) |
50 | { | |
51 | u32 *id, len = xdp->data_end - xdp->data; | |
52 | void *buffer; | |
53 | int err = 0; | |
54 | ||
55 | if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) | |
56 | return -EINVAL; | |
57 | ||
58 | id = xskq_peek_id(xs->umem->fq); | |
59 | if (!id) | |
60 | return -ENOSPC; | |
61 | ||
62 | buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); | |
63 | memcpy(buffer, xdp->data, len); | |
64 | err = xskq_produce_batch_desc(xs->rx, *id, len, | |
65 | xs->umem->frame_headroom); | |
66 | if (!err) | |
67 | xskq_discard_id(xs->umem->fq); | |
68 | ||
69 | return err; | |
70 | } | |
71 | ||
72 | int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) | |
73 | { | |
74 | int err; | |
75 | ||
76 | err = __xsk_rcv(xs, xdp); | |
77 | if (likely(!err)) | |
78 | xdp_return_buff(xdp); | |
79 | else | |
80 | xs->rx_dropped++; | |
81 | ||
82 | return err; | |
83 | } | |
84 | ||
85 | void xsk_flush(struct xdp_sock *xs) | |
86 | { | |
87 | xskq_produce_flush_desc(xs->rx); | |
88 | xs->sk.sk_data_ready(&xs->sk); | |
89 | } | |
90 | ||
91 | int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) | |
92 | { | |
93 | int err; | |
94 | ||
95 | err = __xsk_rcv(xs, xdp); | |
96 | if (!err) | |
97 | xsk_flush(xs); | |
98 | else | |
99 | xs->rx_dropped++; | |
100 | ||
101 | return err; | |
102 | } | |
103 | ||
104 | static unsigned int xsk_poll(struct file *file, struct socket *sock, | |
105 | struct poll_table_struct *wait) | |
106 | { | |
107 | unsigned int mask = datagram_poll(file, sock, wait); | |
108 | struct sock *sk = sock->sk; | |
109 | struct xdp_sock *xs = xdp_sk(sk); | |
110 | ||
111 | if (xs->rx && !xskq_empty_desc(xs->rx)) | |
112 | mask |= POLLIN | POLLRDNORM; | |
113 | ||
114 | return mask; | |
115 | } | |
116 | ||
b9b6b68e BT |
117 | static int xsk_init_queue(u32 entries, struct xsk_queue **queue, |
118 | bool umem_queue) | |
423f3832 MK |
119 | { |
120 | struct xsk_queue *q; | |
121 | ||
122 | if (entries == 0 || *queue || !is_power_of_2(entries)) | |
123 | return -EINVAL; | |
124 | ||
b9b6b68e | 125 | q = xskq_create(entries, umem_queue); |
423f3832 MK |
126 | if (!q) |
127 | return -ENOMEM; | |
128 | ||
129 | *queue = q; | |
130 | return 0; | |
131 | } | |
132 | ||
965a9909 MK |
133 | static void __xsk_release(struct xdp_sock *xs) |
134 | { | |
135 | /* Wait for driver to stop using the xdp socket. */ | |
136 | synchronize_net(); | |
137 | ||
138 | dev_put(xs->dev); | |
139 | } | |
140 | ||
c0c77d8f BT |
141 | static int xsk_release(struct socket *sock) |
142 | { | |
143 | struct sock *sk = sock->sk; | |
965a9909 | 144 | struct xdp_sock *xs = xdp_sk(sk); |
c0c77d8f BT |
145 | struct net *net; |
146 | ||
147 | if (!sk) | |
148 | return 0; | |
149 | ||
150 | net = sock_net(sk); | |
151 | ||
152 | local_bh_disable(); | |
153 | sock_prot_inuse_add(net, sk->sk_prot, -1); | |
154 | local_bh_enable(); | |
155 | ||
965a9909 MK |
156 | if (xs->dev) { |
157 | __xsk_release(xs); | |
158 | xs->dev = NULL; | |
159 | } | |
160 | ||
c0c77d8f BT |
161 | sock_orphan(sk); |
162 | sock->sk = NULL; | |
163 | ||
164 | sk_refcnt_debug_release(sk); | |
165 | sock_put(sk); | |
166 | ||
167 | return 0; | |
168 | } | |
169 | ||
965a9909 MK |
170 | static struct socket *xsk_lookup_xsk_from_fd(int fd) |
171 | { | |
172 | struct socket *sock; | |
173 | int err; | |
174 | ||
175 | sock = sockfd_lookup(fd, &err); | |
176 | if (!sock) | |
177 | return ERR_PTR(-ENOTSOCK); | |
178 | ||
179 | if (sock->sk->sk_family != PF_XDP) { | |
180 | sockfd_put(sock); | |
181 | return ERR_PTR(-ENOPROTOOPT); | |
182 | } | |
183 | ||
184 | return sock; | |
185 | } | |
186 | ||
187 | static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) | |
188 | { | |
189 | struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; | |
190 | struct sock *sk = sock->sk; | |
191 | struct net_device *dev, *dev_curr; | |
192 | struct xdp_sock *xs = xdp_sk(sk); | |
193 | struct xdp_umem *old_umem = NULL; | |
194 | int err = 0; | |
195 | ||
196 | if (addr_len < sizeof(struct sockaddr_xdp)) | |
197 | return -EINVAL; | |
198 | if (sxdp->sxdp_family != AF_XDP) | |
199 | return -EINVAL; | |
200 | ||
201 | mutex_lock(&xs->mutex); | |
202 | dev_curr = xs->dev; | |
203 | dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); | |
204 | if (!dev) { | |
205 | err = -ENODEV; | |
206 | goto out_release; | |
207 | } | |
208 | ||
209 | if (!xs->rx) { | |
210 | err = -EINVAL; | |
211 | goto out_unlock; | |
212 | } | |
213 | ||
214 | if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { | |
215 | err = -EINVAL; | |
216 | goto out_unlock; | |
217 | } | |
218 | ||
219 | if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { | |
220 | struct xdp_sock *umem_xs; | |
221 | struct socket *sock; | |
222 | ||
223 | if (xs->umem) { | |
224 | /* We have already our own. */ | |
225 | err = -EINVAL; | |
226 | goto out_unlock; | |
227 | } | |
228 | ||
229 | sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); | |
230 | if (IS_ERR(sock)) { | |
231 | err = PTR_ERR(sock); | |
232 | goto out_unlock; | |
233 | } | |
234 | ||
235 | umem_xs = xdp_sk(sock->sk); | |
236 | if (!umem_xs->umem) { | |
237 | /* No umem to inherit. */ | |
238 | err = -EBADF; | |
239 | sockfd_put(sock); | |
240 | goto out_unlock; | |
241 | } else if (umem_xs->dev != dev || | |
242 | umem_xs->queue_id != sxdp->sxdp_queue_id) { | |
243 | err = -EINVAL; | |
244 | sockfd_put(sock); | |
245 | goto out_unlock; | |
246 | } | |
247 | ||
248 | xdp_get_umem(umem_xs->umem); | |
249 | old_umem = xs->umem; | |
250 | xs->umem = umem_xs->umem; | |
251 | sockfd_put(sock); | |
252 | } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { | |
253 | err = -EINVAL; | |
254 | goto out_unlock; | |
c497176c BT |
255 | } else { |
256 | /* This xsk has its own umem. */ | |
257 | xskq_set_umem(xs->umem->fq, &xs->umem->props); | |
fe230832 | 258 | xskq_set_umem(xs->umem->cq, &xs->umem->props); |
965a9909 MK |
259 | } |
260 | ||
261 | /* Rebind? */ | |
262 | if (dev_curr && (dev_curr != dev || | |
263 | xs->queue_id != sxdp->sxdp_queue_id)) { | |
264 | __xsk_release(xs); | |
265 | if (old_umem) | |
266 | xdp_put_umem(old_umem); | |
267 | } | |
268 | ||
269 | xs->dev = dev; | |
270 | xs->queue_id = sxdp->sxdp_queue_id; | |
271 | ||
272 | xskq_set_umem(xs->rx, &xs->umem->props); | |
273 | ||
274 | out_unlock: | |
275 | if (err) | |
276 | dev_put(dev); | |
277 | out_release: | |
278 | mutex_unlock(&xs->mutex); | |
279 | return err; | |
280 | } | |
281 | ||
c0c77d8f BT |
282 | static int xsk_setsockopt(struct socket *sock, int level, int optname, |
283 | char __user *optval, unsigned int optlen) | |
284 | { | |
285 | struct sock *sk = sock->sk; | |
286 | struct xdp_sock *xs = xdp_sk(sk); | |
287 | int err; | |
288 | ||
289 | if (level != SOL_XDP) | |
290 | return -ENOPROTOOPT; | |
291 | ||
292 | switch (optname) { | |
b9b6b68e BT |
293 | case XDP_RX_RING: |
294 | { | |
295 | struct xsk_queue **q; | |
296 | int entries; | |
297 | ||
298 | if (optlen < sizeof(entries)) | |
299 | return -EINVAL; | |
300 | if (copy_from_user(&entries, optval, sizeof(entries))) | |
301 | return -EFAULT; | |
302 | ||
303 | mutex_lock(&xs->mutex); | |
304 | q = &xs->rx; | |
305 | err = xsk_init_queue(entries, q, false); | |
306 | mutex_unlock(&xs->mutex); | |
307 | return err; | |
308 | } | |
c0c77d8f BT |
309 | case XDP_UMEM_REG: |
310 | { | |
311 | struct xdp_umem_reg mr; | |
312 | struct xdp_umem *umem; | |
313 | ||
314 | if (xs->umem) | |
315 | return -EBUSY; | |
316 | ||
317 | if (copy_from_user(&mr, optval, sizeof(mr))) | |
318 | return -EFAULT; | |
319 | ||
320 | mutex_lock(&xs->mutex); | |
321 | err = xdp_umem_create(&umem); | |
322 | ||
323 | err = xdp_umem_reg(umem, &mr); | |
324 | if (err) { | |
325 | kfree(umem); | |
326 | mutex_unlock(&xs->mutex); | |
327 | return err; | |
328 | } | |
329 | ||
330 | /* Make sure umem is ready before it can be seen by others */ | |
331 | smp_wmb(); | |
332 | ||
333 | xs->umem = umem; | |
334 | mutex_unlock(&xs->mutex); | |
335 | return 0; | |
336 | } | |
423f3832 | 337 | case XDP_UMEM_FILL_RING: |
fe230832 | 338 | case XDP_UMEM_COMPLETION_RING: |
423f3832 MK |
339 | { |
340 | struct xsk_queue **q; | |
341 | int entries; | |
342 | ||
343 | if (!xs->umem) | |
344 | return -EINVAL; | |
345 | ||
346 | if (copy_from_user(&entries, optval, sizeof(entries))) | |
347 | return -EFAULT; | |
348 | ||
349 | mutex_lock(&xs->mutex); | |
fe230832 MK |
350 | q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : |
351 | &xs->umem->cq; | |
b9b6b68e | 352 | err = xsk_init_queue(entries, q, true); |
423f3832 MK |
353 | mutex_unlock(&xs->mutex); |
354 | return err; | |
355 | } | |
c0c77d8f BT |
356 | default: |
357 | break; | |
358 | } | |
359 | ||
360 | return -ENOPROTOOPT; | |
361 | } | |
362 | ||
423f3832 MK |
363 | static int xsk_mmap(struct file *file, struct socket *sock, |
364 | struct vm_area_struct *vma) | |
365 | { | |
366 | unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; | |
367 | unsigned long size = vma->vm_end - vma->vm_start; | |
368 | struct xdp_sock *xs = xdp_sk(sock->sk); | |
369 | struct xsk_queue *q = NULL; | |
370 | unsigned long pfn; | |
371 | struct page *qpg; | |
372 | ||
b9b6b68e BT |
373 | if (offset == XDP_PGOFF_RX_RING) { |
374 | q = xs->rx; | |
375 | } else { | |
376 | if (!xs->umem) | |
377 | return -EINVAL; | |
423f3832 | 378 | |
b9b6b68e BT |
379 | if (offset == XDP_UMEM_PGOFF_FILL_RING) |
380 | q = xs->umem->fq; | |
fe230832 MK |
381 | else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) |
382 | q = xs->umem->cq; | |
b9b6b68e BT |
383 | else |
384 | return -EINVAL; | |
385 | } | |
423f3832 MK |
386 | |
387 | if (!q) | |
388 | return -EINVAL; | |
389 | ||
390 | qpg = virt_to_head_page(q->ring); | |
391 | if (size > (PAGE_SIZE << compound_order(qpg))) | |
392 | return -EINVAL; | |
393 | ||
394 | pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; | |
395 | return remap_pfn_range(vma, vma->vm_start, pfn, | |
396 | size, vma->vm_page_prot); | |
397 | } | |
398 | ||
c0c77d8f BT |
399 | static struct proto xsk_proto = { |
400 | .name = "XDP", | |
401 | .owner = THIS_MODULE, | |
402 | .obj_size = sizeof(struct xdp_sock), | |
403 | }; | |
404 | ||
405 | static const struct proto_ops xsk_proto_ops = { | |
406 | .family = PF_XDP, | |
407 | .owner = THIS_MODULE, | |
408 | .release = xsk_release, | |
965a9909 | 409 | .bind = xsk_bind, |
c0c77d8f BT |
410 | .connect = sock_no_connect, |
411 | .socketpair = sock_no_socketpair, | |
412 | .accept = sock_no_accept, | |
413 | .getname = sock_no_getname, | |
c497176c | 414 | .poll = xsk_poll, |
c0c77d8f BT |
415 | .ioctl = sock_no_ioctl, |
416 | .listen = sock_no_listen, | |
417 | .shutdown = sock_no_shutdown, | |
418 | .setsockopt = xsk_setsockopt, | |
419 | .getsockopt = sock_no_getsockopt, | |
420 | .sendmsg = sock_no_sendmsg, | |
421 | .recvmsg = sock_no_recvmsg, | |
423f3832 | 422 | .mmap = xsk_mmap, |
c0c77d8f BT |
423 | .sendpage = sock_no_sendpage, |
424 | }; | |
425 | ||
426 | static void xsk_destruct(struct sock *sk) | |
427 | { | |
428 | struct xdp_sock *xs = xdp_sk(sk); | |
429 | ||
430 | if (!sock_flag(sk, SOCK_DEAD)) | |
431 | return; | |
432 | ||
b9b6b68e | 433 | xskq_destroy(xs->rx); |
c0c77d8f BT |
434 | xdp_put_umem(xs->umem); |
435 | ||
436 | sk_refcnt_debug_dec(sk); | |
437 | } | |
438 | ||
439 | static int xsk_create(struct net *net, struct socket *sock, int protocol, | |
440 | int kern) | |
441 | { | |
442 | struct sock *sk; | |
443 | struct xdp_sock *xs; | |
444 | ||
445 | if (!ns_capable(net->user_ns, CAP_NET_RAW)) | |
446 | return -EPERM; | |
447 | if (sock->type != SOCK_RAW) | |
448 | return -ESOCKTNOSUPPORT; | |
449 | ||
450 | if (protocol) | |
451 | return -EPROTONOSUPPORT; | |
452 | ||
453 | sock->state = SS_UNCONNECTED; | |
454 | ||
455 | sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); | |
456 | if (!sk) | |
457 | return -ENOBUFS; | |
458 | ||
459 | sock->ops = &xsk_proto_ops; | |
460 | ||
461 | sock_init_data(sock, sk); | |
462 | ||
463 | sk->sk_family = PF_XDP; | |
464 | ||
465 | sk->sk_destruct = xsk_destruct; | |
466 | sk_refcnt_debug_inc(sk); | |
467 | ||
468 | xs = xdp_sk(sk); | |
469 | mutex_init(&xs->mutex); | |
470 | ||
471 | local_bh_disable(); | |
472 | sock_prot_inuse_add(net, &xsk_proto, 1); | |
473 | local_bh_enable(); | |
474 | ||
475 | return 0; | |
476 | } | |
477 | ||
478 | static const struct net_proto_family xsk_family_ops = { | |
479 | .family = PF_XDP, | |
480 | .create = xsk_create, | |
481 | .owner = THIS_MODULE, | |
482 | }; | |
483 | ||
484 | static int __init xsk_init(void) | |
485 | { | |
486 | int err; | |
487 | ||
488 | err = proto_register(&xsk_proto, 0 /* no slab */); | |
489 | if (err) | |
490 | goto out; | |
491 | ||
492 | err = sock_register(&xsk_family_ops); | |
493 | if (err) | |
494 | goto out_proto; | |
495 | ||
496 | return 0; | |
497 | ||
498 | out_proto: | |
499 | proto_unregister(&xsk_proto); | |
500 | out: | |
501 | return err; | |
502 | } | |
503 | ||
504 | fs_initcall(xsk_init); |