]>
Commit | Line | Data |
---|---|---|
1 | // SPDX-License-Identifier: GPL-2.0 | |
2 | /* XDP sockets | |
3 | * | |
4 | * AF_XDP sockets allows a channel between XDP programs and userspace | |
5 | * applications. | |
6 | * Copyright(c) 2018 Intel Corporation. | |
7 | * | |
8 | * Author(s): Björn Töpel <bjorn.topel@intel.com> | |
9 | * Magnus Karlsson <magnus.karlsson@intel.com> | |
10 | */ | |
11 | ||
12 | #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ | |
13 | ||
14 | #include <linux/if_xdp.h> | |
15 | #include <linux/init.h> | |
16 | #include <linux/sched/mm.h> | |
17 | #include <linux/sched/signal.h> | |
18 | #include <linux/sched/task.h> | |
19 | #include <linux/socket.h> | |
20 | #include <linux/file.h> | |
21 | #include <linux/uaccess.h> | |
22 | #include <linux/net.h> | |
23 | #include <linux/netdevice.h> | |
24 | #include <linux/rculist.h> | |
25 | #include <net/xdp_sock.h> | |
26 | #include <net/xdp.h> | |
27 | ||
28 | #include "xsk_queue.h" | |
29 | #include "xdp_umem.h" | |
30 | ||
31 | #define TX_BATCH_SIZE 16 | |
32 | ||
33 | static struct xdp_sock *xdp_sk(struct sock *sk) | |
34 | { | |
35 | return (struct xdp_sock *)sk; | |
36 | } | |
37 | ||
38 | bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) | |
39 | { | |
40 | return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && | |
41 | READ_ONCE(xs->umem->fq); | |
42 | } | |
43 | ||
44 | u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) | |
45 | { | |
46 | return xskq_peek_addr(umem->fq, addr); | |
47 | } | |
48 | EXPORT_SYMBOL(xsk_umem_peek_addr); | |
49 | ||
50 | void xsk_umem_discard_addr(struct xdp_umem *umem) | |
51 | { | |
52 | xskq_discard_addr(umem->fq); | |
53 | } | |
54 | EXPORT_SYMBOL(xsk_umem_discard_addr); | |
55 | ||
56 | static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) | |
57 | { | |
58 | void *to_buf, *from_buf; | |
59 | u32 metalen; | |
60 | u64 addr; | |
61 | int err; | |
62 | ||
63 | if (!xskq_peek_addr(xs->umem->fq, &addr) || | |
64 | len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { | |
65 | xs->rx_dropped++; | |
66 | return -ENOSPC; | |
67 | } | |
68 | ||
69 | addr += xs->umem->headroom; | |
70 | ||
71 | if (unlikely(xdp_data_meta_unsupported(xdp))) { | |
72 | from_buf = xdp->data; | |
73 | metalen = 0; | |
74 | } else { | |
75 | from_buf = xdp->data_meta; | |
76 | metalen = xdp->data - xdp->data_meta; | |
77 | } | |
78 | ||
79 | to_buf = xdp_umem_get_data(xs->umem, addr); | |
80 | memcpy(to_buf, from_buf, len + metalen); | |
81 | addr += metalen; | |
82 | err = xskq_produce_batch_desc(xs->rx, addr, len); | |
83 | if (!err) { | |
84 | xskq_discard_addr(xs->umem->fq); | |
85 | xdp_return_buff(xdp); | |
86 | return 0; | |
87 | } | |
88 | ||
89 | xs->rx_dropped++; | |
90 | return err; | |
91 | } | |
92 | ||
93 | static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) | |
94 | { | |
95 | int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); | |
96 | ||
97 | if (err) | |
98 | xs->rx_dropped++; | |
99 | ||
100 | return err; | |
101 | } | |
102 | ||
103 | int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) | |
104 | { | |
105 | u32 len; | |
106 | ||
107 | if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) | |
108 | return -EINVAL; | |
109 | ||
110 | len = xdp->data_end - xdp->data; | |
111 | ||
112 | return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? | |
113 | __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); | |
114 | } | |
115 | ||
116 | void xsk_flush(struct xdp_sock *xs) | |
117 | { | |
118 | xskq_produce_flush_desc(xs->rx); | |
119 | xs->sk.sk_data_ready(&xs->sk); | |
120 | } | |
121 | ||
122 | int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) | |
123 | { | |
124 | u32 metalen = xdp->data - xdp->data_meta; | |
125 | u32 len = xdp->data_end - xdp->data; | |
126 | void *buffer; | |
127 | u64 addr; | |
128 | int err; | |
129 | ||
130 | if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) | |
131 | return -EINVAL; | |
132 | ||
133 | if (!xskq_peek_addr(xs->umem->fq, &addr) || | |
134 | len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { | |
135 | xs->rx_dropped++; | |
136 | return -ENOSPC; | |
137 | } | |
138 | ||
139 | addr += xs->umem->headroom; | |
140 | ||
141 | buffer = xdp_umem_get_data(xs->umem, addr); | |
142 | memcpy(buffer, xdp->data_meta, len + metalen); | |
143 | addr += metalen; | |
144 | err = xskq_produce_batch_desc(xs->rx, addr, len); | |
145 | if (!err) { | |
146 | xskq_discard_addr(xs->umem->fq); | |
147 | xsk_flush(xs); | |
148 | return 0; | |
149 | } | |
150 | ||
151 | xs->rx_dropped++; | |
152 | return err; | |
153 | } | |
154 | ||
155 | void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) | |
156 | { | |
157 | xskq_produce_flush_addr_n(umem->cq, nb_entries); | |
158 | } | |
159 | EXPORT_SYMBOL(xsk_umem_complete_tx); | |
160 | ||
161 | void xsk_umem_consume_tx_done(struct xdp_umem *umem) | |
162 | { | |
163 | struct xdp_sock *xs; | |
164 | ||
165 | rcu_read_lock(); | |
166 | list_for_each_entry_rcu(xs, &umem->xsk_list, list) { | |
167 | xs->sk.sk_write_space(&xs->sk); | |
168 | } | |
169 | rcu_read_unlock(); | |
170 | } | |
171 | EXPORT_SYMBOL(xsk_umem_consume_tx_done); | |
172 | ||
173 | bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) | |
174 | { | |
175 | struct xdp_desc desc; | |
176 | struct xdp_sock *xs; | |
177 | ||
178 | rcu_read_lock(); | |
179 | list_for_each_entry_rcu(xs, &umem->xsk_list, list) { | |
180 | if (!xskq_peek_desc(xs->tx, &desc)) | |
181 | continue; | |
182 | ||
183 | if (xskq_produce_addr_lazy(umem->cq, desc.addr)) | |
184 | goto out; | |
185 | ||
186 | *dma = xdp_umem_get_dma(umem, desc.addr); | |
187 | *len = desc.len; | |
188 | ||
189 | xskq_discard_desc(xs->tx); | |
190 | rcu_read_unlock(); | |
191 | return true; | |
192 | } | |
193 | ||
194 | out: | |
195 | rcu_read_unlock(); | |
196 | return false; | |
197 | } | |
198 | EXPORT_SYMBOL(xsk_umem_consume_tx); | |
199 | ||
200 | static int xsk_zc_xmit(struct sock *sk) | |
201 | { | |
202 | struct xdp_sock *xs = xdp_sk(sk); | |
203 | struct net_device *dev = xs->dev; | |
204 | ||
205 | return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); | |
206 | } | |
207 | ||
208 | static void xsk_destruct_skb(struct sk_buff *skb) | |
209 | { | |
210 | u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; | |
211 | struct xdp_sock *xs = xdp_sk(skb->sk); | |
212 | unsigned long flags; | |
213 | ||
214 | spin_lock_irqsave(&xs->tx_completion_lock, flags); | |
215 | WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); | |
216 | spin_unlock_irqrestore(&xs->tx_completion_lock, flags); | |
217 | ||
218 | sock_wfree(skb); | |
219 | } | |
220 | ||
221 | static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, | |
222 | size_t total_len) | |
223 | { | |
224 | u32 max_batch = TX_BATCH_SIZE; | |
225 | struct xdp_sock *xs = xdp_sk(sk); | |
226 | bool sent_frame = false; | |
227 | struct xdp_desc desc; | |
228 | struct sk_buff *skb; | |
229 | int err = 0; | |
230 | ||
231 | mutex_lock(&xs->mutex); | |
232 | ||
233 | while (xskq_peek_desc(xs->tx, &desc)) { | |
234 | char *buffer; | |
235 | u64 addr; | |
236 | u32 len; | |
237 | ||
238 | if (max_batch-- == 0) { | |
239 | err = -EAGAIN; | |
240 | goto out; | |
241 | } | |
242 | ||
243 | if (xskq_reserve_addr(xs->umem->cq)) | |
244 | goto out; | |
245 | ||
246 | if (xs->queue_id >= xs->dev->real_num_tx_queues) | |
247 | goto out; | |
248 | ||
249 | len = desc.len; | |
250 | skb = sock_alloc_send_skb(sk, len, 1, &err); | |
251 | if (unlikely(!skb)) { | |
252 | err = -EAGAIN; | |
253 | goto out; | |
254 | } | |
255 | ||
256 | skb_put(skb, len); | |
257 | addr = desc.addr; | |
258 | buffer = xdp_umem_get_data(xs->umem, addr); | |
259 | err = skb_store_bits(skb, 0, buffer, len); | |
260 | if (unlikely(err)) { | |
261 | kfree_skb(skb); | |
262 | goto out; | |
263 | } | |
264 | ||
265 | skb->dev = xs->dev; | |
266 | skb->priority = sk->sk_priority; | |
267 | skb->mark = sk->sk_mark; | |
268 | skb_shinfo(skb)->destructor_arg = (void *)(long)addr; | |
269 | skb->destructor = xsk_destruct_skb; | |
270 | ||
271 | err = dev_direct_xmit(skb, xs->queue_id); | |
272 | xskq_discard_desc(xs->tx); | |
273 | /* Ignore NET_XMIT_CN as packet might have been sent */ | |
274 | if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { | |
275 | /* SKB completed but not sent */ | |
276 | err = -EBUSY; | |
277 | goto out; | |
278 | } | |
279 | ||
280 | sent_frame = true; | |
281 | } | |
282 | ||
283 | out: | |
284 | if (sent_frame) | |
285 | sk->sk_write_space(sk); | |
286 | ||
287 | mutex_unlock(&xs->mutex); | |
288 | return err; | |
289 | } | |
290 | ||
291 | static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) | |
292 | { | |
293 | bool need_wait = !(m->msg_flags & MSG_DONTWAIT); | |
294 | struct sock *sk = sock->sk; | |
295 | struct xdp_sock *xs = xdp_sk(sk); | |
296 | ||
297 | if (unlikely(!xs->dev)) | |
298 | return -ENXIO; | |
299 | if (unlikely(!(xs->dev->flags & IFF_UP))) | |
300 | return -ENETDOWN; | |
301 | if (unlikely(!xs->tx)) | |
302 | return -ENOBUFS; | |
303 | if (need_wait) | |
304 | return -EOPNOTSUPP; | |
305 | ||
306 | return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); | |
307 | } | |
308 | ||
309 | static unsigned int xsk_poll(struct file *file, struct socket *sock, | |
310 | struct poll_table_struct *wait) | |
311 | { | |
312 | unsigned int mask = datagram_poll(file, sock, wait); | |
313 | struct sock *sk = sock->sk; | |
314 | struct xdp_sock *xs = xdp_sk(sk); | |
315 | ||
316 | if (xs->rx && !xskq_empty_desc(xs->rx)) | |
317 | mask |= POLLIN | POLLRDNORM; | |
318 | if (xs->tx && !xskq_full_desc(xs->tx)) | |
319 | mask |= POLLOUT | POLLWRNORM; | |
320 | ||
321 | return mask; | |
322 | } | |
323 | ||
324 | static int xsk_init_queue(u32 entries, struct xsk_queue **queue, | |
325 | bool umem_queue) | |
326 | { | |
327 | struct xsk_queue *q; | |
328 | ||
329 | if (entries == 0 || *queue || !is_power_of_2(entries)) | |
330 | return -EINVAL; | |
331 | ||
332 | q = xskq_create(entries, umem_queue); | |
333 | if (!q) | |
334 | return -ENOMEM; | |
335 | ||
336 | /* Make sure queue is ready before it can be seen by others */ | |
337 | smp_wmb(); | |
338 | *queue = q; | |
339 | return 0; | |
340 | } | |
341 | ||
342 | static int xsk_release(struct socket *sock) | |
343 | { | |
344 | struct sock *sk = sock->sk; | |
345 | struct xdp_sock *xs = xdp_sk(sk); | |
346 | struct net *net; | |
347 | ||
348 | if (!sk) | |
349 | return 0; | |
350 | ||
351 | net = sock_net(sk); | |
352 | ||
353 | local_bh_disable(); | |
354 | sock_prot_inuse_add(net, sk->sk_prot, -1); | |
355 | local_bh_enable(); | |
356 | ||
357 | if (xs->dev) { | |
358 | struct net_device *dev = xs->dev; | |
359 | ||
360 | /* Wait for driver to stop using the xdp socket. */ | |
361 | xdp_del_sk_umem(xs->umem, xs); | |
362 | xs->dev = NULL; | |
363 | synchronize_net(); | |
364 | dev_put(dev); | |
365 | } | |
366 | ||
367 | xskq_destroy(xs->rx); | |
368 | xskq_destroy(xs->tx); | |
369 | xdp_put_umem(xs->umem); | |
370 | ||
371 | sock_orphan(sk); | |
372 | sock->sk = NULL; | |
373 | ||
374 | sk_refcnt_debug_release(sk); | |
375 | sock_put(sk); | |
376 | ||
377 | return 0; | |
378 | } | |
379 | ||
380 | static struct socket *xsk_lookup_xsk_from_fd(int fd) | |
381 | { | |
382 | struct socket *sock; | |
383 | int err; | |
384 | ||
385 | sock = sockfd_lookup(fd, &err); | |
386 | if (!sock) | |
387 | return ERR_PTR(-ENOTSOCK); | |
388 | ||
389 | if (sock->sk->sk_family != PF_XDP) { | |
390 | sockfd_put(sock); | |
391 | return ERR_PTR(-ENOPROTOOPT); | |
392 | } | |
393 | ||
394 | return sock; | |
395 | } | |
396 | ||
397 | static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) | |
398 | { | |
399 | struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; | |
400 | struct sock *sk = sock->sk; | |
401 | struct xdp_sock *xs = xdp_sk(sk); | |
402 | struct net_device *dev; | |
403 | u32 flags, qid; | |
404 | int err = 0; | |
405 | ||
406 | if (addr_len < sizeof(struct sockaddr_xdp)) | |
407 | return -EINVAL; | |
408 | if (sxdp->sxdp_family != AF_XDP) | |
409 | return -EINVAL; | |
410 | ||
411 | mutex_lock(&xs->mutex); | |
412 | if (xs->dev) { | |
413 | err = -EBUSY; | |
414 | goto out_release; | |
415 | } | |
416 | ||
417 | dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); | |
418 | if (!dev) { | |
419 | err = -ENODEV; | |
420 | goto out_release; | |
421 | } | |
422 | ||
423 | if (!xs->rx && !xs->tx) { | |
424 | err = -EINVAL; | |
425 | goto out_unlock; | |
426 | } | |
427 | ||
428 | qid = sxdp->sxdp_queue_id; | |
429 | flags = sxdp->sxdp_flags; | |
430 | ||
431 | if (flags & XDP_SHARED_UMEM) { | |
432 | struct xdp_sock *umem_xs; | |
433 | struct socket *sock; | |
434 | ||
435 | if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { | |
436 | /* Cannot specify flags for shared sockets. */ | |
437 | err = -EINVAL; | |
438 | goto out_unlock; | |
439 | } | |
440 | ||
441 | if (xs->umem) { | |
442 | /* We have already our own. */ | |
443 | err = -EINVAL; | |
444 | goto out_unlock; | |
445 | } | |
446 | ||
447 | sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); | |
448 | if (IS_ERR(sock)) { | |
449 | err = PTR_ERR(sock); | |
450 | goto out_unlock; | |
451 | } | |
452 | ||
453 | umem_xs = xdp_sk(sock->sk); | |
454 | if (!umem_xs->umem) { | |
455 | /* No umem to inherit. */ | |
456 | err = -EBADF; | |
457 | sockfd_put(sock); | |
458 | goto out_unlock; | |
459 | } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { | |
460 | err = -EINVAL; | |
461 | sockfd_put(sock); | |
462 | goto out_unlock; | |
463 | } | |
464 | ||
465 | xdp_get_umem(umem_xs->umem); | |
466 | xs->umem = umem_xs->umem; | |
467 | sockfd_put(sock); | |
468 | } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { | |
469 | err = -EINVAL; | |
470 | goto out_unlock; | |
471 | } else { | |
472 | /* This xsk has its own umem. */ | |
473 | xskq_set_umem(xs->umem->fq, xs->umem->size, | |
474 | xs->umem->chunk_mask); | |
475 | xskq_set_umem(xs->umem->cq, xs->umem->size, | |
476 | xs->umem->chunk_mask); | |
477 | ||
478 | err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); | |
479 | if (err) | |
480 | goto out_unlock; | |
481 | } | |
482 | ||
483 | xs->dev = dev; | |
484 | xs->zc = xs->umem->zc; | |
485 | xs->queue_id = qid; | |
486 | xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); | |
487 | xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); | |
488 | xdp_add_sk_umem(xs->umem, xs); | |
489 | ||
490 | out_unlock: | |
491 | if (err) | |
492 | dev_put(dev); | |
493 | out_release: | |
494 | mutex_unlock(&xs->mutex); | |
495 | return err; | |
496 | } | |
497 | ||
498 | static int xsk_setsockopt(struct socket *sock, int level, int optname, | |
499 | char __user *optval, unsigned int optlen) | |
500 | { | |
501 | struct sock *sk = sock->sk; | |
502 | struct xdp_sock *xs = xdp_sk(sk); | |
503 | int err; | |
504 | ||
505 | if (level != SOL_XDP) | |
506 | return -ENOPROTOOPT; | |
507 | ||
508 | switch (optname) { | |
509 | case XDP_RX_RING: | |
510 | case XDP_TX_RING: | |
511 | { | |
512 | struct xsk_queue **q; | |
513 | int entries; | |
514 | ||
515 | if (optlen < sizeof(entries)) | |
516 | return -EINVAL; | |
517 | if (copy_from_user(&entries, optval, sizeof(entries))) | |
518 | return -EFAULT; | |
519 | ||
520 | mutex_lock(&xs->mutex); | |
521 | q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; | |
522 | err = xsk_init_queue(entries, q, false); | |
523 | mutex_unlock(&xs->mutex); | |
524 | return err; | |
525 | } | |
526 | case XDP_UMEM_REG: | |
527 | { | |
528 | struct xdp_umem_reg mr; | |
529 | struct xdp_umem *umem; | |
530 | ||
531 | if (copy_from_user(&mr, optval, sizeof(mr))) | |
532 | return -EFAULT; | |
533 | ||
534 | mutex_lock(&xs->mutex); | |
535 | if (xs->umem) { | |
536 | mutex_unlock(&xs->mutex); | |
537 | return -EBUSY; | |
538 | } | |
539 | ||
540 | umem = xdp_umem_create(&mr); | |
541 | if (IS_ERR(umem)) { | |
542 | mutex_unlock(&xs->mutex); | |
543 | return PTR_ERR(umem); | |
544 | } | |
545 | ||
546 | /* Make sure umem is ready before it can be seen by others */ | |
547 | smp_wmb(); | |
548 | xs->umem = umem; | |
549 | mutex_unlock(&xs->mutex); | |
550 | return 0; | |
551 | } | |
552 | case XDP_UMEM_FILL_RING: | |
553 | case XDP_UMEM_COMPLETION_RING: | |
554 | { | |
555 | struct xsk_queue **q; | |
556 | int entries; | |
557 | ||
558 | if (copy_from_user(&entries, optval, sizeof(entries))) | |
559 | return -EFAULT; | |
560 | ||
561 | mutex_lock(&xs->mutex); | |
562 | if (!xs->umem) { | |
563 | mutex_unlock(&xs->mutex); | |
564 | return -EINVAL; | |
565 | } | |
566 | ||
567 | q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : | |
568 | &xs->umem->cq; | |
569 | err = xsk_init_queue(entries, q, true); | |
570 | mutex_unlock(&xs->mutex); | |
571 | return err; | |
572 | } | |
573 | default: | |
574 | break; | |
575 | } | |
576 | ||
577 | return -ENOPROTOOPT; | |
578 | } | |
579 | ||
580 | static int xsk_getsockopt(struct socket *sock, int level, int optname, | |
581 | char __user *optval, int __user *optlen) | |
582 | { | |
583 | struct sock *sk = sock->sk; | |
584 | struct xdp_sock *xs = xdp_sk(sk); | |
585 | int len; | |
586 | ||
587 | if (level != SOL_XDP) | |
588 | return -ENOPROTOOPT; | |
589 | ||
590 | if (get_user(len, optlen)) | |
591 | return -EFAULT; | |
592 | if (len < 0) | |
593 | return -EINVAL; | |
594 | ||
595 | switch (optname) { | |
596 | case XDP_STATISTICS: | |
597 | { | |
598 | struct xdp_statistics stats; | |
599 | ||
600 | if (len < sizeof(stats)) | |
601 | return -EINVAL; | |
602 | ||
603 | mutex_lock(&xs->mutex); | |
604 | stats.rx_dropped = xs->rx_dropped; | |
605 | stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); | |
606 | stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); | |
607 | mutex_unlock(&xs->mutex); | |
608 | ||
609 | if (copy_to_user(optval, &stats, sizeof(stats))) | |
610 | return -EFAULT; | |
611 | if (put_user(sizeof(stats), optlen)) | |
612 | return -EFAULT; | |
613 | ||
614 | return 0; | |
615 | } | |
616 | case XDP_MMAP_OFFSETS: | |
617 | { | |
618 | struct xdp_mmap_offsets off; | |
619 | ||
620 | if (len < sizeof(off)) | |
621 | return -EINVAL; | |
622 | ||
623 | off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); | |
624 | off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); | |
625 | off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); | |
626 | off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); | |
627 | off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); | |
628 | off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); | |
629 | ||
630 | off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); | |
631 | off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); | |
632 | off.fr.desc = offsetof(struct xdp_umem_ring, desc); | |
633 | off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); | |
634 | off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); | |
635 | off.cr.desc = offsetof(struct xdp_umem_ring, desc); | |
636 | ||
637 | len = sizeof(off); | |
638 | if (copy_to_user(optval, &off, len)) | |
639 | return -EFAULT; | |
640 | if (put_user(len, optlen)) | |
641 | return -EFAULT; | |
642 | ||
643 | return 0; | |
644 | } | |
645 | default: | |
646 | break; | |
647 | } | |
648 | ||
649 | return -EOPNOTSUPP; | |
650 | } | |
651 | ||
652 | static int xsk_mmap(struct file *file, struct socket *sock, | |
653 | struct vm_area_struct *vma) | |
654 | { | |
655 | loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; | |
656 | unsigned long size = vma->vm_end - vma->vm_start; | |
657 | struct xdp_sock *xs = xdp_sk(sock->sk); | |
658 | struct xsk_queue *q = NULL; | |
659 | struct xdp_umem *umem; | |
660 | unsigned long pfn; | |
661 | struct page *qpg; | |
662 | ||
663 | if (offset == XDP_PGOFF_RX_RING) { | |
664 | q = READ_ONCE(xs->rx); | |
665 | } else if (offset == XDP_PGOFF_TX_RING) { | |
666 | q = READ_ONCE(xs->tx); | |
667 | } else { | |
668 | umem = READ_ONCE(xs->umem); | |
669 | if (!umem) | |
670 | return -EINVAL; | |
671 | ||
672 | if (offset == XDP_UMEM_PGOFF_FILL_RING) | |
673 | q = READ_ONCE(umem->fq); | |
674 | else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) | |
675 | q = READ_ONCE(umem->cq); | |
676 | } | |
677 | ||
678 | if (!q) | |
679 | return -EINVAL; | |
680 | ||
681 | qpg = virt_to_head_page(q->ring); | |
682 | if (size > (PAGE_SIZE << compound_order(qpg))) | |
683 | return -EINVAL; | |
684 | ||
685 | pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; | |
686 | return remap_pfn_range(vma, vma->vm_start, pfn, | |
687 | size, vma->vm_page_prot); | |
688 | } | |
689 | ||
690 | static struct proto xsk_proto = { | |
691 | .name = "XDP", | |
692 | .owner = THIS_MODULE, | |
693 | .obj_size = sizeof(struct xdp_sock), | |
694 | }; | |
695 | ||
696 | static const struct proto_ops xsk_proto_ops = { | |
697 | .family = PF_XDP, | |
698 | .owner = THIS_MODULE, | |
699 | .release = xsk_release, | |
700 | .bind = xsk_bind, | |
701 | .connect = sock_no_connect, | |
702 | .socketpair = sock_no_socketpair, | |
703 | .accept = sock_no_accept, | |
704 | .getname = sock_no_getname, | |
705 | .poll = xsk_poll, | |
706 | .ioctl = sock_no_ioctl, | |
707 | .listen = sock_no_listen, | |
708 | .shutdown = sock_no_shutdown, | |
709 | .setsockopt = xsk_setsockopt, | |
710 | .getsockopt = xsk_getsockopt, | |
711 | .sendmsg = xsk_sendmsg, | |
712 | .recvmsg = sock_no_recvmsg, | |
713 | .mmap = xsk_mmap, | |
714 | .sendpage = sock_no_sendpage, | |
715 | }; | |
716 | ||
717 | static int xsk_create(struct net *net, struct socket *sock, int protocol, | |
718 | int kern) | |
719 | { | |
720 | struct sock *sk; | |
721 | struct xdp_sock *xs; | |
722 | ||
723 | if (!ns_capable(net->user_ns, CAP_NET_RAW)) | |
724 | return -EPERM; | |
725 | if (sock->type != SOCK_RAW) | |
726 | return -ESOCKTNOSUPPORT; | |
727 | ||
728 | if (protocol) | |
729 | return -EPROTONOSUPPORT; | |
730 | ||
731 | sock->state = SS_UNCONNECTED; | |
732 | ||
733 | sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); | |
734 | if (!sk) | |
735 | return -ENOBUFS; | |
736 | ||
737 | sock->ops = &xsk_proto_ops; | |
738 | ||
739 | sock_init_data(sock, sk); | |
740 | ||
741 | sk->sk_family = PF_XDP; | |
742 | ||
743 | sock_set_flag(sk, SOCK_RCU_FREE); | |
744 | ||
745 | xs = xdp_sk(sk); | |
746 | mutex_init(&xs->mutex); | |
747 | spin_lock_init(&xs->tx_completion_lock); | |
748 | ||
749 | local_bh_disable(); | |
750 | sock_prot_inuse_add(net, &xsk_proto, 1); | |
751 | local_bh_enable(); | |
752 | ||
753 | return 0; | |
754 | } | |
755 | ||
756 | static const struct net_proto_family xsk_family_ops = { | |
757 | .family = PF_XDP, | |
758 | .create = xsk_create, | |
759 | .owner = THIS_MODULE, | |
760 | }; | |
761 | ||
762 | static int __init xsk_init(void) | |
763 | { | |
764 | int err; | |
765 | ||
766 | err = proto_register(&xsk_proto, 0 /* no slab */); | |
767 | if (err) | |
768 | goto out; | |
769 | ||
770 | err = sock_register(&xsk_family_ops); | |
771 | if (err) | |
772 | goto out_proto; | |
773 | ||
774 | return 0; | |
775 | ||
776 | out_proto: | |
777 | proto_unregister(&xsk_proto); | |
778 | out: | |
779 | return err; | |
780 | } | |
781 | ||
782 | fs_initcall(xsk_init); |