1 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
13 /* A BPF sock_map is used to store sock objects. This is primarly used
14 * for doing socket redirect with BPF helper routines.
16 * A sock map may have BPF programs attached to it, currently a program
17 * used to parse packets and a program to provide a verdict and redirect
18 * decision on the packet are supported. Any programs attached to a sock
19 * map are inherited by sock objects when they are added to the map. If
20 * no BPF programs are attached the sock object may only be used for sock
23 * A sock object may be in multiple maps, but can only inherit a single
24 * parse or verdict program. If adding a sock object to a map would result
25 * in having multiple parsing programs the update will return an EBUSY error.
27 * For reference this program is similar to devmap used in XDP context
28 * reviewing these together may be useful. For an example please review
29 * ./samples/bpf/sockmap/.
31 #include <linux/bpf.h>
33 #include <linux/filter.h>
34 #include <linux/errno.h>
35 #include <linux/file.h>
36 #include <linux/kernel.h>
37 #include <linux/net.h>
38 #include <linux/skbuff.h>
39 #include <linux/workqueue.h>
40 #include <linux/list.h>
42 #include <net/strparser.h>
45 #define SOCK_CREATE_FLAG_MASK \
46 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
50 struct sock
**sock_map
;
51 struct bpf_prog
*bpf_tx_msg
;
52 struct bpf_prog
*bpf_parse
;
53 struct bpf_prog
*bpf_verdict
;
56 enum smap_psock_state
{
60 struct smap_psock_map_entry
{
61 struct list_head list
;
69 /* datapath variables */
70 struct sk_buff_head rxqueue
;
73 /* datapath error path cache across tx work invocations */
76 struct sk_buff
*save_skb
;
78 /* datapath variables for tx_msg ULP */
79 struct sock
*sk_redir
;
84 struct sk_msg_buff
*cork
;
86 struct strparser strp
;
87 struct bpf_prog
*bpf_tx_msg
;
88 struct bpf_prog
*bpf_parse
;
89 struct bpf_prog
*bpf_verdict
;
90 struct list_head maps
;
92 /* Back reference used when sock callback trigger sockmap operations */
96 struct work_struct tx_work
;
97 struct work_struct gc_work
;
99 struct proto
*sk_proto
;
100 void (*save_close
)(struct sock
*sk
, long timeout
);
101 void (*save_data_ready
)(struct sock
*sk
);
102 void (*save_write_space
)(struct sock
*sk
);
105 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
);
106 static int bpf_tcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t size
);
107 static int bpf_tcp_sendpage(struct sock
*sk
, struct page
*page
,
108 int offset
, size_t size
, int flags
);
110 static inline struct smap_psock
*smap_psock_sk(const struct sock
*sk
)
112 return rcu_dereference_sk_user_data(sk
);
115 static struct proto tcp_bpf_proto
;
116 static int bpf_tcp_init(struct sock
*sk
)
118 struct smap_psock
*psock
;
121 psock
= smap_psock_sk(sk
);
122 if (unlikely(!psock
)) {
127 if (unlikely(psock
->sk_proto
)) {
132 psock
->save_close
= sk
->sk_prot
->close
;
133 psock
->sk_proto
= sk
->sk_prot
;
135 if (psock
->bpf_tx_msg
) {
136 tcp_bpf_proto
.sendmsg
= bpf_tcp_sendmsg
;
137 tcp_bpf_proto
.sendpage
= bpf_tcp_sendpage
;
140 sk
->sk_prot
= &tcp_bpf_proto
;
145 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
);
146 static int free_start_sg(struct sock
*sk
, struct sk_msg_buff
*md
);
148 static void bpf_tcp_release(struct sock
*sk
)
150 struct smap_psock
*psock
;
153 psock
= smap_psock_sk(sk
);
154 if (unlikely(!psock
))
158 free_start_sg(psock
->sock
, psock
->cork
);
163 sk
->sk_prot
= psock
->sk_proto
;
164 psock
->sk_proto
= NULL
;
169 static void bpf_tcp_close(struct sock
*sk
, long timeout
)
171 void (*close_fun
)(struct sock
*sk
, long timeout
);
172 struct smap_psock_map_entry
*e
, *tmp
;
173 struct smap_psock
*psock
;
177 psock
= smap_psock_sk(sk
);
178 if (unlikely(!psock
)) {
180 return sk
->sk_prot
->close(sk
, timeout
);
183 /* The psock may be destroyed anytime after exiting the RCU critial
184 * section so by the time we use close_fun the psock may no longer
185 * be valid. However, bpf_tcp_close is called with the sock lock
186 * held so the close hook and sk are still valid.
188 close_fun
= psock
->save_close
;
190 write_lock_bh(&sk
->sk_callback_lock
);
191 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
192 osk
= cmpxchg(e
->entry
, sk
, NULL
);
195 smap_release_sock(psock
, sk
);
198 write_unlock_bh(&sk
->sk_callback_lock
);
200 close_fun(sk
, timeout
);
210 static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly
= {
213 .user_visible
= false,
215 .init
= bpf_tcp_init
,
216 .release
= bpf_tcp_release
,
219 static int memcopy_from_iter(struct sock
*sk
,
220 struct sk_msg_buff
*md
,
221 struct iov_iter
*from
, int bytes
)
223 struct scatterlist
*sg
= md
->sg_data
;
224 int i
= md
->sg_curr
, rc
= -ENOSPC
;
230 if (md
->sg_copybreak
>= sg
[i
].length
) {
231 md
->sg_copybreak
= 0;
233 if (++i
== MAX_SKB_FRAGS
)
240 copy
= sg
[i
].length
- md
->sg_copybreak
;
241 to
= sg_virt(&sg
[i
]) + md
->sg_copybreak
;
242 md
->sg_copybreak
+= copy
;
244 if (sk
->sk_route_caps
& NETIF_F_NOCACHE_COPY
)
245 rc
= copy_from_iter_nocache(to
, copy
, from
);
247 rc
= copy_from_iter(to
, copy
, from
);
258 md
->sg_copybreak
= 0;
259 if (++i
== MAX_SKB_FRAGS
)
261 } while (i
!= md
->sg_end
);
267 static int bpf_tcp_push(struct sock
*sk
, int apply_bytes
,
268 struct sk_msg_buff
*md
,
269 int flags
, bool uncharge
)
271 bool apply
= apply_bytes
;
272 struct scatterlist
*sg
;
278 sg
= md
->sg_data
+ md
->sg_start
;
279 size
= (apply
&& apply_bytes
< sg
->length
) ?
280 apply_bytes
: sg
->length
;
283 tcp_rate_check_app_limited(sk
);
286 ret
= do_tcp_sendpages(sk
, p
, offset
, size
, flags
);
294 sk_mem_uncharge(sk
, ret
);
308 sk_mem_uncharge(sk
, ret
);
313 if (md
->sg_start
== MAX_SKB_FRAGS
)
315 memset(sg
, 0, sizeof(*sg
));
317 if (md
->sg_start
== md
->sg_end
)
321 if (apply
&& !apply_bytes
)
327 static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff
*md
)
329 struct scatterlist
*sg
= md
->sg_data
+ md
->sg_start
;
331 if (md
->sg_copy
[md
->sg_start
]) {
332 md
->data
= md
->data_end
= 0;
334 md
->data
= sg_virt(sg
);
335 md
->data_end
= md
->data
+ sg
->length
;
339 static void return_mem_sg(struct sock
*sk
, int bytes
, struct sk_msg_buff
*md
)
341 struct scatterlist
*sg
= md
->sg_data
;
342 int i
= md
->sg_start
;
345 int uncharge
= (bytes
< sg
[i
].length
) ? bytes
: sg
[i
].length
;
347 sk_mem_uncharge(sk
, uncharge
);
352 if (i
== MAX_SKB_FRAGS
)
354 } while (i
!= md
->sg_end
);
357 static void free_bytes_sg(struct sock
*sk
, int bytes
, struct sk_msg_buff
*md
)
359 struct scatterlist
*sg
= md
->sg_data
;
360 int i
= md
->sg_start
, free
;
362 while (bytes
&& sg
[i
].length
) {
365 sg
[i
].length
-= bytes
;
366 sg
[i
].offset
+= bytes
;
367 sk_mem_uncharge(sk
, bytes
);
371 sk_mem_uncharge(sk
, sg
[i
].length
);
372 put_page(sg_page(&sg
[i
]));
373 bytes
-= sg
[i
].length
;
379 if (i
== MAX_SKB_FRAGS
)
384 static int free_sg(struct sock
*sk
, int start
, struct sk_msg_buff
*md
)
386 struct scatterlist
*sg
= md
->sg_data
;
387 int i
= start
, free
= 0;
389 while (sg
[i
].length
) {
390 free
+= sg
[i
].length
;
391 sk_mem_uncharge(sk
, sg
[i
].length
);
392 put_page(sg_page(&sg
[i
]));
398 if (i
== MAX_SKB_FRAGS
)
405 static int free_start_sg(struct sock
*sk
, struct sk_msg_buff
*md
)
407 int free
= free_sg(sk
, md
->sg_start
, md
);
409 md
->sg_start
= md
->sg_end
;
413 static int free_curr_sg(struct sock
*sk
, struct sk_msg_buff
*md
)
415 return free_sg(sk
, md
->sg_curr
, md
);
418 static int bpf_map_msg_verdict(int _rc
, struct sk_msg_buff
*md
)
420 return ((_rc
== SK_PASS
) ?
421 (md
->map
? __SK_REDIRECT
: __SK_PASS
) :
425 static unsigned int smap_do_tx_msg(struct sock
*sk
,
426 struct smap_psock
*psock
,
427 struct sk_msg_buff
*md
)
429 struct bpf_prog
*prog
;
430 unsigned int rc
, _rc
;
435 /* If the policy was removed mid-send then default to 'accept' */
436 prog
= READ_ONCE(psock
->bpf_tx_msg
);
437 if (unlikely(!prog
)) {
442 bpf_compute_data_pointers_sg(md
);
443 rc
= (*prog
->bpf_func
)(md
, prog
->insnsi
);
444 psock
->apply_bytes
= md
->apply_bytes
;
446 /* Moving return codes from UAPI namespace into internal namespace */
447 _rc
= bpf_map_msg_verdict(rc
, md
);
449 /* The psock has a refcount on the sock but not on the map and because
450 * we need to drop rcu read lock here its possible the map could be
451 * removed between here and when we need it to execute the sock
452 * redirect. So do the map lookup now for future use.
454 if (_rc
== __SK_REDIRECT
) {
456 sock_put(psock
->sk_redir
);
457 psock
->sk_redir
= do_msg_redirect_map(md
);
458 if (!psock
->sk_redir
) {
462 sock_hold(psock
->sk_redir
);
471 static int bpf_tcp_sendmsg_do_redirect(struct sock
*sk
, int send
,
472 struct sk_msg_buff
*md
,
475 struct smap_psock
*psock
;
476 struct scatterlist
*sg
;
477 int i
, err
, free
= 0;
482 psock
= smap_psock_sk(sk
);
483 if (unlikely(!psock
))
486 if (!refcount_inc_not_zero(&psock
->refcnt
))
491 err
= bpf_tcp_push(sk
, send
, md
, flags
, false);
493 smap_release_sock(psock
, sk
);
501 while (sg
[i
].length
) {
502 free
+= sg
[i
].length
;
503 put_page(sg_page(&sg
[i
]));
506 if (i
== MAX_SKB_FRAGS
)
512 static inline void bpf_md_init(struct smap_psock
*psock
)
514 if (!psock
->apply_bytes
) {
515 psock
->eval
= __SK_NONE
;
516 if (psock
->sk_redir
) {
517 sock_put(psock
->sk_redir
);
518 psock
->sk_redir
= NULL
;
523 static void apply_bytes_dec(struct smap_psock
*psock
, int i
)
525 if (psock
->apply_bytes
) {
526 if (psock
->apply_bytes
< i
)
527 psock
->apply_bytes
= 0;
529 psock
->apply_bytes
-= i
;
533 static int bpf_exec_tx_verdict(struct smap_psock
*psock
,
534 struct sk_msg_buff
*m
,
536 int *copied
, int flags
)
538 bool cork
= false, enospc
= (m
->sg_start
== m
->sg_end
);
544 if (psock
->eval
== __SK_NONE
)
545 psock
->eval
= smap_do_tx_msg(sk
, psock
, m
);
548 m
->cork_bytes
> psock
->sg_size
&& !enospc
) {
549 psock
->cork_bytes
= m
->cork_bytes
- psock
->sg_size
;
551 psock
->cork
= kcalloc(1,
552 sizeof(struct sk_msg_buff
),
553 GFP_ATOMIC
| __GFP_NOWARN
);
560 memcpy(psock
->cork
, m
, sizeof(*m
));
564 send
= psock
->sg_size
;
565 if (psock
->apply_bytes
&& psock
->apply_bytes
< send
)
566 send
= psock
->apply_bytes
;
568 switch (psock
->eval
) {
570 err
= bpf_tcp_push(sk
, send
, m
, flags
, true);
572 *copied
-= free_start_sg(sk
, m
);
576 apply_bytes_dec(psock
, send
);
577 psock
->sg_size
-= send
;
580 redir
= psock
->sk_redir
;
581 apply_bytes_dec(psock
, send
);
588 return_mem_sg(sk
, send
, m
);
591 err
= bpf_tcp_sendmsg_do_redirect(redir
, send
, m
, flags
);
595 free_start_sg(sk
, m
);
602 psock
->sg_size
-= send
;
606 free_bytes_sg(sk
, send
, m
);
607 apply_bytes_dec(psock
, send
);
609 psock
->sg_size
-= send
;
617 m
->sg_data
[m
->sg_start
].page_link
&&
618 m
->sg_data
[m
->sg_start
].length
)
626 static int bpf_tcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t size
)
628 int flags
= msg
->msg_flags
| MSG_NO_SHARED_FRAGS
;
629 struct sk_msg_buff md
= {0};
630 unsigned int sg_copy
= 0;
631 struct smap_psock
*psock
;
632 int copied
= 0, err
= 0;
633 struct scatterlist
*sg
;
636 /* Its possible a sock event or user removed the psock _but_ the ops
637 * have not been reprogrammed yet so we get here. In this case fallback
638 * to tcp_sendmsg. Note this only works because we _only_ ever allow
639 * a single ULP there is no hierarchy here.
642 psock
= smap_psock_sk(sk
);
643 if (unlikely(!psock
)) {
645 return tcp_sendmsg(sk
, msg
, size
);
648 /* Increment the psock refcnt to ensure its not released while sending a
649 * message. Required because sk lookup and bpf programs are used in
650 * separate rcu critical sections. Its OK if we lose the map entry
651 * but we can't lose the sock reference.
653 if (!refcount_inc_not_zero(&psock
->refcnt
)) {
655 return tcp_sendmsg(sk
, msg
, size
);
659 sg_init_table(sg
, MAX_SKB_FRAGS
);
663 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
665 while (msg_data_left(msg
)) {
666 struct sk_msg_buff
*m
;
675 copy
= msg_data_left(msg
);
676 if (!sk_stream_memory_free(sk
))
677 goto wait_for_sndbuf
;
679 m
= psock
->cork_bytes
? psock
->cork
: &md
;
680 m
->sg_curr
= m
->sg_copybreak
? m
->sg_curr
: m
->sg_end
;
681 err
= sk_alloc_sg(sk
, copy
, m
->sg_data
,
682 m
->sg_start
, &m
->sg_end
, &sg_copy
,
686 goto wait_for_memory
;
691 err
= memcopy_from_iter(sk
, m
, &msg
->msg_iter
, copy
);
697 psock
->sg_size
+= copy
;
701 /* When bytes are being corked skip running BPF program and
702 * applying verdict unless there is no more buffer space. In
703 * the ENOSPC case simply run BPF prorgram with currently
704 * accumulated data. We don't have much choice at this point
705 * we could try extending the page frags or chaining complex
706 * frags but even in these cases _eventually_ we will hit an
707 * OOM scenario. More complex recovery schemes may be
708 * implemented in the future, but BPF programs must handle
709 * the case where apply_cork requests are not honored. The
710 * canonical method to verify this is to check data length.
712 if (psock
->cork_bytes
) {
713 if (copy
> psock
->cork_bytes
)
714 psock
->cork_bytes
= 0;
716 psock
->cork_bytes
-= copy
;
718 if (psock
->cork_bytes
&& !enospc
)
721 /* All cork bytes accounted for re-run filter */
722 psock
->eval
= __SK_NONE
;
723 psock
->cork_bytes
= 0;
726 err
= bpf_exec_tx_verdict(psock
, m
, sk
, &copied
, flags
);
727 if (unlikely(err
< 0))
731 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
733 err
= sk_stream_wait_memory(sk
, &timeo
);
739 err
= sk_stream_error(sk
, msg
->msg_flags
, err
);
742 smap_release_sock(psock
, sk
);
743 return copied
? copied
: err
;
746 static int bpf_tcp_sendpage(struct sock
*sk
, struct page
*page
,
747 int offset
, size_t size
, int flags
)
749 struct sk_msg_buff md
= {0}, *m
= NULL
;
750 int err
= 0, copied
= 0;
751 struct smap_psock
*psock
;
752 struct scatterlist
*sg
;
756 psock
= smap_psock_sk(sk
);
757 if (unlikely(!psock
))
760 if (!refcount_inc_not_zero(&psock
->refcnt
))
766 if (psock
->cork_bytes
)
771 /* Catch case where ring is full and sendpage is stalled. */
772 if (unlikely(m
->sg_end
== m
->sg_start
&&
773 m
->sg_data
[m
->sg_end
].length
))
776 psock
->sg_size
+= size
;
777 sg
= &m
->sg_data
[m
->sg_end
];
778 sg_set_page(sg
, page
, size
, offset
);
780 m
->sg_copy
[m
->sg_end
] = true;
781 sk_mem_charge(sk
, size
);
785 if (m
->sg_end
== MAX_SKB_FRAGS
)
788 if (m
->sg_end
== m
->sg_start
)
791 if (psock
->cork_bytes
) {
792 if (size
> psock
->cork_bytes
)
793 psock
->cork_bytes
= 0;
795 psock
->cork_bytes
-= size
;
797 if (psock
->cork_bytes
&& !enospc
)
800 /* All cork bytes accounted for re-run filter */
801 psock
->eval
= __SK_NONE
;
802 psock
->cork_bytes
= 0;
805 err
= bpf_exec_tx_verdict(psock
, m
, sk
, &copied
, flags
);
808 smap_release_sock(psock
, sk
);
809 return copied
? copied
: err
;
812 return tcp_sendpage(sk
, page
, offset
, size
, flags
);
815 static void bpf_tcp_msg_add(struct smap_psock
*psock
,
817 struct bpf_prog
*tx_msg
)
819 struct bpf_prog
*orig_tx_msg
;
821 orig_tx_msg
= xchg(&psock
->bpf_tx_msg
, tx_msg
);
823 bpf_prog_put(orig_tx_msg
);
826 static int bpf_tcp_ulp_register(void)
828 tcp_bpf_proto
= tcp_prot
;
829 tcp_bpf_proto
.close
= bpf_tcp_close
;
830 /* Once BPF TX ULP is registered it is never unregistered. It
831 * will be in the ULP list for the lifetime of the system. Doing
832 * duplicate registers is not a problem.
834 return tcp_register_ulp(&bpf_tcp_ulp_ops
);
837 static int smap_verdict_func(struct smap_psock
*psock
, struct sk_buff
*skb
)
839 struct bpf_prog
*prog
= READ_ONCE(psock
->bpf_verdict
);
846 /* We need to ensure that BPF metadata for maps is also cleared
847 * when we orphan the skb so that we don't have the possibility
848 * to reference a stale map.
850 TCP_SKB_CB(skb
)->bpf
.map
= NULL
;
851 skb
->sk
= psock
->sock
;
852 bpf_compute_data_pointers(skb
);
854 rc
= (*prog
->bpf_func
)(skb
, prog
->insnsi
);
858 /* Moving return codes from UAPI namespace into internal namespace */
859 return rc
== SK_PASS
?
860 (TCP_SKB_CB(skb
)->bpf
.map
? __SK_REDIRECT
: __SK_PASS
) :
864 static void smap_do_verdict(struct smap_psock
*psock
, struct sk_buff
*skb
)
869 rc
= smap_verdict_func(psock
, skb
);
872 sk
= do_sk_redirect_map(skb
);
874 struct smap_psock
*peer
= smap_psock_sk(sk
);
877 test_bit(SMAP_TX_RUNNING
, &peer
->state
) &&
878 !sock_flag(sk
, SOCK_DEAD
) &&
879 sock_writeable(sk
))) {
880 skb_set_owner_w(skb
, sk
);
881 skb_queue_tail(&peer
->rxqueue
, skb
);
882 schedule_work(&peer
->tx_work
);
886 /* Fall through and free skb otherwise */
893 static void smap_report_sk_error(struct smap_psock
*psock
, int err
)
895 struct sock
*sk
= psock
->sock
;
898 sk
->sk_error_report(sk
);
901 static void smap_read_sock_strparser(struct strparser
*strp
,
904 struct smap_psock
*psock
;
907 psock
= container_of(strp
, struct smap_psock
, strp
);
908 smap_do_verdict(psock
, skb
);
912 /* Called with lock held on socket */
913 static void smap_data_ready(struct sock
*sk
)
915 struct smap_psock
*psock
;
918 psock
= smap_psock_sk(sk
);
920 write_lock_bh(&sk
->sk_callback_lock
);
921 strp_data_ready(&psock
->strp
);
922 write_unlock_bh(&sk
->sk_callback_lock
);
927 static void smap_tx_work(struct work_struct
*w
)
929 struct smap_psock
*psock
;
933 psock
= container_of(w
, struct smap_psock
, tx_work
);
935 /* lock sock to avoid losing sk_socket at some point during loop */
936 lock_sock(psock
->sock
);
937 if (psock
->save_skb
) {
938 skb
= psock
->save_skb
;
939 rem
= psock
->save_rem
;
940 off
= psock
->save_off
;
941 psock
->save_skb
= NULL
;
945 while ((skb
= skb_dequeue(&psock
->rxqueue
))) {
950 if (likely(psock
->sock
->sk_socket
))
951 n
= skb_send_sock_locked(psock
->sock
,
957 /* Retry when space is available */
958 psock
->save_skb
= skb
;
959 psock
->save_rem
= rem
;
960 psock
->save_off
= off
;
963 /* Hard errors break pipe and stop xmit */
964 smap_report_sk_error(psock
, n
? -n
: EPIPE
);
965 clear_bit(SMAP_TX_RUNNING
, &psock
->state
);
975 release_sock(psock
->sock
);
978 static void smap_write_space(struct sock
*sk
)
980 struct smap_psock
*psock
;
983 psock
= smap_psock_sk(sk
);
984 if (likely(psock
&& test_bit(SMAP_TX_RUNNING
, &psock
->state
)))
985 schedule_work(&psock
->tx_work
);
989 static void smap_stop_sock(struct smap_psock
*psock
, struct sock
*sk
)
991 if (!psock
->strp_enabled
)
993 sk
->sk_data_ready
= psock
->save_data_ready
;
994 sk
->sk_write_space
= psock
->save_write_space
;
995 psock
->save_data_ready
= NULL
;
996 psock
->save_write_space
= NULL
;
997 strp_stop(&psock
->strp
);
998 psock
->strp_enabled
= false;
1001 static void smap_destroy_psock(struct rcu_head
*rcu
)
1003 struct smap_psock
*psock
= container_of(rcu
,
1004 struct smap_psock
, rcu
);
1006 /* Now that a grace period has passed there is no longer
1007 * any reference to this sock in the sockmap so we can
1008 * destroy the psock, strparser, and bpf programs. But,
1009 * because we use workqueue sync operations we can not
1010 * do it in rcu context
1012 schedule_work(&psock
->gc_work
);
1015 static void smap_release_sock(struct smap_psock
*psock
, struct sock
*sock
)
1017 if (refcount_dec_and_test(&psock
->refcnt
)) {
1018 tcp_cleanup_ulp(sock
);
1019 smap_stop_sock(psock
, sock
);
1020 clear_bit(SMAP_TX_RUNNING
, &psock
->state
);
1021 rcu_assign_sk_user_data(sock
, NULL
);
1022 call_rcu_sched(&psock
->rcu
, smap_destroy_psock
);
1026 static int smap_parse_func_strparser(struct strparser
*strp
,
1027 struct sk_buff
*skb
)
1029 struct smap_psock
*psock
;
1030 struct bpf_prog
*prog
;
1034 psock
= container_of(strp
, struct smap_psock
, strp
);
1035 prog
= READ_ONCE(psock
->bpf_parse
);
1037 if (unlikely(!prog
)) {
1042 /* Attach socket for bpf program to use if needed we can do this
1043 * because strparser clones the skb before handing it to a upper
1044 * layer, meaning skb_orphan has been called. We NULL sk on the
1045 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
1046 * later and because we are not charging the memory of this skb to
1049 skb
->sk
= psock
->sock
;
1050 bpf_compute_data_pointers(skb
);
1051 rc
= (*prog
->bpf_func
)(skb
, prog
->insnsi
);
1057 static int smap_read_sock_done(struct strparser
*strp
, int err
)
1062 static int smap_init_sock(struct smap_psock
*psock
,
1065 static const struct strp_callbacks cb
= {
1066 .rcv_msg
= smap_read_sock_strparser
,
1067 .parse_msg
= smap_parse_func_strparser
,
1068 .read_sock_done
= smap_read_sock_done
,
1071 return strp_init(&psock
->strp
, sk
, &cb
);
1074 static void smap_init_progs(struct smap_psock
*psock
,
1075 struct bpf_stab
*stab
,
1076 struct bpf_prog
*verdict
,
1077 struct bpf_prog
*parse
)
1079 struct bpf_prog
*orig_parse
, *orig_verdict
;
1081 orig_parse
= xchg(&psock
->bpf_parse
, parse
);
1082 orig_verdict
= xchg(&psock
->bpf_verdict
, verdict
);
1085 bpf_prog_put(orig_verdict
);
1087 bpf_prog_put(orig_parse
);
1090 static void smap_start_sock(struct smap_psock
*psock
, struct sock
*sk
)
1092 if (sk
->sk_data_ready
== smap_data_ready
)
1094 psock
->save_data_ready
= sk
->sk_data_ready
;
1095 psock
->save_write_space
= sk
->sk_write_space
;
1096 sk
->sk_data_ready
= smap_data_ready
;
1097 sk
->sk_write_space
= smap_write_space
;
1098 psock
->strp_enabled
= true;
1101 static void sock_map_remove_complete(struct bpf_stab
*stab
)
1103 bpf_map_area_free(stab
->sock_map
);
1107 static void smap_gc_work(struct work_struct
*w
)
1109 struct smap_psock_map_entry
*e
, *tmp
;
1110 struct smap_psock
*psock
;
1112 psock
= container_of(w
, struct smap_psock
, gc_work
);
1114 /* no callback lock needed because we already detached sockmap ops */
1115 if (psock
->strp_enabled
)
1116 strp_done(&psock
->strp
);
1118 cancel_work_sync(&psock
->tx_work
);
1119 __skb_queue_purge(&psock
->rxqueue
);
1121 /* At this point all strparser and xmit work must be complete */
1122 if (psock
->bpf_parse
)
1123 bpf_prog_put(psock
->bpf_parse
);
1124 if (psock
->bpf_verdict
)
1125 bpf_prog_put(psock
->bpf_verdict
);
1126 if (psock
->bpf_tx_msg
)
1127 bpf_prog_put(psock
->bpf_tx_msg
);
1130 free_start_sg(psock
->sock
, psock
->cork
);
1134 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
1139 if (psock
->sk_redir
)
1140 sock_put(psock
->sk_redir
);
1142 sock_put(psock
->sock
);
1146 static struct smap_psock
*smap_init_psock(struct sock
*sock
,
1147 struct bpf_stab
*stab
)
1149 struct smap_psock
*psock
;
1151 psock
= kzalloc_node(sizeof(struct smap_psock
),
1152 GFP_ATOMIC
| __GFP_NOWARN
,
1153 stab
->map
.numa_node
);
1155 return ERR_PTR(-ENOMEM
);
1157 psock
->eval
= __SK_NONE
;
1159 skb_queue_head_init(&psock
->rxqueue
);
1160 INIT_WORK(&psock
->tx_work
, smap_tx_work
);
1161 INIT_WORK(&psock
->gc_work
, smap_gc_work
);
1162 INIT_LIST_HEAD(&psock
->maps
);
1163 refcount_set(&psock
->refcnt
, 1);
1165 rcu_assign_sk_user_data(sock
, psock
);
1170 static struct bpf_map
*sock_map_alloc(union bpf_attr
*attr
)
1172 struct bpf_stab
*stab
;
1176 if (!capable(CAP_NET_ADMIN
))
1177 return ERR_PTR(-EPERM
);
1179 /* check sanity of attributes */
1180 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
1181 attr
->value_size
!= 4 || attr
->map_flags
& ~SOCK_CREATE_FLAG_MASK
)
1182 return ERR_PTR(-EINVAL
);
1184 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
1185 return ERR_PTR(-E2BIG
);
1187 err
= bpf_tcp_ulp_register();
1188 if (err
&& err
!= -EEXIST
)
1189 return ERR_PTR(err
);
1191 stab
= kzalloc(sizeof(*stab
), GFP_USER
);
1193 return ERR_PTR(-ENOMEM
);
1195 bpf_map_init_from_attr(&stab
->map
, attr
);
1197 /* make sure page count doesn't overflow */
1198 cost
= (u64
) stab
->map
.max_entries
* sizeof(struct sock
*);
1200 if (cost
>= U32_MAX
- PAGE_SIZE
)
1203 stab
->map
.pages
= round_up(cost
, PAGE_SIZE
) >> PAGE_SHIFT
;
1205 /* if map size is larger than memlock limit, reject it early */
1206 err
= bpf_map_precharge_memlock(stab
->map
.pages
);
1211 stab
->sock_map
= bpf_map_area_alloc(stab
->map
.max_entries
*
1212 sizeof(struct sock
*),
1213 stab
->map
.numa_node
);
1214 if (!stab
->sock_map
)
1220 return ERR_PTR(err
);
1223 static void smap_list_remove(struct smap_psock
*psock
, struct sock
**entry
)
1225 struct smap_psock_map_entry
*e
, *tmp
;
1227 list_for_each_entry_safe(e
, tmp
, &psock
->maps
, list
) {
1228 if (e
->entry
== entry
) {
1235 static void sock_map_free(struct bpf_map
*map
)
1237 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1242 /* At this point no update, lookup or delete operations can happen.
1243 * However, be aware we can still get a socket state event updates,
1244 * and data ready callabacks that reference the psock from sk_user_data
1245 * Also psock worker threads are still in-flight. So smap_release_sock
1246 * will only free the psock after cancel_sync on the worker threads
1247 * and a grace period expire to ensure psock is really safe to remove.
1250 for (i
= 0; i
< stab
->map
.max_entries
; i
++) {
1251 struct smap_psock
*psock
;
1254 sock
= xchg(&stab
->sock_map
[i
], NULL
);
1258 write_lock_bh(&sock
->sk_callback_lock
);
1259 psock
= smap_psock_sk(sock
);
1260 /* This check handles a racing sock event that can get the
1261 * sk_callback_lock before this case but after xchg happens
1262 * causing the refcnt to hit zero and sock user data (psock)
1263 * to be null and queued for garbage collection.
1265 if (likely(psock
)) {
1266 smap_list_remove(psock
, &stab
->sock_map
[i
]);
1267 smap_release_sock(psock
, sock
);
1269 write_unlock_bh(&sock
->sk_callback_lock
);
1273 sock_map_remove_complete(stab
);
1276 static int sock_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
1278 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1279 u32 i
= key
? *(u32
*)key
: U32_MAX
;
1280 u32
*next
= (u32
*)next_key
;
1282 if (i
>= stab
->map
.max_entries
) {
1287 if (i
== stab
->map
.max_entries
- 1)
1294 struct sock
*__sock_map_lookup_elem(struct bpf_map
*map
, u32 key
)
1296 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1298 if (key
>= map
->max_entries
)
1301 return READ_ONCE(stab
->sock_map
[key
]);
1304 static int sock_map_delete_elem(struct bpf_map
*map
, void *key
)
1306 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1307 struct smap_psock
*psock
;
1308 int k
= *(u32
*)key
;
1311 if (k
>= map
->max_entries
)
1314 sock
= xchg(&stab
->sock_map
[k
], NULL
);
1318 write_lock_bh(&sock
->sk_callback_lock
);
1319 psock
= smap_psock_sk(sock
);
1323 if (psock
->bpf_parse
)
1324 smap_stop_sock(psock
, sock
);
1325 smap_list_remove(psock
, &stab
->sock_map
[k
]);
1326 smap_release_sock(psock
, sock
);
1328 write_unlock_bh(&sock
->sk_callback_lock
);
1332 /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
1333 * done inside rcu critical sections. This ensures on updates that the psock
1334 * will not be released via smap_release_sock() until concurrent updates/deletes
1335 * complete. All operations operate on sock_map using cmpxchg and xchg
1336 * operations to ensure we do not get stale references. Any reads into the
1337 * map must be done with READ_ONCE() because of this.
1339 * A psock is destroyed via call_rcu and after any worker threads are cancelled
1340 * and syncd so we are certain all references from the update/lookup/delete
1341 * operations as well as references in the data path are no longer in use.
1343 * Psocks may exist in multiple maps, but only a single set of parse/verdict
1344 * programs may be inherited from the maps it belongs to. A reference count
1345 * is kept with the total number of references to the psock from all maps. The
1346 * psock will not be released until this reaches zero. The psock and sock
1347 * user data data use the sk_callback_lock to protect critical data structures
1348 * from concurrent access. This allows us to avoid two updates from modifying
1349 * the user data in sock and the lock is required anyways for modifying
1350 * callbacks, we simply increase its scope slightly.
1353 * - psock must always be read inside RCU critical section
1354 * - sk_user_data must only be modified inside sk_callback_lock and read
1355 * inside RCU critical section.
1356 * - psock->maps list must only be read & modified inside sk_callback_lock
1357 * - sock_map must use READ_ONCE and (cmp)xchg operations
1358 * - BPF verdict/parse programs must use READ_ONCE and xchg operations
1360 static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern
*skops
,
1361 struct bpf_map
*map
,
1362 void *key
, u64 flags
)
1364 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1365 struct smap_psock_map_entry
*e
= NULL
;
1366 struct bpf_prog
*verdict
, *parse
, *tx_msg
;
1367 struct sock
*osock
, *sock
;
1368 struct smap_psock
*psock
;
1369 u32 i
= *(u32
*)key
;
1373 if (unlikely(flags
> BPF_EXIST
))
1376 if (unlikely(i
>= stab
->map
.max_entries
))
1379 sock
= READ_ONCE(stab
->sock_map
[i
]);
1380 if (flags
== BPF_EXIST
&& !sock
)
1382 else if (flags
== BPF_NOEXIST
&& sock
)
1387 /* 1. If sock map has BPF programs those will be inherited by the
1388 * sock being added. If the sock is already attached to BPF programs
1389 * this results in an error.
1391 verdict
= READ_ONCE(stab
->bpf_verdict
);
1392 parse
= READ_ONCE(stab
->bpf_parse
);
1393 tx_msg
= READ_ONCE(stab
->bpf_tx_msg
);
1395 if (parse
&& verdict
) {
1396 /* bpf prog refcnt may be zero if a concurrent attach operation
1397 * removes the program after the above READ_ONCE() but before
1398 * we increment the refcnt. If this is the case abort with an
1401 verdict
= bpf_prog_inc_not_zero(stab
->bpf_verdict
);
1402 if (IS_ERR(verdict
))
1403 return PTR_ERR(verdict
);
1405 parse
= bpf_prog_inc_not_zero(stab
->bpf_parse
);
1406 if (IS_ERR(parse
)) {
1407 bpf_prog_put(verdict
);
1408 return PTR_ERR(parse
);
1413 tx_msg
= bpf_prog_inc_not_zero(stab
->bpf_tx_msg
);
1414 if (IS_ERR(tx_msg
)) {
1416 bpf_prog_put(verdict
);
1418 bpf_prog_put(parse
);
1419 return PTR_ERR(tx_msg
);
1423 write_lock_bh(&sock
->sk_callback_lock
);
1424 psock
= smap_psock_sk(sock
);
1426 /* 2. Do not allow inheriting programs if psock exists and has
1427 * already inherited programs. This would create confusion on
1428 * which parser/verdict program is running. If no psock exists
1429 * create one. Inside sk_callback_lock to ensure concurrent create
1430 * doesn't update user data.
1433 if (READ_ONCE(psock
->bpf_parse
) && parse
) {
1437 if (READ_ONCE(psock
->bpf_tx_msg
) && tx_msg
) {
1441 if (!refcount_inc_not_zero(&psock
->refcnt
)) {
1446 psock
= smap_init_psock(sock
, stab
);
1447 if (IS_ERR(psock
)) {
1448 err
= PTR_ERR(psock
);
1452 set_bit(SMAP_TX_RUNNING
, &psock
->state
);
1456 e
= kzalloc(sizeof(*e
), GFP_ATOMIC
| __GFP_NOWARN
);
1461 e
->entry
= &stab
->sock_map
[i
];
1463 /* 3. At this point we have a reference to a valid psock that is
1464 * running. Attach any BPF programs needed.
1467 bpf_tcp_msg_add(psock
, sock
, tx_msg
);
1469 err
= tcp_set_ulp_id(sock
, TCP_ULP_BPF
);
1474 if (parse
&& verdict
&& !psock
->strp_enabled
) {
1475 err
= smap_init_sock(psock
, sock
);
1478 smap_init_progs(psock
, stab
, verdict
, parse
);
1479 smap_start_sock(psock
, sock
);
1482 /* 4. Place psock in sockmap for use and stop any programs on
1483 * the old sock assuming its not the same sock we are replacing
1484 * it with. Because we can only have a single set of programs if
1485 * old_sock has a strp we can stop it.
1487 list_add_tail(&e
->list
, &psock
->maps
);
1488 write_unlock_bh(&sock
->sk_callback_lock
);
1490 osock
= xchg(&stab
->sock_map
[i
], sock
);
1492 struct smap_psock
*opsock
= smap_psock_sk(osock
);
1494 write_lock_bh(&osock
->sk_callback_lock
);
1495 smap_list_remove(opsock
, &stab
->sock_map
[i
]);
1496 smap_release_sock(opsock
, osock
);
1497 write_unlock_bh(&osock
->sk_callback_lock
);
1501 smap_release_sock(psock
, sock
);
1504 bpf_prog_put(verdict
);
1506 bpf_prog_put(parse
);
1508 bpf_prog_put(tx_msg
);
1509 write_unlock_bh(&sock
->sk_callback_lock
);
1514 int sock_map_prog(struct bpf_map
*map
, struct bpf_prog
*prog
, u32 type
)
1516 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1517 struct bpf_prog
*orig
;
1519 if (unlikely(map
->map_type
!= BPF_MAP_TYPE_SOCKMAP
))
1523 case BPF_SK_MSG_VERDICT
:
1524 orig
= xchg(&stab
->bpf_tx_msg
, prog
);
1526 case BPF_SK_SKB_STREAM_PARSER
:
1527 orig
= xchg(&stab
->bpf_parse
, prog
);
1529 case BPF_SK_SKB_STREAM_VERDICT
:
1530 orig
= xchg(&stab
->bpf_verdict
, prog
);
1542 static void *sock_map_lookup(struct bpf_map
*map
, void *key
)
1547 static int sock_map_update_elem(struct bpf_map
*map
,
1548 void *key
, void *value
, u64 flags
)
1550 struct bpf_sock_ops_kern skops
;
1551 u32 fd
= *(u32
*)value
;
1552 struct socket
*socket
;
1555 socket
= sockfd_lookup(fd
, &err
);
1559 skops
.sk
= socket
->sk
;
1565 if (skops
.sk
->sk_type
!= SOCK_STREAM
||
1566 skops
.sk
->sk_protocol
!= IPPROTO_TCP
) {
1571 err
= sock_map_ctx_update_elem(&skops
, map
, key
, flags
);
1576 static void sock_map_release(struct bpf_map
*map
, struct file
*map_file
)
1578 struct bpf_stab
*stab
= container_of(map
, struct bpf_stab
, map
);
1579 struct bpf_prog
*orig
;
1581 orig
= xchg(&stab
->bpf_parse
, NULL
);
1584 orig
= xchg(&stab
->bpf_verdict
, NULL
);
1588 orig
= xchg(&stab
->bpf_tx_msg
, NULL
);
1593 const struct bpf_map_ops sock_map_ops
= {
1594 .map_alloc
= sock_map_alloc
,
1595 .map_free
= sock_map_free
,
1596 .map_lookup_elem
= sock_map_lookup
,
1597 .map_get_next_key
= sock_map_get_next_key
,
1598 .map_update_elem
= sock_map_update_elem
,
1599 .map_delete_elem
= sock_map_delete_elem
,
1600 .map_release
= sock_map_release
,
1603 BPF_CALL_4(bpf_sock_map_update
, struct bpf_sock_ops_kern
*, bpf_sock
,
1604 struct bpf_map
*, map
, void *, key
, u64
, flags
)
1606 WARN_ON_ONCE(!rcu_read_lock_held());
1607 return sock_map_ctx_update_elem(bpf_sock
, map
, key
, flags
);
1610 const struct bpf_func_proto bpf_sock_map_update_proto
= {
1611 .func
= bpf_sock_map_update
,
1614 .ret_type
= RET_INTEGER
,
1615 .arg1_type
= ARG_PTR_TO_CTX
,
1616 .arg2_type
= ARG_CONST_MAP_PTR
,
1617 .arg3_type
= ARG_PTR_TO_MAP_KEY
,
1618 .arg4_type
= ARG_ANYTHING
,