1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2021, Red Hat.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
12 #include <net/protocol.h>
14 #include <net/mptcp.h>
17 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
19 sock_owned_by_me((const struct sock
*)msk
);
21 if (likely(!__mptcp_check_fallback(msk
)))
27 static u32
sockopt_seq_reset(const struct sock
*sk
)
31 /* Highbits contain state. Allows to distinguish sockopt_seq
32 * of listener and established:
34 * sockopt(s0) - seq is 1
35 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
36 * sockopt(s0) - seq increments to 2 on s0
37 * sockopt(s1) // seq increments to 2 on s1 (different option)
38 * new ssk completes join, inherits options from s0 // seq 2
39 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
41 * Set High order bits to sk_state so ssk->seq == msk->seq test
45 return (u32
)sk
->sk_state
<< 24u;
48 static void sockopt_seq_inc(struct mptcp_sock
*msk
)
50 u32 seq
= (msk
->setsockopt_seq
+ 1) & 0x00ffffff;
52 msk
->setsockopt_seq
= sockopt_seq_reset((struct sock
*)msk
) + seq
;
55 static int mptcp_get_int_option(struct mptcp_sock
*msk
, sockptr_t optval
,
56 unsigned int optlen
, int *val
)
58 if (optlen
< sizeof(int))
61 if (copy_from_sockptr(val
, optval
, sizeof(*val
)))
67 static void mptcp_sol_socket_sync_intval(struct mptcp_sock
*msk
, int optname
, int val
)
69 struct mptcp_subflow_context
*subflow
;
70 struct sock
*sk
= (struct sock
*)msk
;
75 mptcp_for_each_subflow(msk
, subflow
) {
76 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
77 bool slow
= lock_sock_fast(ssk
);
81 sock_valbool_flag(ssk
, SOCK_DBG
, !!val
);
84 if (ssk
->sk_prot
->keepalive
)
85 ssk
->sk_prot
->keepalive(ssk
, !!val
);
86 sock_valbool_flag(ssk
, SOCK_KEEPOPEN
, !!val
);
89 ssk
->sk_priority
= val
;
93 ssk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
94 WRITE_ONCE(ssk
->sk_sndbuf
, sk
->sk_sndbuf
);
98 ssk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
99 WRITE_ONCE(ssk
->sk_rcvbuf
, sk
->sk_rcvbuf
);
102 if (READ_ONCE(ssk
->sk_mark
) != sk
->sk_mark
) {
103 ssk
->sk_mark
= sk
->sk_mark
;
107 case SO_INCOMING_CPU
:
108 WRITE_ONCE(ssk
->sk_incoming_cpu
, val
);
112 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
113 unlock_sock_fast(ssk
, slow
);
119 static int mptcp_sol_socket_intval(struct mptcp_sock
*msk
, int optname
, int val
)
121 sockptr_t optval
= KERNEL_SOCKPTR(&val
);
122 struct sock
*sk
= (struct sock
*)msk
;
125 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
,
126 optval
, sizeof(val
));
130 mptcp_sol_socket_sync_intval(msk
, optname
, val
);
134 static void mptcp_so_incoming_cpu(struct mptcp_sock
*msk
, int val
)
136 struct sock
*sk
= (struct sock
*)msk
;
138 WRITE_ONCE(sk
->sk_incoming_cpu
, val
);
140 mptcp_sol_socket_sync_intval(msk
, SO_INCOMING_CPU
, val
);
143 static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock
*msk
, int optname
, int val
)
145 sockptr_t optval
= KERNEL_SOCKPTR(&val
);
146 struct mptcp_subflow_context
*subflow
;
147 struct sock
*sk
= (struct sock
*)msk
;
150 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
,
151 optval
, sizeof(val
));
156 mptcp_for_each_subflow(msk
, subflow
) {
157 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
158 bool slow
= lock_sock_fast(ssk
);
160 sock_set_timestamp(sk
, optname
, !!val
);
161 unlock_sock_fast(ssk
, slow
);
168 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock
*msk
, int optname
,
174 ret
= mptcp_get_int_option(msk
, optval
, optlen
, &val
);
180 mptcp_sol_socket_sync_intval(msk
, optname
, val
);
189 return mptcp_sol_socket_intval(msk
, optname
, val
);
190 case SO_INCOMING_CPU
:
191 mptcp_so_incoming_cpu(msk
, val
);
193 case SO_TIMESTAMP_OLD
:
194 case SO_TIMESTAMP_NEW
:
195 case SO_TIMESTAMPNS_OLD
:
196 case SO_TIMESTAMPNS_NEW
:
197 return mptcp_setsockopt_sol_socket_tstamp(msk
, optname
, val
);
203 static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock
*msk
,
208 struct mptcp_subflow_context
*subflow
;
209 struct sock
*sk
= (struct sock
*)msk
;
210 struct so_timestamping timestamping
;
213 if (optlen
== sizeof(timestamping
)) {
214 if (copy_from_sockptr(×tamping
, optval
,
215 sizeof(timestamping
)))
217 } else if (optlen
== sizeof(int)) {
218 memset(×tamping
, 0, sizeof(timestamping
));
220 if (copy_from_sockptr(×tamping
.flags
, optval
, sizeof(int)))
226 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
,
227 KERNEL_SOCKPTR(×tamping
),
228 sizeof(timestamping
));
234 mptcp_for_each_subflow(msk
, subflow
) {
235 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
236 bool slow
= lock_sock_fast(ssk
);
238 sock_set_timestamping(sk
, optname
, timestamping
);
239 unlock_sock_fast(ssk
, slow
);
247 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock
*msk
, sockptr_t optval
,
250 struct mptcp_subflow_context
*subflow
;
251 struct sock
*sk
= (struct sock
*)msk
;
256 if (optlen
< sizeof(ling
))
259 if (copy_from_sockptr(&ling
, optval
, sizeof(ling
)))
262 kopt
= KERNEL_SOCKPTR(&ling
);
263 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, SO_LINGER
, kopt
, sizeof(ling
));
268 sockopt_seq_inc(msk
);
269 mptcp_for_each_subflow(msk
, subflow
) {
270 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
271 bool slow
= lock_sock_fast(ssk
);
274 sock_reset_flag(ssk
, SOCK_LINGER
);
276 ssk
->sk_lingertime
= sk
->sk_lingertime
;
277 sock_set_flag(ssk
, SOCK_LINGER
);
280 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
281 unlock_sock_fast(ssk
, slow
);
288 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
289 sockptr_t optval
, unsigned int optlen
)
291 struct sock
*sk
= (struct sock
*)msk
;
292 struct socket
*ssock
;
298 case SO_BINDTODEVICE
:
299 case SO_BINDTOIFINDEX
:
301 ssock
= __mptcp_nmpc_socket(msk
);
307 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
309 if (optname
== SO_REUSEPORT
)
310 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
311 else if (optname
== SO_REUSEADDR
)
312 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
313 else if (optname
== SO_BINDTODEVICE
)
314 sk
->sk_bound_dev_if
= ssock
->sk
->sk_bound_dev_if
;
315 else if (optname
== SO_BINDTOIFINDEX
)
316 sk
->sk_bound_dev_if
= ssock
->sk
->sk_bound_dev_if
;
327 case SO_INCOMING_CPU
:
329 case SO_TIMESTAMP_OLD
:
330 case SO_TIMESTAMP_NEW
:
331 case SO_TIMESTAMPNS_OLD
:
332 case SO_TIMESTAMPNS_NEW
:
333 return mptcp_setsockopt_sol_socket_int(msk
, optname
, optval
,
335 case SO_TIMESTAMPING_OLD
:
336 case SO_TIMESTAMPING_NEW
:
337 return mptcp_setsockopt_sol_socket_timestamping(msk
, optname
,
340 return mptcp_setsockopt_sol_socket_linger(msk
, optval
, optlen
);
342 case SO_RCVTIMEO_OLD
:
343 case SO_RCVTIMEO_NEW
:
345 case SO_PREFER_BUSY_POLL
:
346 case SO_BUSY_POLL_BUDGET
:
347 /* No need to copy: only relevant for msk */
348 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
358 case SO_SELECT_ERR_QUEUE
:
362 /* SO_OOBINLINE is not supported, let's avoid the related mess
363 * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
364 * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
365 * we must be careful with subflows
367 * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
368 * explicitly the sk_protocol field
370 * SO_PEEK_OFF is unsupported, as it is for plain TCP
371 * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
372 * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
373 * but likely needs careful design
375 * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
376 * SO_TXTIME is currently unsupported
382 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
383 sockptr_t optval
, unsigned int optlen
)
385 struct sock
*sk
= (struct sock
*)msk
;
386 int ret
= -EOPNOTSUPP
;
387 struct socket
*ssock
;
392 ssock
= __mptcp_nmpc_socket(msk
);
398 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
400 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
409 static bool mptcp_supported_sockopt(int level
, int optname
)
411 if (level
== SOL_IP
) {
413 /* should work fine */
417 /* the following are control cmsg related */
424 case IP_RECVORIGDSTADDR
:
426 case IP_RECVFRAGSIZE
:
428 /* common stuff that need some love */
431 case IP_BIND_ADDRESS_NO_PORT
:
432 case IP_MTU_DISCOVER
:
435 /* possibly less common may deserve some love */
438 /* the following is apparently a no-op for plain TCP */
439 case IP_RECVERR_RFC4884
:
443 /* IP_OPTIONS is not supported, needs subflow care */
444 /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
445 /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
446 * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
447 * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
448 * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
449 * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
450 * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
453 /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
456 if (level
== SOL_IPV6
) {
460 /* the following are control cmsg related */
461 case IPV6_RECVPKTINFO
:
462 case IPV6_2292PKTINFO
:
463 case IPV6_RECVHOPLIMIT
:
464 case IPV6_2292HOPLIMIT
:
467 case IPV6_RECVHOPOPTS
:
468 case IPV6_2292HOPOPTS
:
469 case IPV6_RECVDSTOPTS
:
470 case IPV6_2292DSTOPTS
:
471 case IPV6_RECVTCLASS
:
473 case IPV6_RECVPATHMTU
:
474 case IPV6_RECVORIGDSTADDR
:
475 case IPV6_RECVFRAGSIZE
:
477 /* the following ones need some love but are quite common */
479 case IPV6_TRANSPARENT
:
482 case IPV6_2292PKTOPTIONS
:
483 case IPV6_UNICAST_HOPS
:
484 case IPV6_MTU_DISCOVER
:
487 case IPV6_FLOWINFO_SEND
:
488 case IPV6_FLOWLABEL_MGR
:
489 case IPV6_MINHOPCOUNT
:
491 case IPV6_AUTOFLOWLABEL
:
493 /* the following one is a no-op for plain TCP */
494 case IPV6_RECVERR_RFC4884
:
498 /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
501 /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
502 * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
503 * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
504 * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
505 * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
506 * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
507 * are not supported better not deal with mcast
509 /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
511 /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
512 /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
515 if (level
== SOL_TCP
) {
517 /* the following are no-op or should work just fine */
518 case TCP_THIN_DUPACK
:
519 case TCP_DEFER_ACCEPT
:
521 /* the following need some love */
524 case TCP_THIN_LINEAR_TIMEOUTS
:
534 case TCP_WINDOW_CLAMP
:
536 case TCP_USER_TIMEOUT
:
538 case TCP_NOTSENT_LOWAT
:
543 /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
545 /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
546 * TCP_REPAIR_WINDOW are not supported, better avoid this mess
548 /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE,
549 * are not supported fastopen is currently unsupported
551 /* TCP_INQ is currently unsupported, needs some recvmsg work */
556 static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock
*msk
, sockptr_t optval
,
559 struct mptcp_subflow_context
*subflow
;
560 struct sock
*sk
= (struct sock
*)msk
;
561 char name
[TCP_CA_NAME_MAX
];
568 ret
= strncpy_from_sockptr(name
, optval
,
569 min_t(long, TCP_CA_NAME_MAX
- 1, optlen
));
575 cap_net_admin
= ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
);
579 sockopt_seq_inc(msk
);
580 mptcp_for_each_subflow(msk
, subflow
) {
581 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
585 err
= tcp_set_congestion_control(ssk
, name
, true, cap_net_admin
);
586 if (err
< 0 && ret
== 0)
588 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
593 strcpy(msk
->ca_name
, name
);
599 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock
*msk
, int optname
,
600 sockptr_t optval
, unsigned int optlen
)
606 return mptcp_setsockopt_sol_tcp_congestion(msk
, optval
, optlen
);
612 int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
613 sockptr_t optval
, unsigned int optlen
)
615 struct mptcp_sock
*msk
= mptcp_sk(sk
);
618 pr_debug("msk=%p", msk
);
620 if (level
== SOL_SOCKET
)
621 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
623 if (!mptcp_supported_sockopt(level
, optname
))
626 /* @@ the meaning of setsockopt() when the socket is connected and
627 * there are multiple subflows is not yet defined. It is up to the
628 * MPTCP-level socket to configure the subflows until the subflow
629 * is in TCP fallback, when TCP socket options are passed through
630 * to the one remaining subflow.
633 ssk
= __mptcp_tcp_fallback(msk
);
636 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
638 if (level
== SOL_IPV6
)
639 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
641 if (level
== SOL_TCP
)
642 return mptcp_setsockopt_sol_tcp(msk
, optname
, optval
, optlen
);
647 static int mptcp_getsockopt_first_sf_only(struct mptcp_sock
*msk
, int level
, int optname
,
648 char __user
*optval
, int __user
*optlen
)
650 struct sock
*sk
= (struct sock
*)msk
;
651 struct socket
*ssock
;
658 ret
= tcp_getsockopt(ssk
, level
, optname
, optval
, optlen
);
662 ssock
= __mptcp_nmpc_socket(msk
);
666 ret
= tcp_getsockopt(ssock
->sk
, level
, optname
, optval
, optlen
);
673 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock
*msk
, int optname
,
674 char __user
*optval
, int __user
*optlen
)
681 return mptcp_getsockopt_first_sf_only(msk
, SOL_TCP
, optname
,
687 int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
688 char __user
*optval
, int __user
*option
)
690 struct mptcp_sock
*msk
= mptcp_sk(sk
);
693 pr_debug("msk=%p", msk
);
695 /* @@ the meaning of setsockopt() when the socket is connected and
696 * there are multiple subflows is not yet defined. It is up to the
697 * MPTCP-level socket to configure the subflows until the subflow
698 * is in TCP fallback, when socket options are passed through
699 * to the one remaining subflow.
702 ssk
= __mptcp_tcp_fallback(msk
);
705 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
707 if (level
== SOL_TCP
)
708 return mptcp_getsockopt_sol_tcp(msk
, optname
, optval
, option
);
712 static void sync_socket_options(struct mptcp_sock
*msk
, struct sock
*ssk
)
714 static const unsigned int tx_rx_locks
= SOCK_RCVBUF_LOCK
| SOCK_SNDBUF_LOCK
;
715 struct sock
*sk
= (struct sock
*)msk
;
717 if (ssk
->sk_prot
->keepalive
) {
718 if (sock_flag(sk
, SOCK_KEEPOPEN
))
719 ssk
->sk_prot
->keepalive(ssk
, 1);
721 ssk
->sk_prot
->keepalive(ssk
, 0);
724 ssk
->sk_priority
= sk
->sk_priority
;
725 ssk
->sk_bound_dev_if
= sk
->sk_bound_dev_if
;
726 ssk
->sk_incoming_cpu
= sk
->sk_incoming_cpu
;
728 if (sk
->sk_userlocks
& tx_rx_locks
) {
729 ssk
->sk_userlocks
|= sk
->sk_userlocks
& tx_rx_locks
;
730 if (sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
)
731 WRITE_ONCE(ssk
->sk_sndbuf
, sk
->sk_sndbuf
);
732 if (sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)
733 WRITE_ONCE(ssk
->sk_rcvbuf
, sk
->sk_rcvbuf
);
736 if (sock_flag(sk
, SOCK_LINGER
)) {
737 ssk
->sk_lingertime
= sk
->sk_lingertime
;
738 sock_set_flag(ssk
, SOCK_LINGER
);
740 sock_reset_flag(ssk
, SOCK_LINGER
);
743 if (sk
->sk_mark
!= ssk
->sk_mark
) {
744 ssk
->sk_mark
= sk
->sk_mark
;
748 sock_valbool_flag(ssk
, SOCK_DBG
, sock_flag(sk
, SOCK_DBG
));
750 if (inet_csk(sk
)->icsk_ca_ops
!= inet_csk(ssk
)->icsk_ca_ops
)
751 tcp_set_congestion_control(ssk
, msk
->ca_name
, false, true);
754 static void __mptcp_sockopt_sync(struct mptcp_sock
*msk
, struct sock
*ssk
)
756 bool slow
= lock_sock_fast(ssk
);
758 sync_socket_options(msk
, ssk
);
760 unlock_sock_fast(ssk
, slow
);
763 void mptcp_sockopt_sync(struct mptcp_sock
*msk
, struct sock
*ssk
)
765 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
767 msk_owned_by_me(msk
);
769 if (READ_ONCE(subflow
->setsockopt_seq
) != msk
->setsockopt_seq
) {
770 __mptcp_sockopt_sync(msk
, ssk
);
772 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
776 void mptcp_sockopt_sync_all(struct mptcp_sock
*msk
)
778 struct mptcp_subflow_context
*subflow
;
779 struct sock
*sk
= (struct sock
*)msk
;
782 seq
= sockopt_seq_reset(sk
);
784 mptcp_for_each_subflow(msk
, subflow
) {
785 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
786 u32 sseq
= READ_ONCE(subflow
->setsockopt_seq
);
788 if (sseq
!= msk
->setsockopt_seq
) {
789 __mptcp_sockopt_sync(msk
, ssk
);
790 WRITE_ONCE(subflow
->setsockopt_seq
, seq
);
791 } else if (sseq
!= seq
) {
792 WRITE_ONCE(subflow
->setsockopt_seq
, seq
);
798 msk
->setsockopt_seq
= seq
;