]>
git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - net/mptcp/sockopt.c
1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2021, Red Hat.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
12 #include <net/protocol.h>
14 #include <net/mptcp.h>
17 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
19 sock_owned_by_me((const struct sock
*)msk
);
21 if (likely(!__mptcp_check_fallback(msk
)))
27 static u32
sockopt_seq_reset(const struct sock
*sk
)
31 /* Highbits contain state. Allows to distinguish sockopt_seq
32 * of listener and established:
34 * sockopt(s0) - seq is 1
35 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
36 * sockopt(s0) - seq increments to 2 on s0
37 * sockopt(s1) // seq increments to 2 on s1 (different option)
38 * new ssk completes join, inherits options from s0 // seq 2
39 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
41 * Set High order bits to sk_state so ssk->seq == msk->seq test
45 return (u32
)sk
->sk_state
<< 24u;
48 static void sockopt_seq_inc(struct mptcp_sock
*msk
)
50 u32 seq
= (msk
->setsockopt_seq
+ 1) & 0x00ffffff;
52 msk
->setsockopt_seq
= sockopt_seq_reset((struct sock
*)msk
) + seq
;
55 static int mptcp_get_int_option(struct mptcp_sock
*msk
, sockptr_t optval
,
56 unsigned int optlen
, int *val
)
58 if (optlen
< sizeof(int))
61 if (copy_from_sockptr(val
, optval
, sizeof(*val
)))
67 static void mptcp_sol_socket_sync_intval(struct mptcp_sock
*msk
, int optname
, int val
)
69 struct mptcp_subflow_context
*subflow
;
70 struct sock
*sk
= (struct sock
*)msk
;
75 mptcp_for_each_subflow(msk
, subflow
) {
76 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
77 bool slow
= lock_sock_fast(ssk
);
81 sock_valbool_flag(ssk
, SOCK_DBG
, !!val
);
84 if (ssk
->sk_prot
->keepalive
)
85 ssk
->sk_prot
->keepalive(ssk
, !!val
);
86 sock_valbool_flag(ssk
, SOCK_KEEPOPEN
, !!val
);
89 ssk
->sk_priority
= val
;
93 ssk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
94 WRITE_ONCE(ssk
->sk_sndbuf
, sk
->sk_sndbuf
);
98 ssk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
99 WRITE_ONCE(ssk
->sk_rcvbuf
, sk
->sk_rcvbuf
);
102 if (READ_ONCE(ssk
->sk_mark
) != sk
->sk_mark
) {
103 ssk
->sk_mark
= sk
->sk_mark
;
107 case SO_INCOMING_CPU
:
108 WRITE_ONCE(ssk
->sk_incoming_cpu
, val
);
112 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
113 unlock_sock_fast(ssk
, slow
);
119 static int mptcp_sol_socket_intval(struct mptcp_sock
*msk
, int optname
, int val
)
121 sockptr_t optval
= KERNEL_SOCKPTR(&val
);
122 struct sock
*sk
= (struct sock
*)msk
;
125 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
,
126 optval
, sizeof(val
));
130 mptcp_sol_socket_sync_intval(msk
, optname
, val
);
134 static void mptcp_so_incoming_cpu(struct mptcp_sock
*msk
, int val
)
136 struct sock
*sk
= (struct sock
*)msk
;
138 WRITE_ONCE(sk
->sk_incoming_cpu
, val
);
140 mptcp_sol_socket_sync_intval(msk
, SO_INCOMING_CPU
, val
);
143 static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock
*msk
, int optname
, int val
)
145 sockptr_t optval
= KERNEL_SOCKPTR(&val
);
146 struct mptcp_subflow_context
*subflow
;
147 struct sock
*sk
= (struct sock
*)msk
;
150 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
,
151 optval
, sizeof(val
));
156 mptcp_for_each_subflow(msk
, subflow
) {
157 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
158 bool slow
= lock_sock_fast(ssk
);
161 case SO_TIMESTAMP_OLD
:
162 case SO_TIMESTAMP_NEW
:
163 case SO_TIMESTAMPNS_OLD
:
164 case SO_TIMESTAMPNS_NEW
:
165 sock_set_timestamp(sk
, optname
, !!val
);
167 case SO_TIMESTAMPING_NEW
:
168 case SO_TIMESTAMPING_OLD
:
169 sock_set_timestamping(sk
, optname
, val
);
173 unlock_sock_fast(ssk
, slow
);
180 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock
*msk
, int optname
,
181 sockptr_t optval
, unsigned int optlen
)
185 ret
= mptcp_get_int_option(msk
, optval
, optlen
, &val
);
191 mptcp_sol_socket_sync_intval(msk
, optname
, val
);
200 return mptcp_sol_socket_intval(msk
, optname
, val
);
201 case SO_INCOMING_CPU
:
202 mptcp_so_incoming_cpu(msk
, val
);
204 case SO_TIMESTAMP_OLD
:
205 case SO_TIMESTAMP_NEW
:
206 case SO_TIMESTAMPNS_OLD
:
207 case SO_TIMESTAMPNS_NEW
:
208 case SO_TIMESTAMPING_OLD
:
209 case SO_TIMESTAMPING_NEW
:
210 return mptcp_setsockopt_sol_socket_tstamp(msk
, optname
, val
);
216 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock
*msk
, sockptr_t optval
,
219 struct mptcp_subflow_context
*subflow
;
220 struct sock
*sk
= (struct sock
*)msk
;
225 if (optlen
< sizeof(ling
))
228 if (copy_from_sockptr(&ling
, optval
, sizeof(ling
)))
231 kopt
= KERNEL_SOCKPTR(&ling
);
232 ret
= sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, SO_LINGER
, kopt
, sizeof(ling
));
237 sockopt_seq_inc(msk
);
238 mptcp_for_each_subflow(msk
, subflow
) {
239 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
240 bool slow
= lock_sock_fast(ssk
);
243 sock_reset_flag(ssk
, SOCK_LINGER
);
245 ssk
->sk_lingertime
= sk
->sk_lingertime
;
246 sock_set_flag(ssk
, SOCK_LINGER
);
249 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
250 unlock_sock_fast(ssk
, slow
);
257 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
258 sockptr_t optval
, unsigned int optlen
)
260 struct sock
*sk
= (struct sock
*)msk
;
261 struct socket
*ssock
;
267 case SO_BINDTODEVICE
:
268 case SO_BINDTOIFINDEX
:
270 ssock
= __mptcp_nmpc_socket(msk
);
276 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
278 if (optname
== SO_REUSEPORT
)
279 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
280 else if (optname
== SO_REUSEADDR
)
281 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
282 else if (optname
== SO_BINDTODEVICE
)
283 sk
->sk_bound_dev_if
= ssock
->sk
->sk_bound_dev_if
;
284 else if (optname
== SO_BINDTOIFINDEX
)
285 sk
->sk_bound_dev_if
= ssock
->sk
->sk_bound_dev_if
;
296 case SO_INCOMING_CPU
:
298 case SO_TIMESTAMP_OLD
:
299 case SO_TIMESTAMP_NEW
:
300 case SO_TIMESTAMPNS_OLD
:
301 case SO_TIMESTAMPNS_NEW
:
302 case SO_TIMESTAMPING_OLD
:
303 case SO_TIMESTAMPING_NEW
:
304 return mptcp_setsockopt_sol_socket_int(msk
, optname
, optval
, optlen
);
306 return mptcp_setsockopt_sol_socket_linger(msk
, optval
, optlen
);
308 case SO_RCVTIMEO_OLD
:
309 case SO_RCVTIMEO_NEW
:
311 case SO_PREFER_BUSY_POLL
:
312 case SO_BUSY_POLL_BUDGET
:
313 /* No need to copy: only relevant for msk */
314 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
324 case SO_SELECT_ERR_QUEUE
:
328 /* SO_OOBINLINE is not supported, let's avoid the related mess
329 * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
330 * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
331 * we must be careful with subflows
333 * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
334 * explicitly the sk_protocol field
336 * SO_PEEK_OFF is unsupported, as it is for plain TCP
337 * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
338 * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
339 * but likely needs careful design
341 * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
342 * SO_TXTIME is currently unsupported
348 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
349 sockptr_t optval
, unsigned int optlen
)
351 struct sock
*sk
= (struct sock
*)msk
;
352 int ret
= -EOPNOTSUPP
;
353 struct socket
*ssock
;
358 ssock
= __mptcp_nmpc_socket(msk
);
364 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
366 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
375 static bool mptcp_supported_sockopt(int level
, int optname
)
377 if (level
== SOL_IP
) {
379 /* should work fine */
383 /* the following are control cmsg related */
390 case IP_RECVORIGDSTADDR
:
392 case IP_RECVFRAGSIZE
:
394 /* common stuff that need some love */
397 case IP_BIND_ADDRESS_NO_PORT
:
398 case IP_MTU_DISCOVER
:
401 /* possibly less common may deserve some love */
404 /* the following is apparently a no-op for plain TCP */
405 case IP_RECVERR_RFC4884
:
409 /* IP_OPTIONS is not supported, needs subflow care */
410 /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
411 /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
412 * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
413 * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
414 * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
415 * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
416 * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
419 /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
422 if (level
== SOL_IPV6
) {
426 /* the following are control cmsg related */
427 case IPV6_RECVPKTINFO
:
428 case IPV6_2292PKTINFO
:
429 case IPV6_RECVHOPLIMIT
:
430 case IPV6_2292HOPLIMIT
:
433 case IPV6_RECVHOPOPTS
:
434 case IPV6_2292HOPOPTS
:
435 case IPV6_RECVDSTOPTS
:
436 case IPV6_2292DSTOPTS
:
437 case IPV6_RECVTCLASS
:
439 case IPV6_RECVPATHMTU
:
440 case IPV6_RECVORIGDSTADDR
:
441 case IPV6_RECVFRAGSIZE
:
443 /* the following ones need some love but are quite common */
445 case IPV6_TRANSPARENT
:
448 case IPV6_2292PKTOPTIONS
:
449 case IPV6_UNICAST_HOPS
:
450 case IPV6_MTU_DISCOVER
:
453 case IPV6_FLOWINFO_SEND
:
454 case IPV6_FLOWLABEL_MGR
:
455 case IPV6_MINHOPCOUNT
:
457 case IPV6_AUTOFLOWLABEL
:
459 /* the following one is a no-op for plain TCP */
460 case IPV6_RECVERR_RFC4884
:
464 /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
467 /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
468 * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
469 * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
470 * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
471 * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
472 * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
473 * are not supported better not deal with mcast
475 /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
477 /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
478 /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
481 if (level
== SOL_TCP
) {
483 /* the following are no-op or should work just fine */
484 case TCP_THIN_DUPACK
:
485 case TCP_DEFER_ACCEPT
:
487 /* the following need some love */
490 case TCP_THIN_LINEAR_TIMEOUTS
:
500 case TCP_WINDOW_CLAMP
:
502 case TCP_USER_TIMEOUT
:
504 case TCP_NOTSENT_LOWAT
:
509 /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
511 /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
512 * TCP_REPAIR_WINDOW are not supported, better avoid this mess
514 /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE,
515 * are not supported fastopen is currently unsupported
517 /* TCP_INQ is currently unsupported, needs some recvmsg work */
522 static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock
*msk
, sockptr_t optval
,
525 struct mptcp_subflow_context
*subflow
;
526 struct sock
*sk
= (struct sock
*)msk
;
527 char name
[TCP_CA_NAME_MAX
];
534 ret
= strncpy_from_sockptr(name
, optval
,
535 min_t(long, TCP_CA_NAME_MAX
- 1, optlen
));
541 cap_net_admin
= ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
);
545 sockopt_seq_inc(msk
);
546 mptcp_for_each_subflow(msk
, subflow
) {
547 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
551 err
= tcp_set_congestion_control(ssk
, name
, true, cap_net_admin
);
552 if (err
< 0 && ret
== 0)
554 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
559 strcpy(msk
->ca_name
, name
);
565 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock
*msk
, int optname
,
566 sockptr_t optval
, unsigned int optlen
)
572 return mptcp_setsockopt_sol_tcp_congestion(msk
, optval
, optlen
);
578 int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
579 sockptr_t optval
, unsigned int optlen
)
581 struct mptcp_sock
*msk
= mptcp_sk(sk
);
584 pr_debug("msk=%p", msk
);
586 if (level
== SOL_SOCKET
)
587 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
589 if (!mptcp_supported_sockopt(level
, optname
))
592 /* @@ the meaning of setsockopt() when the socket is connected and
593 * there are multiple subflows is not yet defined. It is up to the
594 * MPTCP-level socket to configure the subflows until the subflow
595 * is in TCP fallback, when TCP socket options are passed through
596 * to the one remaining subflow.
599 ssk
= __mptcp_tcp_fallback(msk
);
602 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
604 if (level
== SOL_IPV6
)
605 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
607 if (level
== SOL_TCP
)
608 return mptcp_setsockopt_sol_tcp(msk
, optname
, optval
, optlen
);
613 static int mptcp_getsockopt_first_sf_only(struct mptcp_sock
*msk
, int level
, int optname
,
614 char __user
*optval
, int __user
*optlen
)
616 struct sock
*sk
= (struct sock
*)msk
;
617 struct socket
*ssock
;
624 ret
= tcp_getsockopt(ssk
, level
, optname
, optval
, optlen
);
628 ssock
= __mptcp_nmpc_socket(msk
);
632 ret
= tcp_getsockopt(ssock
->sk
, level
, optname
, optval
, optlen
);
639 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock
*msk
, int optname
,
640 char __user
*optval
, int __user
*optlen
)
647 return mptcp_getsockopt_first_sf_only(msk
, SOL_TCP
, optname
,
653 int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
654 char __user
*optval
, int __user
*option
)
656 struct mptcp_sock
*msk
= mptcp_sk(sk
);
659 pr_debug("msk=%p", msk
);
661 /* @@ the meaning of setsockopt() when the socket is connected and
662 * there are multiple subflows is not yet defined. It is up to the
663 * MPTCP-level socket to configure the subflows until the subflow
664 * is in TCP fallback, when socket options are passed through
665 * to the one remaining subflow.
668 ssk
= __mptcp_tcp_fallback(msk
);
671 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
673 if (level
== SOL_TCP
)
674 return mptcp_getsockopt_sol_tcp(msk
, optname
, optval
, option
);
678 static void sync_socket_options(struct mptcp_sock
*msk
, struct sock
*ssk
)
680 static const unsigned int tx_rx_locks
= SOCK_RCVBUF_LOCK
| SOCK_SNDBUF_LOCK
;
681 struct sock
*sk
= (struct sock
*)msk
;
683 if (ssk
->sk_prot
->keepalive
) {
684 if (sock_flag(sk
, SOCK_KEEPOPEN
))
685 ssk
->sk_prot
->keepalive(ssk
, 1);
687 ssk
->sk_prot
->keepalive(ssk
, 0);
690 ssk
->sk_priority
= sk
->sk_priority
;
691 ssk
->sk_bound_dev_if
= sk
->sk_bound_dev_if
;
692 ssk
->sk_incoming_cpu
= sk
->sk_incoming_cpu
;
694 if (sk
->sk_userlocks
& tx_rx_locks
) {
695 ssk
->sk_userlocks
|= sk
->sk_userlocks
& tx_rx_locks
;
696 if (sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
)
697 WRITE_ONCE(ssk
->sk_sndbuf
, sk
->sk_sndbuf
);
698 if (sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)
699 WRITE_ONCE(ssk
->sk_rcvbuf
, sk
->sk_rcvbuf
);
702 if (sock_flag(sk
, SOCK_LINGER
)) {
703 ssk
->sk_lingertime
= sk
->sk_lingertime
;
704 sock_set_flag(ssk
, SOCK_LINGER
);
706 sock_reset_flag(ssk
, SOCK_LINGER
);
709 if (sk
->sk_mark
!= ssk
->sk_mark
) {
710 ssk
->sk_mark
= sk
->sk_mark
;
714 sock_valbool_flag(ssk
, SOCK_DBG
, sock_flag(sk
, SOCK_DBG
));
716 if (inet_csk(sk
)->icsk_ca_ops
!= inet_csk(ssk
)->icsk_ca_ops
)
717 tcp_set_congestion_control(ssk
, msk
->ca_name
, false, true);
720 static void __mptcp_sockopt_sync(struct mptcp_sock
*msk
, struct sock
*ssk
)
722 bool slow
= lock_sock_fast(ssk
);
724 sync_socket_options(msk
, ssk
);
726 unlock_sock_fast(ssk
, slow
);
729 void mptcp_sockopt_sync(struct mptcp_sock
*msk
, struct sock
*ssk
)
731 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
733 msk_owned_by_me(msk
);
735 if (READ_ONCE(subflow
->setsockopt_seq
) != msk
->setsockopt_seq
) {
736 __mptcp_sockopt_sync(msk
, ssk
);
738 subflow
->setsockopt_seq
= msk
->setsockopt_seq
;
742 void mptcp_sockopt_sync_all(struct mptcp_sock
*msk
)
744 struct mptcp_subflow_context
*subflow
;
745 struct sock
*sk
= (struct sock
*)msk
;
748 seq
= sockopt_seq_reset(sk
);
750 mptcp_for_each_subflow(msk
, subflow
) {
751 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
752 u32 sseq
= READ_ONCE(subflow
->setsockopt_seq
);
754 if (sseq
!= msk
->setsockopt_seq
) {
755 __mptcp_sockopt_sync(msk
, ssk
);
756 WRITE_ONCE(subflow
->setsockopt_seq
, seq
);
757 } else if (sseq
!= seq
) {
758 WRITE_ONCE(subflow
->setsockopt_seq
, seq
);
764 msk
->setsockopt_seq
= seq
;