1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <linux/sched/signal.h>
13 #include <linux/atomic.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #include <net/tcp_states.h>
20 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
21 #include <net/transp_v6.h>
23 #include <net/mptcp.h>
27 #define MPTCP_SAME_STATE TCP_MAX_STATES
29 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
31 struct mptcp_sock msk
;
40 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
42 static struct percpu_counter mptcp_sockets_allocated
;
44 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
45 * completed yet or has failed, return the subflow socket.
46 * Otherwise return NULL.
48 static struct socket
*__mptcp_nmpc_socket(const struct mptcp_sock
*msk
)
50 if (!msk
->subflow
|| READ_ONCE(msk
->can_ack
))
56 static bool mptcp_is_tcpsk(struct sock
*sk
)
58 struct socket
*sock
= sk
->sk_socket
;
60 if (unlikely(sk
->sk_prot
== &tcp_prot
)) {
61 /* we are being invoked after mptcp_accept() has
62 * accepted a non-mp-capable flow: sk is a tcp_sk,
65 * Hand the socket over to tcp so all further socket ops
68 sock
->ops
= &inet_stream_ops
;
70 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
71 } else if (unlikely(sk
->sk_prot
== &tcpv6_prot
)) {
72 sock
->ops
= &inet6_stream_ops
;
80 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
82 sock_owned_by_me((const struct sock
*)msk
);
84 if (likely(!__mptcp_check_fallback(msk
)))
90 static int __mptcp_socket_create(struct mptcp_sock
*msk
)
92 struct mptcp_subflow_context
*subflow
;
93 struct sock
*sk
= (struct sock
*)msk
;
97 err
= mptcp_subflow_create_socket(sk
, &ssock
);
101 msk
->first
= ssock
->sk
;
102 msk
->subflow
= ssock
;
103 subflow
= mptcp_subflow_ctx(ssock
->sk
);
104 list_add(&subflow
->node
, &msk
->conn_list
);
105 subflow
->request_mptcp
= 1;
107 /* accept() will wait on first subflow sk_wq, and we always wakes up
110 RCU_INIT_POINTER(msk
->first
->sk_wq
, &sk
->sk_socket
->wq
);
115 static void __mptcp_move_skb(struct mptcp_sock
*msk
, struct sock
*ssk
,
117 unsigned int offset
, size_t copy_len
)
119 struct sock
*sk
= (struct sock
*)msk
;
120 struct sk_buff
*tail
;
122 __skb_unlink(skb
, &ssk
->sk_receive_queue
);
126 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ copy_len
);
128 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
129 if (offset
== 0 && tail
) {
133 if (skb_try_coalesce(tail
, skb
, &fragstolen
, &delta
)) {
134 kfree_skb_partial(skb
, fragstolen
);
135 atomic_add(delta
, &sk
->sk_rmem_alloc
);
136 sk_mem_charge(sk
, delta
);
141 skb_set_owner_r(skb
, sk
);
142 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
143 MPTCP_SKB_CB(skb
)->offset
= offset
;
146 static void mptcp_stop_timer(struct sock
*sk
)
148 struct inet_connection_sock
*icsk
= inet_csk(sk
);
150 sk_stop_timer(sk
, &icsk
->icsk_retransmit_timer
);
151 mptcp_sk(sk
)->timer_ival
= 0;
154 /* both sockets must be locked */
155 static bool mptcp_subflow_dsn_valid(const struct mptcp_sock
*msk
,
158 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
159 u64 dsn
= mptcp_subflow_get_mapped_dsn(subflow
);
161 /* revalidate data sequence number.
163 * mptcp_subflow_data_available() is usually called
164 * without msk lock. Its unlikely (but possible)
165 * that msk->ack_seq has been advanced since the last
166 * call found in-sequence data.
168 if (likely(dsn
== msk
->ack_seq
))
171 subflow
->data_avail
= 0;
172 return mptcp_subflow_data_available(ssk
);
175 static void mptcp_check_data_fin_ack(struct sock
*sk
)
177 struct mptcp_sock
*msk
= mptcp_sk(sk
);
179 if (__mptcp_check_fallback(msk
))
182 /* Look for an acknowledged DATA_FIN */
183 if (((1 << sk
->sk_state
) &
184 (TCPF_FIN_WAIT1
| TCPF_CLOSING
| TCPF_LAST_ACK
)) &&
185 msk
->write_seq
== atomic64_read(&msk
->snd_una
)) {
186 mptcp_stop_timer(sk
);
188 WRITE_ONCE(msk
->snd_data_fin_enable
, 0);
190 switch (sk
->sk_state
) {
192 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
193 sk
->sk_state_change(sk
);
197 inet_sk_state_store(sk
, TCP_CLOSE
);
198 sk
->sk_state_change(sk
);
202 if (sk
->sk_shutdown
== SHUTDOWN_MASK
||
203 sk
->sk_state
== TCP_CLOSE
)
204 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_HUP
);
206 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
210 static bool mptcp_pending_data_fin(struct sock
*sk
, u64
*seq
)
212 struct mptcp_sock
*msk
= mptcp_sk(sk
);
214 if (READ_ONCE(msk
->rcv_data_fin
) &&
215 ((1 << sk
->sk_state
) &
216 (TCPF_ESTABLISHED
| TCPF_FIN_WAIT1
| TCPF_FIN_WAIT2
))) {
217 u64 rcv_data_fin_seq
= READ_ONCE(msk
->rcv_data_fin_seq
);
219 if (msk
->ack_seq
== rcv_data_fin_seq
) {
221 *seq
= rcv_data_fin_seq
;
230 static void mptcp_set_timeout(const struct sock
*sk
, const struct sock
*ssk
)
232 long tout
= ssk
&& inet_csk(ssk
)->icsk_pending
?
233 inet_csk(ssk
)->icsk_timeout
- jiffies
: 0;
236 tout
= mptcp_sk(sk
)->timer_ival
;
237 mptcp_sk(sk
)->timer_ival
= tout
> 0 ? tout
: TCP_RTO_MIN
;
240 static void mptcp_check_data_fin(struct sock
*sk
)
242 struct mptcp_sock
*msk
= mptcp_sk(sk
);
243 u64 rcv_data_fin_seq
;
245 if (__mptcp_check_fallback(msk
) || !msk
->first
)
248 /* Need to ack a DATA_FIN received from a peer while this side
249 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
250 * msk->rcv_data_fin was set when parsing the incoming options
251 * at the subflow level and the msk lock was not held, so this
252 * is the first opportunity to act on the DATA_FIN and change
255 * If we are caught up to the sequence number of the incoming
256 * DATA_FIN, send the DATA_ACK now and do state transition. If
257 * not caught up, do nothing and let the recv code send DATA_ACK
261 if (mptcp_pending_data_fin(sk
, &rcv_data_fin_seq
)) {
262 struct mptcp_subflow_context
*subflow
;
264 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ 1);
265 WRITE_ONCE(msk
->rcv_data_fin
, 0);
267 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
268 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
269 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
271 switch (sk
->sk_state
) {
272 case TCP_ESTABLISHED
:
273 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
276 inet_sk_state_store(sk
, TCP_CLOSING
);
279 inet_sk_state_store(sk
, TCP_CLOSE
);
280 // @@ Close subflows now?
283 /* Other states not expected */
288 mptcp_set_timeout(sk
, NULL
);
289 mptcp_for_each_subflow(msk
, subflow
) {
290 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
297 sk
->sk_state_change(sk
);
299 if (sk
->sk_shutdown
== SHUTDOWN_MASK
||
300 sk
->sk_state
== TCP_CLOSE
)
301 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_HUP
);
303 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
307 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock
*msk
,
311 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
312 struct sock
*sk
= (struct sock
*)msk
;
313 unsigned int moved
= 0;
314 bool more_data_avail
;
318 if (!mptcp_subflow_dsn_valid(msk
, ssk
)) {
325 u32 map_remaining
, offset
;
326 u32 seq
= tp
->copied_seq
;
330 /* try to move as much data as available */
331 map_remaining
= subflow
->map_data_len
-
332 mptcp_subflow_get_map_offset(subflow
);
334 skb
= skb_peek(&ssk
->sk_receive_queue
);
338 if (__mptcp_check_fallback(msk
)) {
339 /* if we are running under the workqueue, TCP could have
340 * collapsed skbs between dummy map creation and now
341 * be sure to adjust the size
343 map_remaining
= skb
->len
;
344 subflow
->map_data_len
= skb
->len
;
347 offset
= seq
- TCP_SKB_CB(skb
)->seq
;
348 fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
354 if (offset
< skb
->len
) {
355 size_t len
= skb
->len
- offset
;
360 __mptcp_move_skb(msk
, ssk
, skb
, offset
, len
);
364 if (WARN_ON_ONCE(map_remaining
< len
))
368 sk_eat_skb(ssk
, skb
);
372 WRITE_ONCE(tp
->copied_seq
, seq
);
373 more_data_avail
= mptcp_subflow_data_available(ssk
);
375 if (atomic_read(&sk
->sk_rmem_alloc
) > READ_ONCE(sk
->sk_rcvbuf
)) {
379 } while (more_data_avail
);
383 /* If the moves have caught up with the DATA_FIN sequence number
384 * it's time to ack the DATA_FIN and change socket state, but
385 * this is not a good place to change state. Let the workqueue
388 if (mptcp_pending_data_fin(sk
, NULL
) &&
389 schedule_work(&msk
->work
))
395 /* In most cases we will be able to lock the mptcp socket. If its already
396 * owned, we need to defer to the work queue to avoid ABBA deadlock.
398 static bool move_skbs_to_msk(struct mptcp_sock
*msk
, struct sock
*ssk
)
400 struct sock
*sk
= (struct sock
*)msk
;
401 unsigned int moved
= 0;
403 if (READ_ONCE(sk
->sk_lock
.owned
))
406 if (unlikely(!spin_trylock_bh(&sk
->sk_lock
.slock
)))
409 /* must re-check after taking the lock */
410 if (!READ_ONCE(sk
->sk_lock
.owned
))
411 __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
413 spin_unlock_bh(&sk
->sk_lock
.slock
);
418 void mptcp_data_ready(struct sock
*sk
, struct sock
*ssk
)
420 struct mptcp_sock
*msk
= mptcp_sk(sk
);
422 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
424 if (atomic_read(&sk
->sk_rmem_alloc
) < READ_ONCE(sk
->sk_rcvbuf
) &&
425 move_skbs_to_msk(msk
, ssk
))
428 /* don't schedule if mptcp sk is (still) over limit */
429 if (atomic_read(&sk
->sk_rmem_alloc
) > READ_ONCE(sk
->sk_rcvbuf
))
432 /* mptcp socket is owned, release_cb should retry */
433 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED
,
434 &sk
->sk_tsq_flags
)) {
437 /* need to try again, its possible release_cb() has already
438 * been called after the test_and_set_bit() above.
440 move_skbs_to_msk(msk
, ssk
);
443 sk
->sk_data_ready(sk
);
446 static void __mptcp_flush_join_list(struct mptcp_sock
*msk
)
448 if (likely(list_empty(&msk
->join_list
)))
451 spin_lock_bh(&msk
->join_list_lock
);
452 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
453 spin_unlock_bh(&msk
->join_list_lock
);
456 static bool mptcp_timer_pending(struct sock
*sk
)
458 return timer_pending(&inet_csk(sk
)->icsk_retransmit_timer
);
461 static void mptcp_reset_timer(struct sock
*sk
)
463 struct inet_connection_sock
*icsk
= inet_csk(sk
);
466 /* should never be called with mptcp level timer cleared */
467 tout
= READ_ONCE(mptcp_sk(sk
)->timer_ival
);
468 if (WARN_ON_ONCE(!tout
))
470 sk_reset_timer(sk
, &icsk
->icsk_retransmit_timer
, jiffies
+ tout
);
473 void mptcp_data_acked(struct sock
*sk
)
475 mptcp_reset_timer(sk
);
477 if ((!sk_stream_is_writeable(sk
) ||
478 (inet_sk_state_load(sk
) != TCP_ESTABLISHED
)) &&
479 schedule_work(&mptcp_sk(sk
)->work
))
483 void mptcp_subflow_eof(struct sock
*sk
)
485 struct mptcp_sock
*msk
= mptcp_sk(sk
);
487 if (!test_and_set_bit(MPTCP_WORK_EOF
, &msk
->flags
) &&
488 schedule_work(&msk
->work
))
492 static void mptcp_check_for_eof(struct mptcp_sock
*msk
)
494 struct mptcp_subflow_context
*subflow
;
495 struct sock
*sk
= (struct sock
*)msk
;
498 mptcp_for_each_subflow(msk
, subflow
)
499 receivers
+= !subflow
->rx_eof
;
501 if (!receivers
&& !(sk
->sk_shutdown
& RCV_SHUTDOWN
)) {
502 /* hopefully temporary hack: propagate shutdown status
503 * to msk, when all subflows agree on it
505 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
507 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
508 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
509 sk
->sk_data_ready(sk
);
513 static bool mptcp_ext_cache_refill(struct mptcp_sock
*msk
)
515 const struct sock
*sk
= (const struct sock
*)msk
;
517 if (!msk
->cached_ext
)
518 msk
->cached_ext
= __skb_ext_alloc(sk
->sk_allocation
);
520 return !!msk
->cached_ext
;
523 static struct sock
*mptcp_subflow_recv_lookup(const struct mptcp_sock
*msk
)
525 struct mptcp_subflow_context
*subflow
;
526 struct sock
*sk
= (struct sock
*)msk
;
528 sock_owned_by_me(sk
);
530 mptcp_for_each_subflow(msk
, subflow
) {
531 if (subflow
->data_avail
)
532 return mptcp_subflow_tcp_sock(subflow
);
538 static bool mptcp_skb_can_collapse_to(u64 write_seq
,
539 const struct sk_buff
*skb
,
540 const struct mptcp_ext
*mpext
)
542 if (!tcp_skb_can_collapse_to(skb
))
545 /* can collapse only if MPTCP level sequence is in order */
546 return mpext
&& mpext
->data_seq
+ mpext
->data_len
== write_seq
;
549 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock
*msk
,
550 const struct page_frag
*pfrag
,
551 const struct mptcp_data_frag
*df
)
553 return df
&& pfrag
->page
== df
->page
&&
554 df
->data_seq
+ df
->data_len
== msk
->write_seq
;
557 static void dfrag_uncharge(struct sock
*sk
, int len
)
559 sk_mem_uncharge(sk
, len
);
560 sk_wmem_queued_add(sk
, -len
);
563 static void dfrag_clear(struct sock
*sk
, struct mptcp_data_frag
*dfrag
)
565 int len
= dfrag
->data_len
+ dfrag
->overhead
;
567 list_del(&dfrag
->list
);
568 dfrag_uncharge(sk
, len
);
569 put_page(dfrag
->page
);
572 static void mptcp_clean_una(struct sock
*sk
)
574 struct mptcp_sock
*msk
= mptcp_sk(sk
);
575 struct mptcp_data_frag
*dtmp
, *dfrag
;
576 bool cleaned
= false;
579 /* on fallback we just need to ignore snd_una, as this is really
582 if (__mptcp_check_fallback(msk
))
583 atomic64_set(&msk
->snd_una
, msk
->write_seq
);
584 snd_una
= atomic64_read(&msk
->snd_una
);
586 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
) {
587 if (after64(dfrag
->data_seq
+ dfrag
->data_len
, snd_una
))
590 dfrag_clear(sk
, dfrag
);
594 dfrag
= mptcp_rtx_head(sk
);
595 if (dfrag
&& after64(snd_una
, dfrag
->data_seq
)) {
596 u64 delta
= snd_una
- dfrag
->data_seq
;
598 if (WARN_ON_ONCE(delta
> dfrag
->data_len
))
601 dfrag
->data_seq
+= delta
;
602 dfrag
->offset
+= delta
;
603 dfrag
->data_len
-= delta
;
605 dfrag_uncharge(sk
, delta
);
611 sk_mem_reclaim_partial(sk
);
613 /* Only wake up writers if a subflow is ready */
614 if (test_bit(MPTCP_SEND_SPACE
, &msk
->flags
))
615 sk_stream_write_space(sk
);
619 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
622 static bool mptcp_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
624 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag
),
625 pfrag
, sk
->sk_allocation
)))
628 sk
->sk_prot
->enter_memory_pressure(sk
);
629 sk_stream_moderate_sndbuf(sk
);
633 static struct mptcp_data_frag
*
634 mptcp_carve_data_frag(const struct mptcp_sock
*msk
, struct page_frag
*pfrag
,
637 int offset
= ALIGN(orig_offset
, sizeof(long));
638 struct mptcp_data_frag
*dfrag
;
640 dfrag
= (struct mptcp_data_frag
*)(page_to_virt(pfrag
->page
) + offset
);
642 dfrag
->data_seq
= msk
->write_seq
;
643 dfrag
->overhead
= offset
- orig_offset
+ sizeof(struct mptcp_data_frag
);
644 dfrag
->offset
= offset
+ sizeof(struct mptcp_data_frag
);
645 dfrag
->page
= pfrag
->page
;
650 static int mptcp_sendmsg_frag(struct sock
*sk
, struct sock
*ssk
,
651 struct msghdr
*msg
, struct mptcp_data_frag
*dfrag
,
652 long *timeo
, int *pmss_now
,
655 int mss_now
, avail_size
, size_goal
, offset
, ret
, frag_truesize
= 0;
656 bool dfrag_collapsed
, can_collapse
= false;
657 struct mptcp_sock
*msk
= mptcp_sk(sk
);
658 struct mptcp_ext
*mpext
= NULL
;
659 bool retransmission
= !!dfrag
;
660 struct sk_buff
*skb
, *tail
;
661 struct page_frag
*pfrag
;
666 /* use the mptcp page cache so that we can easily move the data
667 * from one substream to another, but do per subflow memory accounting
668 * Note: pfrag is used only !retransmission, but the compiler if
669 * fooled into a warning if we don't init here
671 pfrag
= sk_page_frag(sk
);
672 if (!retransmission
) {
673 write_seq
= &msk
->write_seq
;
676 write_seq
= &dfrag
->data_seq
;
680 /* compute copy limit */
681 mss_now
= tcp_send_mss(ssk
, &size_goal
, msg
->msg_flags
);
683 *ps_goal
= size_goal
;
684 avail_size
= size_goal
;
685 skb
= tcp_write_queue_tail(ssk
);
687 mpext
= skb_ext_find(skb
, SKB_EXT_MPTCP
);
689 /* Limit the write to the size available in the
690 * current skb, if any, so that we create at most a new skb.
691 * Explicitly tells TCP internals to avoid collapsing on later
692 * queue management operation, to avoid breaking the ext <->
693 * SSN association set here
695 can_collapse
= (size_goal
- skb
->len
> 0) &&
696 mptcp_skb_can_collapse_to(*write_seq
, skb
, mpext
);
698 TCP_SKB_CB(skb
)->eor
= 1;
700 avail_size
= size_goal
- skb
->len
;
703 if (!retransmission
) {
704 /* reuse tail pfrag, if possible, or carve a new one from the
707 dfrag
= mptcp_rtx_tail(sk
);
708 offset
= pfrag
->offset
;
709 dfrag_collapsed
= mptcp_frag_can_collapse_to(msk
, pfrag
, dfrag
);
710 if (!dfrag_collapsed
) {
711 dfrag
= mptcp_carve_data_frag(msk
, pfrag
, offset
);
712 offset
= dfrag
->offset
;
713 frag_truesize
= dfrag
->overhead
;
715 psize
= min_t(size_t, pfrag
->size
- offset
, avail_size
);
718 pr_debug("left=%zu", msg_data_left(msg
));
719 psize
= copy_page_from_iter(pfrag
->page
, offset
,
720 min_t(size_t, msg_data_left(msg
),
723 pr_debug("left=%zu", msg_data_left(msg
));
727 if (!sk_wmem_schedule(sk
, psize
+ dfrag
->overhead
)) {
728 iov_iter_revert(&msg
->msg_iter
, psize
);
732 offset
= dfrag
->offset
;
733 psize
= min_t(size_t, dfrag
->data_len
, avail_size
);
736 /* tell the TCP stack to delay the push so that we can safely
737 * access the skb after the sendpages call
739 ret
= do_tcp_sendpages(ssk
, page
, offset
, psize
,
740 msg
->msg_flags
| MSG_SENDPAGE_NOTLAST
| MSG_DONTWAIT
);
743 iov_iter_revert(&msg
->msg_iter
, psize
);
747 frag_truesize
+= ret
;
748 if (!retransmission
) {
749 if (unlikely(ret
< psize
))
750 iov_iter_revert(&msg
->msg_iter
, psize
- ret
);
752 /* send successful, keep track of sent data for mptcp-level
755 dfrag
->data_len
+= ret
;
756 if (!dfrag_collapsed
) {
757 get_page(dfrag
->page
);
758 list_add_tail(&dfrag
->list
, &msk
->rtx_queue
);
759 sk_wmem_queued_add(sk
, frag_truesize
);
761 sk_wmem_queued_add(sk
, ret
);
764 /* charge data on mptcp rtx queue to the master socket
765 * Note: we charge such data both to sk and ssk
767 sk
->sk_forward_alloc
-= frag_truesize
;
770 /* if the tail skb extension is still the cached one, collapsing
771 * really happened. Note: we can't check for 'same skb' as the sk_buff
772 * hdr on tail can be transmitted, freed and re-allocated by the
773 * do_tcp_sendpages() call
775 tail
= tcp_write_queue_tail(ssk
);
776 if (mpext
&& tail
&& mpext
== skb_ext_find(tail
, SKB_EXT_MPTCP
)) {
777 WARN_ON_ONCE(!can_collapse
);
778 mpext
->data_len
+= ret
;
782 skb
= tcp_write_queue_tail(ssk
);
783 mpext
= __skb_ext_set(skb
, SKB_EXT_MPTCP
, msk
->cached_ext
);
784 msk
->cached_ext
= NULL
;
786 memset(mpext
, 0, sizeof(*mpext
));
787 mpext
->data_seq
= *write_seq
;
788 mpext
->subflow_seq
= mptcp_subflow_ctx(ssk
)->rel_write_seq
;
789 mpext
->data_len
= ret
;
793 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
794 mpext
->data_seq
, mpext
->subflow_seq
, mpext
->data_len
,
799 pfrag
->offset
+= frag_truesize
;
800 WRITE_ONCE(*write_seq
, *write_seq
+ ret
);
801 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
806 static void mptcp_nospace(struct mptcp_sock
*msk
, struct socket
*sock
)
808 clear_bit(MPTCP_SEND_SPACE
, &msk
->flags
);
809 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
811 /* enables sk->write_space() callbacks */
812 set_bit(SOCK_NOSPACE
, &sock
->flags
);
815 static struct sock
*mptcp_subflow_get_send(struct mptcp_sock
*msk
)
817 struct mptcp_subflow_context
*subflow
;
818 struct sock
*backup
= NULL
;
820 sock_owned_by_me((const struct sock
*)msk
);
822 if (!mptcp_ext_cache_refill(msk
))
825 mptcp_for_each_subflow(msk
, subflow
) {
826 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
828 if (!sk_stream_memory_free(ssk
)) {
829 struct socket
*sock
= ssk
->sk_socket
;
832 mptcp_nospace(msk
, sock
);
837 if (subflow
->backup
) {
850 static void ssk_check_wmem(struct mptcp_sock
*msk
, struct sock
*ssk
)
854 if (likely(sk_stream_is_writeable(ssk
)))
857 sock
= READ_ONCE(ssk
->sk_socket
);
859 mptcp_nospace(msk
, sock
);
862 static int mptcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
)
864 int mss_now
= 0, size_goal
= 0, ret
= 0;
865 struct mptcp_sock
*msk
= mptcp_sk(sk
);
866 struct page_frag
*pfrag
;
872 if (msg
->msg_flags
& ~(MSG_MORE
| MSG_DONTWAIT
| MSG_NOSIGNAL
))
877 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
879 if ((1 << sk
->sk_state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
)) {
880 ret
= sk_stream_wait_connect(sk
, &timeo
);
885 pfrag
= sk_page_frag(sk
);
889 if (sk
->sk_err
|| (sk
->sk_shutdown
& SEND_SHUTDOWN
)) {
894 __mptcp_flush_join_list(msk
);
895 ssk
= mptcp_subflow_get_send(msk
);
896 while (!sk_stream_memory_free(sk
) ||
898 !mptcp_page_frag_refill(ssk
, pfrag
)) {
900 /* make sure retransmit timer is
901 * running before we wait for memory.
903 * The retransmit timer might be needed
904 * to make the peer send an up-to-date
907 mptcp_set_timeout(sk
, ssk
);
908 if (!mptcp_timer_pending(sk
))
909 mptcp_reset_timer(sk
);
912 ret
= sk_stream_wait_memory(sk
, &timeo
);
918 ssk
= mptcp_subflow_get_send(msk
);
919 if (list_empty(&msk
->conn_list
)) {
925 pr_debug("conn_list->subflow=%p", ssk
);
928 tx_ok
= msg_data_left(msg
);
930 ret
= mptcp_sendmsg_frag(sk
, ssk
, msg
, NULL
, &timeo
, &mss_now
,
933 if (ret
== -EAGAIN
&& timeo
> 0) {
934 mptcp_set_timeout(sk
, ssk
);
943 tx_ok
= msg_data_left(msg
);
947 if (!sk_stream_memory_free(ssk
) ||
948 !mptcp_page_frag_refill(ssk
, pfrag
) ||
949 !mptcp_ext_cache_refill(msk
)) {
950 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
951 tcp_push(ssk
, msg
->msg_flags
, mss_now
,
952 tcp_sk(ssk
)->nonagle
, size_goal
);
953 mptcp_set_timeout(sk
, ssk
);
958 /* memory is charged to mptcp level socket as well, i.e.
959 * if msg is very large, mptcp socket may run out of buffer
960 * space. mptcp_clean_una() will release data that has
961 * been acked at mptcp level in the mean time, so there is
962 * a good chance we can continue sending data right away.
964 * Normally, when the tcp subflow can accept more data, then
965 * so can the MPTCP socket. However, we need to cope with
966 * peers that might lag behind in their MPTCP-level
967 * acknowledgements, i.e. data might have been acked at
968 * tcp level only. So, we must also check the MPTCP socket
969 * limits before we send more data.
971 if (unlikely(!sk_stream_memory_free(sk
))) {
972 tcp_push(ssk
, msg
->msg_flags
, mss_now
,
973 tcp_sk(ssk
)->nonagle
, size_goal
);
975 if (!sk_stream_memory_free(sk
)) {
976 /* can't send more for now, need to wait for
977 * MPTCP-level ACKs from peer.
979 * Wakeup will happen via mptcp_clean_una().
981 mptcp_set_timeout(sk
, ssk
);
988 mptcp_set_timeout(sk
, ssk
);
990 tcp_push(ssk
, msg
->msg_flags
, mss_now
, tcp_sk(ssk
)->nonagle
,
993 /* start the timer, if it's not pending */
994 if (!mptcp_timer_pending(sk
))
995 mptcp_reset_timer(sk
);
998 ssk_check_wmem(msk
, ssk
);
1002 return copied
? : ret
;
1005 static void mptcp_wait_data(struct sock
*sk
, long *timeo
)
1007 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
1008 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1010 add_wait_queue(sk_sleep(sk
), &wait
);
1011 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1013 sk_wait_event(sk
, timeo
,
1014 test_and_clear_bit(MPTCP_DATA_READY
, &msk
->flags
), &wait
);
1016 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1017 remove_wait_queue(sk_sleep(sk
), &wait
);
1020 static int __mptcp_recvmsg_mskq(struct mptcp_sock
*msk
,
1024 struct sock
*sk
= (struct sock
*)msk
;
1025 struct sk_buff
*skb
;
1028 while ((skb
= skb_peek(&sk
->sk_receive_queue
)) != NULL
) {
1029 u32 offset
= MPTCP_SKB_CB(skb
)->offset
;
1030 u32 data_len
= skb
->len
- offset
;
1031 u32 count
= min_t(size_t, len
- copied
, data_len
);
1034 err
= skb_copy_datagram_msg(skb
, offset
, msg
, count
);
1035 if (unlikely(err
< 0)) {
1043 if (count
< data_len
) {
1044 MPTCP_SKB_CB(skb
)->offset
+= count
;
1048 __skb_unlink(skb
, &sk
->sk_receive_queue
);
1058 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1060 * Only difference: Use highest rtt estimate of the subflows in use.
1062 static void mptcp_rcv_space_adjust(struct mptcp_sock
*msk
, int copied
)
1064 struct mptcp_subflow_context
*subflow
;
1065 struct sock
*sk
= (struct sock
*)msk
;
1066 u32 time
, advmss
= 1;
1069 sock_owned_by_me(sk
);
1074 msk
->rcvq_space
.copied
+= copied
;
1076 mstamp
= div_u64(tcp_clock_ns(), NSEC_PER_USEC
);
1077 time
= tcp_stamp_us_delta(mstamp
, msk
->rcvq_space
.time
);
1079 rtt_us
= msk
->rcvq_space
.rtt_us
;
1080 if (rtt_us
&& time
< (rtt_us
>> 3))
1084 mptcp_for_each_subflow(msk
, subflow
) {
1085 const struct tcp_sock
*tp
;
1089 tp
= tcp_sk(mptcp_subflow_tcp_sock(subflow
));
1091 sf_rtt_us
= READ_ONCE(tp
->rcv_rtt_est
.rtt_us
);
1092 sf_advmss
= READ_ONCE(tp
->advmss
);
1094 rtt_us
= max(sf_rtt_us
, rtt_us
);
1095 advmss
= max(sf_advmss
, advmss
);
1098 msk
->rcvq_space
.rtt_us
= rtt_us
;
1099 if (time
< (rtt_us
>> 3) || rtt_us
== 0)
1102 if (msk
->rcvq_space
.copied
<= msk
->rcvq_space
.space
)
1105 if (sock_net(sk
)->ipv4
.sysctl_tcp_moderate_rcvbuf
&&
1106 !(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
1110 rcvwin
= ((u64
)msk
->rcvq_space
.copied
<< 1) + 16 * advmss
;
1112 grow
= rcvwin
* (msk
->rcvq_space
.copied
- msk
->rcvq_space
.space
);
1114 do_div(grow
, msk
->rcvq_space
.space
);
1115 rcvwin
+= (grow
<< 1);
1117 rcvmem
= SKB_TRUESIZE(advmss
+ MAX_TCP_HEADER
);
1118 while (tcp_win_from_space(sk
, rcvmem
) < advmss
)
1121 do_div(rcvwin
, advmss
);
1122 rcvbuf
= min_t(u64
, rcvwin
* rcvmem
,
1123 sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[2]);
1125 if (rcvbuf
> sk
->sk_rcvbuf
) {
1128 window_clamp
= tcp_win_from_space(sk
, rcvbuf
);
1129 WRITE_ONCE(sk
->sk_rcvbuf
, rcvbuf
);
1131 /* Make subflows follow along. If we do not do this, we
1132 * get drops at subflow level if skbs can't be moved to
1133 * the mptcp rx queue fast enough (announced rcv_win can
1134 * exceed ssk->sk_rcvbuf).
1136 mptcp_for_each_subflow(msk
, subflow
) {
1139 ssk
= mptcp_subflow_tcp_sock(subflow
);
1140 WRITE_ONCE(ssk
->sk_rcvbuf
, rcvbuf
);
1141 tcp_sk(ssk
)->window_clamp
= window_clamp
;
1146 msk
->rcvq_space
.space
= msk
->rcvq_space
.copied
;
1148 msk
->rcvq_space
.copied
= 0;
1149 msk
->rcvq_space
.time
= mstamp
;
1152 static bool __mptcp_move_skbs(struct mptcp_sock
*msk
)
1154 unsigned int moved
= 0;
1158 struct sock
*ssk
= mptcp_subflow_recv_lookup(msk
);
1164 done
= __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
1171 static int mptcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
1172 int nonblock
, int flags
, int *addr_len
)
1174 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1179 if (msg
->msg_flags
& ~(MSG_WAITALL
| MSG_DONTWAIT
))
1183 timeo
= sock_rcvtimeo(sk
, nonblock
);
1185 len
= min_t(size_t, len
, INT_MAX
);
1186 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, len
);
1187 __mptcp_flush_join_list(msk
);
1189 while (len
> (size_t)copied
) {
1192 bytes_read
= __mptcp_recvmsg_mskq(msk
, msg
, len
- copied
);
1193 if (unlikely(bytes_read
< 0)) {
1195 copied
= bytes_read
;
1199 copied
+= bytes_read
;
1201 if (skb_queue_empty(&sk
->sk_receive_queue
) &&
1202 __mptcp_move_skbs(msk
))
1205 /* only the master socket status is relevant here. The exit
1206 * conditions mirror closely tcp_recvmsg()
1208 if (copied
>= target
)
1213 sk
->sk_state
== TCP_CLOSE
||
1214 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
1216 signal_pending(current
))
1220 copied
= sock_error(sk
);
1224 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
1225 mptcp_check_for_eof(msk
);
1227 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
1230 if (sk
->sk_state
== TCP_CLOSE
) {
1240 if (signal_pending(current
)) {
1241 copied
= sock_intr_errno(timeo
);
1246 pr_debug("block timeout %ld", timeo
);
1247 mptcp_wait_data(sk
, &timeo
);
1250 if (skb_queue_empty(&sk
->sk_receive_queue
)) {
1251 /* entire backlog drained, clear DATA_READY. */
1252 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
1254 /* .. race-breaker: ssk might have gotten new data
1255 * after last __mptcp_move_skbs() returned false.
1257 if (unlikely(__mptcp_move_skbs(msk
)))
1258 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
1259 } else if (unlikely(!test_bit(MPTCP_DATA_READY
, &msk
->flags
))) {
1260 /* data to read but mptcp_wait_data() cleared DATA_READY */
1261 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
1264 mptcp_rcv_space_adjust(msk
, copied
);
1270 static void mptcp_retransmit_handler(struct sock
*sk
)
1272 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1274 if (atomic64_read(&msk
->snd_una
) == READ_ONCE(msk
->write_seq
)) {
1275 mptcp_stop_timer(sk
);
1277 set_bit(MPTCP_WORK_RTX
, &msk
->flags
);
1278 if (schedule_work(&msk
->work
))
1283 static void mptcp_retransmit_timer(struct timer_list
*t
)
1285 struct inet_connection_sock
*icsk
= from_timer(icsk
, t
,
1286 icsk_retransmit_timer
);
1287 struct sock
*sk
= &icsk
->icsk_inet
.sk
;
1290 if (!sock_owned_by_user(sk
)) {
1291 mptcp_retransmit_handler(sk
);
1293 /* delegate our work to tcp_release_cb() */
1294 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED
,
1302 /* Find an idle subflow. Return NULL if there is unacked data at tcp
1305 * A backup subflow is returned only if that is the only kind available.
1307 static struct sock
*mptcp_subflow_get_retrans(const struct mptcp_sock
*msk
)
1309 struct mptcp_subflow_context
*subflow
;
1310 struct sock
*backup
= NULL
;
1312 sock_owned_by_me((const struct sock
*)msk
);
1314 mptcp_for_each_subflow(msk
, subflow
) {
1315 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1317 /* still data outstanding at TCP level? Don't retransmit. */
1318 if (!tcp_write_queue_empty(ssk
))
1321 if (subflow
->backup
) {
1333 /* subflow sockets can be either outgoing (connect) or incoming
1336 * Outgoing subflows use in-kernel sockets.
1337 * Incoming subflows do not have their own 'struct socket' allocated,
1338 * so we need to use tcp_close() after detaching them from the mptcp
1341 static void __mptcp_close_ssk(struct sock
*sk
, struct sock
*ssk
,
1342 struct mptcp_subflow_context
*subflow
,
1345 struct socket
*sock
= READ_ONCE(ssk
->sk_socket
);
1347 list_del(&subflow
->node
);
1349 if (sock
&& sock
!= sk
->sk_socket
) {
1350 /* outgoing subflow */
1353 /* incoming subflow */
1354 tcp_close(ssk
, timeout
);
1358 static unsigned int mptcp_sync_mss(struct sock
*sk
, u32 pmtu
)
1363 static void pm_work(struct mptcp_sock
*msk
)
1365 struct mptcp_pm_data
*pm
= &msk
->pm
;
1367 spin_lock_bh(&msk
->pm
.lock
);
1369 pr_debug("msk=%p status=%x", msk
, pm
->status
);
1370 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_RECEIVED
)) {
1371 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED
);
1372 mptcp_pm_nl_add_addr_received(msk
);
1374 if (pm
->status
& BIT(MPTCP_PM_ESTABLISHED
)) {
1375 pm
->status
&= ~BIT(MPTCP_PM_ESTABLISHED
);
1376 mptcp_pm_nl_fully_established(msk
);
1378 if (pm
->status
& BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
)) {
1379 pm
->status
&= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
);
1380 mptcp_pm_nl_subflow_established(msk
);
1383 spin_unlock_bh(&msk
->pm
.lock
);
1386 static void __mptcp_close_subflow(struct mptcp_sock
*msk
)
1388 struct mptcp_subflow_context
*subflow
, *tmp
;
1390 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
1391 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1393 if (inet_sk_state_load(ssk
) != TCP_CLOSE
)
1396 __mptcp_close_ssk((struct sock
*)msk
, ssk
, subflow
, 0);
1400 static void mptcp_worker(struct work_struct
*work
)
1402 struct mptcp_sock
*msk
= container_of(work
, struct mptcp_sock
, work
);
1403 struct sock
*ssk
, *sk
= &msk
->sk
.icsk_inet
.sk
;
1404 int orig_len
, orig_offset
, mss_now
= 0, size_goal
= 0;
1405 struct mptcp_data_frag
*dfrag
;
1408 struct msghdr msg
= {
1409 .msg_flags
= MSG_DONTWAIT
,
1414 mptcp_clean_una(sk
);
1415 mptcp_check_data_fin_ack(sk
);
1416 __mptcp_flush_join_list(msk
);
1417 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
))
1418 __mptcp_close_subflow(msk
);
1420 __mptcp_move_skbs(msk
);
1425 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
1426 mptcp_check_for_eof(msk
);
1428 mptcp_check_data_fin(sk
);
1430 if (!test_and_clear_bit(MPTCP_WORK_RTX
, &msk
->flags
))
1433 dfrag
= mptcp_rtx_head(sk
);
1437 if (!mptcp_ext_cache_refill(msk
))
1440 ssk
= mptcp_subflow_get_retrans(msk
);
1446 orig_len
= dfrag
->data_len
;
1447 orig_offset
= dfrag
->offset
;
1448 orig_write_seq
= dfrag
->data_seq
;
1449 while (dfrag
->data_len
> 0) {
1450 int ret
= mptcp_sendmsg_frag(sk
, ssk
, &msg
, dfrag
, &timeo
,
1451 &mss_now
, &size_goal
);
1455 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_RETRANSSEGS
);
1457 dfrag
->data_len
-= ret
;
1458 dfrag
->offset
+= ret
;
1460 if (!mptcp_ext_cache_refill(msk
))
1464 tcp_push(ssk
, msg
.msg_flags
, mss_now
, tcp_sk(ssk
)->nonagle
,
1467 dfrag
->data_seq
= orig_write_seq
;
1468 dfrag
->offset
= orig_offset
;
1469 dfrag
->data_len
= orig_len
;
1471 mptcp_set_timeout(sk
, ssk
);
1475 if (!mptcp_timer_pending(sk
))
1476 mptcp_reset_timer(sk
);
1483 static int __mptcp_init_sock(struct sock
*sk
)
1485 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1487 spin_lock_init(&msk
->join_list_lock
);
1489 INIT_LIST_HEAD(&msk
->conn_list
);
1490 INIT_LIST_HEAD(&msk
->join_list
);
1491 INIT_LIST_HEAD(&msk
->rtx_queue
);
1492 __set_bit(MPTCP_SEND_SPACE
, &msk
->flags
);
1493 INIT_WORK(&msk
->work
, mptcp_worker
);
1496 inet_csk(sk
)->icsk_sync_mss
= mptcp_sync_mss
;
1498 mptcp_pm_data_init(msk
);
1500 /* re-use the csk retrans timer for MPTCP-level retrans */
1501 timer_setup(&msk
->sk
.icsk_retransmit_timer
, mptcp_retransmit_timer
, 0);
1506 static int mptcp_init_sock(struct sock
*sk
)
1508 struct net
*net
= sock_net(sk
);
1511 if (!mptcp_is_enabled(net
))
1512 return -ENOPROTOOPT
;
1514 if (unlikely(!net
->mib
.mptcp_statistics
) && !mptcp_mib_alloc(net
))
1517 ret
= __mptcp_init_sock(sk
);
1521 ret
= __mptcp_socket_create(mptcp_sk(sk
));
1525 sk_sockets_allocated_inc(sk
);
1526 sk
->sk_rcvbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[1];
1527 sk
->sk_sndbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_wmem
[2];
1532 static void __mptcp_clear_xmit(struct sock
*sk
)
1534 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1535 struct mptcp_data_frag
*dtmp
, *dfrag
;
1537 sk_stop_timer(sk
, &msk
->sk
.icsk_retransmit_timer
);
1539 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
)
1540 dfrag_clear(sk
, dfrag
);
1543 static void mptcp_cancel_work(struct sock
*sk
)
1545 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1547 if (cancel_work_sync(&msk
->work
))
1551 static void mptcp_subflow_shutdown(struct sock
*sk
, struct sock
*ssk
, int how
)
1555 switch (ssk
->sk_state
) {
1557 if (!(how
& RCV_SHUTDOWN
))
1561 tcp_disconnect(ssk
, O_NONBLOCK
);
1564 if (__mptcp_check_fallback(mptcp_sk(sk
))) {
1565 pr_debug("Fallback");
1566 ssk
->sk_shutdown
|= how
;
1567 tcp_shutdown(ssk
, how
);
1569 pr_debug("Sending DATA_FIN on subflow %p", ssk
);
1570 mptcp_set_timeout(sk
, ssk
);
1579 static const unsigned char new_state
[16] = {
1580 /* current state: new state: action: */
1581 [0 /* (Invalid) */] = TCP_CLOSE
,
1582 [TCP_ESTABLISHED
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
1583 [TCP_SYN_SENT
] = TCP_CLOSE
,
1584 [TCP_SYN_RECV
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
1585 [TCP_FIN_WAIT1
] = TCP_FIN_WAIT1
,
1586 [TCP_FIN_WAIT2
] = TCP_FIN_WAIT2
,
1587 [TCP_TIME_WAIT
] = TCP_CLOSE
, /* should not happen ! */
1588 [TCP_CLOSE
] = TCP_CLOSE
,
1589 [TCP_CLOSE_WAIT
] = TCP_LAST_ACK
| TCP_ACTION_FIN
,
1590 [TCP_LAST_ACK
] = TCP_LAST_ACK
,
1591 [TCP_LISTEN
] = TCP_CLOSE
,
1592 [TCP_CLOSING
] = TCP_CLOSING
,
1593 [TCP_NEW_SYN_RECV
] = TCP_CLOSE
, /* should not happen ! */
1596 static int mptcp_close_state(struct sock
*sk
)
1598 int next
= (int)new_state
[sk
->sk_state
];
1599 int ns
= next
& TCP_STATE_MASK
;
1601 inet_sk_state_store(sk
, ns
);
1603 return next
& TCP_ACTION_FIN
;
1606 static void mptcp_close(struct sock
*sk
, long timeout
)
1608 struct mptcp_subflow_context
*subflow
, *tmp
;
1609 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1610 LIST_HEAD(conn_list
);
1613 sk
->sk_shutdown
= SHUTDOWN_MASK
;
1615 if (sk
->sk_state
== TCP_LISTEN
) {
1616 inet_sk_state_store(sk
, TCP_CLOSE
);
1618 } else if (sk
->sk_state
== TCP_CLOSE
) {
1622 if (__mptcp_check_fallback(msk
)) {
1624 } else if (mptcp_close_state(sk
)) {
1625 pr_debug("Sending DATA_FIN sk=%p", sk
);
1626 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ 1);
1627 WRITE_ONCE(msk
->snd_data_fin_enable
, 1);
1629 mptcp_for_each_subflow(msk
, subflow
) {
1630 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
1632 mptcp_subflow_shutdown(sk
, tcp_sk
, SHUTDOWN_MASK
);
1636 sk_stream_wait_close(sk
, timeout
);
1639 inet_sk_state_store(sk
, TCP_CLOSE
);
1642 /* be sure to always acquire the join list lock, to sync vs
1643 * mptcp_finish_join().
1645 spin_lock_bh(&msk
->join_list_lock
);
1646 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
1647 spin_unlock_bh(&msk
->join_list_lock
);
1648 list_splice_init(&msk
->conn_list
, &conn_list
);
1650 __mptcp_clear_xmit(sk
);
1654 list_for_each_entry_safe(subflow
, tmp
, &conn_list
, node
) {
1655 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1656 __mptcp_close_ssk(sk
, ssk
, subflow
, timeout
);
1659 mptcp_cancel_work(sk
);
1661 __skb_queue_purge(&sk
->sk_receive_queue
);
1663 sk_common_release(sk
);
1666 static void mptcp_copy_inaddrs(struct sock
*msk
, const struct sock
*ssk
)
1668 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1669 const struct ipv6_pinfo
*ssk6
= inet6_sk(ssk
);
1670 struct ipv6_pinfo
*msk6
= inet6_sk(msk
);
1672 msk
->sk_v6_daddr
= ssk
->sk_v6_daddr
;
1673 msk
->sk_v6_rcv_saddr
= ssk
->sk_v6_rcv_saddr
;
1676 msk6
->saddr
= ssk6
->saddr
;
1677 msk6
->flow_label
= ssk6
->flow_label
;
1681 inet_sk(msk
)->inet_num
= inet_sk(ssk
)->inet_num
;
1682 inet_sk(msk
)->inet_dport
= inet_sk(ssk
)->inet_dport
;
1683 inet_sk(msk
)->inet_sport
= inet_sk(ssk
)->inet_sport
;
1684 inet_sk(msk
)->inet_daddr
= inet_sk(ssk
)->inet_daddr
;
1685 inet_sk(msk
)->inet_saddr
= inet_sk(ssk
)->inet_saddr
;
1686 inet_sk(msk
)->inet_rcv_saddr
= inet_sk(ssk
)->inet_rcv_saddr
;
1689 static int mptcp_disconnect(struct sock
*sk
, int flags
)
1691 /* Should never be called.
1692 * inet_stream_connect() calls ->disconnect, but that
1693 * refers to the subflow socket, not the mptcp one.
1699 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1700 static struct ipv6_pinfo
*mptcp_inet6_sk(const struct sock
*sk
)
1702 unsigned int offset
= sizeof(struct mptcp6_sock
) - sizeof(struct ipv6_pinfo
);
1704 return (struct ipv6_pinfo
*)(((u8
*)sk
) + offset
);
1708 struct sock
*mptcp_sk_clone(const struct sock
*sk
,
1709 const struct mptcp_options_received
*mp_opt
,
1710 struct request_sock
*req
)
1712 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
1713 struct sock
*nsk
= sk_clone_lock(sk
, GFP_ATOMIC
);
1714 struct mptcp_sock
*msk
;
1720 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1721 if (nsk
->sk_family
== AF_INET6
)
1722 inet_sk(nsk
)->pinet6
= mptcp_inet6_sk(nsk
);
1725 __mptcp_init_sock(nsk
);
1727 msk
= mptcp_sk(nsk
);
1728 msk
->local_key
= subflow_req
->local_key
;
1729 msk
->token
= subflow_req
->token
;
1730 msk
->subflow
= NULL
;
1731 WRITE_ONCE(msk
->fully_established
, false);
1733 msk
->write_seq
= subflow_req
->idsn
+ 1;
1734 atomic64_set(&msk
->snd_una
, msk
->write_seq
);
1735 if (mp_opt
->mp_capable
) {
1736 msk
->can_ack
= true;
1737 msk
->remote_key
= mp_opt
->sndr_key
;
1738 mptcp_crypto_key_sha(msk
->remote_key
, NULL
, &ack_seq
);
1740 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
1743 sock_reset_flag(nsk
, SOCK_RCU_FREE
);
1744 /* will be fully established after successful MPC subflow creation */
1745 inet_sk_state_store(nsk
, TCP_SYN_RECV
);
1746 bh_unlock_sock(nsk
);
1748 /* keep a single reference */
1753 void mptcp_rcv_space_init(struct mptcp_sock
*msk
, const struct sock
*ssk
)
1755 const struct tcp_sock
*tp
= tcp_sk(ssk
);
1757 msk
->rcvq_space
.copied
= 0;
1758 msk
->rcvq_space
.rtt_us
= 0;
1760 msk
->rcvq_space
.time
= tp
->tcp_mstamp
;
1762 /* initial rcv_space offering made to peer */
1763 msk
->rcvq_space
.space
= min_t(u32
, tp
->rcv_wnd
,
1764 TCP_INIT_CWND
* tp
->advmss
);
1765 if (msk
->rcvq_space
.space
== 0)
1766 msk
->rcvq_space
.space
= TCP_INIT_CWND
* TCP_MSS_DEFAULT
;
1769 static struct sock
*mptcp_accept(struct sock
*sk
, int flags
, int *err
,
1772 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1773 struct socket
*listener
;
1776 listener
= __mptcp_nmpc_socket(msk
);
1777 if (WARN_ON_ONCE(!listener
)) {
1782 pr_debug("msk=%p, listener=%p", msk
, mptcp_subflow_ctx(listener
->sk
));
1783 newsk
= inet_csk_accept(listener
->sk
, flags
, err
, kern
);
1787 pr_debug("msk=%p, subflow is mptcp=%d", msk
, sk_is_mptcp(newsk
));
1788 if (sk_is_mptcp(newsk
)) {
1789 struct mptcp_subflow_context
*subflow
;
1790 struct sock
*new_mptcp_sock
;
1791 struct sock
*ssk
= newsk
;
1793 subflow
= mptcp_subflow_ctx(newsk
);
1794 new_mptcp_sock
= subflow
->conn
;
1796 /* is_mptcp should be false if subflow->conn is missing, see
1797 * subflow_syn_recv_sock()
1799 if (WARN_ON_ONCE(!new_mptcp_sock
)) {
1800 tcp_sk(newsk
)->is_mptcp
= 0;
1804 /* acquire the 2nd reference for the owning socket */
1805 sock_hold(new_mptcp_sock
);
1808 bh_lock_sock(new_mptcp_sock
);
1809 msk
= mptcp_sk(new_mptcp_sock
);
1812 newsk
= new_mptcp_sock
;
1813 mptcp_copy_inaddrs(newsk
, ssk
);
1814 list_add(&subflow
->node
, &msk
->conn_list
);
1816 mptcp_rcv_space_init(msk
, ssk
);
1817 bh_unlock_sock(new_mptcp_sock
);
1819 __MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_MPCAPABLEPASSIVEACK
);
1822 MPTCP_INC_STATS(sock_net(sk
),
1823 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK
);
1829 static void mptcp_destroy(struct sock
*sk
)
1831 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1833 mptcp_token_destroy(msk
);
1834 if (msk
->cached_ext
)
1835 __skb_ext_put(msk
->cached_ext
);
1837 sk_sockets_allocated_dec(sk
);
1840 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
1841 sockptr_t optval
, unsigned int optlen
)
1843 struct sock
*sk
= (struct sock
*)msk
;
1844 struct socket
*ssock
;
1851 ssock
= __mptcp_nmpc_socket(msk
);
1857 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
1859 if (optname
== SO_REUSEPORT
)
1860 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
1861 else if (optname
== SO_REUSEADDR
)
1862 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
1868 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
1871 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
1872 sockptr_t optval
, unsigned int optlen
)
1874 struct sock
*sk
= (struct sock
*)msk
;
1875 int ret
= -EOPNOTSUPP
;
1876 struct socket
*ssock
;
1881 ssock
= __mptcp_nmpc_socket(msk
);
1887 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
1889 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
1898 static int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
1899 sockptr_t optval
, unsigned int optlen
)
1901 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1904 pr_debug("msk=%p", msk
);
1906 if (level
== SOL_SOCKET
)
1907 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
1909 /* @@ the meaning of setsockopt() when the socket is connected and
1910 * there are multiple subflows is not yet defined. It is up to the
1911 * MPTCP-level socket to configure the subflows until the subflow
1912 * is in TCP fallback, when TCP socket options are passed through
1913 * to the one remaining subflow.
1916 ssk
= __mptcp_tcp_fallback(msk
);
1919 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
1921 if (level
== SOL_IPV6
)
1922 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
1927 static int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
1928 char __user
*optval
, int __user
*option
)
1930 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1933 pr_debug("msk=%p", msk
);
1935 /* @@ the meaning of setsockopt() when the socket is connected and
1936 * there are multiple subflows is not yet defined. It is up to the
1937 * MPTCP-level socket to configure the subflows until the subflow
1938 * is in TCP fallback, when socket options are passed through
1939 * to the one remaining subflow.
1942 ssk
= __mptcp_tcp_fallback(msk
);
1945 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
1950 #define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
1951 TCPF_WRITE_TIMER_DEFERRED)
1953 /* this is very alike tcp_release_cb() but we must handle differently a
1954 * different set of events
1956 static void mptcp_release_cb(struct sock
*sk
)
1958 unsigned long flags
, nflags
;
1961 flags
= sk
->sk_tsq_flags
;
1962 if (!(flags
& MPTCP_DEFERRED_ALL
))
1964 nflags
= flags
& ~MPTCP_DEFERRED_ALL
;
1965 } while (cmpxchg(&sk
->sk_tsq_flags
, flags
, nflags
) != flags
);
1967 sock_release_ownership(sk
);
1969 if (flags
& TCPF_DELACK_TIMER_DEFERRED
) {
1970 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1973 ssk
= mptcp_subflow_recv_lookup(msk
);
1974 if (!ssk
|| !schedule_work(&msk
->work
))
1978 if (flags
& TCPF_WRITE_TIMER_DEFERRED
) {
1979 mptcp_retransmit_handler(sk
);
1984 static int mptcp_hash(struct sock
*sk
)
1986 /* should never be called,
1987 * we hash the TCP subflows not the master socket
1993 static void mptcp_unhash(struct sock
*sk
)
1995 /* called from sk_common_release(), but nothing to do here */
1998 static int mptcp_get_port(struct sock
*sk
, unsigned short snum
)
2000 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2001 struct socket
*ssock
;
2003 ssock
= __mptcp_nmpc_socket(msk
);
2004 pr_debug("msk=%p, subflow=%p", msk
, ssock
);
2005 if (WARN_ON_ONCE(!ssock
))
2008 return inet_csk_get_port(ssock
->sk
, snum
);
2011 void mptcp_finish_connect(struct sock
*ssk
)
2013 struct mptcp_subflow_context
*subflow
;
2014 struct mptcp_sock
*msk
;
2018 subflow
= mptcp_subflow_ctx(ssk
);
2022 pr_debug("msk=%p, token=%u", sk
, subflow
->token
);
2024 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
, &ack_seq
);
2026 subflow
->map_seq
= ack_seq
;
2027 subflow
->map_subflow_seq
= 1;
2029 /* the socket is not connected yet, no msk/subflow ops can access/race
2030 * accessing the field below
2032 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
2033 WRITE_ONCE(msk
->local_key
, subflow
->local_key
);
2034 WRITE_ONCE(msk
->write_seq
, subflow
->idsn
+ 1);
2035 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
2036 WRITE_ONCE(msk
->can_ack
, 1);
2037 atomic64_set(&msk
->snd_una
, msk
->write_seq
);
2039 mptcp_pm_new_connection(msk
, 0);
2041 mptcp_rcv_space_init(msk
, ssk
);
2044 static void mptcp_sock_graft(struct sock
*sk
, struct socket
*parent
)
2046 write_lock_bh(&sk
->sk_callback_lock
);
2047 rcu_assign_pointer(sk
->sk_wq
, &parent
->wq
);
2048 sk_set_socket(sk
, parent
);
2049 sk
->sk_uid
= SOCK_INODE(parent
)->i_uid
;
2050 write_unlock_bh(&sk
->sk_callback_lock
);
2053 bool mptcp_finish_join(struct sock
*sk
)
2055 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
2056 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
2057 struct sock
*parent
= (void *)msk
;
2058 struct socket
*parent_sock
;
2061 pr_debug("msk=%p, subflow=%p", msk
, subflow
);
2063 /* mptcp socket already closing? */
2064 if (!mptcp_is_fully_established(parent
))
2067 if (!msk
->pm
.server_side
)
2070 if (!mptcp_pm_allow_new_subflow(msk
))
2073 /* active connections are already on conn_list, and we can't acquire
2075 * use the join list lock as synchronization point and double-check
2076 * msk status to avoid racing with mptcp_close()
2078 spin_lock_bh(&msk
->join_list_lock
);
2079 ret
= inet_sk_state_load(parent
) == TCP_ESTABLISHED
;
2080 if (ret
&& !WARN_ON_ONCE(!list_empty(&subflow
->node
)))
2081 list_add_tail(&subflow
->node
, &msk
->join_list
);
2082 spin_unlock_bh(&msk
->join_list_lock
);
2086 /* attach to msk socket only after we are sure he will deal with us
2089 parent_sock
= READ_ONCE(parent
->sk_socket
);
2090 if (parent_sock
&& !sk
->sk_socket
)
2091 mptcp_sock_graft(sk
, parent_sock
);
2092 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
2096 static bool mptcp_memory_free(const struct sock
*sk
, int wake
)
2098 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2100 return wake
? test_bit(MPTCP_SEND_SPACE
, &msk
->flags
) : true;
2103 static struct proto mptcp_prot
= {
2105 .owner
= THIS_MODULE
,
2106 .init
= mptcp_init_sock
,
2107 .disconnect
= mptcp_disconnect
,
2108 .close
= mptcp_close
,
2109 .accept
= mptcp_accept
,
2110 .setsockopt
= mptcp_setsockopt
,
2111 .getsockopt
= mptcp_getsockopt
,
2112 .shutdown
= tcp_shutdown
,
2113 .destroy
= mptcp_destroy
,
2114 .sendmsg
= mptcp_sendmsg
,
2115 .recvmsg
= mptcp_recvmsg
,
2116 .release_cb
= mptcp_release_cb
,
2118 .unhash
= mptcp_unhash
,
2119 .get_port
= mptcp_get_port
,
2120 .sockets_allocated
= &mptcp_sockets_allocated
,
2121 .memory_allocated
= &tcp_memory_allocated
,
2122 .memory_pressure
= &tcp_memory_pressure
,
2123 .stream_memory_free
= mptcp_memory_free
,
2124 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
2125 .sysctl_mem
= sysctl_tcp_mem
,
2126 .obj_size
= sizeof(struct mptcp_sock
),
2127 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
2128 .no_autobind
= true,
2131 static int mptcp_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
2133 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
2134 struct socket
*ssock
;
2137 lock_sock(sock
->sk
);
2138 ssock
= __mptcp_nmpc_socket(msk
);
2144 err
= ssock
->ops
->bind(ssock
, uaddr
, addr_len
);
2146 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
2149 release_sock(sock
->sk
);
2153 static void mptcp_subflow_early_fallback(struct mptcp_sock
*msk
,
2154 struct mptcp_subflow_context
*subflow
)
2156 subflow
->request_mptcp
= 0;
2157 __mptcp_do_fallback(msk
);
2160 static int mptcp_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
2161 int addr_len
, int flags
)
2163 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
2164 struct mptcp_subflow_context
*subflow
;
2165 struct socket
*ssock
;
2168 lock_sock(sock
->sk
);
2169 if (sock
->state
!= SS_UNCONNECTED
&& msk
->subflow
) {
2170 /* pending connection or invalid state, let existing subflow
2173 ssock
= msk
->subflow
;
2177 ssock
= __mptcp_nmpc_socket(msk
);
2183 mptcp_token_destroy(msk
);
2184 inet_sk_state_store(sock
->sk
, TCP_SYN_SENT
);
2185 subflow
= mptcp_subflow_ctx(ssock
->sk
);
2186 #ifdef CONFIG_TCP_MD5SIG
2187 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
2190 if (rcu_access_pointer(tcp_sk(ssock
->sk
)->md5sig_info
))
2191 mptcp_subflow_early_fallback(msk
, subflow
);
2193 if (subflow
->request_mptcp
&& mptcp_token_new_connect(ssock
->sk
))
2194 mptcp_subflow_early_fallback(msk
, subflow
);
2197 err
= ssock
->ops
->connect(ssock
, uaddr
, addr_len
, flags
);
2198 sock
->state
= ssock
->state
;
2200 /* on successful connect, the msk state will be moved to established by
2201 * subflow_finish_connect()
2203 if (!err
|| err
== -EINPROGRESS
)
2204 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
2206 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
2209 release_sock(sock
->sk
);
2213 static int mptcp_listen(struct socket
*sock
, int backlog
)
2215 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
2216 struct socket
*ssock
;
2219 pr_debug("msk=%p", msk
);
2221 lock_sock(sock
->sk
);
2222 ssock
= __mptcp_nmpc_socket(msk
);
2228 mptcp_token_destroy(msk
);
2229 inet_sk_state_store(sock
->sk
, TCP_LISTEN
);
2230 sock_set_flag(sock
->sk
, SOCK_RCU_FREE
);
2232 err
= ssock
->ops
->listen(ssock
, backlog
);
2233 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
2235 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
2238 release_sock(sock
->sk
);
2242 static int mptcp_stream_accept(struct socket
*sock
, struct socket
*newsock
,
2243 int flags
, bool kern
)
2245 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
2246 struct socket
*ssock
;
2249 pr_debug("msk=%p", msk
);
2251 lock_sock(sock
->sk
);
2252 if (sock
->sk
->sk_state
!= TCP_LISTEN
)
2255 ssock
= __mptcp_nmpc_socket(msk
);
2259 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
2260 sock_hold(ssock
->sk
);
2261 release_sock(sock
->sk
);
2263 err
= ssock
->ops
->accept(sock
, newsock
, flags
, kern
);
2264 if (err
== 0 && !mptcp_is_tcpsk(newsock
->sk
)) {
2265 struct mptcp_sock
*msk
= mptcp_sk(newsock
->sk
);
2266 struct mptcp_subflow_context
*subflow
;
2268 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
2269 * This is needed so NOSPACE flag can be set from tcp stack.
2271 __mptcp_flush_join_list(msk
);
2272 mptcp_for_each_subflow(msk
, subflow
) {
2273 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2275 if (!ssk
->sk_socket
)
2276 mptcp_sock_graft(ssk
, newsock
);
2280 if (inet_csk_listen_poll(ssock
->sk
))
2281 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2282 sock_put(ssock
->sk
);
2286 release_sock(sock
->sk
);
2290 static __poll_t
mptcp_check_readable(struct mptcp_sock
*msk
)
2292 return test_bit(MPTCP_DATA_READY
, &msk
->flags
) ? EPOLLIN
| EPOLLRDNORM
:
2296 static __poll_t
mptcp_poll(struct file
*file
, struct socket
*sock
,
2297 struct poll_table_struct
*wait
)
2299 struct sock
*sk
= sock
->sk
;
2300 struct mptcp_sock
*msk
;
2305 sock_poll_wait(file
, sock
, wait
);
2307 state
= inet_sk_state_load(sk
);
2308 if (state
== TCP_LISTEN
)
2309 return mptcp_check_readable(msk
);
2311 if (state
!= TCP_SYN_SENT
&& state
!= TCP_SYN_RECV
) {
2312 mask
|= mptcp_check_readable(msk
);
2313 if (sk_stream_is_writeable(sk
) &&
2314 test_bit(MPTCP_SEND_SPACE
, &msk
->flags
))
2315 mask
|= EPOLLOUT
| EPOLLWRNORM
;
2317 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2318 mask
|= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
2323 static int mptcp_shutdown(struct socket
*sock
, int how
)
2325 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
2326 struct mptcp_subflow_context
*subflow
;
2329 pr_debug("sk=%p, how=%d", msk
, how
);
2331 lock_sock(sock
->sk
);
2334 if ((how
& ~SHUTDOWN_MASK
) || !how
) {
2339 if (sock
->state
== SS_CONNECTING
) {
2340 if ((1 << sock
->sk
->sk_state
) &
2341 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_CLOSE
))
2342 sock
->state
= SS_DISCONNECTING
;
2344 sock
->state
= SS_CONNECTED
;
2347 /* If we've already sent a FIN, or it's a closed state, skip this. */
2348 if (__mptcp_check_fallback(msk
)) {
2349 if (how
== SHUT_WR
|| how
== SHUT_RDWR
)
2350 inet_sk_state_store(sock
->sk
, TCP_FIN_WAIT1
);
2352 mptcp_for_each_subflow(msk
, subflow
) {
2353 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2355 mptcp_subflow_shutdown(sock
->sk
, tcp_sk
, how
);
2357 } else if ((how
& SEND_SHUTDOWN
) &&
2358 ((1 << sock
->sk
->sk_state
) &
2359 (TCPF_ESTABLISHED
| TCPF_SYN_SENT
|
2360 TCPF_SYN_RECV
| TCPF_CLOSE_WAIT
)) &&
2361 mptcp_close_state(sock
->sk
)) {
2362 __mptcp_flush_join_list(msk
);
2364 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ 1);
2365 WRITE_ONCE(msk
->snd_data_fin_enable
, 1);
2367 mptcp_for_each_subflow(msk
, subflow
) {
2368 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2370 mptcp_subflow_shutdown(sock
->sk
, tcp_sk
, how
);
2374 /* Wake up anyone sleeping in poll. */
2375 sock
->sk
->sk_state_change(sock
->sk
);
2378 release_sock(sock
->sk
);
2383 static const struct proto_ops mptcp_stream_ops
= {
2385 .owner
= THIS_MODULE
,
2386 .release
= inet_release
,
2388 .connect
= mptcp_stream_connect
,
2389 .socketpair
= sock_no_socketpair
,
2390 .accept
= mptcp_stream_accept
,
2391 .getname
= inet_getname
,
2393 .ioctl
= inet_ioctl
,
2394 .gettstamp
= sock_gettstamp
,
2395 .listen
= mptcp_listen
,
2396 .shutdown
= mptcp_shutdown
,
2397 .setsockopt
= sock_common_setsockopt
,
2398 .getsockopt
= sock_common_getsockopt
,
2399 .sendmsg
= inet_sendmsg
,
2400 .recvmsg
= inet_recvmsg
,
2401 .mmap
= sock_no_mmap
,
2402 .sendpage
= inet_sendpage
,
2405 static struct inet_protosw mptcp_protosw
= {
2406 .type
= SOCK_STREAM
,
2407 .protocol
= IPPROTO_MPTCP
,
2408 .prot
= &mptcp_prot
,
2409 .ops
= &mptcp_stream_ops
,
2410 .flags
= INET_PROTOSW_ICSK
,
2413 void __init
mptcp_proto_init(void)
2415 mptcp_prot
.h
.hashinfo
= tcp_prot
.h
.hashinfo
;
2417 if (percpu_counter_init(&mptcp_sockets_allocated
, 0, GFP_KERNEL
))
2418 panic("Failed to allocate MPTCP pcpu counter\n");
2420 mptcp_subflow_init();
2424 if (proto_register(&mptcp_prot
, 1) != 0)
2425 panic("Failed to register MPTCP proto.\n");
2427 inet_register_protosw(&mptcp_protosw
);
2429 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb
) > sizeof_field(struct sk_buff
, cb
));
2432 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2433 static const struct proto_ops mptcp_v6_stream_ops
= {
2435 .owner
= THIS_MODULE
,
2436 .release
= inet6_release
,
2438 .connect
= mptcp_stream_connect
,
2439 .socketpair
= sock_no_socketpair
,
2440 .accept
= mptcp_stream_accept
,
2441 .getname
= inet6_getname
,
2443 .ioctl
= inet6_ioctl
,
2444 .gettstamp
= sock_gettstamp
,
2445 .listen
= mptcp_listen
,
2446 .shutdown
= mptcp_shutdown
,
2447 .setsockopt
= sock_common_setsockopt
,
2448 .getsockopt
= sock_common_getsockopt
,
2449 .sendmsg
= inet6_sendmsg
,
2450 .recvmsg
= inet6_recvmsg
,
2451 .mmap
= sock_no_mmap
,
2452 .sendpage
= inet_sendpage
,
2453 #ifdef CONFIG_COMPAT
2454 .compat_ioctl
= inet6_compat_ioctl
,
2458 static struct proto mptcp_v6_prot
;
2460 static void mptcp_v6_destroy(struct sock
*sk
)
2463 inet6_destroy_sock(sk
);
2466 static struct inet_protosw mptcp_v6_protosw
= {
2467 .type
= SOCK_STREAM
,
2468 .protocol
= IPPROTO_MPTCP
,
2469 .prot
= &mptcp_v6_prot
,
2470 .ops
= &mptcp_v6_stream_ops
,
2471 .flags
= INET_PROTOSW_ICSK
,
2474 int __init
mptcp_proto_v6_init(void)
2478 mptcp_v6_prot
= mptcp_prot
;
2479 strcpy(mptcp_v6_prot
.name
, "MPTCPv6");
2480 mptcp_v6_prot
.slab
= NULL
;
2481 mptcp_v6_prot
.destroy
= mptcp_v6_destroy
;
2482 mptcp_v6_prot
.obj_size
= sizeof(struct mptcp6_sock
);
2484 err
= proto_register(&mptcp_v6_prot
, 1);
2488 err
= inet6_register_protosw(&mptcp_v6_protosw
);
2490 proto_unregister(&mptcp_v6_prot
);