1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <linux/sched/signal.h>
13 #include <linux/atomic.h>
14 #include <linux/igmp.h>
16 #include <net/inet_common.h>
17 #include <net/inet_hashtables.h>
18 #include <net/protocol.h>
20 #include <net/tcp_states.h>
21 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
22 #include <net/transp_v6.h>
23 #include <net/addrconf.h>
25 #include <net/mptcp.h>
30 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
32 struct mptcp_sock msk
;
43 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
45 static struct percpu_counter mptcp_sockets_allocated
;
47 static void __mptcp_destroy_sock(struct sock
*sk
);
48 static void __mptcp_check_send_data_fin(struct sock
*sk
);
50 /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
51 * completed yet or has failed, return the subflow socket.
52 * Otherwise return NULL.
54 static struct socket
*__mptcp_nmpc_socket(const struct mptcp_sock
*msk
)
56 if (!msk
->subflow
|| READ_ONCE(msk
->can_ack
))
62 /* Returns end sequence number of the receiver's advertised window */
63 static u64
mptcp_wnd_end(const struct mptcp_sock
*msk
)
65 return READ_ONCE(msk
->wnd_end
);
68 static bool mptcp_is_tcpsk(struct sock
*sk
)
70 struct socket
*sock
= sk
->sk_socket
;
72 if (unlikely(sk
->sk_prot
== &tcp_prot
)) {
73 /* we are being invoked after mptcp_accept() has
74 * accepted a non-mp-capable flow: sk is a tcp_sk,
77 * Hand the socket over to tcp so all further socket ops
80 sock
->ops
= &inet_stream_ops
;
82 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
83 } else if (unlikely(sk
->sk_prot
== &tcpv6_prot
)) {
84 sock
->ops
= &inet6_stream_ops
;
92 static struct sock
*__mptcp_tcp_fallback(struct mptcp_sock
*msk
)
94 sock_owned_by_me((const struct sock
*)msk
);
96 if (likely(!__mptcp_check_fallback(msk
)))
102 static int __mptcp_socket_create(struct mptcp_sock
*msk
)
104 struct mptcp_subflow_context
*subflow
;
105 struct sock
*sk
= (struct sock
*)msk
;
106 struct socket
*ssock
;
109 err
= mptcp_subflow_create_socket(sk
, &ssock
);
113 msk
->first
= ssock
->sk
;
114 msk
->subflow
= ssock
;
115 subflow
= mptcp_subflow_ctx(ssock
->sk
);
116 list_add(&subflow
->node
, &msk
->conn_list
);
117 sock_hold(ssock
->sk
);
118 subflow
->request_mptcp
= 1;
119 mptcp_sock_graft(msk
->first
, sk
->sk_socket
);
124 static void mptcp_drop(struct sock
*sk
, struct sk_buff
*skb
)
126 sk_drops_add(sk
, skb
);
130 static bool mptcp_try_coalesce(struct sock
*sk
, struct sk_buff
*to
,
131 struct sk_buff
*from
)
136 if (MPTCP_SKB_CB(from
)->offset
||
137 !skb_try_coalesce(to
, from
, &fragstolen
, &delta
))
140 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
141 MPTCP_SKB_CB(from
)->map_seq
, MPTCP_SKB_CB(to
)->map_seq
,
142 to
->len
, MPTCP_SKB_CB(from
)->end_seq
);
143 MPTCP_SKB_CB(to
)->end_seq
= MPTCP_SKB_CB(from
)->end_seq
;
144 kfree_skb_partial(from
, fragstolen
);
145 atomic_add(delta
, &sk
->sk_rmem_alloc
);
146 sk_mem_charge(sk
, delta
);
150 static bool mptcp_ooo_try_coalesce(struct mptcp_sock
*msk
, struct sk_buff
*to
,
151 struct sk_buff
*from
)
153 if (MPTCP_SKB_CB(from
)->map_seq
!= MPTCP_SKB_CB(to
)->end_seq
)
156 return mptcp_try_coalesce((struct sock
*)msk
, to
, from
);
159 /* "inspired" by tcp_data_queue_ofo(), main differences:
161 * - don't cope with sacks
163 static void mptcp_data_queue_ofo(struct mptcp_sock
*msk
, struct sk_buff
*skb
)
165 struct sock
*sk
= (struct sock
*)msk
;
166 struct rb_node
**p
, *parent
;
167 u64 seq
, end_seq
, max_seq
;
168 struct sk_buff
*skb1
;
170 seq
= MPTCP_SKB_CB(skb
)->map_seq
;
171 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
172 max_seq
= READ_ONCE(msk
->rcv_wnd_sent
);
174 pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk
, seq
, max_seq
,
175 RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
176 if (after64(end_seq
, max_seq
)) {
179 pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
180 (unsigned long long)end_seq
- (unsigned long)max_seq
,
181 (unsigned long long)msk
->rcv_wnd_sent
);
182 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_NODSSWINDOW
);
186 p
= &msk
->out_of_order_queue
.rb_node
;
187 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUE
);
188 if (RB_EMPTY_ROOT(&msk
->out_of_order_queue
)) {
189 rb_link_node(&skb
->rbnode
, NULL
, p
);
190 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
191 msk
->ooo_last_skb
= skb
;
195 /* with 2 subflows, adding at end of ooo queue is quite likely
196 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
198 if (mptcp_ooo_try_coalesce(msk
, msk
->ooo_last_skb
, skb
)) {
199 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
200 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
204 /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
205 if (!before64(seq
, MPTCP_SKB_CB(msk
->ooo_last_skb
)->end_seq
)) {
206 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOQUEUETAIL
);
207 parent
= &msk
->ooo_last_skb
->rbnode
;
208 p
= &parent
->rb_right
;
212 /* Find place to insert this segment. Handle overlaps on the way. */
216 skb1
= rb_to_skb(parent
);
217 if (before64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
218 p
= &parent
->rb_left
;
221 if (before64(seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
222 if (!after64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
)) {
223 /* All the bits are present. Drop. */
225 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
228 if (after64(seq
, MPTCP_SKB_CB(skb1
)->map_seq
)) {
232 * continue traversing
235 /* skb's seq == skb1's seq and skb covers skb1.
236 * Replace skb1 with skb.
238 rb_replace_node(&skb1
->rbnode
, &skb
->rbnode
,
239 &msk
->out_of_order_queue
);
240 mptcp_drop(sk
, skb1
);
241 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
244 } else if (mptcp_ooo_try_coalesce(msk
, skb1
, skb
)) {
245 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_OFOMERGE
);
248 p
= &parent
->rb_right
;
252 /* Insert segment into RB tree. */
253 rb_link_node(&skb
->rbnode
, parent
, p
);
254 rb_insert_color(&skb
->rbnode
, &msk
->out_of_order_queue
);
257 /* Remove other segments covered by skb. */
258 while ((skb1
= skb_rb_next(skb
)) != NULL
) {
259 if (before64(end_seq
, MPTCP_SKB_CB(skb1
)->end_seq
))
261 rb_erase(&skb1
->rbnode
, &msk
->out_of_order_queue
);
262 mptcp_drop(sk
, skb1
);
263 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
265 /* If there is no skb after us, we are the last_skb ! */
267 msk
->ooo_last_skb
= skb
;
271 skb_set_owner_r(skb
, sk
);
274 static bool __mptcp_move_skb(struct mptcp_sock
*msk
, struct sock
*ssk
,
275 struct sk_buff
*skb
, unsigned int offset
,
278 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
279 struct sock
*sk
= (struct sock
*)msk
;
280 struct sk_buff
*tail
;
282 __skb_unlink(skb
, &ssk
->sk_receive_queue
);
287 /* try to fetch required memory from subflow */
288 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
289 if (ssk
->sk_forward_alloc
< skb
->truesize
)
291 __sk_mem_reclaim(ssk
, skb
->truesize
);
292 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
))
296 /* the skb map_seq accounts for the skb offset:
297 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
300 MPTCP_SKB_CB(skb
)->map_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
301 MPTCP_SKB_CB(skb
)->end_seq
= MPTCP_SKB_CB(skb
)->map_seq
+ copy_len
;
302 MPTCP_SKB_CB(skb
)->offset
= offset
;
304 if (MPTCP_SKB_CB(skb
)->map_seq
== msk
->ack_seq
) {
306 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ copy_len
);
307 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
308 if (tail
&& mptcp_try_coalesce(sk
, tail
, skb
))
311 skb_set_owner_r(skb
, sk
);
312 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
314 } else if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
)) {
315 mptcp_data_queue_ofo(msk
, skb
);
319 /* old data, keep it simple and drop the whole pkt, sender
320 * will retransmit as needed, if needed.
322 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
328 static void mptcp_stop_timer(struct sock
*sk
)
330 struct inet_connection_sock
*icsk
= inet_csk(sk
);
332 sk_stop_timer(sk
, &icsk
->icsk_retransmit_timer
);
333 mptcp_sk(sk
)->timer_ival
= 0;
336 static void mptcp_close_wake_up(struct sock
*sk
)
338 if (sock_flag(sk
, SOCK_DEAD
))
341 sk
->sk_state_change(sk
);
342 if (sk
->sk_shutdown
== SHUTDOWN_MASK
||
343 sk
->sk_state
== TCP_CLOSE
)
344 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_HUP
);
346 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
349 static bool mptcp_pending_data_fin_ack(struct sock
*sk
)
351 struct mptcp_sock
*msk
= mptcp_sk(sk
);
353 return !__mptcp_check_fallback(msk
) &&
354 ((1 << sk
->sk_state
) &
355 (TCPF_FIN_WAIT1
| TCPF_CLOSING
| TCPF_LAST_ACK
)) &&
356 msk
->write_seq
== READ_ONCE(msk
->snd_una
);
359 static void mptcp_check_data_fin_ack(struct sock
*sk
)
361 struct mptcp_sock
*msk
= mptcp_sk(sk
);
363 /* Look for an acknowledged DATA_FIN */
364 if (mptcp_pending_data_fin_ack(sk
)) {
365 WRITE_ONCE(msk
->snd_data_fin_enable
, 0);
367 switch (sk
->sk_state
) {
369 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
373 inet_sk_state_store(sk
, TCP_CLOSE
);
377 mptcp_close_wake_up(sk
);
381 static bool mptcp_pending_data_fin(struct sock
*sk
, u64
*seq
)
383 struct mptcp_sock
*msk
= mptcp_sk(sk
);
385 if (READ_ONCE(msk
->rcv_data_fin
) &&
386 ((1 << sk
->sk_state
) &
387 (TCPF_ESTABLISHED
| TCPF_FIN_WAIT1
| TCPF_FIN_WAIT2
))) {
388 u64 rcv_data_fin_seq
= READ_ONCE(msk
->rcv_data_fin_seq
);
390 if (msk
->ack_seq
== rcv_data_fin_seq
) {
392 *seq
= rcv_data_fin_seq
;
401 static void mptcp_set_timeout(const struct sock
*sk
, const struct sock
*ssk
)
403 long tout
= ssk
&& inet_csk(ssk
)->icsk_pending
?
404 inet_csk(ssk
)->icsk_timeout
- jiffies
: 0;
407 tout
= mptcp_sk(sk
)->timer_ival
;
408 mptcp_sk(sk
)->timer_ival
= tout
> 0 ? tout
: TCP_RTO_MIN
;
411 static bool mptcp_subflow_active(struct mptcp_subflow_context
*subflow
)
413 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
415 /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
416 if (subflow
->request_join
&& !subflow
->fully_established
)
419 /* only send if our side has not closed yet */
420 return ((1 << ssk
->sk_state
) & (TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
));
423 static bool tcp_can_send_ack(const struct sock
*ssk
)
425 return !((1 << inet_sk_state_load(ssk
)) &
426 (TCPF_SYN_SENT
| TCPF_SYN_RECV
| TCPF_TIME_WAIT
| TCPF_CLOSE
| TCPF_LISTEN
));
429 static void mptcp_send_ack(struct mptcp_sock
*msk
)
431 struct mptcp_subflow_context
*subflow
;
433 mptcp_for_each_subflow(msk
, subflow
) {
434 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
437 if (tcp_can_send_ack(ssk
))
443 static bool mptcp_subflow_cleanup_rbuf(struct sock
*ssk
)
448 ret
= tcp_can_send_ack(ssk
);
450 tcp_cleanup_rbuf(ssk
, 1);
455 static void mptcp_cleanup_rbuf(struct mptcp_sock
*msk
)
457 struct sock
*ack_hint
= READ_ONCE(msk
->ack_hint
);
458 struct mptcp_subflow_context
*subflow
;
460 /* if the hinted ssk is still active, try to use it */
461 if (likely(ack_hint
)) {
462 mptcp_for_each_subflow(msk
, subflow
) {
463 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
465 if (ack_hint
== ssk
&& mptcp_subflow_cleanup_rbuf(ssk
))
470 /* otherwise pick the first active subflow */
471 mptcp_for_each_subflow(msk
, subflow
)
472 if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow
)))
476 static bool mptcp_check_data_fin(struct sock
*sk
)
478 struct mptcp_sock
*msk
= mptcp_sk(sk
);
479 u64 rcv_data_fin_seq
;
482 if (__mptcp_check_fallback(msk
) || !msk
->first
)
485 /* Need to ack a DATA_FIN received from a peer while this side
486 * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
487 * msk->rcv_data_fin was set when parsing the incoming options
488 * at the subflow level and the msk lock was not held, so this
489 * is the first opportunity to act on the DATA_FIN and change
492 * If we are caught up to the sequence number of the incoming
493 * DATA_FIN, send the DATA_ACK now and do state transition. If
494 * not caught up, do nothing and let the recv code send DATA_ACK
498 if (mptcp_pending_data_fin(sk
, &rcv_data_fin_seq
)) {
499 WRITE_ONCE(msk
->ack_seq
, msk
->ack_seq
+ 1);
500 WRITE_ONCE(msk
->rcv_data_fin
, 0);
502 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
503 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
504 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
506 switch (sk
->sk_state
) {
507 case TCP_ESTABLISHED
:
508 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
511 inet_sk_state_store(sk
, TCP_CLOSING
);
514 inet_sk_state_store(sk
, TCP_CLOSE
);
517 /* Other states not expected */
523 mptcp_set_timeout(sk
, NULL
);
525 mptcp_close_wake_up(sk
);
530 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock
*msk
,
534 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
535 struct sock
*sk
= (struct sock
*)msk
;
536 unsigned int moved
= 0;
537 bool more_data_avail
;
542 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
544 if (!(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
545 int ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
547 if (unlikely(ssk_rbuf
> sk_rbuf
)) {
548 WRITE_ONCE(sk
->sk_rcvbuf
, ssk_rbuf
);
553 pr_debug("msk=%p ssk=%p", msk
, ssk
);
556 u32 map_remaining
, offset
;
557 u32 seq
= tp
->copied_seq
;
561 /* try to move as much data as available */
562 map_remaining
= subflow
->map_data_len
-
563 mptcp_subflow_get_map_offset(subflow
);
565 skb
= skb_peek(&ssk
->sk_receive_queue
);
567 /* if no data is found, a racing workqueue/recvmsg
568 * already processed the new data, stop here or we
569 * can enter an infinite loop
576 if (__mptcp_check_fallback(msk
)) {
577 /* if we are running under the workqueue, TCP could have
578 * collapsed skbs between dummy map creation and now
579 * be sure to adjust the size
581 map_remaining
= skb
->len
;
582 subflow
->map_data_len
= skb
->len
;
585 offset
= seq
- TCP_SKB_CB(skb
)->seq
;
586 fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
592 if (offset
< skb
->len
) {
593 size_t len
= skb
->len
- offset
;
598 if (__mptcp_move_skb(msk
, ssk
, skb
, offset
, len
))
602 if (WARN_ON_ONCE(map_remaining
< len
))
606 sk_eat_skb(ssk
, skb
);
610 WRITE_ONCE(tp
->copied_seq
, seq
);
611 more_data_avail
= mptcp_subflow_data_available(ssk
);
613 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
) {
617 } while (more_data_avail
);
618 WRITE_ONCE(msk
->ack_hint
, ssk
);
624 static bool __mptcp_ofo_queue(struct mptcp_sock
*msk
)
626 struct sock
*sk
= (struct sock
*)msk
;
627 struct sk_buff
*skb
, *tail
;
632 p
= rb_first(&msk
->out_of_order_queue
);
633 pr_debug("msk=%p empty=%d", msk
, RB_EMPTY_ROOT(&msk
->out_of_order_queue
));
636 if (after64(MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
))
640 rb_erase(&skb
->rbnode
, &msk
->out_of_order_queue
);
642 if (unlikely(!after64(MPTCP_SKB_CB(skb
)->end_seq
,
645 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_DUPDATA
);
649 end_seq
= MPTCP_SKB_CB(skb
)->end_seq
;
650 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
651 if (!tail
|| !mptcp_ooo_try_coalesce(msk
, tail
, skb
)) {
652 int delta
= msk
->ack_seq
- MPTCP_SKB_CB(skb
)->map_seq
;
654 /* skip overlapping data, if any */
655 pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
656 MPTCP_SKB_CB(skb
)->map_seq
, msk
->ack_seq
,
658 MPTCP_SKB_CB(skb
)->offset
+= delta
;
659 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
661 msk
->ack_seq
= end_seq
;
667 /* In most cases we will be able to lock the mptcp socket. If its already
668 * owned, we need to defer to the work queue to avoid ABBA deadlock.
670 static void move_skbs_to_msk(struct mptcp_sock
*msk
, struct sock
*ssk
)
672 struct sock
*sk
= (struct sock
*)msk
;
673 unsigned int moved
= 0;
675 if (inet_sk_state_load(sk
) == TCP_CLOSE
)
680 __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
681 __mptcp_ofo_queue(msk
);
683 /* If the moves have caught up with the DATA_FIN sequence number
684 * it's time to ack the DATA_FIN and change socket state, but
685 * this is not a good place to change state. Let the workqueue
688 if (mptcp_pending_data_fin(sk
, NULL
))
689 mptcp_schedule_work(sk
);
690 mptcp_data_unlock(sk
);
693 void mptcp_data_ready(struct sock
*sk
, struct sock
*ssk
)
695 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
696 struct mptcp_sock
*msk
= mptcp_sk(sk
);
697 int sk_rbuf
, ssk_rbuf
;
700 /* The peer can send data while we are shutting down this
701 * subflow at msk destruction time, but we must avoid enqueuing
702 * more data to the msk receive queue
704 if (unlikely(subflow
->disposable
))
707 /* move_skbs_to_msk below can legitly clear the data_avail flag,
708 * but we will need later to properly woke the reader, cache its
711 wake
= subflow
->data_avail
== MPTCP_SUBFLOW_DATA_AVAIL
;
713 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
715 ssk_rbuf
= READ_ONCE(ssk
->sk_rcvbuf
);
716 sk_rbuf
= READ_ONCE(sk
->sk_rcvbuf
);
717 if (unlikely(ssk_rbuf
> sk_rbuf
))
720 /* over limit? can't append more skbs to msk */
721 if (atomic_read(&sk
->sk_rmem_alloc
) > sk_rbuf
)
724 move_skbs_to_msk(msk
, ssk
);
728 sk
->sk_data_ready(sk
);
731 void __mptcp_flush_join_list(struct mptcp_sock
*msk
)
733 if (likely(list_empty(&msk
->join_list
)))
736 spin_lock_bh(&msk
->join_list_lock
);
737 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
738 spin_unlock_bh(&msk
->join_list_lock
);
741 static bool mptcp_timer_pending(struct sock
*sk
)
743 return timer_pending(&inet_csk(sk
)->icsk_retransmit_timer
);
746 static void mptcp_reset_timer(struct sock
*sk
)
748 struct inet_connection_sock
*icsk
= inet_csk(sk
);
751 /* prevent rescheduling on close */
752 if (unlikely(inet_sk_state_load(sk
) == TCP_CLOSE
))
755 /* should never be called with mptcp level timer cleared */
756 tout
= READ_ONCE(mptcp_sk(sk
)->timer_ival
);
757 if (WARN_ON_ONCE(!tout
))
759 sk_reset_timer(sk
, &icsk
->icsk_retransmit_timer
, jiffies
+ tout
);
762 bool mptcp_schedule_work(struct sock
*sk
)
764 if (inet_sk_state_load(sk
) != TCP_CLOSE
&&
765 schedule_work(&mptcp_sk(sk
)->work
)) {
766 /* each subflow already holds a reference to the sk, and the
767 * workqueue is invoked by a subflow, so sk can't go away here.
775 void mptcp_subflow_eof(struct sock
*sk
)
777 if (!test_and_set_bit(MPTCP_WORK_EOF
, &mptcp_sk(sk
)->flags
))
778 mptcp_schedule_work(sk
);
781 static void mptcp_check_for_eof(struct mptcp_sock
*msk
)
783 struct mptcp_subflow_context
*subflow
;
784 struct sock
*sk
= (struct sock
*)msk
;
787 mptcp_for_each_subflow(msk
, subflow
)
788 receivers
+= !subflow
->rx_eof
;
792 if (!(sk
->sk_shutdown
& RCV_SHUTDOWN
)) {
793 /* hopefully temporary hack: propagate shutdown status
794 * to msk, when all subflows agree on it
796 sk
->sk_shutdown
|= RCV_SHUTDOWN
;
798 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
799 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
800 sk
->sk_data_ready(sk
);
803 switch (sk
->sk_state
) {
804 case TCP_ESTABLISHED
:
805 inet_sk_state_store(sk
, TCP_CLOSE_WAIT
);
808 inet_sk_state_store(sk
, TCP_CLOSING
);
811 inet_sk_state_store(sk
, TCP_CLOSE
);
816 mptcp_close_wake_up(sk
);
819 static struct sock
*mptcp_subflow_recv_lookup(const struct mptcp_sock
*msk
)
821 struct mptcp_subflow_context
*subflow
;
822 struct sock
*sk
= (struct sock
*)msk
;
824 sock_owned_by_me(sk
);
826 mptcp_for_each_subflow(msk
, subflow
) {
827 if (subflow
->data_avail
)
828 return mptcp_subflow_tcp_sock(subflow
);
834 static bool mptcp_skb_can_collapse_to(u64 write_seq
,
835 const struct sk_buff
*skb
,
836 const struct mptcp_ext
*mpext
)
838 if (!tcp_skb_can_collapse_to(skb
))
841 /* can collapse only if MPTCP level sequence is in order and this
842 * mapping has not been xmitted yet
844 return mpext
&& mpext
->data_seq
+ mpext
->data_len
== write_seq
&&
848 static bool mptcp_frag_can_collapse_to(const struct mptcp_sock
*msk
,
849 const struct page_frag
*pfrag
,
850 const struct mptcp_data_frag
*df
)
852 return df
&& pfrag
->page
== df
->page
&&
853 pfrag
->size
- pfrag
->offset
> 0 &&
854 df
->data_seq
+ df
->data_len
== msk
->write_seq
;
857 static int mptcp_wmem_with_overhead(struct sock
*sk
, int size
)
859 struct mptcp_sock
*msk
= mptcp_sk(sk
);
862 ret
= size
+ ((sizeof(struct mptcp_data_frag
) * size
) >> PAGE_SHIFT
);
863 skbs
= (msk
->tx_pending_data
+ size
) / msk
->size_goal_cache
;
864 if (skbs
< msk
->skb_tx_cache
.qlen
)
867 return ret
+ (skbs
- msk
->skb_tx_cache
.qlen
) * SKB_TRUESIZE(MAX_TCP_HEADER
);
870 static void __mptcp_wmem_reserve(struct sock
*sk
, int size
)
872 int amount
= mptcp_wmem_with_overhead(sk
, size
);
873 struct mptcp_sock
*msk
= mptcp_sk(sk
);
875 WARN_ON_ONCE(msk
->wmem_reserved
);
876 if (WARN_ON_ONCE(amount
< 0))
879 if (amount
<= sk
->sk_forward_alloc
)
882 /* under memory pressure try to reserve at most a single page
883 * otherwise try to reserve the full estimate and fallback
884 * to a single page before entering the error path
886 if ((tcp_under_memory_pressure(sk
) && amount
> PAGE_SIZE
) ||
887 !sk_wmem_schedule(sk
, amount
)) {
888 if (amount
<= PAGE_SIZE
)
892 if (!sk_wmem_schedule(sk
, amount
))
897 msk
->wmem_reserved
= amount
;
898 sk
->sk_forward_alloc
-= amount
;
902 /* we will wait for memory on next allocation */
903 msk
->wmem_reserved
= -1;
906 static void __mptcp_update_wmem(struct sock
*sk
)
908 struct mptcp_sock
*msk
= mptcp_sk(sk
);
910 if (!msk
->wmem_reserved
)
913 if (msk
->wmem_reserved
< 0)
914 msk
->wmem_reserved
= 0;
915 if (msk
->wmem_reserved
> 0) {
916 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
917 msk
->wmem_reserved
= 0;
921 static bool mptcp_wmem_alloc(struct sock
*sk
, int size
)
923 struct mptcp_sock
*msk
= mptcp_sk(sk
);
925 /* check for pre-existing error condition */
926 if (msk
->wmem_reserved
< 0)
929 if (msk
->wmem_reserved
>= size
)
933 if (!sk_wmem_schedule(sk
, size
)) {
934 mptcp_data_unlock(sk
);
938 sk
->sk_forward_alloc
-= size
;
939 msk
->wmem_reserved
+= size
;
940 mptcp_data_unlock(sk
);
943 msk
->wmem_reserved
-= size
;
947 static void mptcp_wmem_uncharge(struct sock
*sk
, int size
)
949 struct mptcp_sock
*msk
= mptcp_sk(sk
);
951 if (msk
->wmem_reserved
< 0)
952 msk
->wmem_reserved
= 0;
953 msk
->wmem_reserved
+= size
;
956 static void mptcp_mem_reclaim_partial(struct sock
*sk
)
958 struct mptcp_sock
*msk
= mptcp_sk(sk
);
960 /* if we are experiencing a transint allocation error,
961 * the forward allocation memory has been already
964 if (msk
->wmem_reserved
< 0)
968 sk
->sk_forward_alloc
+= msk
->wmem_reserved
;
969 sk_mem_reclaim_partial(sk
);
970 msk
->wmem_reserved
= sk
->sk_forward_alloc
;
971 sk
->sk_forward_alloc
= 0;
972 mptcp_data_unlock(sk
);
975 static void dfrag_uncharge(struct sock
*sk
, int len
)
977 sk_mem_uncharge(sk
, len
);
978 sk_wmem_queued_add(sk
, -len
);
981 static void dfrag_clear(struct sock
*sk
, struct mptcp_data_frag
*dfrag
)
983 int len
= dfrag
->data_len
+ dfrag
->overhead
;
985 list_del(&dfrag
->list
);
986 dfrag_uncharge(sk
, len
);
987 put_page(dfrag
->page
);
990 static void __mptcp_clean_una(struct sock
*sk
)
992 struct mptcp_sock
*msk
= mptcp_sk(sk
);
993 struct mptcp_data_frag
*dtmp
, *dfrag
;
994 bool cleaned
= false;
997 /* on fallback we just need to ignore snd_una, as this is really
1000 if (__mptcp_check_fallback(msk
))
1001 msk
->snd_una
= READ_ONCE(msk
->snd_nxt
);
1003 snd_una
= msk
->snd_una
;
1004 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
) {
1005 if (after64(dfrag
->data_seq
+ dfrag
->data_len
, snd_una
))
1008 if (WARN_ON_ONCE(dfrag
== msk
->first_pending
))
1010 dfrag_clear(sk
, dfrag
);
1014 dfrag
= mptcp_rtx_head(sk
);
1015 if (dfrag
&& after64(snd_una
, dfrag
->data_seq
)) {
1016 u64 delta
= snd_una
- dfrag
->data_seq
;
1018 if (WARN_ON_ONCE(delta
> dfrag
->already_sent
))
1021 dfrag
->data_seq
+= delta
;
1022 dfrag
->offset
+= delta
;
1023 dfrag
->data_len
-= delta
;
1024 dfrag
->already_sent
-= delta
;
1026 dfrag_uncharge(sk
, delta
);
1032 if (tcp_under_memory_pressure(sk
)) {
1033 __mptcp_update_wmem(sk
);
1034 sk_mem_reclaim_partial(sk
);
1037 if (sk_stream_is_writeable(sk
)) {
1038 /* pairs with memory barrier in mptcp_poll */
1040 if (test_and_clear_bit(MPTCP_NOSPACE
, &msk
->flags
))
1041 sk_stream_write_space(sk
);
1045 if (snd_una
== READ_ONCE(msk
->snd_nxt
)) {
1046 if (msk
->timer_ival
)
1047 mptcp_stop_timer(sk
);
1049 mptcp_reset_timer(sk
);
1053 static void mptcp_enter_memory_pressure(struct sock
*sk
)
1055 struct mptcp_subflow_context
*subflow
;
1056 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1059 sk_stream_moderate_sndbuf(sk
);
1060 mptcp_for_each_subflow(msk
, subflow
) {
1061 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1064 tcp_enter_memory_pressure(ssk
);
1065 sk_stream_moderate_sndbuf(ssk
);
1070 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
1073 static bool mptcp_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
1075 if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag
),
1076 pfrag
, sk
->sk_allocation
)))
1079 mptcp_enter_memory_pressure(sk
);
1083 static struct mptcp_data_frag
*
1084 mptcp_carve_data_frag(const struct mptcp_sock
*msk
, struct page_frag
*pfrag
,
1087 int offset
= ALIGN(orig_offset
, sizeof(long));
1088 struct mptcp_data_frag
*dfrag
;
1090 dfrag
= (struct mptcp_data_frag
*)(page_to_virt(pfrag
->page
) + offset
);
1091 dfrag
->data_len
= 0;
1092 dfrag
->data_seq
= msk
->write_seq
;
1093 dfrag
->overhead
= offset
- orig_offset
+ sizeof(struct mptcp_data_frag
);
1094 dfrag
->offset
= offset
+ sizeof(struct mptcp_data_frag
);
1095 dfrag
->already_sent
= 0;
1096 dfrag
->page
= pfrag
->page
;
1101 struct mptcp_sendmsg_info
{
1109 static int mptcp_check_allowed_size(struct mptcp_sock
*msk
, u64 data_seq
,
1112 u64 window_end
= mptcp_wnd_end(msk
);
1114 if (__mptcp_check_fallback(msk
))
1117 if (!before64(data_seq
+ avail_size
, window_end
)) {
1118 u64 allowed_size
= window_end
- data_seq
;
1120 return min_t(unsigned int, allowed_size
, avail_size
);
1126 static bool __mptcp_add_ext(struct sk_buff
*skb
, gfp_t gfp
)
1128 struct skb_ext
*mpext
= __skb_ext_alloc(gfp
);
1132 __skb_ext_set(skb
, SKB_EXT_MPTCP
, mpext
);
1136 static struct sk_buff
*__mptcp_do_alloc_tx_skb(struct sock
*sk
, gfp_t gfp
)
1138 struct sk_buff
*skb
;
1140 skb
= alloc_skb_fclone(MAX_TCP_HEADER
, gfp
);
1142 if (likely(__mptcp_add_ext(skb
, gfp
))) {
1143 skb_reserve(skb
, MAX_TCP_HEADER
);
1144 skb
->reserved_tailroom
= skb
->end
- skb
->tail
;
1149 mptcp_enter_memory_pressure(sk
);
1154 static bool mptcp_tx_cache_refill(struct sock
*sk
, int size
,
1155 struct sk_buff_head
*skbs
, int *total_ts
)
1157 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1158 struct sk_buff
*skb
;
1161 if (unlikely(tcp_under_memory_pressure(sk
))) {
1162 mptcp_mem_reclaim_partial(sk
);
1164 /* under pressure pre-allocate at most a single skb */
1165 if (msk
->skb_tx_cache
.qlen
)
1167 space_needed
= msk
->size_goal_cache
;
1169 space_needed
= msk
->tx_pending_data
+ size
-
1170 msk
->skb_tx_cache
.qlen
* msk
->size_goal_cache
;
1173 while (space_needed
> 0) {
1174 skb
= __mptcp_do_alloc_tx_skb(sk
, sk
->sk_allocation
);
1175 if (unlikely(!skb
)) {
1176 /* under memory pressure, try to pass the caller a
1177 * single skb to allow forward progress
1179 while (skbs
->qlen
> 1) {
1180 skb
= __skb_dequeue_tail(skbs
);
1181 *total_ts
-= skb
->truesize
;
1184 return skbs
->qlen
> 0;
1187 *total_ts
+= skb
->truesize
;
1188 __skb_queue_tail(skbs
, skb
);
1189 space_needed
-= msk
->size_goal_cache
;
1194 static bool __mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
, gfp_t gfp
)
1196 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1197 struct sk_buff
*skb
;
1199 if (ssk
->sk_tx_skb_cache
) {
1200 skb
= ssk
->sk_tx_skb_cache
;
1201 if (unlikely(!skb_ext_find(skb
, SKB_EXT_MPTCP
) &&
1202 !__mptcp_add_ext(skb
, gfp
)))
1207 skb
= skb_peek(&msk
->skb_tx_cache
);
1209 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1210 skb
= __skb_dequeue(&msk
->skb_tx_cache
);
1211 if (WARN_ON_ONCE(!skb
))
1214 mptcp_wmem_uncharge(sk
, skb
->truesize
);
1215 ssk
->sk_tx_skb_cache
= skb
;
1219 /* over memory limit, no point to try to allocate a new skb */
1223 skb
= __mptcp_do_alloc_tx_skb(sk
, gfp
);
1227 if (likely(sk_wmem_schedule(ssk
, skb
->truesize
))) {
1228 ssk
->sk_tx_skb_cache
= skb
;
1235 static bool mptcp_must_reclaim_memory(struct sock
*sk
, struct sock
*ssk
)
1237 return !ssk
->sk_tx_skb_cache
&&
1238 !skb_peek(&mptcp_sk(sk
)->skb_tx_cache
) &&
1239 tcp_under_memory_pressure(sk
);
1242 static bool mptcp_alloc_tx_skb(struct sock
*sk
, struct sock
*ssk
)
1244 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
)))
1245 mptcp_mem_reclaim_partial(sk
);
1246 return __mptcp_alloc_tx_skb(sk
, ssk
, sk
->sk_allocation
);
1249 static int mptcp_sendmsg_frag(struct sock
*sk
, struct sock
*ssk
,
1250 struct mptcp_data_frag
*dfrag
,
1251 struct mptcp_sendmsg_info
*info
)
1253 u64 data_seq
= dfrag
->data_seq
+ info
->sent
;
1254 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1255 bool zero_window_probe
= false;
1256 struct mptcp_ext
*mpext
= NULL
;
1257 struct sk_buff
*skb
, *tail
;
1258 bool can_collapse
= false;
1263 pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
1264 msk
, ssk
, dfrag
->data_seq
, dfrag
->data_len
, info
->sent
);
1266 /* compute send limit */
1267 info
->mss_now
= tcp_send_mss(ssk
, &info
->size_goal
, info
->flags
);
1268 avail_size
= info
->size_goal
;
1269 msk
->size_goal_cache
= info
->size_goal
;
1270 skb
= tcp_write_queue_tail(ssk
);
1272 /* Limit the write to the size available in the
1273 * current skb, if any, so that we create at most a new skb.
1274 * Explicitly tells TCP internals to avoid collapsing on later
1275 * queue management operation, to avoid breaking the ext <->
1276 * SSN association set here
1278 mpext
= skb_ext_find(skb
, SKB_EXT_MPTCP
);
1279 can_collapse
= (info
->size_goal
- skb
->len
> 0) &&
1280 mptcp_skb_can_collapse_to(data_seq
, skb
, mpext
);
1281 if (!can_collapse
) {
1282 TCP_SKB_CB(skb
)->eor
= 1;
1284 size_bias
= skb
->len
;
1285 avail_size
= info
->size_goal
- skb
->len
;
1289 /* Zero window and all data acked? Probe. */
1290 avail_size
= mptcp_check_allowed_size(msk
, data_seq
, avail_size
);
1291 if (avail_size
== 0) {
1292 u64 snd_una
= READ_ONCE(msk
->snd_una
);
1294 if (skb
|| snd_una
!= msk
->snd_nxt
)
1296 zero_window_probe
= true;
1297 data_seq
= snd_una
- 1;
1301 if (WARN_ON_ONCE(info
->sent
> info
->limit
||
1302 info
->limit
> dfrag
->data_len
))
1305 ret
= info
->limit
- info
->sent
;
1306 tail
= tcp_build_frag(ssk
, avail_size
+ size_bias
, info
->flags
,
1307 dfrag
->page
, dfrag
->offset
+ info
->sent
, &ret
);
1309 tcp_remove_empty_skb(sk
, tcp_write_queue_tail(ssk
));
1313 /* if the tail skb is still the cached one, collapsing really happened.
1316 TCP_SKB_CB(tail
)->tcp_flags
&= ~TCPHDR_PSH
;
1317 mpext
->data_len
+= ret
;
1318 WARN_ON_ONCE(!can_collapse
);
1319 WARN_ON_ONCE(zero_window_probe
);
1323 mpext
= skb_ext_find(tail
, SKB_EXT_MPTCP
);
1324 if (WARN_ON_ONCE(!mpext
)) {
1325 /* should never reach here, stream corrupted */
1329 memset(mpext
, 0, sizeof(*mpext
));
1330 mpext
->data_seq
= data_seq
;
1331 mpext
->subflow_seq
= mptcp_subflow_ctx(ssk
)->rel_write_seq
;
1332 mpext
->data_len
= ret
;
1336 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1337 mpext
->data_seq
, mpext
->subflow_seq
, mpext
->data_len
,
1340 if (zero_window_probe
) {
1341 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1344 tcp_push_pending_frames(ssk
);
1347 mptcp_subflow_ctx(ssk
)->rel_write_seq
+= ret
;
1351 #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1352 sizeof(struct tcphdr) - \
1353 MAX_TCP_OPTION_SPACE - \
1354 sizeof(struct ipv6hdr) - \
1355 sizeof(struct frag_hdr))
1357 struct subflow_send_info
{
1362 static struct sock
*mptcp_subflow_get_send(struct mptcp_sock
*msk
,
1365 struct subflow_send_info send_info
[2];
1366 struct mptcp_subflow_context
*subflow
;
1367 int i
, nr_active
= 0;
1372 sock_owned_by_me((struct sock
*)msk
);
1375 if (__mptcp_check_fallback(msk
)) {
1378 *sndbuf
= msk
->first
->sk_sndbuf
;
1379 return sk_stream_memory_free(msk
->first
) ? msk
->first
: NULL
;
1382 /* re-use last subflow, if the burst allow that */
1383 if (msk
->last_snd
&& msk
->snd_burst
> 0 &&
1384 sk_stream_memory_free(msk
->last_snd
) &&
1385 mptcp_subflow_active(mptcp_subflow_ctx(msk
->last_snd
))) {
1386 mptcp_for_each_subflow(msk
, subflow
) {
1387 ssk
= mptcp_subflow_tcp_sock(subflow
);
1388 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1390 return msk
->last_snd
;
1393 /* pick the subflow with the lower wmem/wspace ratio */
1394 for (i
= 0; i
< 2; ++i
) {
1395 send_info
[i
].ssk
= NULL
;
1396 send_info
[i
].ratio
= -1;
1398 mptcp_for_each_subflow(msk
, subflow
) {
1399 ssk
= mptcp_subflow_tcp_sock(subflow
);
1400 if (!mptcp_subflow_active(subflow
))
1403 nr_active
+= !subflow
->backup
;
1404 *sndbuf
= max(tcp_sk(ssk
)->snd_wnd
, *sndbuf
);
1405 if (!sk_stream_memory_free(subflow
->tcp_sock
))
1408 pace
= READ_ONCE(ssk
->sk_pacing_rate
);
1412 ratio
= div_u64((u64
)READ_ONCE(ssk
->sk_wmem_queued
) << 32,
1414 if (ratio
< send_info
[subflow
->backup
].ratio
) {
1415 send_info
[subflow
->backup
].ssk
= ssk
;
1416 send_info
[subflow
->backup
].ratio
= ratio
;
1420 pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1421 msk
, nr_active
, send_info
[0].ssk
, send_info
[0].ratio
,
1422 send_info
[1].ssk
, send_info
[1].ratio
);
1424 /* pick the best backup if no other subflow is active */
1426 send_info
[0].ssk
= send_info
[1].ssk
;
1428 if (send_info
[0].ssk
) {
1429 msk
->last_snd
= send_info
[0].ssk
;
1430 msk
->snd_burst
= min_t(int, MPTCP_SEND_BURST_SIZE
,
1431 sk_stream_wspace(msk
->last_snd
));
1432 return msk
->last_snd
;
1437 static void mptcp_push_release(struct sock
*sk
, struct sock
*ssk
,
1438 struct mptcp_sendmsg_info
*info
)
1440 mptcp_set_timeout(sk
, ssk
);
1441 tcp_push(ssk
, 0, info
->mss_now
, tcp_sk(ssk
)->nonagle
, info
->size_goal
);
1445 static void __mptcp_push_pending(struct sock
*sk
, unsigned int flags
)
1447 struct sock
*prev_ssk
= NULL
, *ssk
= NULL
;
1448 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1449 struct mptcp_sendmsg_info info
= {
1452 struct mptcp_data_frag
*dfrag
;
1453 int len
, copied
= 0;
1456 while ((dfrag
= mptcp_send_head(sk
))) {
1457 info
.sent
= dfrag
->already_sent
;
1458 info
.limit
= dfrag
->data_len
;
1459 len
= dfrag
->data_len
- dfrag
->already_sent
;
1464 __mptcp_flush_join_list(msk
);
1465 ssk
= mptcp_subflow_get_send(msk
, &sndbuf
);
1467 /* do auto tuning */
1468 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1469 sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1470 WRITE_ONCE(sk
->sk_sndbuf
, sndbuf
);
1472 /* try to keep the subflow socket lock across
1473 * consecutive xmit on the same socket
1475 if (ssk
!= prev_ssk
&& prev_ssk
)
1476 mptcp_push_release(sk
, prev_ssk
, &info
);
1480 if (ssk
!= prev_ssk
|| !prev_ssk
)
1483 /* keep it simple and always provide a new skb for the
1484 * subflow, even if we will not use it when collapsing
1485 * on the pending one
1487 if (!mptcp_alloc_tx_skb(sk
, ssk
)) {
1488 mptcp_push_release(sk
, ssk
, &info
);
1492 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1494 mptcp_push_release(sk
, ssk
, &info
);
1499 dfrag
->already_sent
+= ret
;
1500 msk
->snd_nxt
+= ret
;
1501 msk
->snd_burst
-= ret
;
1502 msk
->tx_pending_data
-= ret
;
1506 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1509 /* at this point we held the socket lock for the last subflow we used */
1511 mptcp_push_release(sk
, ssk
, &info
);
1515 /* start the timer, if it's not pending */
1516 if (!mptcp_timer_pending(sk
))
1517 mptcp_reset_timer(sk
);
1518 __mptcp_check_send_data_fin(sk
);
1522 static void __mptcp_subflow_push_pending(struct sock
*sk
, struct sock
*ssk
)
1524 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1525 struct mptcp_sendmsg_info info
;
1526 struct mptcp_data_frag
*dfrag
;
1527 int len
, copied
= 0;
1530 while ((dfrag
= mptcp_send_head(sk
))) {
1531 info
.sent
= dfrag
->already_sent
;
1532 info
.limit
= dfrag
->data_len
;
1533 len
= dfrag
->data_len
- dfrag
->already_sent
;
1537 /* do auto tuning */
1538 if (!(sk
->sk_userlocks
& SOCK_SNDBUF_LOCK
) &&
1539 ssk
->sk_sndbuf
> READ_ONCE(sk
->sk_sndbuf
))
1540 WRITE_ONCE(sk
->sk_sndbuf
, ssk
->sk_sndbuf
);
1542 if (unlikely(mptcp_must_reclaim_memory(sk
, ssk
))) {
1543 __mptcp_update_wmem(sk
);
1544 sk_mem_reclaim_partial(sk
);
1546 if (!__mptcp_alloc_tx_skb(sk
, ssk
, GFP_ATOMIC
))
1549 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
1554 dfrag
->already_sent
+= ret
;
1555 msk
->snd_nxt
+= ret
;
1556 msk
->snd_burst
-= ret
;
1557 msk
->tx_pending_data
-= ret
;
1561 WRITE_ONCE(msk
->first_pending
, mptcp_send_next(sk
));
1565 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1566 * not going to flush it via release_sock()
1568 __mptcp_update_wmem(sk
);
1570 mptcp_set_timeout(sk
, ssk
);
1571 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
1573 if (!mptcp_timer_pending(sk
))
1574 mptcp_reset_timer(sk
);
1576 if (msk
->snd_data_fin_enable
&&
1577 msk
->snd_nxt
+ 1 == msk
->write_seq
)
1578 mptcp_schedule_work(sk
);
1582 static int mptcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
)
1584 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1585 struct page_frag
*pfrag
;
1590 if (msg
->msg_flags
& ~(MSG_MORE
| MSG_DONTWAIT
| MSG_NOSIGNAL
))
1593 mptcp_lock_sock(sk
, __mptcp_wmem_reserve(sk
, min_t(size_t, 1 << 20, len
)));
1595 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1597 if ((1 << sk
->sk_state
) & ~(TCPF_ESTABLISHED
| TCPF_CLOSE_WAIT
)) {
1598 ret
= sk_stream_wait_connect(sk
, &timeo
);
1603 pfrag
= sk_page_frag(sk
);
1605 while (msg_data_left(msg
)) {
1606 int total_ts
, frag_truesize
= 0;
1607 struct mptcp_data_frag
*dfrag
;
1608 struct sk_buff_head skbs
;
1609 bool dfrag_collapsed
;
1610 size_t psize
, offset
;
1612 if (sk
->sk_err
|| (sk
->sk_shutdown
& SEND_SHUTDOWN
)) {
1617 /* reuse tail pfrag, if possible, or carve a new one from the
1620 dfrag
= mptcp_pending_tail(sk
);
1621 dfrag_collapsed
= mptcp_frag_can_collapse_to(msk
, pfrag
, dfrag
);
1622 if (!dfrag_collapsed
) {
1623 if (!sk_stream_memory_free(sk
))
1624 goto wait_for_memory
;
1626 if (!mptcp_page_frag_refill(sk
, pfrag
))
1627 goto wait_for_memory
;
1629 dfrag
= mptcp_carve_data_frag(msk
, pfrag
, pfrag
->offset
);
1630 frag_truesize
= dfrag
->overhead
;
1633 /* we do not bound vs wspace, to allow a single packet.
1634 * memory accounting will prevent execessive memory usage
1637 offset
= dfrag
->offset
+ dfrag
->data_len
;
1638 psize
= pfrag
->size
- offset
;
1639 psize
= min_t(size_t, psize
, msg_data_left(msg
));
1640 total_ts
= psize
+ frag_truesize
;
1641 __skb_queue_head_init(&skbs
);
1642 if (!mptcp_tx_cache_refill(sk
, psize
, &skbs
, &total_ts
))
1643 goto wait_for_memory
;
1645 if (!mptcp_wmem_alloc(sk
, total_ts
)) {
1646 __skb_queue_purge(&skbs
);
1647 goto wait_for_memory
;
1650 skb_queue_splice_tail(&skbs
, &msk
->skb_tx_cache
);
1651 if (copy_page_from_iter(dfrag
->page
, offset
, psize
,
1652 &msg
->msg_iter
) != psize
) {
1653 mptcp_wmem_uncharge(sk
, psize
+ frag_truesize
);
1658 /* data successfully copied into the write queue */
1660 dfrag
->data_len
+= psize
;
1661 frag_truesize
+= psize
;
1662 pfrag
->offset
+= frag_truesize
;
1663 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ psize
);
1664 msk
->tx_pending_data
+= psize
;
1666 /* charge data on mptcp pending queue to the msk socket
1667 * Note: we charge such data both to sk and ssk
1669 sk_wmem_queued_add(sk
, frag_truesize
);
1670 if (!dfrag_collapsed
) {
1671 get_page(dfrag
->page
);
1672 list_add_tail(&dfrag
->list
, &msk
->rtx_queue
);
1673 if (!msk
->first_pending
)
1674 WRITE_ONCE(msk
->first_pending
, dfrag
);
1676 pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk
,
1677 dfrag
->data_seq
, dfrag
->data_len
, dfrag
->already_sent
,
1683 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
1684 __mptcp_push_pending(sk
, msg
->msg_flags
);
1685 ret
= sk_stream_wait_memory(sk
, &timeo
);
1691 __mptcp_push_pending(sk
, msg
->msg_flags
);
1695 return copied
? : ret
;
1698 static void mptcp_wait_data(struct sock
*sk
, long *timeo
)
1700 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
1701 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1703 add_wait_queue(sk_sleep(sk
), &wait
);
1704 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1706 sk_wait_event(sk
, timeo
,
1707 test_and_clear_bit(MPTCP_DATA_READY
, &msk
->flags
), &wait
);
1709 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
1710 remove_wait_queue(sk_sleep(sk
), &wait
);
1713 static int __mptcp_recvmsg_mskq(struct mptcp_sock
*msk
,
1717 struct sk_buff
*skb
;
1720 while ((skb
= skb_peek(&msk
->receive_queue
)) != NULL
) {
1721 u32 offset
= MPTCP_SKB_CB(skb
)->offset
;
1722 u32 data_len
= skb
->len
- offset
;
1723 u32 count
= min_t(size_t, len
- copied
, data_len
);
1726 err
= skb_copy_datagram_msg(skb
, offset
, msg
, count
);
1727 if (unlikely(err
< 0)) {
1735 if (count
< data_len
) {
1736 MPTCP_SKB_CB(skb
)->offset
+= count
;
1740 /* we will bulk release the skb memory later */
1741 skb
->destructor
= NULL
;
1742 msk
->rmem_released
+= skb
->truesize
;
1743 __skb_unlink(skb
, &msk
->receive_queue
);
1753 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
1755 * Only difference: Use highest rtt estimate of the subflows in use.
1757 static void mptcp_rcv_space_adjust(struct mptcp_sock
*msk
, int copied
)
1759 struct mptcp_subflow_context
*subflow
;
1760 struct sock
*sk
= (struct sock
*)msk
;
1761 u32 time
, advmss
= 1;
1764 sock_owned_by_me(sk
);
1769 msk
->rcvq_space
.copied
+= copied
;
1771 mstamp
= div_u64(tcp_clock_ns(), NSEC_PER_USEC
);
1772 time
= tcp_stamp_us_delta(mstamp
, msk
->rcvq_space
.time
);
1774 rtt_us
= msk
->rcvq_space
.rtt_us
;
1775 if (rtt_us
&& time
< (rtt_us
>> 3))
1779 mptcp_for_each_subflow(msk
, subflow
) {
1780 const struct tcp_sock
*tp
;
1784 tp
= tcp_sk(mptcp_subflow_tcp_sock(subflow
));
1786 sf_rtt_us
= READ_ONCE(tp
->rcv_rtt_est
.rtt_us
);
1787 sf_advmss
= READ_ONCE(tp
->advmss
);
1789 rtt_us
= max(sf_rtt_us
, rtt_us
);
1790 advmss
= max(sf_advmss
, advmss
);
1793 msk
->rcvq_space
.rtt_us
= rtt_us
;
1794 if (time
< (rtt_us
>> 3) || rtt_us
== 0)
1797 if (msk
->rcvq_space
.copied
<= msk
->rcvq_space
.space
)
1800 if (sock_net(sk
)->ipv4
.sysctl_tcp_moderate_rcvbuf
&&
1801 !(sk
->sk_userlocks
& SOCK_RCVBUF_LOCK
)) {
1805 rcvwin
= ((u64
)msk
->rcvq_space
.copied
<< 1) + 16 * advmss
;
1807 grow
= rcvwin
* (msk
->rcvq_space
.copied
- msk
->rcvq_space
.space
);
1809 do_div(grow
, msk
->rcvq_space
.space
);
1810 rcvwin
+= (grow
<< 1);
1812 rcvmem
= SKB_TRUESIZE(advmss
+ MAX_TCP_HEADER
);
1813 while (tcp_win_from_space(sk
, rcvmem
) < advmss
)
1816 do_div(rcvwin
, advmss
);
1817 rcvbuf
= min_t(u64
, rcvwin
* rcvmem
,
1818 sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[2]);
1820 if (rcvbuf
> sk
->sk_rcvbuf
) {
1823 window_clamp
= tcp_win_from_space(sk
, rcvbuf
);
1824 WRITE_ONCE(sk
->sk_rcvbuf
, rcvbuf
);
1826 /* Make subflows follow along. If we do not do this, we
1827 * get drops at subflow level if skbs can't be moved to
1828 * the mptcp rx queue fast enough (announced rcv_win can
1829 * exceed ssk->sk_rcvbuf).
1831 mptcp_for_each_subflow(msk
, subflow
) {
1835 ssk
= mptcp_subflow_tcp_sock(subflow
);
1836 slow
= lock_sock_fast(ssk
);
1837 WRITE_ONCE(ssk
->sk_rcvbuf
, rcvbuf
);
1838 tcp_sk(ssk
)->window_clamp
= window_clamp
;
1839 tcp_cleanup_rbuf(ssk
, 1);
1840 unlock_sock_fast(ssk
, slow
);
1845 msk
->rcvq_space
.space
= msk
->rcvq_space
.copied
;
1847 msk
->rcvq_space
.copied
= 0;
1848 msk
->rcvq_space
.time
= mstamp
;
1851 static void __mptcp_update_rmem(struct sock
*sk
)
1853 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1855 if (!msk
->rmem_released
)
1858 atomic_sub(msk
->rmem_released
, &sk
->sk_rmem_alloc
);
1859 sk_mem_uncharge(sk
, msk
->rmem_released
);
1860 msk
->rmem_released
= 0;
1863 static void __mptcp_splice_receive_queue(struct sock
*sk
)
1865 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1867 skb_queue_splice_tail_init(&sk
->sk_receive_queue
, &msk
->receive_queue
);
1870 static bool __mptcp_move_skbs(struct mptcp_sock
*msk
, unsigned int rcv
)
1872 struct sock
*sk
= (struct sock
*)msk
;
1873 unsigned int moved
= 0;
1876 __mptcp_flush_join_list(msk
);
1878 struct sock
*ssk
= mptcp_subflow_recv_lookup(msk
);
1881 /* we can have data pending in the subflows only if the msk
1882 * receive buffer was full at subflow_data_ready() time,
1883 * that is an unlikely slow path.
1888 slowpath
= lock_sock_fast(ssk
);
1889 mptcp_data_lock(sk
);
1890 done
= __mptcp_move_skbs_from_subflow(msk
, ssk
, &moved
);
1891 mptcp_data_unlock(sk
);
1893 WRITE_ONCE(msk
->rmem_pending
, min(rcv
, moved
));
1894 tcp_cleanup_rbuf(ssk
, 1);
1895 WRITE_ONCE(msk
->rmem_pending
, 0);
1897 unlock_sock_fast(ssk
, slowpath
);
1900 /* acquire the data lock only if some input data is pending */
1902 if (!RB_EMPTY_ROOT(&msk
->out_of_order_queue
) ||
1903 !skb_queue_empty_lockless(&sk
->sk_receive_queue
)) {
1904 mptcp_data_lock(sk
);
1905 __mptcp_update_rmem(sk
);
1906 ret
|= __mptcp_ofo_queue(msk
);
1907 __mptcp_splice_receive_queue(sk
);
1908 mptcp_data_unlock(sk
);
1911 mptcp_check_data_fin((struct sock
*)msk
);
1912 return !skb_queue_empty(&msk
->receive_queue
);
1915 static int mptcp_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t len
,
1916 int nonblock
, int flags
, int *addr_len
)
1918 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1923 if (msg
->msg_flags
& ~(MSG_WAITALL
| MSG_DONTWAIT
))
1926 mptcp_lock_sock(sk
, __mptcp_splice_receive_queue(sk
));
1927 if (unlikely(sk
->sk_state
== TCP_LISTEN
)) {
1932 timeo
= sock_rcvtimeo(sk
, nonblock
);
1934 len
= min_t(size_t, len
, INT_MAX
);
1935 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, len
);
1937 while (copied
< len
) {
1938 int bytes_read
, old_space
;
1940 bytes_read
= __mptcp_recvmsg_mskq(msk
, msg
, len
- copied
);
1941 if (unlikely(bytes_read
< 0)) {
1943 copied
= bytes_read
;
1947 copied
+= bytes_read
;
1949 if (skb_queue_empty(&msk
->receive_queue
) &&
1950 __mptcp_move_skbs(msk
, len
- copied
))
1953 /* be sure to advertise window change */
1954 old_space
= READ_ONCE(msk
->old_wspace
);
1955 if ((tcp_space(sk
) - old_space
) >= old_space
)
1956 mptcp_cleanup_rbuf(msk
);
1958 /* only the master socket status is relevant here. The exit
1959 * conditions mirror closely tcp_recvmsg()
1961 if (copied
>= target
)
1966 sk
->sk_state
== TCP_CLOSE
||
1967 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
1969 signal_pending(current
))
1973 copied
= sock_error(sk
);
1977 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
1978 mptcp_check_for_eof(msk
);
1980 if (sk
->sk_shutdown
& RCV_SHUTDOWN
) {
1981 /* race breaker: the shutdown could be after the
1982 * previous receive queue check
1984 if (__mptcp_move_skbs(msk
, len
- copied
))
1989 if (sk
->sk_state
== TCP_CLOSE
) {
1999 if (signal_pending(current
)) {
2000 copied
= sock_intr_errno(timeo
);
2005 pr_debug("block timeout %ld", timeo
);
2006 mptcp_wait_data(sk
, &timeo
);
2009 if (skb_queue_empty_lockless(&sk
->sk_receive_queue
) &&
2010 skb_queue_empty(&msk
->receive_queue
)) {
2011 /* entire backlog drained, clear DATA_READY. */
2012 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
2014 /* .. race-breaker: ssk might have gotten new data
2015 * after last __mptcp_move_skbs() returned false.
2017 if (unlikely(__mptcp_move_skbs(msk
, 0)))
2018 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2019 } else if (unlikely(!test_bit(MPTCP_DATA_READY
, &msk
->flags
))) {
2020 /* data to read but mptcp_wait_data() cleared DATA_READY */
2021 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2024 pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
2025 msk
, test_bit(MPTCP_DATA_READY
, &msk
->flags
),
2026 skb_queue_empty_lockless(&sk
->sk_receive_queue
), copied
);
2027 mptcp_rcv_space_adjust(msk
, copied
);
2033 static void mptcp_retransmit_handler(struct sock
*sk
)
2035 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2037 set_bit(MPTCP_WORK_RTX
, &msk
->flags
);
2038 mptcp_schedule_work(sk
);
2041 static void mptcp_retransmit_timer(struct timer_list
*t
)
2043 struct inet_connection_sock
*icsk
= from_timer(icsk
, t
,
2044 icsk_retransmit_timer
);
2045 struct sock
*sk
= &icsk
->icsk_inet
.sk
;
2048 if (!sock_owned_by_user(sk
)) {
2049 mptcp_retransmit_handler(sk
);
2051 /* delegate our work to tcp_release_cb() */
2052 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED
,
2060 static void mptcp_timeout_timer(struct timer_list
*t
)
2062 struct sock
*sk
= from_timer(sk
, t
, sk_timer
);
2064 mptcp_schedule_work(sk
);
2068 /* Find an idle subflow. Return NULL if there is unacked data at tcp
2071 * A backup subflow is returned only if that is the only kind available.
2073 static struct sock
*mptcp_subflow_get_retrans(const struct mptcp_sock
*msk
)
2075 struct mptcp_subflow_context
*subflow
;
2076 struct sock
*backup
= NULL
;
2078 sock_owned_by_me((const struct sock
*)msk
);
2080 if (__mptcp_check_fallback(msk
))
2083 mptcp_for_each_subflow(msk
, subflow
) {
2084 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2086 if (!mptcp_subflow_active(subflow
))
2089 /* still data outstanding at TCP level? Don't retransmit. */
2090 if (!tcp_write_queue_empty(ssk
)) {
2091 if (inet_csk(ssk
)->icsk_ca_state
>= TCP_CA_Loss
)
2096 if (subflow
->backup
) {
2108 static void mptcp_dispose_initial_subflow(struct mptcp_sock
*msk
)
2111 iput(SOCK_INODE(msk
->subflow
));
2112 msk
->subflow
= NULL
;
2116 /* subflow sockets can be either outgoing (connect) or incoming
2119 * Outgoing subflows use in-kernel sockets.
2120 * Incoming subflows do not have their own 'struct socket' allocated,
2121 * so we need to use tcp_close() after detaching them from the mptcp
2124 void __mptcp_close_ssk(struct sock
*sk
, struct sock
*ssk
,
2125 struct mptcp_subflow_context
*subflow
)
2127 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2129 list_del(&subflow
->node
);
2131 lock_sock_nested(ssk
, SINGLE_DEPTH_NESTING
);
2133 /* if we are invoked by the msk cleanup code, the subflow is
2139 subflow
->disposable
= 1;
2141 /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
2142 * the ssk has been already destroyed, we just need to release the
2143 * reference owned by msk;
2145 if (!inet_csk(ssk
)->icsk_ulp_ops
) {
2146 kfree_rcu(subflow
, rcu
);
2148 /* otherwise tcp will dispose of the ssk and subflow ctx */
2149 __tcp_close(ssk
, 0);
2151 /* close acquired an extra ref */
2158 if (ssk
== msk
->last_snd
)
2159 msk
->last_snd
= NULL
;
2161 if (msk
->subflow
&& ssk
== msk
->subflow
->sk
)
2162 mptcp_dispose_initial_subflow(msk
);
2165 static unsigned int mptcp_sync_mss(struct sock
*sk
, u32 pmtu
)
2170 static void pm_work(struct mptcp_sock
*msk
)
2172 struct mptcp_pm_data
*pm
= &msk
->pm
;
2174 spin_lock_bh(&msk
->pm
.lock
);
2176 pr_debug("msk=%p status=%x", msk
, pm
->status
);
2177 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_RECEIVED
)) {
2178 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED
);
2179 mptcp_pm_nl_add_addr_received(msk
);
2181 if (pm
->status
& BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
)) {
2182 pm
->status
&= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK
);
2183 mptcp_pm_nl_add_addr_send_ack(msk
);
2185 if (pm
->status
& BIT(MPTCP_PM_RM_ADDR_RECEIVED
)) {
2186 pm
->status
&= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED
);
2187 mptcp_pm_nl_rm_addr_received(msk
);
2189 if (pm
->status
& BIT(MPTCP_PM_ESTABLISHED
)) {
2190 pm
->status
&= ~BIT(MPTCP_PM_ESTABLISHED
);
2191 mptcp_pm_nl_fully_established(msk
);
2193 if (pm
->status
& BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
)) {
2194 pm
->status
&= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED
);
2195 mptcp_pm_nl_subflow_established(msk
);
2198 spin_unlock_bh(&msk
->pm
.lock
);
2201 static void __mptcp_close_subflow(struct mptcp_sock
*msk
)
2203 struct mptcp_subflow_context
*subflow
, *tmp
;
2207 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2208 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2210 if (inet_sk_state_load(ssk
) != TCP_CLOSE
)
2213 __mptcp_close_ssk((struct sock
*)msk
, ssk
, subflow
);
2217 static bool mptcp_check_close_timeout(const struct sock
*sk
)
2219 s32 delta
= tcp_jiffies32
- inet_csk(sk
)->icsk_mtup
.probe_timestamp
;
2220 struct mptcp_subflow_context
*subflow
;
2222 if (delta
>= TCP_TIMEWAIT_LEN
)
2225 /* if all subflows are in closed status don't bother with additional
2228 mptcp_for_each_subflow(mptcp_sk(sk
), subflow
) {
2229 if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow
)) !=
2236 static void mptcp_check_fastclose(struct mptcp_sock
*msk
)
2238 struct mptcp_subflow_context
*subflow
, *tmp
;
2239 struct sock
*sk
= &msk
->sk
.icsk_inet
.sk
;
2241 if (likely(!READ_ONCE(msk
->rcv_fastclose
)))
2244 mptcp_token_destroy(msk
);
2246 list_for_each_entry_safe(subflow
, tmp
, &msk
->conn_list
, node
) {
2247 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2250 if (tcp_sk
->sk_state
!= TCP_CLOSE
) {
2251 tcp_send_active_reset(tcp_sk
, GFP_ATOMIC
);
2252 tcp_set_state(tcp_sk
, TCP_CLOSE
);
2254 release_sock(tcp_sk
);
2257 inet_sk_state_store(sk
, TCP_CLOSE
);
2258 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2259 smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
2260 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
2261 set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
);
2263 mptcp_close_wake_up(sk
);
2266 static void mptcp_worker(struct work_struct
*work
)
2268 struct mptcp_sock
*msk
= container_of(work
, struct mptcp_sock
, work
);
2269 struct sock
*ssk
, *sk
= &msk
->sk
.icsk_inet
.sk
;
2270 struct mptcp_sendmsg_info info
= {};
2271 struct mptcp_data_frag
*dfrag
;
2276 state
= sk
->sk_state
;
2277 if (unlikely(state
== TCP_CLOSE
))
2280 mptcp_check_data_fin_ack(sk
);
2281 __mptcp_flush_join_list(msk
);
2283 mptcp_check_fastclose(msk
);
2285 if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
))
2286 __mptcp_close_subflow(msk
);
2291 if (test_and_clear_bit(MPTCP_WORK_EOF
, &msk
->flags
))
2292 mptcp_check_for_eof(msk
);
2294 __mptcp_check_send_data_fin(sk
);
2295 mptcp_check_data_fin(sk
);
2297 /* There is no point in keeping around an orphaned sk timedout or
2298 * closed, but we need the msk around to reply to incoming DATA_FIN,
2299 * even if it is orphaned and in FIN_WAIT2 state
2301 if (sock_flag(sk
, SOCK_DEAD
) &&
2302 (mptcp_check_close_timeout(sk
) || sk
->sk_state
== TCP_CLOSE
)) {
2303 inet_sk_state_store(sk
, TCP_CLOSE
);
2304 __mptcp_destroy_sock(sk
);
2308 if (!test_and_clear_bit(MPTCP_WORK_RTX
, &msk
->flags
))
2311 __mptcp_clean_una(sk
);
2312 dfrag
= mptcp_rtx_head(sk
);
2316 ssk
= mptcp_subflow_get_retrans(msk
);
2322 /* limit retransmission to the bytes already sent on some subflows */
2324 info
.limit
= dfrag
->already_sent
;
2325 while (info
.sent
< dfrag
->already_sent
) {
2326 if (!mptcp_alloc_tx_skb(sk
, ssk
))
2329 ret
= mptcp_sendmsg_frag(sk
, ssk
, dfrag
, &info
);
2333 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_RETRANSSEGS
);
2338 tcp_push(ssk
, 0, info
.mss_now
, tcp_sk(ssk
)->nonagle
,
2341 mptcp_set_timeout(sk
, ssk
);
2345 if (!mptcp_timer_pending(sk
))
2346 mptcp_reset_timer(sk
);
2353 static int __mptcp_init_sock(struct sock
*sk
)
2355 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2357 spin_lock_init(&msk
->join_list_lock
);
2359 INIT_LIST_HEAD(&msk
->conn_list
);
2360 INIT_LIST_HEAD(&msk
->join_list
);
2361 INIT_LIST_HEAD(&msk
->rtx_queue
);
2362 INIT_WORK(&msk
->work
, mptcp_worker
);
2363 __skb_queue_head_init(&msk
->receive_queue
);
2364 __skb_queue_head_init(&msk
->skb_tx_cache
);
2365 msk
->out_of_order_queue
= RB_ROOT
;
2366 msk
->first_pending
= NULL
;
2367 msk
->wmem_reserved
= 0;
2368 msk
->rmem_released
= 0;
2369 msk
->tx_pending_data
= 0;
2370 msk
->size_goal_cache
= TCP_BASE_MSS
;
2372 msk
->ack_hint
= NULL
;
2374 inet_csk(sk
)->icsk_sync_mss
= mptcp_sync_mss
;
2376 mptcp_pm_data_init(msk
);
2378 /* re-use the csk retrans timer for MPTCP-level retrans */
2379 timer_setup(&msk
->sk
.icsk_retransmit_timer
, mptcp_retransmit_timer
, 0);
2380 timer_setup(&sk
->sk_timer
, mptcp_timeout_timer
, 0);
2384 static int mptcp_init_sock(struct sock
*sk
)
2386 struct net
*net
= sock_net(sk
);
2389 ret
= __mptcp_init_sock(sk
);
2393 if (!mptcp_is_enabled(net
))
2394 return -ENOPROTOOPT
;
2396 if (unlikely(!net
->mib
.mptcp_statistics
) && !mptcp_mib_alloc(net
))
2399 ret
= __mptcp_socket_create(mptcp_sk(sk
));
2403 sk_sockets_allocated_inc(sk
);
2404 sk
->sk_rcvbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_rmem
[1];
2405 sk
->sk_sndbuf
= sock_net(sk
)->ipv4
.sysctl_tcp_wmem
[1];
2410 static void __mptcp_clear_xmit(struct sock
*sk
)
2412 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2413 struct mptcp_data_frag
*dtmp
, *dfrag
;
2414 struct sk_buff
*skb
;
2416 WRITE_ONCE(msk
->first_pending
, NULL
);
2417 list_for_each_entry_safe(dfrag
, dtmp
, &msk
->rtx_queue
, list
)
2418 dfrag_clear(sk
, dfrag
);
2419 while ((skb
= __skb_dequeue(&msk
->skb_tx_cache
)) != NULL
) {
2420 sk
->sk_forward_alloc
+= skb
->truesize
;
2425 static void mptcp_cancel_work(struct sock
*sk
)
2427 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2429 if (cancel_work_sync(&msk
->work
))
2433 void mptcp_subflow_shutdown(struct sock
*sk
, struct sock
*ssk
, int how
)
2437 switch (ssk
->sk_state
) {
2439 if (!(how
& RCV_SHUTDOWN
))
2443 tcp_disconnect(ssk
, O_NONBLOCK
);
2446 if (__mptcp_check_fallback(mptcp_sk(sk
))) {
2447 pr_debug("Fallback");
2448 ssk
->sk_shutdown
|= how
;
2449 tcp_shutdown(ssk
, how
);
2451 pr_debug("Sending DATA_FIN on subflow %p", ssk
);
2452 mptcp_set_timeout(sk
, ssk
);
2461 static const unsigned char new_state
[16] = {
2462 /* current state: new state: action: */
2463 [0 /* (Invalid) */] = TCP_CLOSE
,
2464 [TCP_ESTABLISHED
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2465 [TCP_SYN_SENT
] = TCP_CLOSE
,
2466 [TCP_SYN_RECV
] = TCP_FIN_WAIT1
| TCP_ACTION_FIN
,
2467 [TCP_FIN_WAIT1
] = TCP_FIN_WAIT1
,
2468 [TCP_FIN_WAIT2
] = TCP_FIN_WAIT2
,
2469 [TCP_TIME_WAIT
] = TCP_CLOSE
, /* should not happen ! */
2470 [TCP_CLOSE
] = TCP_CLOSE
,
2471 [TCP_CLOSE_WAIT
] = TCP_LAST_ACK
| TCP_ACTION_FIN
,
2472 [TCP_LAST_ACK
] = TCP_LAST_ACK
,
2473 [TCP_LISTEN
] = TCP_CLOSE
,
2474 [TCP_CLOSING
] = TCP_CLOSING
,
2475 [TCP_NEW_SYN_RECV
] = TCP_CLOSE
, /* should not happen ! */
2478 static int mptcp_close_state(struct sock
*sk
)
2480 int next
= (int)new_state
[sk
->sk_state
];
2481 int ns
= next
& TCP_STATE_MASK
;
2483 inet_sk_state_store(sk
, ns
);
2485 return next
& TCP_ACTION_FIN
;
2488 static void __mptcp_check_send_data_fin(struct sock
*sk
)
2490 struct mptcp_subflow_context
*subflow
;
2491 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2493 pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
2494 msk
, msk
->snd_data_fin_enable
, !!mptcp_send_head(sk
),
2495 msk
->snd_nxt
, msk
->write_seq
);
2497 /* we still need to enqueue subflows or not really shutting down,
2500 if (!msk
->snd_data_fin_enable
|| msk
->snd_nxt
+ 1 != msk
->write_seq
||
2501 mptcp_send_head(sk
))
2504 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
2506 /* fallback socket will not get data_fin/ack, can move to the next
2509 if (__mptcp_check_fallback(msk
)) {
2510 if ((1 << sk
->sk_state
) & (TCPF_CLOSING
| TCPF_LAST_ACK
)) {
2511 inet_sk_state_store(sk
, TCP_CLOSE
);
2512 mptcp_close_wake_up(sk
);
2513 } else if (sk
->sk_state
== TCP_FIN_WAIT1
) {
2514 inet_sk_state_store(sk
, TCP_FIN_WAIT2
);
2518 __mptcp_flush_join_list(msk
);
2519 mptcp_for_each_subflow(msk
, subflow
) {
2520 struct sock
*tcp_sk
= mptcp_subflow_tcp_sock(subflow
);
2522 mptcp_subflow_shutdown(sk
, tcp_sk
, SEND_SHUTDOWN
);
2526 static void __mptcp_wr_shutdown(struct sock
*sk
)
2528 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2530 pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
2531 msk
, msk
->snd_data_fin_enable
, sk
->sk_shutdown
, sk
->sk_state
,
2532 !!mptcp_send_head(sk
));
2534 /* will be ignored by fallback sockets */
2535 WRITE_ONCE(msk
->write_seq
, msk
->write_seq
+ 1);
2536 WRITE_ONCE(msk
->snd_data_fin_enable
, 1);
2538 __mptcp_check_send_data_fin(sk
);
2541 static void __mptcp_destroy_sock(struct sock
*sk
)
2543 struct mptcp_subflow_context
*subflow
, *tmp
;
2544 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2545 LIST_HEAD(conn_list
);
2547 pr_debug("msk=%p", msk
);
2551 /* be sure to always acquire the join list lock, to sync vs
2552 * mptcp_finish_join().
2554 spin_lock_bh(&msk
->join_list_lock
);
2555 list_splice_tail_init(&msk
->join_list
, &msk
->conn_list
);
2556 spin_unlock_bh(&msk
->join_list_lock
);
2557 list_splice_init(&msk
->conn_list
, &conn_list
);
2559 sk_stop_timer(sk
, &msk
->sk
.icsk_retransmit_timer
);
2560 sk_stop_timer(sk
, &sk
->sk_timer
);
2563 list_for_each_entry_safe(subflow
, tmp
, &conn_list
, node
) {
2564 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2565 __mptcp_close_ssk(sk
, ssk
, subflow
);
2568 sk
->sk_prot
->destroy(sk
);
2570 WARN_ON_ONCE(msk
->wmem_reserved
);
2571 WARN_ON_ONCE(msk
->rmem_released
);
2572 sk_stream_kill_queues(sk
);
2573 xfrm_sk_free_policy(sk
);
2574 sk_refcnt_debug_release(sk
);
2575 mptcp_dispose_initial_subflow(msk
);
2579 static void mptcp_close(struct sock
*sk
, long timeout
)
2581 struct mptcp_subflow_context
*subflow
;
2582 bool do_cancel_work
= false;
2585 sk
->sk_shutdown
= SHUTDOWN_MASK
;
2587 if ((1 << sk
->sk_state
) & (TCPF_LISTEN
| TCPF_CLOSE
)) {
2588 inet_sk_state_store(sk
, TCP_CLOSE
);
2592 if (mptcp_close_state(sk
))
2593 __mptcp_wr_shutdown(sk
);
2595 sk_stream_wait_close(sk
, timeout
);
2598 /* orphan all the subflows */
2599 inet_csk(sk
)->icsk_mtup
.probe_timestamp
= tcp_jiffies32
;
2600 list_for_each_entry(subflow
, &mptcp_sk(sk
)->conn_list
, node
) {
2601 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2602 bool slow
= lock_sock_fast(ssk
);
2605 unlock_sock_fast(ssk
, slow
);
2610 pr_debug("msk=%p state=%d", sk
, sk
->sk_state
);
2611 if (sk
->sk_state
== TCP_CLOSE
) {
2612 __mptcp_destroy_sock(sk
);
2613 do_cancel_work
= true;
2615 sk_reset_timer(sk
, &sk
->sk_timer
, jiffies
+ TCP_TIMEWAIT_LEN
);
2619 mptcp_cancel_work(sk
);
2623 static void mptcp_copy_inaddrs(struct sock
*msk
, const struct sock
*ssk
)
2625 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2626 const struct ipv6_pinfo
*ssk6
= inet6_sk(ssk
);
2627 struct ipv6_pinfo
*msk6
= inet6_sk(msk
);
2629 msk
->sk_v6_daddr
= ssk
->sk_v6_daddr
;
2630 msk
->sk_v6_rcv_saddr
= ssk
->sk_v6_rcv_saddr
;
2633 msk6
->saddr
= ssk6
->saddr
;
2634 msk6
->flow_label
= ssk6
->flow_label
;
2638 inet_sk(msk
)->inet_num
= inet_sk(ssk
)->inet_num
;
2639 inet_sk(msk
)->inet_dport
= inet_sk(ssk
)->inet_dport
;
2640 inet_sk(msk
)->inet_sport
= inet_sk(ssk
)->inet_sport
;
2641 inet_sk(msk
)->inet_daddr
= inet_sk(ssk
)->inet_daddr
;
2642 inet_sk(msk
)->inet_saddr
= inet_sk(ssk
)->inet_saddr
;
2643 inet_sk(msk
)->inet_rcv_saddr
= inet_sk(ssk
)->inet_rcv_saddr
;
2646 static int mptcp_disconnect(struct sock
*sk
, int flags
)
2648 struct mptcp_subflow_context
*subflow
;
2649 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2651 __mptcp_flush_join_list(msk
);
2652 mptcp_for_each_subflow(msk
, subflow
) {
2653 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
2656 tcp_disconnect(ssk
, flags
);
2662 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2663 static struct ipv6_pinfo
*mptcp_inet6_sk(const struct sock
*sk
)
2665 unsigned int offset
= sizeof(struct mptcp6_sock
) - sizeof(struct ipv6_pinfo
);
2667 return (struct ipv6_pinfo
*)(((u8
*)sk
) + offset
);
2671 struct sock
*mptcp_sk_clone(const struct sock
*sk
,
2672 const struct mptcp_options_received
*mp_opt
,
2673 struct request_sock
*req
)
2675 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
2676 struct sock
*nsk
= sk_clone_lock(sk
, GFP_ATOMIC
);
2677 struct mptcp_sock
*msk
;
2683 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
2684 if (nsk
->sk_family
== AF_INET6
)
2685 inet_sk(nsk
)->pinet6
= mptcp_inet6_sk(nsk
);
2688 __mptcp_init_sock(nsk
);
2690 msk
= mptcp_sk(nsk
);
2691 msk
->local_key
= subflow_req
->local_key
;
2692 msk
->token
= subflow_req
->token
;
2693 msk
->subflow
= NULL
;
2694 WRITE_ONCE(msk
->fully_established
, false);
2696 msk
->write_seq
= subflow_req
->idsn
+ 1;
2697 msk
->snd_nxt
= msk
->write_seq
;
2698 msk
->snd_una
= msk
->write_seq
;
2699 msk
->wnd_end
= msk
->snd_nxt
+ req
->rsk_rcv_wnd
;
2701 if (mp_opt
->mp_capable
) {
2702 msk
->can_ack
= true;
2703 msk
->remote_key
= mp_opt
->sndr_key
;
2704 mptcp_crypto_key_sha(msk
->remote_key
, NULL
, &ack_seq
);
2706 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
2707 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
2710 sock_reset_flag(nsk
, SOCK_RCU_FREE
);
2711 /* will be fully established after successful MPC subflow creation */
2712 inet_sk_state_store(nsk
, TCP_SYN_RECV
);
2714 security_inet_csk_clone(nsk
, req
);
2715 bh_unlock_sock(nsk
);
2717 /* keep a single reference */
2722 void mptcp_rcv_space_init(struct mptcp_sock
*msk
, const struct sock
*ssk
)
2724 const struct tcp_sock
*tp
= tcp_sk(ssk
);
2726 msk
->rcvq_space
.copied
= 0;
2727 msk
->rcvq_space
.rtt_us
= 0;
2729 msk
->rcvq_space
.time
= tp
->tcp_mstamp
;
2731 /* initial rcv_space offering made to peer */
2732 msk
->rcvq_space
.space
= min_t(u32
, tp
->rcv_wnd
,
2733 TCP_INIT_CWND
* tp
->advmss
);
2734 if (msk
->rcvq_space
.space
== 0)
2735 msk
->rcvq_space
.space
= TCP_INIT_CWND
* TCP_MSS_DEFAULT
;
2737 WRITE_ONCE(msk
->wnd_end
, msk
->snd_nxt
+ tcp_sk(ssk
)->snd_wnd
);
2740 static struct sock
*mptcp_accept(struct sock
*sk
, int flags
, int *err
,
2743 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2744 struct socket
*listener
;
2747 listener
= __mptcp_nmpc_socket(msk
);
2748 if (WARN_ON_ONCE(!listener
)) {
2753 pr_debug("msk=%p, listener=%p", msk
, mptcp_subflow_ctx(listener
->sk
));
2754 newsk
= inet_csk_accept(listener
->sk
, flags
, err
, kern
);
2758 pr_debug("msk=%p, subflow is mptcp=%d", msk
, sk_is_mptcp(newsk
));
2759 if (sk_is_mptcp(newsk
)) {
2760 struct mptcp_subflow_context
*subflow
;
2761 struct sock
*new_mptcp_sock
;
2763 subflow
= mptcp_subflow_ctx(newsk
);
2764 new_mptcp_sock
= subflow
->conn
;
2766 /* is_mptcp should be false if subflow->conn is missing, see
2767 * subflow_syn_recv_sock()
2769 if (WARN_ON_ONCE(!new_mptcp_sock
)) {
2770 tcp_sk(newsk
)->is_mptcp
= 0;
2774 /* acquire the 2nd reference for the owning socket */
2775 sock_hold(new_mptcp_sock
);
2776 newsk
= new_mptcp_sock
;
2777 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_MPCAPABLEPASSIVEACK
);
2779 MPTCP_INC_STATS(sock_net(sk
),
2780 MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK
);
2786 void mptcp_destroy_common(struct mptcp_sock
*msk
)
2788 struct sock
*sk
= (struct sock
*)msk
;
2790 __mptcp_clear_xmit(sk
);
2792 /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
2793 skb_queue_splice_tail_init(&msk
->receive_queue
, &sk
->sk_receive_queue
);
2795 skb_rbtree_purge(&msk
->out_of_order_queue
);
2796 mptcp_token_destroy(msk
);
2797 mptcp_pm_free_anno_list(msk
);
2800 static void mptcp_destroy(struct sock
*sk
)
2802 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2804 mptcp_destroy_common(msk
);
2805 sk_sockets_allocated_dec(sk
);
2808 static int mptcp_setsockopt_sol_socket(struct mptcp_sock
*msk
, int optname
,
2809 sockptr_t optval
, unsigned int optlen
)
2811 struct sock
*sk
= (struct sock
*)msk
;
2812 struct socket
*ssock
;
2819 ssock
= __mptcp_nmpc_socket(msk
);
2825 ret
= sock_setsockopt(ssock
, SOL_SOCKET
, optname
, optval
, optlen
);
2827 if (optname
== SO_REUSEPORT
)
2828 sk
->sk_reuseport
= ssock
->sk
->sk_reuseport
;
2829 else if (optname
== SO_REUSEADDR
)
2830 sk
->sk_reuse
= ssock
->sk
->sk_reuse
;
2836 return sock_setsockopt(sk
->sk_socket
, SOL_SOCKET
, optname
, optval
, optlen
);
2839 static int mptcp_setsockopt_v6(struct mptcp_sock
*msk
, int optname
,
2840 sockptr_t optval
, unsigned int optlen
)
2842 struct sock
*sk
= (struct sock
*)msk
;
2843 int ret
= -EOPNOTSUPP
;
2844 struct socket
*ssock
;
2849 ssock
= __mptcp_nmpc_socket(msk
);
2855 ret
= tcp_setsockopt(ssock
->sk
, SOL_IPV6
, optname
, optval
, optlen
);
2857 sk
->sk_ipv6only
= ssock
->sk
->sk_ipv6only
;
2866 static int mptcp_setsockopt(struct sock
*sk
, int level
, int optname
,
2867 sockptr_t optval
, unsigned int optlen
)
2869 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2872 pr_debug("msk=%p", msk
);
2874 if (level
== SOL_SOCKET
)
2875 return mptcp_setsockopt_sol_socket(msk
, optname
, optval
, optlen
);
2877 /* @@ the meaning of setsockopt() when the socket is connected and
2878 * there are multiple subflows is not yet defined. It is up to the
2879 * MPTCP-level socket to configure the subflows until the subflow
2880 * is in TCP fallback, when TCP socket options are passed through
2881 * to the one remaining subflow.
2884 ssk
= __mptcp_tcp_fallback(msk
);
2887 return tcp_setsockopt(ssk
, level
, optname
, optval
, optlen
);
2889 if (level
== SOL_IPV6
)
2890 return mptcp_setsockopt_v6(msk
, optname
, optval
, optlen
);
2895 static int mptcp_getsockopt(struct sock
*sk
, int level
, int optname
,
2896 char __user
*optval
, int __user
*option
)
2898 struct mptcp_sock
*msk
= mptcp_sk(sk
);
2901 pr_debug("msk=%p", msk
);
2903 /* @@ the meaning of setsockopt() when the socket is connected and
2904 * there are multiple subflows is not yet defined. It is up to the
2905 * MPTCP-level socket to configure the subflows until the subflow
2906 * is in TCP fallback, when socket options are passed through
2907 * to the one remaining subflow.
2910 ssk
= __mptcp_tcp_fallback(msk
);
2913 return tcp_getsockopt(ssk
, level
, optname
, optval
, option
);
2918 void __mptcp_data_acked(struct sock
*sk
)
2920 if (!sock_owned_by_user(sk
))
2921 __mptcp_clean_una(sk
);
2923 set_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
);
2925 if (mptcp_pending_data_fin_ack(sk
))
2926 mptcp_schedule_work(sk
);
2929 void __mptcp_check_push(struct sock
*sk
, struct sock
*ssk
)
2931 if (!mptcp_send_head(sk
))
2934 if (!sock_owned_by_user(sk
))
2935 __mptcp_subflow_push_pending(sk
, ssk
);
2937 set_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
);
2940 #define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
2942 /* processes deferred events and flush wmem */
2943 static void mptcp_release_cb(struct sock
*sk
)
2945 unsigned long flags
, nflags
;
2949 if (test_and_clear_bit(MPTCP_PUSH_PENDING
, &mptcp_sk(sk
)->flags
))
2950 flags
|= MPTCP_PUSH_PENDING
;
2954 /* the following actions acquire the subflow socket lock
2956 * 1) can't be invoked in atomic scope
2957 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
2958 * datapath acquires the msk socket spinlock while helding
2959 * the subflow socket lock
2962 spin_unlock_bh(&sk
->sk_lock
.slock
);
2963 if (flags
& MPTCP_PUSH_PENDING
)
2964 __mptcp_push_pending(sk
, 0);
2967 spin_lock_bh(&sk
->sk_lock
.slock
);
2970 if (test_and_clear_bit(MPTCP_CLEAN_UNA
, &mptcp_sk(sk
)->flags
))
2971 __mptcp_clean_una(sk
);
2972 if (test_and_clear_bit(MPTCP_ERROR_REPORT
, &mptcp_sk(sk
)->flags
))
2973 __mptcp_error_report(sk
);
2975 /* push_pending may touch wmem_reserved, ensure we do the cleanup
2978 __mptcp_update_wmem(sk
);
2979 __mptcp_update_rmem(sk
);
2982 flags
= sk
->sk_tsq_flags
;
2983 if (!(flags
& MPTCP_DEFERRED_ALL
))
2985 nflags
= flags
& ~MPTCP_DEFERRED_ALL
;
2986 } while (cmpxchg(&sk
->sk_tsq_flags
, flags
, nflags
) != flags
);
2988 sock_release_ownership(sk
);
2990 if (flags
& TCPF_WRITE_TIMER_DEFERRED
) {
2991 mptcp_retransmit_handler(sk
);
2996 static int mptcp_hash(struct sock
*sk
)
2998 /* should never be called,
2999 * we hash the TCP subflows not the master socket
3005 static void mptcp_unhash(struct sock
*sk
)
3007 /* called from sk_common_release(), but nothing to do here */
3010 static int mptcp_get_port(struct sock
*sk
, unsigned short snum
)
3012 struct mptcp_sock
*msk
= mptcp_sk(sk
);
3013 struct socket
*ssock
;
3015 ssock
= __mptcp_nmpc_socket(msk
);
3016 pr_debug("msk=%p, subflow=%p", msk
, ssock
);
3017 if (WARN_ON_ONCE(!ssock
))
3020 return inet_csk_get_port(ssock
->sk
, snum
);
3023 void mptcp_finish_connect(struct sock
*ssk
)
3025 struct mptcp_subflow_context
*subflow
;
3026 struct mptcp_sock
*msk
;
3030 subflow
= mptcp_subflow_ctx(ssk
);
3034 pr_debug("msk=%p, token=%u", sk
, subflow
->token
);
3036 mptcp_crypto_key_sha(subflow
->remote_key
, NULL
, &ack_seq
);
3038 subflow
->map_seq
= ack_seq
;
3039 subflow
->map_subflow_seq
= 1;
3041 /* the socket is not connected yet, no msk/subflow ops can access/race
3042 * accessing the field below
3044 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
3045 WRITE_ONCE(msk
->local_key
, subflow
->local_key
);
3046 WRITE_ONCE(msk
->write_seq
, subflow
->idsn
+ 1);
3047 WRITE_ONCE(msk
->snd_nxt
, msk
->write_seq
);
3048 WRITE_ONCE(msk
->ack_seq
, ack_seq
);
3049 WRITE_ONCE(msk
->rcv_wnd_sent
, ack_seq
);
3050 WRITE_ONCE(msk
->can_ack
, 1);
3051 WRITE_ONCE(msk
->snd_una
, msk
->write_seq
);
3053 mptcp_pm_new_connection(msk
, 0);
3055 mptcp_rcv_space_init(msk
, ssk
);
3058 void mptcp_sock_graft(struct sock
*sk
, struct socket
*parent
)
3060 write_lock_bh(&sk
->sk_callback_lock
);
3061 rcu_assign_pointer(sk
->sk_wq
, &parent
->wq
);
3062 sk_set_socket(sk
, parent
);
3063 sk
->sk_uid
= SOCK_INODE(parent
)->i_uid
;
3064 write_unlock_bh(&sk
->sk_callback_lock
);
3067 bool mptcp_finish_join(struct sock
*ssk
)
3069 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
3070 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
3071 struct sock
*parent
= (void *)msk
;
3072 struct socket
*parent_sock
;
3075 pr_debug("msk=%p, subflow=%p", msk
, subflow
);
3077 /* mptcp socket already closing? */
3078 if (!mptcp_is_fully_established(parent
))
3081 if (!msk
->pm
.server_side
)
3084 if (!mptcp_pm_allow_new_subflow(msk
))
3087 /* active connections are already on conn_list, and we can't acquire
3089 * use the join list lock as synchronization point and double-check
3090 * msk status to avoid racing with __mptcp_destroy_sock()
3092 spin_lock_bh(&msk
->join_list_lock
);
3093 ret
= inet_sk_state_load(parent
) == TCP_ESTABLISHED
;
3094 if (ret
&& !WARN_ON_ONCE(!list_empty(&subflow
->node
))) {
3095 list_add_tail(&subflow
->node
, &msk
->join_list
);
3098 spin_unlock_bh(&msk
->join_list_lock
);
3102 /* attach to msk socket only after we are sure he will deal with us
3105 parent_sock
= READ_ONCE(parent
->sk_socket
);
3106 if (parent_sock
&& !ssk
->sk_socket
)
3107 mptcp_sock_graft(ssk
, parent_sock
);
3108 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
3112 static void mptcp_shutdown(struct sock
*sk
, int how
)
3114 pr_debug("sk=%p, how=%d", sk
, how
);
3116 if ((how
& SEND_SHUTDOWN
) && mptcp_close_state(sk
))
3117 __mptcp_wr_shutdown(sk
);
3120 static struct proto mptcp_prot
= {
3122 .owner
= THIS_MODULE
,
3123 .init
= mptcp_init_sock
,
3124 .disconnect
= mptcp_disconnect
,
3125 .close
= mptcp_close
,
3126 .accept
= mptcp_accept
,
3127 .setsockopt
= mptcp_setsockopt
,
3128 .getsockopt
= mptcp_getsockopt
,
3129 .shutdown
= mptcp_shutdown
,
3130 .destroy
= mptcp_destroy
,
3131 .sendmsg
= mptcp_sendmsg
,
3132 .recvmsg
= mptcp_recvmsg
,
3133 .release_cb
= mptcp_release_cb
,
3135 .unhash
= mptcp_unhash
,
3136 .get_port
= mptcp_get_port
,
3137 .sockets_allocated
= &mptcp_sockets_allocated
,
3138 .memory_allocated
= &tcp_memory_allocated
,
3139 .memory_pressure
= &tcp_memory_pressure
,
3140 .sysctl_wmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_wmem
),
3141 .sysctl_rmem_offset
= offsetof(struct net
, ipv4
.sysctl_tcp_rmem
),
3142 .sysctl_mem
= sysctl_tcp_mem
,
3143 .obj_size
= sizeof(struct mptcp_sock
),
3144 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
3145 .no_autobind
= true,
3148 static int mptcp_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3150 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3151 struct socket
*ssock
;
3154 lock_sock(sock
->sk
);
3155 ssock
= __mptcp_nmpc_socket(msk
);
3161 err
= ssock
->ops
->bind(ssock
, uaddr
, addr_len
);
3163 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3166 release_sock(sock
->sk
);
3170 static void mptcp_subflow_early_fallback(struct mptcp_sock
*msk
,
3171 struct mptcp_subflow_context
*subflow
)
3173 subflow
->request_mptcp
= 0;
3174 __mptcp_do_fallback(msk
);
3177 static int mptcp_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
3178 int addr_len
, int flags
)
3180 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3181 struct mptcp_subflow_context
*subflow
;
3182 struct socket
*ssock
;
3185 lock_sock(sock
->sk
);
3186 if (sock
->state
!= SS_UNCONNECTED
&& msk
->subflow
) {
3187 /* pending connection or invalid state, let existing subflow
3190 ssock
= msk
->subflow
;
3194 ssock
= __mptcp_nmpc_socket(msk
);
3200 mptcp_token_destroy(msk
);
3201 inet_sk_state_store(sock
->sk
, TCP_SYN_SENT
);
3202 subflow
= mptcp_subflow_ctx(ssock
->sk
);
3203 #ifdef CONFIG_TCP_MD5SIG
3204 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
3207 if (rcu_access_pointer(tcp_sk(ssock
->sk
)->md5sig_info
))
3208 mptcp_subflow_early_fallback(msk
, subflow
);
3210 if (subflow
->request_mptcp
&& mptcp_token_new_connect(ssock
->sk
))
3211 mptcp_subflow_early_fallback(msk
, subflow
);
3214 err
= ssock
->ops
->connect(ssock
, uaddr
, addr_len
, flags
);
3215 sock
->state
= ssock
->state
;
3217 /* on successful connect, the msk state will be moved to established by
3218 * subflow_finish_connect()
3220 if (!err
|| err
== -EINPROGRESS
)
3221 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3223 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3226 release_sock(sock
->sk
);
3230 static int mptcp_listen(struct socket
*sock
, int backlog
)
3232 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3233 struct socket
*ssock
;
3236 pr_debug("msk=%p", msk
);
3238 lock_sock(sock
->sk
);
3239 ssock
= __mptcp_nmpc_socket(msk
);
3245 mptcp_token_destroy(msk
);
3246 inet_sk_state_store(sock
->sk
, TCP_LISTEN
);
3247 sock_set_flag(sock
->sk
, SOCK_RCU_FREE
);
3249 err
= ssock
->ops
->listen(ssock
, backlog
);
3250 inet_sk_state_store(sock
->sk
, inet_sk_state_load(ssock
->sk
));
3252 mptcp_copy_inaddrs(sock
->sk
, ssock
->sk
);
3255 release_sock(sock
->sk
);
3259 static int mptcp_stream_accept(struct socket
*sock
, struct socket
*newsock
,
3260 int flags
, bool kern
)
3262 struct mptcp_sock
*msk
= mptcp_sk(sock
->sk
);
3263 struct socket
*ssock
;
3266 pr_debug("msk=%p", msk
);
3268 lock_sock(sock
->sk
);
3269 if (sock
->sk
->sk_state
!= TCP_LISTEN
)
3272 ssock
= __mptcp_nmpc_socket(msk
);
3276 clear_bit(MPTCP_DATA_READY
, &msk
->flags
);
3277 sock_hold(ssock
->sk
);
3278 release_sock(sock
->sk
);
3280 err
= ssock
->ops
->accept(sock
, newsock
, flags
, kern
);
3281 if (err
== 0 && !mptcp_is_tcpsk(newsock
->sk
)) {
3282 struct mptcp_sock
*msk
= mptcp_sk(newsock
->sk
);
3283 struct mptcp_subflow_context
*subflow
;
3284 struct sock
*newsk
= newsock
->sk
;
3287 slowpath
= lock_sock_fast(newsk
);
3289 /* PM/worker can now acquire the first subflow socket
3290 * lock without racing with listener queue cleanup,
3291 * we can notify it, if needed.
3293 subflow
= mptcp_subflow_ctx(msk
->first
);
3294 list_add(&subflow
->node
, &msk
->conn_list
);
3295 sock_hold(msk
->first
);
3296 if (mptcp_is_fully_established(newsk
))
3297 mptcp_pm_fully_established(msk
);
3299 mptcp_copy_inaddrs(newsk
, msk
->first
);
3300 mptcp_rcv_space_init(msk
, msk
->first
);
3302 /* set ssk->sk_socket of accept()ed flows to mptcp socket.
3303 * This is needed so NOSPACE flag can be set from tcp stack.
3305 __mptcp_flush_join_list(msk
);
3306 mptcp_for_each_subflow(msk
, subflow
) {
3307 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
3309 if (!ssk
->sk_socket
)
3310 mptcp_sock_graft(ssk
, newsock
);
3312 unlock_sock_fast(newsk
, slowpath
);
3315 if (inet_csk_listen_poll(ssock
->sk
))
3316 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
3317 sock_put(ssock
->sk
);
3321 release_sock(sock
->sk
);
3325 static __poll_t
mptcp_check_readable(struct mptcp_sock
*msk
)
3327 return test_bit(MPTCP_DATA_READY
, &msk
->flags
) ? EPOLLIN
| EPOLLRDNORM
:
3331 static __poll_t
mptcp_check_writeable(struct mptcp_sock
*msk
)
3333 struct sock
*sk
= (struct sock
*)msk
;
3335 if (unlikely(sk
->sk_shutdown
& SEND_SHUTDOWN
))
3336 return EPOLLOUT
| EPOLLWRNORM
;
3338 if (sk_stream_is_writeable(sk
))
3339 return EPOLLOUT
| EPOLLWRNORM
;
3341 set_bit(MPTCP_NOSPACE
, &msk
->flags
);
3342 smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
3343 if (sk_stream_is_writeable(sk
))
3344 return EPOLLOUT
| EPOLLWRNORM
;
3349 static __poll_t
mptcp_poll(struct file
*file
, struct socket
*sock
,
3350 struct poll_table_struct
*wait
)
3352 struct sock
*sk
= sock
->sk
;
3353 struct mptcp_sock
*msk
;
3358 sock_poll_wait(file
, sock
, wait
);
3360 state
= inet_sk_state_load(sk
);
3361 pr_debug("msk=%p state=%d flags=%lx", msk
, state
, msk
->flags
);
3362 if (state
== TCP_LISTEN
)
3363 return mptcp_check_readable(msk
);
3365 if (state
!= TCP_SYN_SENT
&& state
!= TCP_SYN_RECV
) {
3366 mask
|= mptcp_check_readable(msk
);
3367 mask
|= mptcp_check_writeable(msk
);
3369 if (sk
->sk_shutdown
== SHUTDOWN_MASK
|| state
== TCP_CLOSE
)
3371 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
3372 mask
|= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
3374 /* This barrier is coupled with smp_wmb() in tcp_reset() */
3382 static int mptcp_release(struct socket
*sock
)
3384 struct mptcp_subflow_context
*subflow
;
3385 struct sock
*sk
= sock
->sk
;
3386 struct mptcp_sock
*msk
;
3395 mptcp_for_each_subflow(msk
, subflow
) {
3396 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
3398 ip_mc_drop_socket(ssk
);
3403 return inet_release(sock
);
3406 static const struct proto_ops mptcp_stream_ops
= {
3408 .owner
= THIS_MODULE
,
3409 .release
= mptcp_release
,
3411 .connect
= mptcp_stream_connect
,
3412 .socketpair
= sock_no_socketpair
,
3413 .accept
= mptcp_stream_accept
,
3414 .getname
= inet_getname
,
3416 .ioctl
= inet_ioctl
,
3417 .gettstamp
= sock_gettstamp
,
3418 .listen
= mptcp_listen
,
3419 .shutdown
= inet_shutdown
,
3420 .setsockopt
= sock_common_setsockopt
,
3421 .getsockopt
= sock_common_getsockopt
,
3422 .sendmsg
= inet_sendmsg
,
3423 .recvmsg
= inet_recvmsg
,
3424 .mmap
= sock_no_mmap
,
3425 .sendpage
= inet_sendpage
,
3428 static struct inet_protosw mptcp_protosw
= {
3429 .type
= SOCK_STREAM
,
3430 .protocol
= IPPROTO_MPTCP
,
3431 .prot
= &mptcp_prot
,
3432 .ops
= &mptcp_stream_ops
,
3433 .flags
= INET_PROTOSW_ICSK
,
3436 void __init
mptcp_proto_init(void)
3438 mptcp_prot
.h
.hashinfo
= tcp_prot
.h
.hashinfo
;
3440 if (percpu_counter_init(&mptcp_sockets_allocated
, 0, GFP_KERNEL
))
3441 panic("Failed to allocate MPTCP pcpu counter\n");
3443 mptcp_subflow_init();
3447 if (proto_register(&mptcp_prot
, 1) != 0)
3448 panic("Failed to register MPTCP proto.\n");
3450 inet_register_protosw(&mptcp_protosw
);
3452 BUILD_BUG_ON(sizeof(struct mptcp_skb_cb
) > sizeof_field(struct sk_buff
, cb
));
3455 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
3456 static int mptcp6_release(struct socket
*sock
)
3458 struct mptcp_subflow_context
*subflow
;
3459 struct mptcp_sock
*msk
;
3460 struct sock
*sk
= sock
->sk
;
3469 mptcp_for_each_subflow(msk
, subflow
) {
3470 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
3472 ip_mc_drop_socket(ssk
);
3473 ipv6_sock_mc_close(ssk
);
3474 ipv6_sock_ac_close(ssk
);
3478 return inet6_release(sock
);
3481 static const struct proto_ops mptcp_v6_stream_ops
= {
3483 .owner
= THIS_MODULE
,
3484 .release
= mptcp6_release
,
3486 .connect
= mptcp_stream_connect
,
3487 .socketpair
= sock_no_socketpair
,
3488 .accept
= mptcp_stream_accept
,
3489 .getname
= inet6_getname
,
3491 .ioctl
= inet6_ioctl
,
3492 .gettstamp
= sock_gettstamp
,
3493 .listen
= mptcp_listen
,
3494 .shutdown
= inet_shutdown
,
3495 .setsockopt
= sock_common_setsockopt
,
3496 .getsockopt
= sock_common_getsockopt
,
3497 .sendmsg
= inet6_sendmsg
,
3498 .recvmsg
= inet6_recvmsg
,
3499 .mmap
= sock_no_mmap
,
3500 .sendpage
= inet_sendpage
,
3501 #ifdef CONFIG_COMPAT
3502 .compat_ioctl
= inet6_compat_ioctl
,
3506 static struct proto mptcp_v6_prot
;
3508 static void mptcp_v6_destroy(struct sock
*sk
)
3511 inet6_destroy_sock(sk
);
3514 static struct inet_protosw mptcp_v6_protosw
= {
3515 .type
= SOCK_STREAM
,
3516 .protocol
= IPPROTO_MPTCP
,
3517 .prot
= &mptcp_v6_prot
,
3518 .ops
= &mptcp_v6_stream_ops
,
3519 .flags
= INET_PROTOSW_ICSK
,
3522 int __init
mptcp_proto_v6_init(void)
3526 mptcp_v6_prot
= mptcp_prot
;
3527 strcpy(mptcp_v6_prot
.name
, "MPTCPv6");
3528 mptcp_v6_prot
.slab
= NULL
;
3529 mptcp_v6_prot
.destroy
= mptcp_v6_destroy
;
3530 mptcp_v6_prot
.obj_size
= sizeof(struct mptcp6_sock
);
3532 err
= proto_register(&mptcp_v6_prot
, 1);
3536 err
= inet6_register_protosw(&mptcp_v6_protosw
);
3538 proto_unregister(&mptcp_v6_prot
);