1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha2.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
21 #include <net/transp_v6.h>
23 #include <net/mptcp.h>
24 #include <uapi/linux/mptcp.h>
28 static void mptcp_subflow_ops_undo_override(struct sock
*ssk
);
30 static void SUBFLOW_REQ_INC_STATS(struct request_sock
*req
,
31 enum linux_mptcp_mib_field field
)
33 MPTCP_INC_STATS(sock_net(req_to_sk(req
)), field
);
36 static void subflow_req_destructor(struct request_sock
*req
)
38 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
40 pr_debug("subflow_req=%p", subflow_req
);
43 sock_put((struct sock
*)subflow_req
->msk
);
45 mptcp_token_destroy_request(req
);
46 tcp_request_sock_ops
.destructor(req
);
49 static void subflow_generate_hmac(u64 key1
, u64 key2
, u32 nonce1
, u32 nonce2
,
54 put_unaligned_be32(nonce1
, &msg
[0]);
55 put_unaligned_be32(nonce2
, &msg
[4]);
57 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 8, hmac
);
60 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock
*msk
)
62 return mptcp_is_fully_established((void *)msk
) &&
63 READ_ONCE(msk
->pm
.accept_subflow
);
66 /* validate received token and create truncated hmac and nonce for SYN-ACK */
67 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock
*subflow_req
)
69 struct mptcp_sock
*msk
= subflow_req
->msk
;
70 u8 hmac
[SHA256_DIGEST_SIZE
];
72 get_random_bytes(&subflow_req
->local_nonce
, sizeof(u32
));
74 subflow_generate_hmac(msk
->local_key
, msk
->remote_key
,
75 subflow_req
->local_nonce
,
76 subflow_req
->remote_nonce
, hmac
);
78 subflow_req
->thmac
= get_unaligned_be64(hmac
);
81 static struct mptcp_sock
*subflow_token_join_request(struct request_sock
*req
)
83 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
84 struct mptcp_sock
*msk
;
87 msk
= mptcp_token_get_sock(subflow_req
->token
);
89 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINNOTOKEN
);
93 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)req
);
95 sock_put((struct sock
*)msk
);
98 subflow_req
->local_id
= local_id
;
103 static void subflow_init_req(struct request_sock
*req
, const struct sock
*sk_listener
)
105 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
107 subflow_req
->mp_capable
= 0;
108 subflow_req
->mp_join
= 0;
109 subflow_req
->msk
= NULL
;
110 mptcp_token_init_request(req
);
113 static bool subflow_use_different_sport(struct mptcp_sock
*msk
, const struct sock
*sk
)
115 return inet_sk(sk
)->inet_sport
!= inet_sk((struct sock
*)msk
)->inet_sport
;
118 /* Init mptcp request socket.
120 * Returns an error code if a JOIN has failed and a TCP reset
123 static int subflow_check_req(struct request_sock
*req
,
124 const struct sock
*sk_listener
,
127 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
128 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
129 struct mptcp_options_received mp_opt
;
131 pr_debug("subflow_req=%p, listener=%p", subflow_req
, listener
);
133 #ifdef CONFIG_TCP_MD5SIG
134 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
137 if (rcu_access_pointer(tcp_sk(sk_listener
)->md5sig_info
))
141 mptcp_get_options(skb
, &mp_opt
);
143 if (mp_opt
.mp_capable
) {
144 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVE
);
148 } else if (mp_opt
.mp_join
) {
149 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNRX
);
152 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
153 int err
, retries
= 4;
155 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
158 get_random_bytes(&subflow_req
->local_key
, sizeof(subflow_req
->local_key
));
159 } while (subflow_req
->local_key
== 0);
161 if (unlikely(req
->syncookie
)) {
162 mptcp_crypto_key_sha(subflow_req
->local_key
,
165 if (mptcp_token_exists(subflow_req
->token
)) {
169 subflow_req
->mp_capable
= 1;
174 err
= mptcp_token_new_request(req
);
176 subflow_req
->mp_capable
= 1;
177 else if (retries
-- > 0)
180 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
181 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
182 subflow_req
->mp_join
= 1;
183 subflow_req
->backup
= mp_opt
.backup
;
184 subflow_req
->remote_id
= mp_opt
.join_id
;
185 subflow_req
->token
= mp_opt
.token
;
186 subflow_req
->remote_nonce
= mp_opt
.nonce
;
187 subflow_req
->msk
= subflow_token_join_request(req
);
189 /* Can't fall back to TCP in this case. */
190 if (!subflow_req
->msk
)
193 if (subflow_use_different_sport(subflow_req
->msk
, sk_listener
)) {
194 pr_debug("syn inet_sport=%d %d",
195 ntohs(inet_sk(sk_listener
)->inet_sport
),
196 ntohs(inet_sk((struct sock
*)subflow_req
->msk
)->inet_sport
));
197 if (!mptcp_pm_sport_in_anno_list(subflow_req
->msk
, sk_listener
)) {
198 sock_put((struct sock
*)subflow_req
->msk
);
199 mptcp_token_destroy_request(req
);
200 tcp_request_sock_ops
.destructor(req
);
201 subflow_req
->msk
= NULL
;
202 subflow_req
->mp_join
= 0;
203 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MISMATCHPORTSYNRX
);
206 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINPORTSYNRX
);
209 subflow_req_create_thmac(subflow_req
);
211 if (unlikely(req
->syncookie
)) {
212 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
213 subflow_init_req_cookie_join_save(subflow_req
, skb
);
216 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req
->token
,
217 subflow_req
->remote_nonce
, subflow_req
->msk
);
223 int mptcp_subflow_init_cookie_req(struct request_sock
*req
,
224 const struct sock
*sk_listener
,
227 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
228 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
229 struct mptcp_options_received mp_opt
;
232 subflow_init_req(req
, sk_listener
);
233 mptcp_get_options(skb
, &mp_opt
);
235 if (mp_opt
.mp_capable
&& mp_opt
.mp_join
)
238 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
239 if (mp_opt
.sndr_key
== 0)
242 subflow_req
->local_key
= mp_opt
.rcvr_key
;
243 err
= mptcp_token_new_request(req
);
247 subflow_req
->mp_capable
= 1;
248 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
249 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
250 if (!mptcp_token_join_cookie_init_state(subflow_req
, skb
))
253 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
254 subflow_req
->mp_join
= 1;
256 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
261 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req
);
263 static struct dst_entry
*subflow_v4_route_req(const struct sock
*sk
,
266 struct request_sock
*req
)
268 struct dst_entry
*dst
;
271 tcp_rsk(req
)->is_mptcp
= 1;
272 subflow_init_req(req
, sk
);
274 dst
= tcp_request_sock_ipv4_ops
.route_req(sk
, skb
, fl
, req
);
278 err
= subflow_check_req(req
, sk
, skb
);
284 tcp_request_sock_ops
.send_reset(sk
, skb
);
288 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
289 static struct dst_entry
*subflow_v6_route_req(const struct sock
*sk
,
292 struct request_sock
*req
)
294 struct dst_entry
*dst
;
297 tcp_rsk(req
)->is_mptcp
= 1;
298 subflow_init_req(req
, sk
);
300 dst
= tcp_request_sock_ipv6_ops
.route_req(sk
, skb
, fl
, req
);
304 err
= subflow_check_req(req
, sk
, skb
);
310 tcp6_request_sock_ops
.send_reset(sk
, skb
);
315 /* validate received truncated hmac and create hmac for third ACK */
316 static bool subflow_thmac_valid(struct mptcp_subflow_context
*subflow
)
318 u8 hmac
[SHA256_DIGEST_SIZE
];
321 subflow_generate_hmac(subflow
->remote_key
, subflow
->local_key
,
322 subflow
->remote_nonce
, subflow
->local_nonce
,
325 thmac
= get_unaligned_be64(hmac
);
326 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
327 subflow
, subflow
->token
,
328 (unsigned long long)thmac
,
329 (unsigned long long)subflow
->thmac
);
331 return thmac
== subflow
->thmac
;
334 void mptcp_subflow_reset(struct sock
*ssk
)
336 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
337 struct sock
*sk
= subflow
->conn
;
339 /* must hold: tcp_done() could drop last reference on parent */
342 tcp_set_state(ssk
, TCP_CLOSE
);
343 tcp_send_active_reset(ssk
, GFP_ATOMIC
);
345 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &mptcp_sk(sk
)->flags
) &&
346 schedule_work(&mptcp_sk(sk
)->work
))
347 return; /* worker will put sk for us */
352 static bool subflow_use_different_dport(struct mptcp_sock
*msk
, const struct sock
*sk
)
354 return inet_sk(sk
)->inet_dport
!= inet_sk((struct sock
*)msk
)->inet_dport
;
357 static void subflow_finish_connect(struct sock
*sk
, const struct sk_buff
*skb
)
359 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
360 struct mptcp_options_received mp_opt
;
361 struct sock
*parent
= subflow
->conn
;
363 subflow
->icsk_af_ops
->sk_rx_dst_set(sk
, skb
);
365 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
366 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
367 parent
->sk_state_change(parent
);
370 /* be sure no special action on any packet other than syn-ack */
371 if (subflow
->conn_finished
)
374 mptcp_propagate_sndbuf(parent
, sk
);
375 subflow
->rel_write_seq
= 1;
376 subflow
->conn_finished
= 1;
377 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
378 pr_debug("subflow=%p synack seq=%x", subflow
, subflow
->ssn_offset
);
380 mptcp_get_options(skb
, &mp_opt
);
381 if (subflow
->request_mptcp
) {
382 if (!mp_opt
.mp_capable
) {
383 MPTCP_INC_STATS(sock_net(sk
),
384 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK
);
385 mptcp_do_fallback(sk
);
386 pr_fallback(mptcp_sk(subflow
->conn
));
390 subflow
->mp_capable
= 1;
391 subflow
->can_ack
= 1;
392 subflow
->remote_key
= mp_opt
.sndr_key
;
393 pr_debug("subflow=%p, remote_key=%llu", subflow
,
394 subflow
->remote_key
);
395 mptcp_finish_connect(sk
);
396 } else if (subflow
->request_join
) {
397 u8 hmac
[SHA256_DIGEST_SIZE
];
402 subflow
->thmac
= mp_opt
.thmac
;
403 subflow
->remote_nonce
= mp_opt
.nonce
;
404 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow
,
405 subflow
->thmac
, subflow
->remote_nonce
);
407 if (!subflow_thmac_valid(subflow
)) {
408 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINACKMAC
);
412 subflow_generate_hmac(subflow
->local_key
, subflow
->remote_key
,
413 subflow
->local_nonce
,
414 subflow
->remote_nonce
,
416 memcpy(subflow
->hmac
, hmac
, MPTCPOPT_HMAC_LEN
);
418 if (!mptcp_finish_join(sk
))
421 subflow
->mp_join
= 1;
422 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKRX
);
424 if (subflow_use_different_dport(mptcp_sk(parent
), sk
)) {
425 pr_debug("synack inet_dport=%d %d",
426 ntohs(inet_sk(sk
)->inet_dport
),
427 ntohs(inet_sk(parent
)->inet_dport
));
428 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINPORTSYNACKRX
);
430 } else if (mptcp_check_fallback(sk
)) {
432 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
437 mptcp_subflow_reset(sk
);
440 struct request_sock_ops mptcp_subflow_request_sock_ops
;
441 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops
);
442 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops
;
444 static int subflow_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
446 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
448 pr_debug("subflow=%p", subflow
);
450 /* Never answer to SYNs sent to broadcast or multicast */
451 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
454 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
455 &subflow_request_sock_ipv4_ops
,
462 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
463 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops
;
464 static struct inet_connection_sock_af_ops subflow_v6_specific
;
465 static struct inet_connection_sock_af_ops subflow_v6m_specific
;
466 static struct proto tcpv6_prot_override
;
468 static int subflow_v6_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
470 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
472 pr_debug("subflow=%p", subflow
);
474 if (skb
->protocol
== htons(ETH_P_IP
))
475 return subflow_v4_conn_request(sk
, skb
);
477 if (!ipv6_unicast_destination(skb
))
480 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
481 &subflow_request_sock_ipv6_ops
, sk
, skb
);
485 return 0; /* don't send reset */
489 /* validate hmac received in third ACK */
490 static bool subflow_hmac_valid(const struct request_sock
*req
,
491 const struct mptcp_options_received
*mp_opt
)
493 const struct mptcp_subflow_request_sock
*subflow_req
;
494 u8 hmac
[SHA256_DIGEST_SIZE
];
495 struct mptcp_sock
*msk
;
497 subflow_req
= mptcp_subflow_rsk(req
);
498 msk
= subflow_req
->msk
;
502 subflow_generate_hmac(msk
->remote_key
, msk
->local_key
,
503 subflow_req
->remote_nonce
,
504 subflow_req
->local_nonce
, hmac
);
506 return !crypto_memneq(hmac
, mp_opt
->hmac
, MPTCPOPT_HMAC_LEN
);
509 static void mptcp_sock_destruct(struct sock
*sk
)
511 /* if new mptcp socket isn't accepted, it is free'd
512 * from the tcp listener sockets request queue, linked
513 * from req->sk. The tcp socket is released.
514 * This calls the ULP release function which will
515 * also remove the mptcp socket, via
516 * sock_put(ctx->conn).
518 * Problem is that the mptcp socket will be in
519 * ESTABLISHED state and will not have the SOCK_DEAD flag.
520 * Both result in warnings from inet_sock_destruct.
523 if (sk
->sk_state
== TCP_ESTABLISHED
) {
524 sk
->sk_state
= TCP_CLOSE
;
525 WARN_ON_ONCE(sk
->sk_socket
);
529 mptcp_destroy_common(mptcp_sk(sk
));
530 inet_sock_destruct(sk
);
533 static void mptcp_force_close(struct sock
*sk
)
535 inet_sk_state_store(sk
, TCP_CLOSE
);
536 sk_common_release(sk
);
539 static void subflow_ulp_fallback(struct sock
*sk
,
540 struct mptcp_subflow_context
*old_ctx
)
542 struct inet_connection_sock
*icsk
= inet_csk(sk
);
544 mptcp_subflow_tcp_fallback(sk
, old_ctx
);
545 icsk
->icsk_ulp_ops
= NULL
;
546 rcu_assign_pointer(icsk
->icsk_ulp_data
, NULL
);
547 tcp_sk(sk
)->is_mptcp
= 0;
549 mptcp_subflow_ops_undo_override(sk
);
552 static void subflow_drop_ctx(struct sock
*ssk
)
554 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
559 subflow_ulp_fallback(ssk
, ctx
);
566 void mptcp_subflow_fully_established(struct mptcp_subflow_context
*subflow
,
567 struct mptcp_options_received
*mp_opt
)
569 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
571 subflow
->remote_key
= mp_opt
->sndr_key
;
572 subflow
->fully_established
= 1;
573 subflow
->can_ack
= 1;
574 WRITE_ONCE(msk
->fully_established
, true);
577 static struct sock
*subflow_syn_recv_sock(const struct sock
*sk
,
579 struct request_sock
*req
,
580 struct dst_entry
*dst
,
581 struct request_sock
*req_unhash
,
584 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk
);
585 struct mptcp_subflow_request_sock
*subflow_req
;
586 struct mptcp_options_received mp_opt
;
587 bool fallback
, fallback_is_fatal
;
588 struct sock
*new_msk
= NULL
;
591 pr_debug("listener=%p, req=%p, conn=%p", listener
, req
, listener
->conn
);
593 /* After child creation we must look for 'mp_capable' even when options
596 mp_opt
.mp_capable
= 0;
598 /* hopefully temporary handling for MP_JOIN+syncookie */
599 subflow_req
= mptcp_subflow_rsk(req
);
600 fallback_is_fatal
= tcp_rsk(req
)->is_mptcp
&& subflow_req
->mp_join
;
601 fallback
= !tcp_rsk(req
)->is_mptcp
;
605 /* if the sk is MP_CAPABLE, we try to fetch the client key */
606 if (subflow_req
->mp_capable
) {
607 if (TCP_SKB_CB(skb
)->seq
!= subflow_req
->ssn_offset
+ 1) {
608 /* here we can receive and accept an in-window,
609 * out-of-order pkt, which will not carry the MP_CAPABLE
610 * opt even on mptcp enabled paths
615 mptcp_get_options(skb
, &mp_opt
);
616 if (!mp_opt
.mp_capable
) {
622 new_msk
= mptcp_sk_clone(listener
->conn
, &mp_opt
, req
);
625 } else if (subflow_req
->mp_join
) {
626 mptcp_get_options(skb
, &mp_opt
);
627 if (!mp_opt
.mp_join
|| !subflow_hmac_valid(req
, &mp_opt
) ||
628 !mptcp_can_accept_new_subflow(subflow_req
->msk
)) {
629 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKMAC
);
635 child
= listener
->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, dst
,
636 req_unhash
, own_req
);
638 if (child
&& *own_req
) {
639 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(child
);
641 tcp_rsk(req
)->drop_req
= false;
643 /* we need to fallback on ctx allocation failure and on pre-reqs
644 * checking above. In the latter scenario we additionally need
645 * to reset the context to non MPTCP status.
647 if (!ctx
|| fallback
) {
648 if (fallback_is_fatal
)
651 subflow_drop_ctx(child
);
655 if (ctx
->mp_capable
) {
656 /* this can't race with mptcp_close(), as the msk is
657 * not yet exposted to user-space
659 inet_sk_state_store((void *)new_msk
, TCP_ESTABLISHED
);
661 /* record the newly created socket as the first msk
662 * subflow, but don't link it yet into conn_list
664 WRITE_ONCE(mptcp_sk(new_msk
)->first
, child
);
666 /* new mpc subflow takes ownership of the newly
667 * created mptcp socket
669 new_msk
->sk_destruct
= mptcp_sock_destruct
;
670 mptcp_pm_new_connection(mptcp_sk(new_msk
), child
, 1);
671 mptcp_token_accept(subflow_req
, mptcp_sk(new_msk
));
675 /* with OoO packets we can reach here without ingress
678 if (mp_opt
.mp_capable
)
679 mptcp_subflow_fully_established(ctx
, &mp_opt
);
680 } else if (ctx
->mp_join
) {
681 struct mptcp_sock
*owner
;
683 owner
= subflow_req
->msk
;
687 /* move the msk reference ownership to the subflow */
688 subflow_req
->msk
= NULL
;
689 ctx
->conn
= (struct sock
*)owner
;
690 if (!mptcp_finish_join(child
))
693 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKRX
);
694 tcp_rsk(req
)->drop_req
= true;
696 if (subflow_use_different_sport(owner
, sk
)) {
697 pr_debug("ack inet_sport=%d %d",
698 ntohs(inet_sk(sk
)->inet_sport
),
699 ntohs(inet_sk((struct sock
*)owner
)->inet_sport
));
700 if (!mptcp_pm_sport_in_anno_list(owner
, sk
)) {
701 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MISMATCHPORTACKRX
);
704 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINPORTACKRX
);
710 /* dispose of the left over mptcp master, if any */
711 if (unlikely(new_msk
))
712 mptcp_force_close(new_msk
);
714 /* check for expected invariant - should never trigger, just help
715 * catching eariler subtle bugs
717 WARN_ON_ONCE(child
&& *own_req
&& tcp_sk(child
)->is_mptcp
&&
718 (!mptcp_subflow_ctx(child
) ||
719 !mptcp_subflow_ctx(child
)->conn
));
723 subflow_drop_ctx(child
);
724 tcp_rsk(req
)->drop_req
= true;
725 inet_csk_prepare_for_destroy_sock(child
);
727 req
->rsk_ops
->send_reset(sk
, skb
);
729 /* The last child reference will be released by the caller */
733 static struct inet_connection_sock_af_ops subflow_specific
;
734 static struct proto tcp_prot_override
;
736 enum mapping_status
{
744 static u64
expand_seq(u64 old_seq
, u16 old_data_len
, u64 seq
)
746 if ((u32
)seq
== (u32
)old_seq
)
749 /* Assume map covers data not mapped yet. */
750 return seq
| ((old_seq
+ old_data_len
+ 1) & GENMASK_ULL(63, 32));
753 static void warn_bad_map(struct mptcp_subflow_context
*subflow
, u32 ssn
)
755 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
756 ssn
, subflow
->map_subflow_seq
, subflow
->map_data_len
);
759 static bool skb_is_fully_mapped(struct sock
*ssk
, struct sk_buff
*skb
)
761 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
762 unsigned int skb_consumed
;
764 skb_consumed
= tcp_sk(ssk
)->copied_seq
- TCP_SKB_CB(skb
)->seq
;
765 if (WARN_ON_ONCE(skb_consumed
>= skb
->len
))
768 return skb
->len
- skb_consumed
<= subflow
->map_data_len
-
769 mptcp_subflow_get_map_offset(subflow
);
772 static bool validate_mapping(struct sock
*ssk
, struct sk_buff
*skb
)
774 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
775 u32 ssn
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
777 if (unlikely(before(ssn
, subflow
->map_subflow_seq
))) {
778 /* Mapping covers data later in the subflow stream,
779 * currently unsupported.
781 warn_bad_map(subflow
, ssn
);
784 if (unlikely(!before(ssn
, subflow
->map_subflow_seq
+
785 subflow
->map_data_len
))) {
786 /* Mapping does covers past subflow data, invalid */
787 warn_bad_map(subflow
, ssn
+ skb
->len
);
793 static enum mapping_status
get_mapping_status(struct sock
*ssk
,
794 struct mptcp_sock
*msk
)
796 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
797 struct mptcp_ext
*mpext
;
802 skb
= skb_peek(&ssk
->sk_receive_queue
);
804 return MAPPING_EMPTY
;
806 if (mptcp_check_fallback(ssk
))
807 return MAPPING_DUMMY
;
809 mpext
= mptcp_get_ext(skb
);
810 if (!mpext
|| !mpext
->use_map
) {
811 if (!subflow
->map_valid
&& !skb
->len
) {
812 /* the TCP stack deliver 0 len FIN pkt to the receive
813 * queue, that is the only 0len pkts ever expected here,
814 * and we can admit no mapping only for 0 len pkts
816 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
))
817 WARN_ONCE(1, "0len seq %d:%d flags %x",
818 TCP_SKB_CB(skb
)->seq
,
819 TCP_SKB_CB(skb
)->end_seq
,
820 TCP_SKB_CB(skb
)->tcp_flags
);
821 sk_eat_skb(ssk
, skb
);
822 return MAPPING_EMPTY
;
825 if (!subflow
->map_valid
)
826 return MAPPING_INVALID
;
831 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
832 mpext
->data_seq
, mpext
->dsn64
, mpext
->subflow_seq
,
833 mpext
->data_len
, mpext
->data_fin
);
835 data_len
= mpext
->data_len
;
837 pr_err("Infinite mapping not handled");
838 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_INFINITEMAPRX
);
839 return MAPPING_INVALID
;
842 if (mpext
->data_fin
== 1) {
844 bool updated
= mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
,
846 pr_debug("DATA_FIN with no payload seq=%llu", mpext
->data_seq
);
847 if (subflow
->map_valid
) {
848 /* A DATA_FIN might arrive in a DSS
849 * option before the previous mapping
850 * has been fully consumed. Continue
851 * handling the existing mapping.
853 skb_ext_del(skb
, SKB_EXT_MPTCP
);
856 if (updated
&& schedule_work(&msk
->work
))
857 sock_hold((struct sock
*)msk
);
859 return MAPPING_DATA_FIN
;
862 u64 data_fin_seq
= mpext
->data_seq
+ data_len
- 1;
864 /* If mpext->data_seq is a 32-bit value, data_fin_seq
865 * must also be limited to 32 bits.
868 data_fin_seq
&= GENMASK_ULL(31, 0);
870 mptcp_update_rcv_data_fin(msk
, data_fin_seq
, mpext
->dsn64
);
871 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
872 data_fin_seq
, mpext
->dsn64
);
875 /* Adjust for DATA_FIN using 1 byte of sequence space */
880 map_seq
= expand_seq(subflow
->map_seq
, subflow
->map_data_len
,
882 pr_debug("expanded seq=%llu", subflow
->map_seq
);
884 map_seq
= mpext
->data_seq
;
886 WRITE_ONCE(mptcp_sk(subflow
->conn
)->use_64bit_ack
, !!mpext
->dsn64
);
888 if (subflow
->map_valid
) {
889 /* Allow replacing only with an identical map */
890 if (subflow
->map_seq
== map_seq
&&
891 subflow
->map_subflow_seq
== mpext
->subflow_seq
&&
892 subflow
->map_data_len
== data_len
) {
893 skb_ext_del(skb
, SKB_EXT_MPTCP
);
897 /* If this skb data are fully covered by the current mapping,
898 * the new map would need caching, which is not supported
900 if (skb_is_fully_mapped(ssk
, skb
)) {
901 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSNOMATCH
);
902 return MAPPING_INVALID
;
905 /* will validate the next map after consuming the current one */
909 subflow
->map_seq
= map_seq
;
910 subflow
->map_subflow_seq
= mpext
->subflow_seq
;
911 subflow
->map_data_len
= data_len
;
912 subflow
->map_valid
= 1;
913 subflow
->mpc_map
= mpext
->mpc_map
;
914 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
915 subflow
->map_seq
, subflow
->map_subflow_seq
,
916 subflow
->map_data_len
);
919 /* we revalidate valid mapping on new skb, because we must ensure
920 * the current skb is completely covered by the available mapping
922 if (!validate_mapping(ssk
, skb
))
923 return MAPPING_INVALID
;
925 skb_ext_del(skb
, SKB_EXT_MPTCP
);
929 static void mptcp_subflow_discard_data(struct sock
*ssk
, struct sk_buff
*skb
,
932 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
933 bool fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
936 incr
= limit
>= skb
->len
? skb
->len
+ fin
: limit
;
938 pr_debug("discarding=%d len=%d seq=%d", incr
, skb
->len
,
939 subflow
->map_subflow_seq
);
940 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DUPDATA
);
941 tcp_sk(ssk
)->copied_seq
+= incr
;
942 if (!before(tcp_sk(ssk
)->copied_seq
, TCP_SKB_CB(skb
)->end_seq
))
943 sk_eat_skb(ssk
, skb
);
944 if (mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
)
945 subflow
->map_valid
= 0;
948 /* sched mptcp worker to remove the subflow if no more data is pending */
949 static void subflow_sched_work_if_closed(struct mptcp_sock
*msk
, struct sock
*ssk
)
951 struct sock
*sk
= (struct sock
*)msk
;
953 if (likely(ssk
->sk_state
!= TCP_CLOSE
))
956 if (skb_queue_empty(&ssk
->sk_receive_queue
) &&
957 !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW
, &msk
->flags
)) {
959 if (!schedule_work(&msk
->work
))
964 static bool subflow_check_data_avail(struct sock
*ssk
)
966 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
967 enum mapping_status status
;
968 struct mptcp_sock
*msk
;
971 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow
->conn
, ssk
,
972 subflow
->data_avail
, skb_peek(&ssk
->sk_receive_queue
));
973 if (!skb_peek(&ssk
->sk_receive_queue
))
974 subflow
->data_avail
= 0;
975 if (subflow
->data_avail
)
978 msk
= mptcp_sk(subflow
->conn
);
983 status
= get_mapping_status(ssk
, msk
);
984 pr_debug("msk=%p ssk=%p status=%d", msk
, ssk
, status
);
985 if (status
== MAPPING_INVALID
) {
986 ssk
->sk_err
= EBADMSG
;
989 if (status
== MAPPING_DUMMY
) {
990 __mptcp_do_fallback(msk
);
991 skb
= skb_peek(&ssk
->sk_receive_queue
);
992 subflow
->map_valid
= 1;
993 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
994 subflow
->map_data_len
= skb
->len
;
995 subflow
->map_subflow_seq
= tcp_sk(ssk
)->copied_seq
-
997 subflow
->data_avail
= MPTCP_SUBFLOW_DATA_AVAIL
;
1001 if (status
!= MAPPING_OK
)
1004 skb
= skb_peek(&ssk
->sk_receive_queue
);
1005 if (WARN_ON_ONCE(!skb
))
1008 /* if msk lacks the remote key, this subflow must provide an
1009 * MP_CAPABLE-based mapping
1011 if (unlikely(!READ_ONCE(msk
->can_ack
))) {
1012 if (!subflow
->mpc_map
) {
1013 ssk
->sk_err
= EBADMSG
;
1016 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
1017 WRITE_ONCE(msk
->ack_seq
, subflow
->map_seq
);
1018 WRITE_ONCE(msk
->can_ack
, true);
1021 old_ack
= READ_ONCE(msk
->ack_seq
);
1022 ack_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
1023 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack
,
1025 if (ack_seq
== old_ack
) {
1026 subflow
->data_avail
= MPTCP_SUBFLOW_DATA_AVAIL
;
1028 } else if (after64(ack_seq
, old_ack
)) {
1029 subflow
->data_avail
= MPTCP_SUBFLOW_OOO_DATA
;
1033 /* only accept in-sequence mapping. Old values are spurious
1036 mptcp_subflow_discard_data(ssk
, skb
, old_ack
- ack_seq
);
1041 subflow_sched_work_if_closed(msk
, ssk
);
1044 /* fatal protocol error, close the socket */
1045 /* This barrier is coupled with smp_rmb() in tcp_poll() */
1047 ssk
->sk_error_report(ssk
);
1048 tcp_set_state(ssk
, TCP_CLOSE
);
1049 tcp_send_active_reset(ssk
, GFP_ATOMIC
);
1050 subflow
->data_avail
= 0;
1054 bool mptcp_subflow_data_available(struct sock
*sk
)
1056 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1058 /* check if current mapping is still valid */
1059 if (subflow
->map_valid
&&
1060 mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
) {
1061 subflow
->map_valid
= 0;
1062 subflow
->data_avail
= 0;
1064 pr_debug("Done with mapping: seq=%u data_len=%u",
1065 subflow
->map_subflow_seq
,
1066 subflow
->map_data_len
);
1069 return subflow_check_data_avail(sk
);
1072 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
1075 * In mptcp, rwin is about the mptcp-level connection data.
1077 * Data that is still on the ssk rx queue can thus be ignored,
1078 * as far as mptcp peer is concerened that data is still inflight.
1079 * DSS ACK is updated when skb is moved to the mptcp rx queue.
1081 void mptcp_space(const struct sock
*ssk
, int *space
, int *full_space
)
1083 const struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1084 const struct sock
*sk
= subflow
->conn
;
1086 *space
= __mptcp_space(sk
);
1087 *full_space
= tcp_full_space(sk
);
1090 static void subflow_data_ready(struct sock
*sk
)
1092 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1093 u16 state
= 1 << inet_sk_state_load(sk
);
1094 struct sock
*parent
= subflow
->conn
;
1095 struct mptcp_sock
*msk
;
1097 msk
= mptcp_sk(parent
);
1098 if (state
& TCPF_LISTEN
) {
1099 /* MPJ subflow are removed from accept queue before reaching here,
1100 * avoid stray wakeups
1102 if (reqsk_queue_empty(&inet_csk(sk
)->icsk_accept_queue
))
1105 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
1106 parent
->sk_data_ready(parent
);
1110 WARN_ON_ONCE(!__mptcp_check_fallback(msk
) && !subflow
->mp_capable
&&
1111 !subflow
->mp_join
&& !(state
& TCPF_CLOSE
));
1113 if (mptcp_subflow_data_available(sk
))
1114 mptcp_data_ready(parent
, sk
);
1117 static void subflow_write_space(struct sock
*ssk
)
1119 struct sock
*sk
= mptcp_subflow_ctx(ssk
)->conn
;
1121 mptcp_propagate_sndbuf(sk
, ssk
);
1122 mptcp_write_space(sk
);
1125 void __mptcp_error_report(struct sock
*sk
)
1127 struct mptcp_subflow_context
*subflow
;
1128 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1130 mptcp_for_each_subflow(msk
, subflow
) {
1131 struct sock
*ssk
= mptcp_subflow_tcp_sock(subflow
);
1132 int err
= sock_error(ssk
);
1137 /* only propagate errors on fallen-back sockets or
1140 if (sk
->sk_state
!= TCP_SYN_SENT
&& !__mptcp_check_fallback(msk
))
1143 inet_sk_state_store(sk
, inet_sk_state_load(ssk
));
1146 /* This barrier is coupled with smp_rmb() in mptcp_poll() */
1148 sk
->sk_error_report(sk
);
1153 static void subflow_error_report(struct sock
*ssk
)
1155 struct sock
*sk
= mptcp_subflow_ctx(ssk
)->conn
;
1157 mptcp_data_lock(sk
);
1158 if (!sock_owned_by_user(sk
))
1159 __mptcp_error_report(sk
);
1161 set_bit(MPTCP_ERROR_REPORT
, &mptcp_sk(sk
)->flags
);
1162 mptcp_data_unlock(sk
);
1165 static struct inet_connection_sock_af_ops
*
1166 subflow_default_af_ops(struct sock
*sk
)
1168 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1169 if (sk
->sk_family
== AF_INET6
)
1170 return &subflow_v6_specific
;
1172 return &subflow_specific
;
1175 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1176 void mptcpv6_handle_mapped(struct sock
*sk
, bool mapped
)
1178 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1179 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1180 struct inet_connection_sock_af_ops
*target
;
1182 target
= mapped
? &subflow_v6m_specific
: subflow_default_af_ops(sk
);
1184 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1185 subflow
, sk
->sk_family
, icsk
->icsk_af_ops
, target
, mapped
);
1187 if (likely(icsk
->icsk_af_ops
== target
))
1190 subflow
->icsk_af_ops
= icsk
->icsk_af_ops
;
1191 icsk
->icsk_af_ops
= target
;
1195 void mptcp_info2sockaddr(const struct mptcp_addr_info
*info
,
1196 struct sockaddr_storage
*addr
,
1197 unsigned short family
)
1199 memset(addr
, 0, sizeof(*addr
));
1200 addr
->ss_family
= family
;
1201 if (addr
->ss_family
== AF_INET
) {
1202 struct sockaddr_in
*in_addr
= (struct sockaddr_in
*)addr
;
1204 if (info
->family
== AF_INET
)
1205 in_addr
->sin_addr
= info
->addr
;
1206 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1207 else if (ipv6_addr_v4mapped(&info
->addr6
))
1208 in_addr
->sin_addr
.s_addr
= info
->addr6
.s6_addr32
[3];
1210 in_addr
->sin_port
= info
->port
;
1212 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1213 else if (addr
->ss_family
== AF_INET6
) {
1214 struct sockaddr_in6
*in6_addr
= (struct sockaddr_in6
*)addr
;
1216 if (info
->family
== AF_INET
)
1217 ipv6_addr_set_v4mapped(info
->addr
.s_addr
,
1218 &in6_addr
->sin6_addr
);
1220 in6_addr
->sin6_addr
= info
->addr6
;
1221 in6_addr
->sin6_port
= info
->port
;
1226 int __mptcp_subflow_connect(struct sock
*sk
, const struct mptcp_addr_info
*loc
,
1227 const struct mptcp_addr_info
*remote
)
1229 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1230 struct mptcp_subflow_context
*subflow
;
1231 struct sockaddr_storage addr
;
1232 int remote_id
= remote
->id
;
1233 int local_id
= loc
->id
;
1240 if (!mptcp_is_fully_established(sk
))
1243 err
= mptcp_subflow_create_socket(sk
, &sf
);
1248 subflow
= mptcp_subflow_ctx(ssk
);
1250 get_random_bytes(&subflow
->local_nonce
, sizeof(u32
));
1251 } while (!subflow
->local_nonce
);
1254 err
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)ssk
);
1261 subflow
->remote_key
= msk
->remote_key
;
1262 subflow
->local_key
= msk
->local_key
;
1263 subflow
->token
= msk
->token
;
1264 mptcp_info2sockaddr(loc
, &addr
, ssk
->sk_family
);
1266 addrlen
= sizeof(struct sockaddr_in
);
1267 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1268 if (addr
.ss_family
== AF_INET6
)
1269 addrlen
= sizeof(struct sockaddr_in6
);
1271 ssk
->sk_bound_dev_if
= loc
->ifindex
;
1272 err
= kernel_bind(sf
, (struct sockaddr
*)&addr
, addrlen
);
1276 mptcp_crypto_key_sha(subflow
->remote_key
, &remote_token
, NULL
);
1277 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk
,
1278 remote_token
, local_id
, remote_id
);
1279 subflow
->remote_token
= remote_token
;
1280 subflow
->local_id
= local_id
;
1281 subflow
->remote_id
= remote_id
;
1282 subflow
->request_join
= 1;
1283 subflow
->request_bkup
= !!(loc
->flags
& MPTCP_PM_ADDR_FLAG_BACKUP
);
1284 mptcp_info2sockaddr(remote
, &addr
, ssk
->sk_family
);
1286 mptcp_add_pending_subflow(msk
, subflow
);
1287 err
= kernel_connect(sf
, (struct sockaddr
*)&addr
, addrlen
, O_NONBLOCK
);
1288 if (err
&& err
!= -EINPROGRESS
)
1291 /* discard the subflow socket */
1292 mptcp_sock_graft(ssk
, sk
->sk_socket
);
1293 iput(SOCK_INODE(sf
));
1297 spin_lock_bh(&msk
->join_list_lock
);
1298 list_del(&subflow
->node
);
1299 spin_unlock_bh(&msk
->join_list_lock
);
1302 subflow
->disposable
= 1;
1307 static void mptcp_attach_cgroup(struct sock
*parent
, struct sock
*child
)
1309 #ifdef CONFIG_SOCK_CGROUP_DATA
1310 struct sock_cgroup_data
*parent_skcd
= &parent
->sk_cgrp_data
,
1311 *child_skcd
= &child
->sk_cgrp_data
;
1313 /* only the additional subflows created by kworkers have to be modified */
1314 if (cgroup_id(sock_cgroup_ptr(parent_skcd
)) !=
1315 cgroup_id(sock_cgroup_ptr(child_skcd
))) {
1317 struct mem_cgroup
*memcg
= parent
->sk_memcg
;
1319 mem_cgroup_sk_free(child
);
1320 if (memcg
&& css_tryget(&memcg
->css
))
1321 child
->sk_memcg
= memcg
;
1322 #endif /* CONFIG_MEMCG */
1324 cgroup_sk_free(child_skcd
);
1325 *child_skcd
= *parent_skcd
;
1326 cgroup_sk_clone(child_skcd
);
1328 #endif /* CONFIG_SOCK_CGROUP_DATA */
1331 static void mptcp_subflow_ops_override(struct sock
*ssk
)
1333 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1334 if (ssk
->sk_prot
== &tcpv6_prot
)
1335 ssk
->sk_prot
= &tcpv6_prot_override
;
1338 ssk
->sk_prot
= &tcp_prot_override
;
1341 static void mptcp_subflow_ops_undo_override(struct sock
*ssk
)
1343 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1344 if (ssk
->sk_prot
== &tcpv6_prot_override
)
1345 ssk
->sk_prot
= &tcpv6_prot
;
1348 ssk
->sk_prot
= &tcp_prot
;
1350 int mptcp_subflow_create_socket(struct sock
*sk
, struct socket
**new_sock
)
1352 struct mptcp_subflow_context
*subflow
;
1353 struct net
*net
= sock_net(sk
);
1357 /* un-accepted server sockets can reach here - on bad configuration
1358 * bail early to avoid greater trouble later
1360 if (unlikely(!sk
->sk_socket
))
1363 err
= sock_create_kern(net
, sk
->sk_family
, SOCK_STREAM
, IPPROTO_TCP
,
1370 /* the newly created socket has to be in the same cgroup as its parent */
1371 mptcp_attach_cgroup(sk
, sf
->sk
);
1373 /* kernel sockets do not by default acquire net ref, but TCP timer
1376 sf
->sk
->sk_net_refcnt
= 1;
1378 #ifdef CONFIG_PROC_FS
1379 this_cpu_add(*net
->core
.sock_inuse
, 1);
1381 err
= tcp_set_ulp(sf
->sk
, "mptcp");
1382 release_sock(sf
->sk
);
1389 /* the newly created socket really belongs to the owning MPTCP master
1390 * socket, even if for additional subflows the allocation is performed
1391 * by a kernel workqueue. Adjust inode references, so that the
1392 * procfs/diag interaces really show this one belonging to the correct
1395 SOCK_INODE(sf
)->i_ino
= SOCK_INODE(sk
->sk_socket
)->i_ino
;
1396 SOCK_INODE(sf
)->i_uid
= SOCK_INODE(sk
->sk_socket
)->i_uid
;
1397 SOCK_INODE(sf
)->i_gid
= SOCK_INODE(sk
->sk_socket
)->i_gid
;
1399 subflow
= mptcp_subflow_ctx(sf
->sk
);
1400 pr_debug("subflow=%p", subflow
);
1405 mptcp_subflow_ops_override(sf
->sk
);
1410 static struct mptcp_subflow_context
*subflow_create_ctx(struct sock
*sk
,
1413 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1414 struct mptcp_subflow_context
*ctx
;
1416 ctx
= kzalloc(sizeof(*ctx
), priority
);
1420 rcu_assign_pointer(icsk
->icsk_ulp_data
, ctx
);
1421 INIT_LIST_HEAD(&ctx
->node
);
1422 INIT_LIST_HEAD(&ctx
->delegated_node
);
1424 pr_debug("subflow=%p", ctx
);
1431 static void __subflow_state_change(struct sock
*sk
)
1433 struct socket_wq
*wq
;
1436 wq
= rcu_dereference(sk
->sk_wq
);
1437 if (skwq_has_sleeper(wq
))
1438 wake_up_interruptible_all(&wq
->wait
);
1442 static bool subflow_is_done(const struct sock
*sk
)
1444 return sk
->sk_shutdown
& RCV_SHUTDOWN
|| sk
->sk_state
== TCP_CLOSE
;
1447 static void subflow_state_change(struct sock
*sk
)
1449 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1450 struct sock
*parent
= subflow
->conn
;
1452 __subflow_state_change(sk
);
1454 if (subflow_simultaneous_connect(sk
)) {
1455 mptcp_propagate_sndbuf(parent
, sk
);
1456 mptcp_do_fallback(sk
);
1457 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
1458 pr_fallback(mptcp_sk(parent
));
1459 subflow
->conn_finished
= 1;
1460 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
1461 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
1462 parent
->sk_state_change(parent
);
1466 /* as recvmsg() does not acquire the subflow socket for ssk selection
1467 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1468 * the data available machinery here.
1470 if (mptcp_subflow_data_available(sk
))
1471 mptcp_data_ready(parent
, sk
);
1473 subflow_sched_work_if_closed(mptcp_sk(parent
), sk
);
1475 if (__mptcp_check_fallback(mptcp_sk(parent
)) &&
1476 !subflow
->rx_eof
&& subflow_is_done(sk
)) {
1477 subflow
->rx_eof
= 1;
1478 mptcp_subflow_eof(parent
);
1482 static int subflow_ulp_init(struct sock
*sk
)
1484 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1485 struct mptcp_subflow_context
*ctx
;
1486 struct tcp_sock
*tp
= tcp_sk(sk
);
1489 /* disallow attaching ULP to a socket unless it has been
1490 * created with sock_create_kern()
1492 if (!sk
->sk_kern_sock
) {
1497 ctx
= subflow_create_ctx(sk
, GFP_KERNEL
);
1503 pr_debug("subflow=%p, family=%d", ctx
, sk
->sk_family
);
1506 ctx
->icsk_af_ops
= icsk
->icsk_af_ops
;
1507 icsk
->icsk_af_ops
= subflow_default_af_ops(sk
);
1508 ctx
->tcp_data_ready
= sk
->sk_data_ready
;
1509 ctx
->tcp_state_change
= sk
->sk_state_change
;
1510 ctx
->tcp_write_space
= sk
->sk_write_space
;
1511 ctx
->tcp_error_report
= sk
->sk_error_report
;
1512 sk
->sk_data_ready
= subflow_data_ready
;
1513 sk
->sk_write_space
= subflow_write_space
;
1514 sk
->sk_state_change
= subflow_state_change
;
1515 sk
->sk_error_report
= subflow_error_report
;
1520 static void subflow_ulp_release(struct sock
*ssk
)
1522 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
1523 bool release
= true;
1531 /* if the msk has been orphaned, keep the ctx
1532 * alive, will be freed by __mptcp_close_ssk(),
1533 * when the subflow is still unaccepted
1535 release
= ctx
->disposable
|| list_empty(&ctx
->node
);
1539 mptcp_subflow_ops_undo_override(ssk
);
1541 kfree_rcu(ctx
, rcu
);
1544 static void subflow_ulp_clone(const struct request_sock
*req
,
1546 const gfp_t priority
)
1548 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
1549 struct mptcp_subflow_context
*old_ctx
= mptcp_subflow_ctx(newsk
);
1550 struct mptcp_subflow_context
*new_ctx
;
1552 if (!tcp_rsk(req
)->is_mptcp
||
1553 (!subflow_req
->mp_capable
&& !subflow_req
->mp_join
)) {
1554 subflow_ulp_fallback(newsk
, old_ctx
);
1558 new_ctx
= subflow_create_ctx(newsk
, priority
);
1560 subflow_ulp_fallback(newsk
, old_ctx
);
1564 new_ctx
->conn_finished
= 1;
1565 new_ctx
->icsk_af_ops
= old_ctx
->icsk_af_ops
;
1566 new_ctx
->tcp_data_ready
= old_ctx
->tcp_data_ready
;
1567 new_ctx
->tcp_state_change
= old_ctx
->tcp_state_change
;
1568 new_ctx
->tcp_write_space
= old_ctx
->tcp_write_space
;
1569 new_ctx
->tcp_error_report
= old_ctx
->tcp_error_report
;
1570 new_ctx
->rel_write_seq
= 1;
1571 new_ctx
->tcp_sock
= newsk
;
1573 if (subflow_req
->mp_capable
) {
1574 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1575 * is fully established only after we receive the remote key
1577 new_ctx
->mp_capable
= 1;
1578 new_ctx
->local_key
= subflow_req
->local_key
;
1579 new_ctx
->token
= subflow_req
->token
;
1580 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1581 new_ctx
->idsn
= subflow_req
->idsn
;
1582 } else if (subflow_req
->mp_join
) {
1583 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1584 new_ctx
->mp_join
= 1;
1585 new_ctx
->fully_established
= 1;
1586 new_ctx
->backup
= subflow_req
->backup
;
1587 new_ctx
->local_id
= subflow_req
->local_id
;
1588 new_ctx
->remote_id
= subflow_req
->remote_id
;
1589 new_ctx
->token
= subflow_req
->token
;
1590 new_ctx
->thmac
= subflow_req
->thmac
;
1594 static void tcp_release_cb_override(struct sock
*ssk
)
1596 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
1598 if (mptcp_subflow_has_delegated_action(subflow
))
1599 mptcp_subflow_process_delegated(ssk
);
1601 tcp_release_cb(ssk
);
1604 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly
= {
1606 .owner
= THIS_MODULE
,
1607 .init
= subflow_ulp_init
,
1608 .release
= subflow_ulp_release
,
1609 .clone
= subflow_ulp_clone
,
1612 static int subflow_ops_init(struct request_sock_ops
*subflow_ops
)
1614 subflow_ops
->obj_size
= sizeof(struct mptcp_subflow_request_sock
);
1615 subflow_ops
->slab_name
= "request_sock_subflow";
1617 subflow_ops
->slab
= kmem_cache_create(subflow_ops
->slab_name
,
1618 subflow_ops
->obj_size
, 0,
1620 SLAB_TYPESAFE_BY_RCU
,
1622 if (!subflow_ops
->slab
)
1625 subflow_ops
->destructor
= subflow_req_destructor
;
1630 void __init
mptcp_subflow_init(void)
1632 mptcp_subflow_request_sock_ops
= tcp_request_sock_ops
;
1633 if (subflow_ops_init(&mptcp_subflow_request_sock_ops
) != 0)
1634 panic("MPTCP: failed to init subflow request sock ops\n");
1636 subflow_request_sock_ipv4_ops
= tcp_request_sock_ipv4_ops
;
1637 subflow_request_sock_ipv4_ops
.route_req
= subflow_v4_route_req
;
1639 subflow_specific
= ipv4_specific
;
1640 subflow_specific
.conn_request
= subflow_v4_conn_request
;
1641 subflow_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1642 subflow_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1644 tcp_prot_override
= tcp_prot
;
1645 tcp_prot_override
.release_cb
= tcp_release_cb_override
;
1647 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1648 subflow_request_sock_ipv6_ops
= tcp_request_sock_ipv6_ops
;
1649 subflow_request_sock_ipv6_ops
.route_req
= subflow_v6_route_req
;
1651 subflow_v6_specific
= ipv6_specific
;
1652 subflow_v6_specific
.conn_request
= subflow_v6_conn_request
;
1653 subflow_v6_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1654 subflow_v6_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1656 subflow_v6m_specific
= subflow_v6_specific
;
1657 subflow_v6m_specific
.queue_xmit
= ipv4_specific
.queue_xmit
;
1658 subflow_v6m_specific
.send_check
= ipv4_specific
.send_check
;
1659 subflow_v6m_specific
.net_header_len
= ipv4_specific
.net_header_len
;
1660 subflow_v6m_specific
.mtu_reduced
= ipv4_specific
.mtu_reduced
;
1661 subflow_v6m_specific
.net_frag_header_len
= 0;
1663 tcpv6_prot_override
= tcpv6_prot
;
1664 tcpv6_prot_override
.release_cb
= tcp_release_cb_override
;
1667 mptcp_diag_subflow_init(&subflow_ulp_ops
);
1669 if (tcp_register_ulp(&subflow_ulp_ops
) != 0)
1670 panic("MPTCP: failed to register subflows to ULP\n");