1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
22 #include <net/mptcp.h>
23 #include <uapi/linux/mptcp.h>
27 static void SUBFLOW_REQ_INC_STATS(struct request_sock
*req
,
28 enum linux_mptcp_mib_field field
)
30 MPTCP_INC_STATS(sock_net(req_to_sk(req
)), field
);
33 static void subflow_req_destructor(struct request_sock
*req
)
35 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
37 pr_debug("subflow_req=%p", subflow_req
);
40 sock_put((struct sock
*)subflow_req
->msk
);
42 mptcp_token_destroy_request(req
);
43 tcp_request_sock_ops
.destructor(req
);
46 static void subflow_generate_hmac(u64 key1
, u64 key2
, u32 nonce1
, u32 nonce2
,
51 put_unaligned_be32(nonce1
, &msg
[0]);
52 put_unaligned_be32(nonce2
, &msg
[4]);
54 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 8, hmac
);
57 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock
*msk
)
59 return mptcp_is_fully_established((void *)msk
) &&
60 READ_ONCE(msk
->pm
.accept_subflow
);
63 /* validate received token and create truncated hmac and nonce for SYN-ACK */
64 static struct mptcp_sock
*subflow_token_join_request(struct request_sock
*req
,
65 const struct sk_buff
*skb
)
67 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
68 u8 hmac
[SHA256_DIGEST_SIZE
];
69 struct mptcp_sock
*msk
;
72 msk
= mptcp_token_get_sock(subflow_req
->token
);
74 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINNOTOKEN
);
78 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)req
);
80 sock_put((struct sock
*)msk
);
83 subflow_req
->local_id
= local_id
;
85 get_random_bytes(&subflow_req
->local_nonce
, sizeof(u32
));
87 subflow_generate_hmac(msk
->local_key
, msk
->remote_key
,
88 subflow_req
->local_nonce
,
89 subflow_req
->remote_nonce
, hmac
);
91 subflow_req
->thmac
= get_unaligned_be64(hmac
);
95 static int __subflow_init_req(struct request_sock
*req
, const struct sock
*sk_listener
)
97 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
99 subflow_req
->mp_capable
= 0;
100 subflow_req
->mp_join
= 0;
101 subflow_req
->msk
= NULL
;
102 mptcp_token_init_request(req
);
104 #ifdef CONFIG_TCP_MD5SIG
105 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
108 if (rcu_access_pointer(tcp_sk(sk_listener
)->md5sig_info
))
115 static void subflow_init_req(struct request_sock
*req
,
116 const struct sock
*sk_listener
,
119 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
120 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
121 struct mptcp_options_received mp_opt
;
124 pr_debug("subflow_req=%p, listener=%p", subflow_req
, listener
);
126 ret
= __subflow_init_req(req
, sk_listener
);
130 mptcp_get_options(skb
, &mp_opt
);
132 if (mp_opt
.mp_capable
) {
133 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVE
);
137 } else if (mp_opt
.mp_join
) {
138 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNRX
);
141 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
142 int err
, retries
= 4;
144 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
147 get_random_bytes(&subflow_req
->local_key
, sizeof(subflow_req
->local_key
));
148 } while (subflow_req
->local_key
== 0);
150 if (unlikely(req
->syncookie
)) {
151 mptcp_crypto_key_sha(subflow_req
->local_key
,
154 if (mptcp_token_exists(subflow_req
->token
)) {
158 subflow_req
->mp_capable
= 1;
163 err
= mptcp_token_new_request(req
);
165 subflow_req
->mp_capable
= 1;
166 else if (retries
-- > 0)
169 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
170 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
171 subflow_req
->mp_join
= 1;
172 subflow_req
->backup
= mp_opt
.backup
;
173 subflow_req
->remote_id
= mp_opt
.join_id
;
174 subflow_req
->token
= mp_opt
.token
;
175 subflow_req
->remote_nonce
= mp_opt
.nonce
;
176 subflow_req
->msk
= subflow_token_join_request(req
, skb
);
178 if (unlikely(req
->syncookie
) && subflow_req
->msk
) {
179 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
180 subflow_init_req_cookie_join_save(subflow_req
, skb
);
183 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req
->token
,
184 subflow_req
->remote_nonce
, subflow_req
->msk
);
188 int mptcp_subflow_init_cookie_req(struct request_sock
*req
,
189 const struct sock
*sk_listener
,
192 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
193 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
194 struct mptcp_options_received mp_opt
;
197 err
= __subflow_init_req(req
, sk_listener
);
201 mptcp_get_options(skb
, &mp_opt
);
203 if (mp_opt
.mp_capable
&& mp_opt
.mp_join
)
206 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
207 if (mp_opt
.sndr_key
== 0)
210 subflow_req
->local_key
= mp_opt
.rcvr_key
;
211 err
= mptcp_token_new_request(req
);
215 subflow_req
->mp_capable
= 1;
216 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
217 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
218 if (!mptcp_token_join_cookie_init_state(subflow_req
, skb
))
221 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
222 subflow_req
->mp_join
= 1;
224 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
229 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req
);
231 static void subflow_v4_init_req(struct request_sock
*req
,
232 const struct sock
*sk_listener
,
235 tcp_rsk(req
)->is_mptcp
= 1;
237 tcp_request_sock_ipv4_ops
.init_req(req
, sk_listener
, skb
);
239 subflow_init_req(req
, sk_listener
, skb
);
242 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
243 static void subflow_v6_init_req(struct request_sock
*req
,
244 const struct sock
*sk_listener
,
247 tcp_rsk(req
)->is_mptcp
= 1;
249 tcp_request_sock_ipv6_ops
.init_req(req
, sk_listener
, skb
);
251 subflow_init_req(req
, sk_listener
, skb
);
255 /* validate received truncated hmac and create hmac for third ACK */
256 static bool subflow_thmac_valid(struct mptcp_subflow_context
*subflow
)
258 u8 hmac
[SHA256_DIGEST_SIZE
];
261 subflow_generate_hmac(subflow
->remote_key
, subflow
->local_key
,
262 subflow
->remote_nonce
, subflow
->local_nonce
,
265 thmac
= get_unaligned_be64(hmac
);
266 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
267 subflow
, subflow
->token
,
268 (unsigned long long)thmac
,
269 (unsigned long long)subflow
->thmac
);
271 return thmac
== subflow
->thmac
;
274 static void subflow_finish_connect(struct sock
*sk
, const struct sk_buff
*skb
)
276 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
277 struct mptcp_options_received mp_opt
;
278 struct sock
*parent
= subflow
->conn
;
280 subflow
->icsk_af_ops
->sk_rx_dst_set(sk
, skb
);
282 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
283 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
284 parent
->sk_state_change(parent
);
287 /* be sure no special action on any packet other than syn-ack */
288 if (subflow
->conn_finished
)
291 subflow
->rel_write_seq
= 1;
292 subflow
->conn_finished
= 1;
293 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
294 pr_debug("subflow=%p synack seq=%x", subflow
, subflow
->ssn_offset
);
296 mptcp_get_options(skb
, &mp_opt
);
297 if (subflow
->request_mptcp
) {
298 if (!mp_opt
.mp_capable
) {
299 MPTCP_INC_STATS(sock_net(sk
),
300 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK
);
301 mptcp_do_fallback(sk
);
302 pr_fallback(mptcp_sk(subflow
->conn
));
306 subflow
->mp_capable
= 1;
307 subflow
->can_ack
= 1;
308 subflow
->remote_key
= mp_opt
.sndr_key
;
309 pr_debug("subflow=%p, remote_key=%llu", subflow
,
310 subflow
->remote_key
);
311 mptcp_finish_connect(sk
);
312 } else if (subflow
->request_join
) {
313 u8 hmac
[SHA256_DIGEST_SIZE
];
318 subflow
->thmac
= mp_opt
.thmac
;
319 subflow
->remote_nonce
= mp_opt
.nonce
;
320 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow
,
321 subflow
->thmac
, subflow
->remote_nonce
);
323 if (!subflow_thmac_valid(subflow
)) {
324 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINACKMAC
);
328 subflow_generate_hmac(subflow
->local_key
, subflow
->remote_key
,
329 subflow
->local_nonce
,
330 subflow
->remote_nonce
,
332 memcpy(subflow
->hmac
, hmac
, MPTCPOPT_HMAC_LEN
);
334 if (!mptcp_finish_join(sk
))
337 subflow
->mp_join
= 1;
338 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKRX
);
339 } else if (mptcp_check_fallback(sk
)) {
341 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
346 tcp_send_active_reset(sk
, GFP_ATOMIC
);
350 struct request_sock_ops mptcp_subflow_request_sock_ops
;
351 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops
);
352 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops
;
354 static int subflow_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
356 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
358 pr_debug("subflow=%p", subflow
);
360 /* Never answer to SYNs sent to broadcast or multicast */
361 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
364 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
365 &subflow_request_sock_ipv4_ops
,
372 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
373 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops
;
374 static struct inet_connection_sock_af_ops subflow_v6_specific
;
375 static struct inet_connection_sock_af_ops subflow_v6m_specific
;
377 static int subflow_v6_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
379 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
381 pr_debug("subflow=%p", subflow
);
383 if (skb
->protocol
== htons(ETH_P_IP
))
384 return subflow_v4_conn_request(sk
, skb
);
386 if (!ipv6_unicast_destination(skb
))
389 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
390 &subflow_request_sock_ipv6_ops
, sk
, skb
);
394 return 0; /* don't send reset */
398 /* validate hmac received in third ACK */
399 static bool subflow_hmac_valid(const struct request_sock
*req
,
400 const struct mptcp_options_received
*mp_opt
)
402 const struct mptcp_subflow_request_sock
*subflow_req
;
403 u8 hmac
[SHA256_DIGEST_SIZE
];
404 struct mptcp_sock
*msk
;
406 subflow_req
= mptcp_subflow_rsk(req
);
407 msk
= subflow_req
->msk
;
411 subflow_generate_hmac(msk
->remote_key
, msk
->local_key
,
412 subflow_req
->remote_nonce
,
413 subflow_req
->local_nonce
, hmac
);
415 return !crypto_memneq(hmac
, mp_opt
->hmac
, MPTCPOPT_HMAC_LEN
);
418 static void mptcp_sock_destruct(struct sock
*sk
)
420 /* if new mptcp socket isn't accepted, it is free'd
421 * from the tcp listener sockets request queue, linked
422 * from req->sk. The tcp socket is released.
423 * This calls the ULP release function which will
424 * also remove the mptcp socket, via
425 * sock_put(ctx->conn).
427 * Problem is that the mptcp socket will be in
428 * ESTABLISHED state and will not have the SOCK_DEAD flag.
429 * Both result in warnings from inet_sock_destruct.
432 if (sk
->sk_state
== TCP_ESTABLISHED
) {
433 sk
->sk_state
= TCP_CLOSE
;
434 WARN_ON_ONCE(sk
->sk_socket
);
438 mptcp_destroy_common(mptcp_sk(sk
));
439 inet_sock_destruct(sk
);
442 static void mptcp_force_close(struct sock
*sk
)
444 inet_sk_state_store(sk
, TCP_CLOSE
);
445 sk_common_release(sk
);
448 static void subflow_ulp_fallback(struct sock
*sk
,
449 struct mptcp_subflow_context
*old_ctx
)
451 struct inet_connection_sock
*icsk
= inet_csk(sk
);
453 mptcp_subflow_tcp_fallback(sk
, old_ctx
);
454 icsk
->icsk_ulp_ops
= NULL
;
455 rcu_assign_pointer(icsk
->icsk_ulp_data
, NULL
);
456 tcp_sk(sk
)->is_mptcp
= 0;
459 static void subflow_drop_ctx(struct sock
*ssk
)
461 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
466 subflow_ulp_fallback(ssk
, ctx
);
473 void mptcp_subflow_fully_established(struct mptcp_subflow_context
*subflow
,
474 struct mptcp_options_received
*mp_opt
)
476 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
478 subflow
->remote_key
= mp_opt
->sndr_key
;
479 subflow
->fully_established
= 1;
480 subflow
->can_ack
= 1;
481 WRITE_ONCE(msk
->fully_established
, true);
484 static struct sock
*subflow_syn_recv_sock(const struct sock
*sk
,
486 struct request_sock
*req
,
487 struct dst_entry
*dst
,
488 struct request_sock
*req_unhash
,
491 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk
);
492 struct mptcp_subflow_request_sock
*subflow_req
;
493 struct mptcp_options_received mp_opt
;
494 bool fallback
, fallback_is_fatal
;
495 struct sock
*new_msk
= NULL
;
498 pr_debug("listener=%p, req=%p, conn=%p", listener
, req
, listener
->conn
);
500 /* After child creation we must look for 'mp_capable' even when options
503 mp_opt
.mp_capable
= 0;
505 /* hopefully temporary handling for MP_JOIN+syncookie */
506 subflow_req
= mptcp_subflow_rsk(req
);
507 fallback_is_fatal
= tcp_rsk(req
)->is_mptcp
&& subflow_req
->mp_join
;
508 fallback
= !tcp_rsk(req
)->is_mptcp
;
512 /* if the sk is MP_CAPABLE, we try to fetch the client key */
513 if (subflow_req
->mp_capable
) {
514 if (TCP_SKB_CB(skb
)->seq
!= subflow_req
->ssn_offset
+ 1) {
515 /* here we can receive and accept an in-window,
516 * out-of-order pkt, which will not carry the MP_CAPABLE
517 * opt even on mptcp enabled paths
522 mptcp_get_options(skb
, &mp_opt
);
523 if (!mp_opt
.mp_capable
) {
529 new_msk
= mptcp_sk_clone(listener
->conn
, &mp_opt
, req
);
532 } else if (subflow_req
->mp_join
) {
533 mptcp_get_options(skb
, &mp_opt
);
534 if (!mp_opt
.mp_join
||
535 !mptcp_can_accept_new_subflow(subflow_req
->msk
) ||
536 !subflow_hmac_valid(req
, &mp_opt
)) {
537 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKMAC
);
543 child
= listener
->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, dst
,
544 req_unhash
, own_req
);
546 if (child
&& *own_req
) {
547 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(child
);
549 tcp_rsk(req
)->drop_req
= false;
551 /* we need to fallback on ctx allocation failure and on pre-reqs
552 * checking above. In the latter scenario we additionally need
553 * to reset the context to non MPTCP status.
555 if (!ctx
|| fallback
) {
556 if (fallback_is_fatal
)
559 subflow_drop_ctx(child
);
563 if (ctx
->mp_capable
) {
564 /* this can't race with mptcp_close(), as the msk is
565 * not yet exposted to user-space
567 inet_sk_state_store((void *)new_msk
, TCP_ESTABLISHED
);
569 /* new mpc subflow takes ownership of the newly
570 * created mptcp socket
572 new_msk
->sk_destruct
= mptcp_sock_destruct
;
573 mptcp_pm_new_connection(mptcp_sk(new_msk
), 1);
574 mptcp_token_accept(subflow_req
, mptcp_sk(new_msk
));
578 /* with OoO packets we can reach here without ingress
581 if (mp_opt
.mp_capable
)
582 mptcp_subflow_fully_established(ctx
, &mp_opt
);
583 } else if (ctx
->mp_join
) {
584 struct mptcp_sock
*owner
;
586 owner
= subflow_req
->msk
;
590 /* move the msk reference ownership to the subflow */
591 subflow_req
->msk
= NULL
;
592 ctx
->conn
= (struct sock
*)owner
;
593 if (!mptcp_finish_join(child
))
596 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKRX
);
597 tcp_rsk(req
)->drop_req
= true;
602 /* dispose of the left over mptcp master, if any */
603 if (unlikely(new_msk
))
604 mptcp_force_close(new_msk
);
606 /* check for expected invariant - should never trigger, just help
607 * catching eariler subtle bugs
609 WARN_ON_ONCE(child
&& *own_req
&& tcp_sk(child
)->is_mptcp
&&
610 (!mptcp_subflow_ctx(child
) ||
611 !mptcp_subflow_ctx(child
)->conn
));
615 subflow_drop_ctx(child
);
616 tcp_rsk(req
)->drop_req
= true;
617 inet_csk_prepare_for_destroy_sock(child
);
619 req
->rsk_ops
->send_reset(sk
, skb
);
621 /* The last child reference will be released by the caller */
625 static struct inet_connection_sock_af_ops subflow_specific
;
627 enum mapping_status
{
635 static u64
expand_seq(u64 old_seq
, u16 old_data_len
, u64 seq
)
637 if ((u32
)seq
== (u32
)old_seq
)
640 /* Assume map covers data not mapped yet. */
641 return seq
| ((old_seq
+ old_data_len
+ 1) & GENMASK_ULL(63, 32));
644 static void warn_bad_map(struct mptcp_subflow_context
*subflow
, u32 ssn
)
646 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
647 ssn
, subflow
->map_subflow_seq
, subflow
->map_data_len
);
650 static bool skb_is_fully_mapped(struct sock
*ssk
, struct sk_buff
*skb
)
652 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
653 unsigned int skb_consumed
;
655 skb_consumed
= tcp_sk(ssk
)->copied_seq
- TCP_SKB_CB(skb
)->seq
;
656 if (WARN_ON_ONCE(skb_consumed
>= skb
->len
))
659 return skb
->len
- skb_consumed
<= subflow
->map_data_len
-
660 mptcp_subflow_get_map_offset(subflow
);
663 static bool validate_mapping(struct sock
*ssk
, struct sk_buff
*skb
)
665 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
666 u32 ssn
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
668 if (unlikely(before(ssn
, subflow
->map_subflow_seq
))) {
669 /* Mapping covers data later in the subflow stream,
670 * currently unsupported.
672 warn_bad_map(subflow
, ssn
);
675 if (unlikely(!before(ssn
, subflow
->map_subflow_seq
+
676 subflow
->map_data_len
))) {
677 /* Mapping does covers past subflow data, invalid */
678 warn_bad_map(subflow
, ssn
+ skb
->len
);
684 static enum mapping_status
get_mapping_status(struct sock
*ssk
,
685 struct mptcp_sock
*msk
)
687 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
688 struct mptcp_ext
*mpext
;
693 skb
= skb_peek(&ssk
->sk_receive_queue
);
695 return MAPPING_EMPTY
;
697 if (mptcp_check_fallback(ssk
))
698 return MAPPING_DUMMY
;
700 mpext
= mptcp_get_ext(skb
);
701 if (!mpext
|| !mpext
->use_map
) {
702 if (!subflow
->map_valid
&& !skb
->len
) {
703 /* the TCP stack deliver 0 len FIN pkt to the receive
704 * queue, that is the only 0len pkts ever expected here,
705 * and we can admit no mapping only for 0 len pkts
707 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
))
708 WARN_ONCE(1, "0len seq %d:%d flags %x",
709 TCP_SKB_CB(skb
)->seq
,
710 TCP_SKB_CB(skb
)->end_seq
,
711 TCP_SKB_CB(skb
)->tcp_flags
);
712 sk_eat_skb(ssk
, skb
);
713 return MAPPING_EMPTY
;
716 if (!subflow
->map_valid
)
717 return MAPPING_INVALID
;
722 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
723 mpext
->data_seq
, mpext
->dsn64
, mpext
->subflow_seq
,
724 mpext
->data_len
, mpext
->data_fin
);
726 data_len
= mpext
->data_len
;
728 pr_err("Infinite mapping not handled");
729 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_INFINITEMAPRX
);
730 return MAPPING_INVALID
;
733 if (mpext
->data_fin
== 1) {
735 mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
);
736 pr_debug("DATA_FIN with no payload seq=%llu", mpext
->data_seq
);
737 if (subflow
->map_valid
) {
738 /* A DATA_FIN might arrive in a DSS
739 * option before the previous mapping
740 * has been fully consumed. Continue
741 * handling the existing mapping.
743 skb_ext_del(skb
, SKB_EXT_MPTCP
);
746 return MAPPING_DATA_FIN
;
749 mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
+ data_len
);
750 pr_debug("DATA_FIN with mapping seq=%llu", mpext
->data_seq
+ data_len
);
753 /* Adjust for DATA_FIN using 1 byte of sequence space */
758 map_seq
= expand_seq(subflow
->map_seq
, subflow
->map_data_len
,
760 subflow
->use_64bit_ack
= 0;
761 pr_debug("expanded seq=%llu", subflow
->map_seq
);
763 map_seq
= mpext
->data_seq
;
764 subflow
->use_64bit_ack
= 1;
767 if (subflow
->map_valid
) {
768 /* Allow replacing only with an identical map */
769 if (subflow
->map_seq
== map_seq
&&
770 subflow
->map_subflow_seq
== mpext
->subflow_seq
&&
771 subflow
->map_data_len
== data_len
) {
772 skb_ext_del(skb
, SKB_EXT_MPTCP
);
776 /* If this skb data are fully covered by the current mapping,
777 * the new map would need caching, which is not supported
779 if (skb_is_fully_mapped(ssk
, skb
)) {
780 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSNOMATCH
);
781 return MAPPING_INVALID
;
784 /* will validate the next map after consuming the current one */
788 subflow
->map_seq
= map_seq
;
789 subflow
->map_subflow_seq
= mpext
->subflow_seq
;
790 subflow
->map_data_len
= data_len
;
791 subflow
->map_valid
= 1;
792 subflow
->mpc_map
= mpext
->mpc_map
;
793 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
794 subflow
->map_seq
, subflow
->map_subflow_seq
,
795 subflow
->map_data_len
);
798 /* we revalidate valid mapping on new skb, because we must ensure
799 * the current skb is completely covered by the available mapping
801 if (!validate_mapping(ssk
, skb
))
802 return MAPPING_INVALID
;
804 skb_ext_del(skb
, SKB_EXT_MPTCP
);
808 static void mptcp_subflow_discard_data(struct sock
*ssk
, struct sk_buff
*skb
,
811 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
812 bool fin
= TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
;
815 incr
= limit
>= skb
->len
? skb
->len
+ fin
: limit
;
817 pr_debug("discarding=%d len=%d seq=%d", incr
, skb
->len
,
818 subflow
->map_subflow_seq
);
819 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DUPDATA
);
820 tcp_sk(ssk
)->copied_seq
+= incr
;
821 if (!before(tcp_sk(ssk
)->copied_seq
, TCP_SKB_CB(skb
)->end_seq
))
822 sk_eat_skb(ssk
, skb
);
823 if (mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
)
824 subflow
->map_valid
= 0;
826 tcp_cleanup_rbuf(ssk
, incr
);
829 static bool subflow_check_data_avail(struct sock
*ssk
)
831 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
832 enum mapping_status status
;
833 struct mptcp_sock
*msk
;
836 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow
->conn
, ssk
,
837 subflow
->data_avail
, skb_peek(&ssk
->sk_receive_queue
));
838 if (!skb_peek(&ssk
->sk_receive_queue
))
839 subflow
->data_avail
= 0;
840 if (subflow
->data_avail
)
843 msk
= mptcp_sk(subflow
->conn
);
848 status
= get_mapping_status(ssk
, msk
);
849 pr_debug("msk=%p ssk=%p status=%d", msk
, ssk
, status
);
850 if (status
== MAPPING_INVALID
) {
851 ssk
->sk_err
= EBADMSG
;
854 if (status
== MAPPING_DUMMY
) {
855 __mptcp_do_fallback(msk
);
856 skb
= skb_peek(&ssk
->sk_receive_queue
);
857 subflow
->map_valid
= 1;
858 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
859 subflow
->map_data_len
= skb
->len
;
860 subflow
->map_subflow_seq
= tcp_sk(ssk
)->copied_seq
-
862 subflow
->data_avail
= MPTCP_SUBFLOW_DATA_AVAIL
;
866 if (status
!= MAPPING_OK
)
869 skb
= skb_peek(&ssk
->sk_receive_queue
);
870 if (WARN_ON_ONCE(!skb
))
873 /* if msk lacks the remote key, this subflow must provide an
874 * MP_CAPABLE-based mapping
876 if (unlikely(!READ_ONCE(msk
->can_ack
))) {
877 if (!subflow
->mpc_map
) {
878 ssk
->sk_err
= EBADMSG
;
881 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
882 WRITE_ONCE(msk
->ack_seq
, subflow
->map_seq
);
883 WRITE_ONCE(msk
->can_ack
, true);
886 old_ack
= READ_ONCE(msk
->ack_seq
);
887 ack_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
888 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack
,
890 if (ack_seq
== old_ack
) {
891 subflow
->data_avail
= MPTCP_SUBFLOW_DATA_AVAIL
;
893 } else if (after64(ack_seq
, old_ack
)) {
894 subflow
->data_avail
= MPTCP_SUBFLOW_OOO_DATA
;
898 /* only accept in-sequence mapping. Old values are spurious
901 mptcp_subflow_discard_data(ssk
, skb
, old_ack
- ack_seq
);
906 /* fatal protocol error, close the socket */
907 /* This barrier is coupled with smp_rmb() in tcp_poll() */
909 ssk
->sk_error_report(ssk
);
910 tcp_set_state(ssk
, TCP_CLOSE
);
911 tcp_send_active_reset(ssk
, GFP_ATOMIC
);
912 subflow
->data_avail
= 0;
916 bool mptcp_subflow_data_available(struct sock
*sk
)
918 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
920 /* check if current mapping is still valid */
921 if (subflow
->map_valid
&&
922 mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
) {
923 subflow
->map_valid
= 0;
924 subflow
->data_avail
= 0;
926 pr_debug("Done with mapping: seq=%u data_len=%u",
927 subflow
->map_subflow_seq
,
928 subflow
->map_data_len
);
931 return subflow_check_data_avail(sk
);
934 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
937 * In mptcp, rwin is about the mptcp-level connection data.
939 * Data that is still on the ssk rx queue can thus be ignored,
940 * as far as mptcp peer is concerened that data is still inflight.
941 * DSS ACK is updated when skb is moved to the mptcp rx queue.
943 void mptcp_space(const struct sock
*ssk
, int *space
, int *full_space
)
945 const struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
946 const struct sock
*sk
= subflow
->conn
;
948 *space
= tcp_space(sk
);
949 *full_space
= tcp_full_space(sk
);
952 static void subflow_data_ready(struct sock
*sk
)
954 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
955 u16 state
= 1 << inet_sk_state_load(sk
);
956 struct sock
*parent
= subflow
->conn
;
957 struct mptcp_sock
*msk
;
959 msk
= mptcp_sk(parent
);
960 if (state
& TCPF_LISTEN
) {
961 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
962 parent
->sk_data_ready(parent
);
966 WARN_ON_ONCE(!__mptcp_check_fallback(msk
) && !subflow
->mp_capable
&&
967 !subflow
->mp_join
&& !(state
& TCPF_CLOSE
));
969 if (mptcp_subflow_data_available(sk
))
970 mptcp_data_ready(parent
, sk
);
973 static void subflow_write_space(struct sock
*sk
)
975 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
976 struct sock
*parent
= subflow
->conn
;
978 if (!sk_stream_is_writeable(sk
))
981 if (sk_stream_is_writeable(parent
)) {
982 set_bit(MPTCP_SEND_SPACE
, &mptcp_sk(parent
)->flags
);
983 smp_mb__after_atomic();
984 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
985 sk_stream_write_space(parent
);
989 static struct inet_connection_sock_af_ops
*
990 subflow_default_af_ops(struct sock
*sk
)
992 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
993 if (sk
->sk_family
== AF_INET6
)
994 return &subflow_v6_specific
;
996 return &subflow_specific
;
999 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1000 void mptcpv6_handle_mapped(struct sock
*sk
, bool mapped
)
1002 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1003 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1004 struct inet_connection_sock_af_ops
*target
;
1006 target
= mapped
? &subflow_v6m_specific
: subflow_default_af_ops(sk
);
1008 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1009 subflow
, sk
->sk_family
, icsk
->icsk_af_ops
, target
, mapped
);
1011 if (likely(icsk
->icsk_af_ops
== target
))
1014 subflow
->icsk_af_ops
= icsk
->icsk_af_ops
;
1015 icsk
->icsk_af_ops
= target
;
1019 static void mptcp_info2sockaddr(const struct mptcp_addr_info
*info
,
1020 struct sockaddr_storage
*addr
)
1022 memset(addr
, 0, sizeof(*addr
));
1023 addr
->ss_family
= info
->family
;
1024 if (addr
->ss_family
== AF_INET
) {
1025 struct sockaddr_in
*in_addr
= (struct sockaddr_in
*)addr
;
1027 in_addr
->sin_addr
= info
->addr
;
1028 in_addr
->sin_port
= info
->port
;
1030 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1031 else if (addr
->ss_family
== AF_INET6
) {
1032 struct sockaddr_in6
*in6_addr
= (struct sockaddr_in6
*)addr
;
1034 in6_addr
->sin6_addr
= info
->addr6
;
1035 in6_addr
->sin6_port
= info
->port
;
1040 int __mptcp_subflow_connect(struct sock
*sk
, const struct mptcp_addr_info
*loc
,
1041 const struct mptcp_addr_info
*remote
)
1043 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1044 struct mptcp_subflow_context
*subflow
;
1045 struct sockaddr_storage addr
;
1046 int remote_id
= remote
->id
;
1047 int local_id
= loc
->id
;
1054 if (!mptcp_is_fully_established(sk
))
1057 err
= mptcp_subflow_create_socket(sk
, &sf
);
1062 subflow
= mptcp_subflow_ctx(ssk
);
1064 get_random_bytes(&subflow
->local_nonce
, sizeof(u32
));
1065 } while (!subflow
->local_nonce
);
1068 err
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)ssk
);
1075 subflow
->remote_key
= msk
->remote_key
;
1076 subflow
->local_key
= msk
->local_key
;
1077 subflow
->token
= msk
->token
;
1078 mptcp_info2sockaddr(loc
, &addr
);
1080 addrlen
= sizeof(struct sockaddr_in
);
1081 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1082 if (loc
->family
== AF_INET6
)
1083 addrlen
= sizeof(struct sockaddr_in6
);
1085 ssk
->sk_bound_dev_if
= loc
->ifindex
;
1086 err
= kernel_bind(sf
, (struct sockaddr
*)&addr
, addrlen
);
1090 mptcp_crypto_key_sha(subflow
->remote_key
, &remote_token
, NULL
);
1091 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk
,
1092 remote_token
, local_id
, remote_id
);
1093 subflow
->remote_token
= remote_token
;
1094 subflow
->local_id
= local_id
;
1095 subflow
->remote_id
= remote_id
;
1096 subflow
->request_join
= 1;
1097 subflow
->request_bkup
= !!(loc
->flags
& MPTCP_PM_ADDR_FLAG_BACKUP
);
1098 mptcp_info2sockaddr(remote
, &addr
);
1100 err
= kernel_connect(sf
, (struct sockaddr
*)&addr
, addrlen
, O_NONBLOCK
);
1101 if (err
&& err
!= -EINPROGRESS
)
1104 spin_lock_bh(&msk
->join_list_lock
);
1105 list_add_tail(&subflow
->node
, &msk
->join_list
);
1106 spin_unlock_bh(&msk
->join_list_lock
);
1115 int mptcp_subflow_create_socket(struct sock
*sk
, struct socket
**new_sock
)
1117 struct mptcp_subflow_context
*subflow
;
1118 struct net
*net
= sock_net(sk
);
1122 /* un-accepted server sockets can reach here - on bad configuration
1123 * bail early to avoid greater trouble later
1125 if (unlikely(!sk
->sk_socket
))
1128 err
= sock_create_kern(net
, sk
->sk_family
, SOCK_STREAM
, IPPROTO_TCP
,
1135 /* kernel sockets do not by default acquire net ref, but TCP timer
1138 sf
->sk
->sk_net_refcnt
= 1;
1140 #ifdef CONFIG_PROC_FS
1141 this_cpu_add(*net
->core
.sock_inuse
, 1);
1143 err
= tcp_set_ulp(sf
->sk
, "mptcp");
1144 release_sock(sf
->sk
);
1151 /* the newly created socket really belongs to the owning MPTCP master
1152 * socket, even if for additional subflows the allocation is performed
1153 * by a kernel workqueue. Adjust inode references, so that the
1154 * procfs/diag interaces really show this one belonging to the correct
1157 SOCK_INODE(sf
)->i_ino
= SOCK_INODE(sk
->sk_socket
)->i_ino
;
1158 SOCK_INODE(sf
)->i_uid
= SOCK_INODE(sk
->sk_socket
)->i_uid
;
1159 SOCK_INODE(sf
)->i_gid
= SOCK_INODE(sk
->sk_socket
)->i_gid
;
1161 subflow
= mptcp_subflow_ctx(sf
->sk
);
1162 pr_debug("subflow=%p", subflow
);
1171 static struct mptcp_subflow_context
*subflow_create_ctx(struct sock
*sk
,
1174 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1175 struct mptcp_subflow_context
*ctx
;
1177 ctx
= kzalloc(sizeof(*ctx
), priority
);
1181 rcu_assign_pointer(icsk
->icsk_ulp_data
, ctx
);
1182 INIT_LIST_HEAD(&ctx
->node
);
1184 pr_debug("subflow=%p", ctx
);
1191 static void __subflow_state_change(struct sock
*sk
)
1193 struct socket_wq
*wq
;
1196 wq
= rcu_dereference(sk
->sk_wq
);
1197 if (skwq_has_sleeper(wq
))
1198 wake_up_interruptible_all(&wq
->wait
);
1202 static bool subflow_is_done(const struct sock
*sk
)
1204 return sk
->sk_shutdown
& RCV_SHUTDOWN
|| sk
->sk_state
== TCP_CLOSE
;
1207 static void subflow_state_change(struct sock
*sk
)
1209 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1210 struct sock
*parent
= subflow
->conn
;
1212 __subflow_state_change(sk
);
1214 if (subflow_simultaneous_connect(sk
)) {
1215 mptcp_do_fallback(sk
);
1216 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
1217 pr_fallback(mptcp_sk(parent
));
1218 subflow
->conn_finished
= 1;
1219 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
1220 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
1221 parent
->sk_state_change(parent
);
1225 /* as recvmsg() does not acquire the subflow socket for ssk selection
1226 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1227 * the data available machinery here.
1229 if (mptcp_subflow_data_available(sk
))
1230 mptcp_data_ready(parent
, sk
);
1232 if (__mptcp_check_fallback(mptcp_sk(parent
)) &&
1233 !(parent
->sk_shutdown
& RCV_SHUTDOWN
) &&
1234 !subflow
->rx_eof
&& subflow_is_done(sk
)) {
1235 subflow
->rx_eof
= 1;
1236 mptcp_subflow_eof(parent
);
1240 static int subflow_ulp_init(struct sock
*sk
)
1242 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1243 struct mptcp_subflow_context
*ctx
;
1244 struct tcp_sock
*tp
= tcp_sk(sk
);
1247 /* disallow attaching ULP to a socket unless it has been
1248 * created with sock_create_kern()
1250 if (!sk
->sk_kern_sock
) {
1255 ctx
= subflow_create_ctx(sk
, GFP_KERNEL
);
1261 pr_debug("subflow=%p, family=%d", ctx
, sk
->sk_family
);
1264 ctx
->icsk_af_ops
= icsk
->icsk_af_ops
;
1265 icsk
->icsk_af_ops
= subflow_default_af_ops(sk
);
1266 ctx
->tcp_data_ready
= sk
->sk_data_ready
;
1267 ctx
->tcp_state_change
= sk
->sk_state_change
;
1268 ctx
->tcp_write_space
= sk
->sk_write_space
;
1269 sk
->sk_data_ready
= subflow_data_ready
;
1270 sk
->sk_write_space
= subflow_write_space
;
1271 sk
->sk_state_change
= subflow_state_change
;
1276 static void subflow_ulp_release(struct sock
*sk
)
1278 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(sk
);
1284 sock_put(ctx
->conn
);
1286 kfree_rcu(ctx
, rcu
);
1289 static void subflow_ulp_clone(const struct request_sock
*req
,
1291 const gfp_t priority
)
1293 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
1294 struct mptcp_subflow_context
*old_ctx
= mptcp_subflow_ctx(newsk
);
1295 struct mptcp_subflow_context
*new_ctx
;
1297 if (!tcp_rsk(req
)->is_mptcp
||
1298 (!subflow_req
->mp_capable
&& !subflow_req
->mp_join
)) {
1299 subflow_ulp_fallback(newsk
, old_ctx
);
1303 new_ctx
= subflow_create_ctx(newsk
, priority
);
1305 subflow_ulp_fallback(newsk
, old_ctx
);
1309 new_ctx
->conn_finished
= 1;
1310 new_ctx
->icsk_af_ops
= old_ctx
->icsk_af_ops
;
1311 new_ctx
->tcp_data_ready
= old_ctx
->tcp_data_ready
;
1312 new_ctx
->tcp_state_change
= old_ctx
->tcp_state_change
;
1313 new_ctx
->tcp_write_space
= old_ctx
->tcp_write_space
;
1314 new_ctx
->rel_write_seq
= 1;
1315 new_ctx
->tcp_sock
= newsk
;
1317 if (subflow_req
->mp_capable
) {
1318 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1319 * is fully established only after we receive the remote key
1321 new_ctx
->mp_capable
= 1;
1322 new_ctx
->local_key
= subflow_req
->local_key
;
1323 new_ctx
->token
= subflow_req
->token
;
1324 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1325 new_ctx
->idsn
= subflow_req
->idsn
;
1326 } else if (subflow_req
->mp_join
) {
1327 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1328 new_ctx
->mp_join
= 1;
1329 new_ctx
->fully_established
= 1;
1330 new_ctx
->backup
= subflow_req
->backup
;
1331 new_ctx
->local_id
= subflow_req
->local_id
;
1332 new_ctx
->remote_id
= subflow_req
->remote_id
;
1333 new_ctx
->token
= subflow_req
->token
;
1334 new_ctx
->thmac
= subflow_req
->thmac
;
1338 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly
= {
1340 .owner
= THIS_MODULE
,
1341 .init
= subflow_ulp_init
,
1342 .release
= subflow_ulp_release
,
1343 .clone
= subflow_ulp_clone
,
1346 static int subflow_ops_init(struct request_sock_ops
*subflow_ops
)
1348 subflow_ops
->obj_size
= sizeof(struct mptcp_subflow_request_sock
);
1349 subflow_ops
->slab_name
= "request_sock_subflow";
1351 subflow_ops
->slab
= kmem_cache_create(subflow_ops
->slab_name
,
1352 subflow_ops
->obj_size
, 0,
1354 SLAB_TYPESAFE_BY_RCU
,
1356 if (!subflow_ops
->slab
)
1359 subflow_ops
->destructor
= subflow_req_destructor
;
1364 void __init
mptcp_subflow_init(void)
1366 mptcp_subflow_request_sock_ops
= tcp_request_sock_ops
;
1367 if (subflow_ops_init(&mptcp_subflow_request_sock_ops
) != 0)
1368 panic("MPTCP: failed to init subflow request sock ops\n");
1370 subflow_request_sock_ipv4_ops
= tcp_request_sock_ipv4_ops
;
1371 subflow_request_sock_ipv4_ops
.init_req
= subflow_v4_init_req
;
1373 subflow_specific
= ipv4_specific
;
1374 subflow_specific
.conn_request
= subflow_v4_conn_request
;
1375 subflow_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1376 subflow_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1378 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1379 subflow_request_sock_ipv6_ops
= tcp_request_sock_ipv6_ops
;
1380 subflow_request_sock_ipv6_ops
.init_req
= subflow_v6_init_req
;
1382 subflow_v6_specific
= ipv6_specific
;
1383 subflow_v6_specific
.conn_request
= subflow_v6_conn_request
;
1384 subflow_v6_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1385 subflow_v6_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1387 subflow_v6m_specific
= subflow_v6_specific
;
1388 subflow_v6m_specific
.queue_xmit
= ipv4_specific
.queue_xmit
;
1389 subflow_v6m_specific
.send_check
= ipv4_specific
.send_check
;
1390 subflow_v6m_specific
.net_header_len
= ipv4_specific
.net_header_len
;
1391 subflow_v6m_specific
.mtu_reduced
= ipv4_specific
.mtu_reduced
;
1392 subflow_v6m_specific
.net_frag_header_len
= 0;
1395 mptcp_diag_subflow_init(&subflow_ulp_ops
);
1397 if (tcp_register_ulp(&subflow_ulp_ops
) != 0)
1398 panic("MPTCP: failed to register subflows to ULP\n");