1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright (c) 2017 - 2019, Intel Corporation.
7 #define pr_fmt(fmt) "MPTCP: " fmt
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <crypto/algapi.h>
13 #include <crypto/sha.h>
15 #include <net/inet_common.h>
16 #include <net/inet_hashtables.h>
17 #include <net/protocol.h>
19 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
20 #include <net/ip6_route.h>
22 #include <net/mptcp.h>
26 static void SUBFLOW_REQ_INC_STATS(struct request_sock
*req
,
27 enum linux_mptcp_mib_field field
)
29 MPTCP_INC_STATS(sock_net(req_to_sk(req
)), field
);
32 static void subflow_req_destructor(struct request_sock
*req
)
34 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
36 pr_debug("subflow_req=%p", subflow_req
);
39 sock_put((struct sock
*)subflow_req
->msk
);
41 mptcp_token_destroy_request(req
);
42 tcp_request_sock_ops
.destructor(req
);
45 static void subflow_generate_hmac(u64 key1
, u64 key2
, u32 nonce1
, u32 nonce2
,
50 put_unaligned_be32(nonce1
, &msg
[0]);
51 put_unaligned_be32(nonce2
, &msg
[4]);
53 mptcp_crypto_hmac_sha(key1
, key2
, msg
, 8, hmac
);
56 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock
*msk
)
58 return mptcp_is_fully_established((void *)msk
) &&
59 READ_ONCE(msk
->pm
.accept_subflow
);
62 /* validate received token and create truncated hmac and nonce for SYN-ACK */
63 static struct mptcp_sock
*subflow_token_join_request(struct request_sock
*req
,
64 const struct sk_buff
*skb
)
66 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
67 u8 hmac
[SHA256_DIGEST_SIZE
];
68 struct mptcp_sock
*msk
;
71 msk
= mptcp_token_get_sock(subflow_req
->token
);
73 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINNOTOKEN
);
77 local_id
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)req
);
79 sock_put((struct sock
*)msk
);
82 subflow_req
->local_id
= local_id
;
84 get_random_bytes(&subflow_req
->local_nonce
, sizeof(u32
));
86 subflow_generate_hmac(msk
->local_key
, msk
->remote_key
,
87 subflow_req
->local_nonce
,
88 subflow_req
->remote_nonce
, hmac
);
90 subflow_req
->thmac
= get_unaligned_be64(hmac
);
94 static int __subflow_init_req(struct request_sock
*req
, const struct sock
*sk_listener
)
96 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
98 subflow_req
->mp_capable
= 0;
99 subflow_req
->mp_join
= 0;
100 subflow_req
->msk
= NULL
;
101 mptcp_token_init_request(req
);
103 #ifdef CONFIG_TCP_MD5SIG
104 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
107 if (rcu_access_pointer(tcp_sk(sk_listener
)->md5sig_info
))
114 static void subflow_init_req(struct request_sock
*req
,
115 const struct sock
*sk_listener
,
118 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
119 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
120 struct mptcp_options_received mp_opt
;
123 pr_debug("subflow_req=%p, listener=%p", subflow_req
, listener
);
125 ret
= __subflow_init_req(req
, sk_listener
);
129 mptcp_get_options(skb
, &mp_opt
);
131 if (mp_opt
.mp_capable
) {
132 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_MPCAPABLEPASSIVE
);
136 } else if (mp_opt
.mp_join
) {
137 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINSYNRX
);
140 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
141 int err
, retries
= 4;
143 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
146 get_random_bytes(&subflow_req
->local_key
, sizeof(subflow_req
->local_key
));
147 } while (subflow_req
->local_key
== 0);
149 if (unlikely(req
->syncookie
)) {
150 mptcp_crypto_key_sha(subflow_req
->local_key
,
153 if (mptcp_token_exists(subflow_req
->token
)) {
157 subflow_req
->mp_capable
= 1;
162 err
= mptcp_token_new_request(req
);
164 subflow_req
->mp_capable
= 1;
165 else if (retries
-- > 0)
168 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
169 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
170 subflow_req
->mp_join
= 1;
171 subflow_req
->backup
= mp_opt
.backup
;
172 subflow_req
->remote_id
= mp_opt
.join_id
;
173 subflow_req
->token
= mp_opt
.token
;
174 subflow_req
->remote_nonce
= mp_opt
.nonce
;
175 subflow_req
->msk
= subflow_token_join_request(req
, skb
);
177 if (unlikely(req
->syncookie
) && subflow_req
->msk
) {
178 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
179 subflow_init_req_cookie_join_save(subflow_req
, skb
);
182 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req
->token
,
183 subflow_req
->remote_nonce
, subflow_req
->msk
);
187 int mptcp_subflow_init_cookie_req(struct request_sock
*req
,
188 const struct sock
*sk_listener
,
191 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk_listener
);
192 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
193 struct mptcp_options_received mp_opt
;
196 err
= __subflow_init_req(req
, sk_listener
);
200 mptcp_get_options(skb
, &mp_opt
);
202 if (mp_opt
.mp_capable
&& mp_opt
.mp_join
)
205 if (mp_opt
.mp_capable
&& listener
->request_mptcp
) {
206 if (mp_opt
.sndr_key
== 0)
209 subflow_req
->local_key
= mp_opt
.rcvr_key
;
210 err
= mptcp_token_new_request(req
);
214 subflow_req
->mp_capable
= 1;
215 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
216 } else if (mp_opt
.mp_join
&& listener
->request_mptcp
) {
217 if (!mptcp_token_join_cookie_init_state(subflow_req
, skb
))
220 if (mptcp_can_accept_new_subflow(subflow_req
->msk
))
221 subflow_req
->mp_join
= 1;
223 subflow_req
->ssn_offset
= TCP_SKB_CB(skb
)->seq
- 1;
228 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req
);
230 static void subflow_v4_init_req(struct request_sock
*req
,
231 const struct sock
*sk_listener
,
234 tcp_rsk(req
)->is_mptcp
= 1;
236 tcp_request_sock_ipv4_ops
.init_req(req
, sk_listener
, skb
);
238 subflow_init_req(req
, sk_listener
, skb
);
241 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
242 static void subflow_v6_init_req(struct request_sock
*req
,
243 const struct sock
*sk_listener
,
246 tcp_rsk(req
)->is_mptcp
= 1;
248 tcp_request_sock_ipv6_ops
.init_req(req
, sk_listener
, skb
);
250 subflow_init_req(req
, sk_listener
, skb
);
254 /* validate received truncated hmac and create hmac for third ACK */
255 static bool subflow_thmac_valid(struct mptcp_subflow_context
*subflow
)
257 u8 hmac
[SHA256_DIGEST_SIZE
];
260 subflow_generate_hmac(subflow
->remote_key
, subflow
->local_key
,
261 subflow
->remote_nonce
, subflow
->local_nonce
,
264 thmac
= get_unaligned_be64(hmac
);
265 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
266 subflow
, subflow
->token
,
267 (unsigned long long)thmac
,
268 (unsigned long long)subflow
->thmac
);
270 return thmac
== subflow
->thmac
;
273 static void subflow_finish_connect(struct sock
*sk
, const struct sk_buff
*skb
)
275 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
276 struct mptcp_options_received mp_opt
;
277 struct sock
*parent
= subflow
->conn
;
279 subflow
->icsk_af_ops
->sk_rx_dst_set(sk
, skb
);
281 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
282 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
283 parent
->sk_state_change(parent
);
286 /* be sure no special action on any packet other than syn-ack */
287 if (subflow
->conn_finished
)
290 subflow
->rel_write_seq
= 1;
291 subflow
->conn_finished
= 1;
292 subflow
->ssn_offset
= TCP_SKB_CB(skb
)->seq
;
293 pr_debug("subflow=%p synack seq=%x", subflow
, subflow
->ssn_offset
);
295 mptcp_get_options(skb
, &mp_opt
);
296 if (subflow
->request_mptcp
) {
297 if (!mp_opt
.mp_capable
) {
298 MPTCP_INC_STATS(sock_net(sk
),
299 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK
);
300 mptcp_do_fallback(sk
);
301 pr_fallback(mptcp_sk(subflow
->conn
));
305 subflow
->mp_capable
= 1;
306 subflow
->can_ack
= 1;
307 subflow
->remote_key
= mp_opt
.sndr_key
;
308 pr_debug("subflow=%p, remote_key=%llu", subflow
,
309 subflow
->remote_key
);
310 mptcp_finish_connect(sk
);
311 } else if (subflow
->request_join
) {
312 u8 hmac
[SHA256_DIGEST_SIZE
];
317 subflow
->thmac
= mp_opt
.thmac
;
318 subflow
->remote_nonce
= mp_opt
.nonce
;
319 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow
,
320 subflow
->thmac
, subflow
->remote_nonce
);
322 if (!subflow_thmac_valid(subflow
)) {
323 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINACKMAC
);
327 subflow_generate_hmac(subflow
->local_key
, subflow
->remote_key
,
328 subflow
->local_nonce
,
329 subflow
->remote_nonce
,
331 memcpy(subflow
->hmac
, hmac
, MPTCPOPT_HMAC_LEN
);
333 if (!mptcp_finish_join(sk
))
336 subflow
->mp_join
= 1;
337 MPTCP_INC_STATS(sock_net(sk
), MPTCP_MIB_JOINSYNACKRX
);
338 } else if (mptcp_check_fallback(sk
)) {
340 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
345 tcp_send_active_reset(sk
, GFP_ATOMIC
);
349 struct request_sock_ops mptcp_subflow_request_sock_ops
;
350 EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops
);
351 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops
;
353 static int subflow_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
355 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
357 pr_debug("subflow=%p", subflow
);
359 /* Never answer to SYNs sent to broadcast or multicast */
360 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
363 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
364 &subflow_request_sock_ipv4_ops
,
371 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
372 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops
;
373 static struct inet_connection_sock_af_ops subflow_v6_specific
;
374 static struct inet_connection_sock_af_ops subflow_v6m_specific
;
376 static int subflow_v6_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
378 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
380 pr_debug("subflow=%p", subflow
);
382 if (skb
->protocol
== htons(ETH_P_IP
))
383 return subflow_v4_conn_request(sk
, skb
);
385 if (!ipv6_unicast_destination(skb
))
388 return tcp_conn_request(&mptcp_subflow_request_sock_ops
,
389 &subflow_request_sock_ipv6_ops
, sk
, skb
);
393 return 0; /* don't send reset */
397 /* validate hmac received in third ACK */
398 static bool subflow_hmac_valid(const struct request_sock
*req
,
399 const struct mptcp_options_received
*mp_opt
)
401 const struct mptcp_subflow_request_sock
*subflow_req
;
402 u8 hmac
[SHA256_DIGEST_SIZE
];
403 struct mptcp_sock
*msk
;
405 subflow_req
= mptcp_subflow_rsk(req
);
406 msk
= subflow_req
->msk
;
410 subflow_generate_hmac(msk
->remote_key
, msk
->local_key
,
411 subflow_req
->remote_nonce
,
412 subflow_req
->local_nonce
, hmac
);
414 return !crypto_memneq(hmac
, mp_opt
->hmac
, MPTCPOPT_HMAC_LEN
);
417 static void mptcp_sock_destruct(struct sock
*sk
)
419 /* if new mptcp socket isn't accepted, it is free'd
420 * from the tcp listener sockets request queue, linked
421 * from req->sk. The tcp socket is released.
422 * This calls the ULP release function which will
423 * also remove the mptcp socket, via
424 * sock_put(ctx->conn).
426 * Problem is that the mptcp socket will be in
427 * ESTABLISHED state and will not have the SOCK_DEAD flag.
428 * Both result in warnings from inet_sock_destruct.
431 if (sk
->sk_state
== TCP_ESTABLISHED
) {
432 sk
->sk_state
= TCP_CLOSE
;
433 WARN_ON_ONCE(sk
->sk_socket
);
437 mptcp_token_destroy(mptcp_sk(sk
));
438 inet_sock_destruct(sk
);
441 static void mptcp_force_close(struct sock
*sk
)
443 inet_sk_state_store(sk
, TCP_CLOSE
);
444 sk_common_release(sk
);
447 static void subflow_ulp_fallback(struct sock
*sk
,
448 struct mptcp_subflow_context
*old_ctx
)
450 struct inet_connection_sock
*icsk
= inet_csk(sk
);
452 mptcp_subflow_tcp_fallback(sk
, old_ctx
);
453 icsk
->icsk_ulp_ops
= NULL
;
454 rcu_assign_pointer(icsk
->icsk_ulp_data
, NULL
);
455 tcp_sk(sk
)->is_mptcp
= 0;
458 static void subflow_drop_ctx(struct sock
*ssk
)
460 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(ssk
);
465 subflow_ulp_fallback(ssk
, ctx
);
472 void mptcp_subflow_fully_established(struct mptcp_subflow_context
*subflow
,
473 struct mptcp_options_received
*mp_opt
)
475 struct mptcp_sock
*msk
= mptcp_sk(subflow
->conn
);
477 subflow
->remote_key
= mp_opt
->sndr_key
;
478 subflow
->fully_established
= 1;
479 subflow
->can_ack
= 1;
480 WRITE_ONCE(msk
->fully_established
, true);
483 static struct sock
*subflow_syn_recv_sock(const struct sock
*sk
,
485 struct request_sock
*req
,
486 struct dst_entry
*dst
,
487 struct request_sock
*req_unhash
,
490 struct mptcp_subflow_context
*listener
= mptcp_subflow_ctx(sk
);
491 struct mptcp_subflow_request_sock
*subflow_req
;
492 struct mptcp_options_received mp_opt
;
493 bool fallback
, fallback_is_fatal
;
494 struct sock
*new_msk
= NULL
;
497 pr_debug("listener=%p, req=%p, conn=%p", listener
, req
, listener
->conn
);
499 /* After child creation we must look for 'mp_capable' even when options
502 mp_opt
.mp_capable
= 0;
504 /* hopefully temporary handling for MP_JOIN+syncookie */
505 subflow_req
= mptcp_subflow_rsk(req
);
506 fallback_is_fatal
= tcp_rsk(req
)->is_mptcp
&& subflow_req
->mp_join
;
507 fallback
= !tcp_rsk(req
)->is_mptcp
;
511 /* if the sk is MP_CAPABLE, we try to fetch the client key */
512 if (subflow_req
->mp_capable
) {
513 if (TCP_SKB_CB(skb
)->seq
!= subflow_req
->ssn_offset
+ 1) {
514 /* here we can receive and accept an in-window,
515 * out-of-order pkt, which will not carry the MP_CAPABLE
516 * opt even on mptcp enabled paths
521 mptcp_get_options(skb
, &mp_opt
);
522 if (!mp_opt
.mp_capable
) {
528 new_msk
= mptcp_sk_clone(listener
->conn
, &mp_opt
, req
);
531 } else if (subflow_req
->mp_join
) {
532 mptcp_get_options(skb
, &mp_opt
);
533 if (!mp_opt
.mp_join
||
534 !mptcp_can_accept_new_subflow(subflow_req
->msk
) ||
535 !subflow_hmac_valid(req
, &mp_opt
)) {
536 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKMAC
);
542 child
= listener
->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, dst
,
543 req_unhash
, own_req
);
545 if (child
&& *own_req
) {
546 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(child
);
548 tcp_rsk(req
)->drop_req
= false;
550 /* we need to fallback on ctx allocation failure and on pre-reqs
551 * checking above. In the latter scenario we additionally need
552 * to reset the context to non MPTCP status.
554 if (!ctx
|| fallback
) {
555 if (fallback_is_fatal
)
558 subflow_drop_ctx(child
);
562 if (ctx
->mp_capable
) {
563 /* this can't race with mptcp_close(), as the msk is
564 * not yet exposted to user-space
566 inet_sk_state_store((void *)new_msk
, TCP_ESTABLISHED
);
568 /* new mpc subflow takes ownership of the newly
569 * created mptcp socket
571 new_msk
->sk_destruct
= mptcp_sock_destruct
;
572 mptcp_pm_new_connection(mptcp_sk(new_msk
), 1);
573 mptcp_token_accept(subflow_req
, mptcp_sk(new_msk
));
577 /* with OoO packets we can reach here without ingress
580 if (mp_opt
.mp_capable
)
581 mptcp_subflow_fully_established(ctx
, &mp_opt
);
582 } else if (ctx
->mp_join
) {
583 struct mptcp_sock
*owner
;
585 owner
= subflow_req
->msk
;
589 /* move the msk reference ownership to the subflow */
590 subflow_req
->msk
= NULL
;
591 ctx
->conn
= (struct sock
*)owner
;
592 if (!mptcp_finish_join(child
))
595 SUBFLOW_REQ_INC_STATS(req
, MPTCP_MIB_JOINACKRX
);
596 tcp_rsk(req
)->drop_req
= true;
601 /* dispose of the left over mptcp master, if any */
602 if (unlikely(new_msk
))
603 mptcp_force_close(new_msk
);
605 /* check for expected invariant - should never trigger, just help
606 * catching eariler subtle bugs
608 WARN_ON_ONCE(child
&& *own_req
&& tcp_sk(child
)->is_mptcp
&&
609 (!mptcp_subflow_ctx(child
) ||
610 !mptcp_subflow_ctx(child
)->conn
));
614 subflow_drop_ctx(child
);
615 tcp_rsk(req
)->drop_req
= true;
616 inet_csk_prepare_for_destroy_sock(child
);
618 req
->rsk_ops
->send_reset(sk
, skb
);
620 /* The last child reference will be released by the caller */
624 static struct inet_connection_sock_af_ops subflow_specific
;
626 enum mapping_status
{
634 static u64
expand_seq(u64 old_seq
, u16 old_data_len
, u64 seq
)
636 if ((u32
)seq
== (u32
)old_seq
)
639 /* Assume map covers data not mapped yet. */
640 return seq
| ((old_seq
+ old_data_len
+ 1) & GENMASK_ULL(63, 32));
643 static void warn_bad_map(struct mptcp_subflow_context
*subflow
, u32 ssn
)
645 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
646 ssn
, subflow
->map_subflow_seq
, subflow
->map_data_len
);
649 static bool skb_is_fully_mapped(struct sock
*ssk
, struct sk_buff
*skb
)
651 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
652 unsigned int skb_consumed
;
654 skb_consumed
= tcp_sk(ssk
)->copied_seq
- TCP_SKB_CB(skb
)->seq
;
655 if (WARN_ON_ONCE(skb_consumed
>= skb
->len
))
658 return skb
->len
- skb_consumed
<= subflow
->map_data_len
-
659 mptcp_subflow_get_map_offset(subflow
);
662 static bool validate_mapping(struct sock
*ssk
, struct sk_buff
*skb
)
664 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
665 u32 ssn
= tcp_sk(ssk
)->copied_seq
- subflow
->ssn_offset
;
667 if (unlikely(before(ssn
, subflow
->map_subflow_seq
))) {
668 /* Mapping covers data later in the subflow stream,
669 * currently unsupported.
671 warn_bad_map(subflow
, ssn
);
674 if (unlikely(!before(ssn
, subflow
->map_subflow_seq
+
675 subflow
->map_data_len
))) {
676 /* Mapping does covers past subflow data, invalid */
677 warn_bad_map(subflow
, ssn
+ skb
->len
);
683 static enum mapping_status
get_mapping_status(struct sock
*ssk
,
684 struct mptcp_sock
*msk
)
686 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
687 struct mptcp_ext
*mpext
;
692 skb
= skb_peek(&ssk
->sk_receive_queue
);
694 return MAPPING_EMPTY
;
696 if (mptcp_check_fallback(ssk
))
697 return MAPPING_DUMMY
;
699 mpext
= mptcp_get_ext(skb
);
700 if (!mpext
|| !mpext
->use_map
) {
701 if (!subflow
->map_valid
&& !skb
->len
) {
702 /* the TCP stack deliver 0 len FIN pkt to the receive
703 * queue, that is the only 0len pkts ever expected here,
704 * and we can admit no mapping only for 0 len pkts
706 if (!(TCP_SKB_CB(skb
)->tcp_flags
& TCPHDR_FIN
))
707 WARN_ONCE(1, "0len seq %d:%d flags %x",
708 TCP_SKB_CB(skb
)->seq
,
709 TCP_SKB_CB(skb
)->end_seq
,
710 TCP_SKB_CB(skb
)->tcp_flags
);
711 sk_eat_skb(ssk
, skb
);
712 return MAPPING_EMPTY
;
715 if (!subflow
->map_valid
)
716 return MAPPING_INVALID
;
721 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
722 mpext
->data_seq
, mpext
->dsn64
, mpext
->subflow_seq
,
723 mpext
->data_len
, mpext
->data_fin
);
725 data_len
= mpext
->data_len
;
727 pr_err("Infinite mapping not handled");
728 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_INFINITEMAPRX
);
729 return MAPPING_INVALID
;
732 if (mpext
->data_fin
== 1) {
734 mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
);
735 pr_debug("DATA_FIN with no payload seq=%llu", mpext
->data_seq
);
736 if (subflow
->map_valid
) {
737 /* A DATA_FIN might arrive in a DSS
738 * option before the previous mapping
739 * has been fully consumed. Continue
740 * handling the existing mapping.
742 skb_ext_del(skb
, SKB_EXT_MPTCP
);
745 return MAPPING_DATA_FIN
;
748 mptcp_update_rcv_data_fin(msk
, mpext
->data_seq
+ data_len
);
749 pr_debug("DATA_FIN with mapping seq=%llu", mpext
->data_seq
+ data_len
);
752 /* Adjust for DATA_FIN using 1 byte of sequence space */
757 map_seq
= expand_seq(subflow
->map_seq
, subflow
->map_data_len
,
759 subflow
->use_64bit_ack
= 0;
760 pr_debug("expanded seq=%llu", subflow
->map_seq
);
762 map_seq
= mpext
->data_seq
;
763 subflow
->use_64bit_ack
= 1;
766 if (subflow
->map_valid
) {
767 /* Allow replacing only with an identical map */
768 if (subflow
->map_seq
== map_seq
&&
769 subflow
->map_subflow_seq
== mpext
->subflow_seq
&&
770 subflow
->map_data_len
== data_len
) {
771 skb_ext_del(skb
, SKB_EXT_MPTCP
);
775 /* If this skb data are fully covered by the current mapping,
776 * the new map would need caching, which is not supported
778 if (skb_is_fully_mapped(ssk
, skb
)) {
779 MPTCP_INC_STATS(sock_net(ssk
), MPTCP_MIB_DSSNOMATCH
);
780 return MAPPING_INVALID
;
783 /* will validate the next map after consuming the current one */
787 subflow
->map_seq
= map_seq
;
788 subflow
->map_subflow_seq
= mpext
->subflow_seq
;
789 subflow
->map_data_len
= data_len
;
790 subflow
->map_valid
= 1;
791 subflow
->mpc_map
= mpext
->mpc_map
;
792 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
793 subflow
->map_seq
, subflow
->map_subflow_seq
,
794 subflow
->map_data_len
);
797 /* we revalidate valid mapping on new skb, because we must ensure
798 * the current skb is completely covered by the available mapping
800 if (!validate_mapping(ssk
, skb
))
801 return MAPPING_INVALID
;
803 skb_ext_del(skb
, SKB_EXT_MPTCP
);
807 static int subflow_read_actor(read_descriptor_t
*desc
,
809 unsigned int offset
, size_t len
)
811 size_t copy_len
= min(desc
->count
, len
);
813 desc
->count
-= copy_len
;
815 pr_debug("flushed %zu bytes, %zu left", copy_len
, desc
->count
);
819 static bool subflow_check_data_avail(struct sock
*ssk
)
821 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
822 enum mapping_status status
;
823 struct mptcp_sock
*msk
;
826 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow
->conn
, ssk
,
827 subflow
->data_avail
, skb_peek(&ssk
->sk_receive_queue
));
828 if (subflow
->data_avail
)
831 msk
= mptcp_sk(subflow
->conn
);
838 status
= get_mapping_status(ssk
, msk
);
839 pr_debug("msk=%p ssk=%p status=%d", msk
, ssk
, status
);
840 if (status
== MAPPING_INVALID
) {
841 ssk
->sk_err
= EBADMSG
;
844 if (status
== MAPPING_DUMMY
) {
845 __mptcp_do_fallback(msk
);
846 skb
= skb_peek(&ssk
->sk_receive_queue
);
847 subflow
->map_valid
= 1;
848 subflow
->map_seq
= READ_ONCE(msk
->ack_seq
);
849 subflow
->map_data_len
= skb
->len
;
850 subflow
->map_subflow_seq
= tcp_sk(ssk
)->copied_seq
-
855 if (status
!= MAPPING_OK
)
858 skb
= skb_peek(&ssk
->sk_receive_queue
);
859 if (WARN_ON_ONCE(!skb
))
862 /* if msk lacks the remote key, this subflow must provide an
863 * MP_CAPABLE-based mapping
865 if (unlikely(!READ_ONCE(msk
->can_ack
))) {
866 if (!subflow
->mpc_map
) {
867 ssk
->sk_err
= EBADMSG
;
870 WRITE_ONCE(msk
->remote_key
, subflow
->remote_key
);
871 WRITE_ONCE(msk
->ack_seq
, subflow
->map_seq
);
872 WRITE_ONCE(msk
->can_ack
, true);
875 old_ack
= READ_ONCE(msk
->ack_seq
);
876 ack_seq
= mptcp_subflow_get_mapped_dsn(subflow
);
877 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack
,
879 if (ack_seq
== old_ack
)
882 /* only accept in-sequence mapping. Old values are spurious
883 * retransmission; we can hit "future" values on active backup
884 * subflow switch, we relay on retransmissions to get
886 * Cuncurrent subflows support will require subflow data
889 map_remaining
= subflow
->map_data_len
-
890 mptcp_subflow_get_map_offset(subflow
);
891 if (before64(ack_seq
, old_ack
))
892 delta
= min_t(size_t, old_ack
- ack_seq
, map_remaining
);
894 delta
= min_t(size_t, ack_seq
- old_ack
, map_remaining
);
896 /* discard mapped data */
897 pr_debug("discarding %zu bytes, current map len=%d", delta
,
900 read_descriptor_t desc
= {
905 ret
= tcp_read_sock(ssk
, &desc
, subflow_read_actor
);
912 if (delta
== map_remaining
)
913 subflow
->map_valid
= 0;
919 /* fatal protocol error, close the socket */
920 /* This barrier is coupled with smp_rmb() in tcp_poll() */
922 ssk
->sk_error_report(ssk
);
923 tcp_set_state(ssk
, TCP_CLOSE
);
924 tcp_send_active_reset(ssk
, GFP_ATOMIC
);
928 bool mptcp_subflow_data_available(struct sock
*sk
)
930 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
933 /* check if current mapping is still valid */
934 if (subflow
->map_valid
&&
935 mptcp_subflow_get_map_offset(subflow
) >= subflow
->map_data_len
) {
936 subflow
->map_valid
= 0;
937 subflow
->data_avail
= 0;
939 pr_debug("Done with mapping: seq=%u data_len=%u",
940 subflow
->map_subflow_seq
,
941 subflow
->map_data_len
);
944 if (!subflow_check_data_avail(sk
)) {
945 subflow
->data_avail
= 0;
949 skb
= skb_peek(&sk
->sk_receive_queue
);
950 subflow
->data_avail
= skb
&&
951 before(tcp_sk(sk
)->copied_seq
, TCP_SKB_CB(skb
)->end_seq
);
952 return subflow
->data_avail
;
955 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
958 * In mptcp, rwin is about the mptcp-level connection data.
960 * Data that is still on the ssk rx queue can thus be ignored,
961 * as far as mptcp peer is concerened that data is still inflight.
962 * DSS ACK is updated when skb is moved to the mptcp rx queue.
964 void mptcp_space(const struct sock
*ssk
, int *space
, int *full_space
)
966 const struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(ssk
);
967 const struct sock
*sk
= subflow
->conn
;
969 *space
= tcp_space(sk
);
970 *full_space
= tcp_full_space(sk
);
973 static void subflow_data_ready(struct sock
*sk
)
975 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
976 u16 state
= 1 << inet_sk_state_load(sk
);
977 struct sock
*parent
= subflow
->conn
;
978 struct mptcp_sock
*msk
;
980 msk
= mptcp_sk(parent
);
981 if (state
& TCPF_LISTEN
) {
982 set_bit(MPTCP_DATA_READY
, &msk
->flags
);
983 parent
->sk_data_ready(parent
);
987 WARN_ON_ONCE(!__mptcp_check_fallback(msk
) && !subflow
->mp_capable
&&
988 !subflow
->mp_join
&& !(state
& TCPF_CLOSE
));
990 if (mptcp_subflow_data_available(sk
))
991 mptcp_data_ready(parent
, sk
);
994 static void subflow_write_space(struct sock
*sk
)
996 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
997 struct sock
*parent
= subflow
->conn
;
999 sk_stream_write_space(sk
);
1000 if (sk_stream_is_writeable(sk
)) {
1001 set_bit(MPTCP_SEND_SPACE
, &mptcp_sk(parent
)->flags
);
1002 smp_mb__after_atomic();
1003 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
1004 sk_stream_write_space(parent
);
1008 static struct inet_connection_sock_af_ops
*
1009 subflow_default_af_ops(struct sock
*sk
)
1011 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1012 if (sk
->sk_family
== AF_INET6
)
1013 return &subflow_v6_specific
;
1015 return &subflow_specific
;
1018 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1019 void mptcpv6_handle_mapped(struct sock
*sk
, bool mapped
)
1021 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1022 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1023 struct inet_connection_sock_af_ops
*target
;
1025 target
= mapped
? &subflow_v6m_specific
: subflow_default_af_ops(sk
);
1027 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
1028 subflow
, sk
->sk_family
, icsk
->icsk_af_ops
, target
, mapped
);
1030 if (likely(icsk
->icsk_af_ops
== target
))
1033 subflow
->icsk_af_ops
= icsk
->icsk_af_ops
;
1034 icsk
->icsk_af_ops
= target
;
1038 static void mptcp_info2sockaddr(const struct mptcp_addr_info
*info
,
1039 struct sockaddr_storage
*addr
)
1041 memset(addr
, 0, sizeof(*addr
));
1042 addr
->ss_family
= info
->family
;
1043 if (addr
->ss_family
== AF_INET
) {
1044 struct sockaddr_in
*in_addr
= (struct sockaddr_in
*)addr
;
1046 in_addr
->sin_addr
= info
->addr
;
1047 in_addr
->sin_port
= info
->port
;
1049 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1050 else if (addr
->ss_family
== AF_INET6
) {
1051 struct sockaddr_in6
*in6_addr
= (struct sockaddr_in6
*)addr
;
1053 in6_addr
->sin6_addr
= info
->addr6
;
1054 in6_addr
->sin6_port
= info
->port
;
1059 int __mptcp_subflow_connect(struct sock
*sk
, int ifindex
,
1060 const struct mptcp_addr_info
*loc
,
1061 const struct mptcp_addr_info
*remote
)
1063 struct mptcp_sock
*msk
= mptcp_sk(sk
);
1064 struct mptcp_subflow_context
*subflow
;
1065 struct sockaddr_storage addr
;
1066 int remote_id
= remote
->id
;
1067 int local_id
= loc
->id
;
1074 if (!mptcp_is_fully_established(sk
))
1077 err
= mptcp_subflow_create_socket(sk
, &sf
);
1082 subflow
= mptcp_subflow_ctx(ssk
);
1084 get_random_bytes(&subflow
->local_nonce
, sizeof(u32
));
1085 } while (!subflow
->local_nonce
);
1088 err
= mptcp_pm_get_local_id(msk
, (struct sock_common
*)ssk
);
1095 subflow
->remote_key
= msk
->remote_key
;
1096 subflow
->local_key
= msk
->local_key
;
1097 subflow
->token
= msk
->token
;
1098 mptcp_info2sockaddr(loc
, &addr
);
1100 addrlen
= sizeof(struct sockaddr_in
);
1101 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1102 if (loc
->family
== AF_INET6
)
1103 addrlen
= sizeof(struct sockaddr_in6
);
1105 ssk
->sk_bound_dev_if
= ifindex
;
1106 err
= kernel_bind(sf
, (struct sockaddr
*)&addr
, addrlen
);
1110 mptcp_crypto_key_sha(subflow
->remote_key
, &remote_token
, NULL
);
1111 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk
,
1112 remote_token
, local_id
, remote_id
);
1113 subflow
->remote_token
= remote_token
;
1114 subflow
->local_id
= local_id
;
1115 subflow
->remote_id
= remote_id
;
1116 subflow
->request_join
= 1;
1117 subflow
->request_bkup
= 1;
1118 mptcp_info2sockaddr(remote
, &addr
);
1120 err
= kernel_connect(sf
, (struct sockaddr
*)&addr
, addrlen
, O_NONBLOCK
);
1121 if (err
&& err
!= -EINPROGRESS
)
1124 spin_lock_bh(&msk
->join_list_lock
);
1125 list_add_tail(&subflow
->node
, &msk
->join_list
);
1126 spin_unlock_bh(&msk
->join_list_lock
);
1135 int mptcp_subflow_create_socket(struct sock
*sk
, struct socket
**new_sock
)
1137 struct mptcp_subflow_context
*subflow
;
1138 struct net
*net
= sock_net(sk
);
1142 /* un-accepted server sockets can reach here - on bad configuration
1143 * bail early to avoid greater trouble later
1145 if (unlikely(!sk
->sk_socket
))
1148 err
= sock_create_kern(net
, sk
->sk_family
, SOCK_STREAM
, IPPROTO_TCP
,
1155 /* kernel sockets do not by default acquire net ref, but TCP timer
1158 sf
->sk
->sk_net_refcnt
= 1;
1160 #ifdef CONFIG_PROC_FS
1161 this_cpu_add(*net
->core
.sock_inuse
, 1);
1163 err
= tcp_set_ulp(sf
->sk
, "mptcp");
1164 release_sock(sf
->sk
);
1171 /* the newly created socket really belongs to the owning MPTCP master
1172 * socket, even if for additional subflows the allocation is performed
1173 * by a kernel workqueue. Adjust inode references, so that the
1174 * procfs/diag interaces really show this one belonging to the correct
1177 SOCK_INODE(sf
)->i_ino
= SOCK_INODE(sk
->sk_socket
)->i_ino
;
1178 SOCK_INODE(sf
)->i_uid
= SOCK_INODE(sk
->sk_socket
)->i_uid
;
1179 SOCK_INODE(sf
)->i_gid
= SOCK_INODE(sk
->sk_socket
)->i_gid
;
1181 subflow
= mptcp_subflow_ctx(sf
->sk
);
1182 pr_debug("subflow=%p", subflow
);
1191 static struct mptcp_subflow_context
*subflow_create_ctx(struct sock
*sk
,
1194 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1195 struct mptcp_subflow_context
*ctx
;
1197 ctx
= kzalloc(sizeof(*ctx
), priority
);
1201 rcu_assign_pointer(icsk
->icsk_ulp_data
, ctx
);
1202 INIT_LIST_HEAD(&ctx
->node
);
1204 pr_debug("subflow=%p", ctx
);
1211 static void __subflow_state_change(struct sock
*sk
)
1213 struct socket_wq
*wq
;
1216 wq
= rcu_dereference(sk
->sk_wq
);
1217 if (skwq_has_sleeper(wq
))
1218 wake_up_interruptible_all(&wq
->wait
);
1222 static bool subflow_is_done(const struct sock
*sk
)
1224 return sk
->sk_shutdown
& RCV_SHUTDOWN
|| sk
->sk_state
== TCP_CLOSE
;
1227 static void subflow_state_change(struct sock
*sk
)
1229 struct mptcp_subflow_context
*subflow
= mptcp_subflow_ctx(sk
);
1230 struct sock
*parent
= subflow
->conn
;
1232 __subflow_state_change(sk
);
1234 if (subflow_simultaneous_connect(sk
)) {
1235 mptcp_do_fallback(sk
);
1236 mptcp_rcv_space_init(mptcp_sk(parent
), sk
);
1237 pr_fallback(mptcp_sk(parent
));
1238 subflow
->conn_finished
= 1;
1239 if (inet_sk_state_load(parent
) == TCP_SYN_SENT
) {
1240 inet_sk_state_store(parent
, TCP_ESTABLISHED
);
1241 parent
->sk_state_change(parent
);
1245 /* as recvmsg() does not acquire the subflow socket for ssk selection
1246 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1247 * the data available machinery here.
1249 if (mptcp_subflow_data_available(sk
))
1250 mptcp_data_ready(parent
, sk
);
1252 if (__mptcp_check_fallback(mptcp_sk(parent
)) &&
1253 !(parent
->sk_shutdown
& RCV_SHUTDOWN
) &&
1254 !subflow
->rx_eof
&& subflow_is_done(sk
)) {
1255 subflow
->rx_eof
= 1;
1256 mptcp_subflow_eof(parent
);
1260 static int subflow_ulp_init(struct sock
*sk
)
1262 struct inet_connection_sock
*icsk
= inet_csk(sk
);
1263 struct mptcp_subflow_context
*ctx
;
1264 struct tcp_sock
*tp
= tcp_sk(sk
);
1267 /* disallow attaching ULP to a socket unless it has been
1268 * created with sock_create_kern()
1270 if (!sk
->sk_kern_sock
) {
1275 ctx
= subflow_create_ctx(sk
, GFP_KERNEL
);
1281 pr_debug("subflow=%p, family=%d", ctx
, sk
->sk_family
);
1284 ctx
->icsk_af_ops
= icsk
->icsk_af_ops
;
1285 icsk
->icsk_af_ops
= subflow_default_af_ops(sk
);
1286 ctx
->tcp_data_ready
= sk
->sk_data_ready
;
1287 ctx
->tcp_state_change
= sk
->sk_state_change
;
1288 ctx
->tcp_write_space
= sk
->sk_write_space
;
1289 sk
->sk_data_ready
= subflow_data_ready
;
1290 sk
->sk_write_space
= subflow_write_space
;
1291 sk
->sk_state_change
= subflow_state_change
;
1296 static void subflow_ulp_release(struct sock
*sk
)
1298 struct mptcp_subflow_context
*ctx
= mptcp_subflow_ctx(sk
);
1304 sock_put(ctx
->conn
);
1306 kfree_rcu(ctx
, rcu
);
1309 static void subflow_ulp_clone(const struct request_sock
*req
,
1311 const gfp_t priority
)
1313 struct mptcp_subflow_request_sock
*subflow_req
= mptcp_subflow_rsk(req
);
1314 struct mptcp_subflow_context
*old_ctx
= mptcp_subflow_ctx(newsk
);
1315 struct mptcp_subflow_context
*new_ctx
;
1317 if (!tcp_rsk(req
)->is_mptcp
||
1318 (!subflow_req
->mp_capable
&& !subflow_req
->mp_join
)) {
1319 subflow_ulp_fallback(newsk
, old_ctx
);
1323 new_ctx
= subflow_create_ctx(newsk
, priority
);
1325 subflow_ulp_fallback(newsk
, old_ctx
);
1329 new_ctx
->conn_finished
= 1;
1330 new_ctx
->icsk_af_ops
= old_ctx
->icsk_af_ops
;
1331 new_ctx
->tcp_data_ready
= old_ctx
->tcp_data_ready
;
1332 new_ctx
->tcp_state_change
= old_ctx
->tcp_state_change
;
1333 new_ctx
->tcp_write_space
= old_ctx
->tcp_write_space
;
1334 new_ctx
->rel_write_seq
= 1;
1335 new_ctx
->tcp_sock
= newsk
;
1337 if (subflow_req
->mp_capable
) {
1338 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1339 * is fully established only after we receive the remote key
1341 new_ctx
->mp_capable
= 1;
1342 new_ctx
->local_key
= subflow_req
->local_key
;
1343 new_ctx
->token
= subflow_req
->token
;
1344 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1345 new_ctx
->idsn
= subflow_req
->idsn
;
1346 } else if (subflow_req
->mp_join
) {
1347 new_ctx
->ssn_offset
= subflow_req
->ssn_offset
;
1348 new_ctx
->mp_join
= 1;
1349 new_ctx
->fully_established
= 1;
1350 new_ctx
->backup
= subflow_req
->backup
;
1351 new_ctx
->local_id
= subflow_req
->local_id
;
1352 new_ctx
->remote_id
= subflow_req
->remote_id
;
1353 new_ctx
->token
= subflow_req
->token
;
1354 new_ctx
->thmac
= subflow_req
->thmac
;
1358 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly
= {
1360 .owner
= THIS_MODULE
,
1361 .init
= subflow_ulp_init
,
1362 .release
= subflow_ulp_release
,
1363 .clone
= subflow_ulp_clone
,
1366 static int subflow_ops_init(struct request_sock_ops
*subflow_ops
)
1368 subflow_ops
->obj_size
= sizeof(struct mptcp_subflow_request_sock
);
1369 subflow_ops
->slab_name
= "request_sock_subflow";
1371 subflow_ops
->slab
= kmem_cache_create(subflow_ops
->slab_name
,
1372 subflow_ops
->obj_size
, 0,
1374 SLAB_TYPESAFE_BY_RCU
,
1376 if (!subflow_ops
->slab
)
1379 subflow_ops
->destructor
= subflow_req_destructor
;
1384 void __init
mptcp_subflow_init(void)
1386 mptcp_subflow_request_sock_ops
= tcp_request_sock_ops
;
1387 if (subflow_ops_init(&mptcp_subflow_request_sock_ops
) != 0)
1388 panic("MPTCP: failed to init subflow request sock ops\n");
1390 subflow_request_sock_ipv4_ops
= tcp_request_sock_ipv4_ops
;
1391 subflow_request_sock_ipv4_ops
.init_req
= subflow_v4_init_req
;
1393 subflow_specific
= ipv4_specific
;
1394 subflow_specific
.conn_request
= subflow_v4_conn_request
;
1395 subflow_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1396 subflow_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1398 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1399 subflow_request_sock_ipv6_ops
= tcp_request_sock_ipv6_ops
;
1400 subflow_request_sock_ipv6_ops
.init_req
= subflow_v6_init_req
;
1402 subflow_v6_specific
= ipv6_specific
;
1403 subflow_v6_specific
.conn_request
= subflow_v6_conn_request
;
1404 subflow_v6_specific
.syn_recv_sock
= subflow_syn_recv_sock
;
1405 subflow_v6_specific
.sk_rx_dst_set
= subflow_finish_connect
;
1407 subflow_v6m_specific
= subflow_v6_specific
;
1408 subflow_v6m_specific
.queue_xmit
= ipv4_specific
.queue_xmit
;
1409 subflow_v6m_specific
.send_check
= ipv4_specific
.send_check
;
1410 subflow_v6m_specific
.net_header_len
= ipv4_specific
.net_header_len
;
1411 subflow_v6m_specific
.mtu_reduced
= ipv4_specific
.mtu_reduced
;
1412 subflow_v6m_specific
.net_frag_header_len
= 0;
1415 mptcp_diag_subflow_init(&subflow_ulp_ops
);
1417 if (tcp_register_ulp(&subflow_ulp_ops
) != 0)
1418 panic("MPTCP: failed to register subflows to ULP\n");