]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/mptcp/subflow.c
mptcp: create msk early
[mirror_ubuntu-hirsute-kernel.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
12#include <net/sock.h>
13#include <net/inet_common.h>
14#include <net/inet_hashtables.h>
15#include <net/protocol.h>
16#include <net/tcp.h>
cec37a6e
PK
17#if IS_ENABLED(CONFIG_MPTCP_IPV6)
18#include <net/ip6_route.h>
19#endif
2303f994
PK
20#include <net/mptcp.h>
21#include "protocol.h"
22
79c0949e
PK
23static int subflow_rebuild_header(struct sock *sk)
24{
25 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
26 int err = 0;
27
28 if (subflow->request_mptcp && !subflow->token) {
29 pr_debug("subflow=%p", sk);
30 err = mptcp_token_new_connect(sk);
31 }
32
33 if (err)
34 return err;
35
36 return subflow->icsk_af_ops->rebuild_header(sk);
37}
38
39static void subflow_req_destructor(struct request_sock *req)
40{
41 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
42
43 pr_debug("subflow_req=%p", subflow_req);
44
45 if (subflow_req->mp_capable)
46 mptcp_token_destroy_request(subflow_req->token);
47 tcp_request_sock_ops.destructor(req);
48}
49
cec37a6e
PK
50static void subflow_init_req(struct request_sock *req,
51 const struct sock *sk_listener,
52 struct sk_buff *skb)
53{
54 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
55 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
56 struct tcp_options_received rx_opt;
57
58 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
59
60 memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
61 mptcp_get_options(skb, &rx_opt);
62
63 subflow_req->mp_capable = 0;
d22f4988 64 subflow_req->remote_key_valid = 0;
cec37a6e
PK
65
66#ifdef CONFIG_TCP_MD5SIG
67 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
68 * TCP option space.
69 */
70 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
71 return;
72#endif
73
74 if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
79c0949e
PK
75 int err;
76
77 err = mptcp_token_new_request(req);
78 if (err == 0)
79 subflow_req->mp_capable = 1;
80
648ef4b8 81 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
cec37a6e
PK
82 }
83}
84
85static void subflow_v4_init_req(struct request_sock *req,
86 const struct sock *sk_listener,
87 struct sk_buff *skb)
88{
89 tcp_rsk(req)->is_mptcp = 1;
90
91 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
92
93 subflow_init_req(req, sk_listener, skb);
94}
95
96#if IS_ENABLED(CONFIG_MPTCP_IPV6)
97static void subflow_v6_init_req(struct request_sock *req,
98 const struct sock *sk_listener,
99 struct sk_buff *skb)
100{
101 tcp_rsk(req)->is_mptcp = 1;
102
103 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
104
105 subflow_init_req(req, sk_listener, skb);
106}
107#endif
108
109static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
110{
111 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
112
113 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
114
115 if (subflow->conn && !subflow->conn_finished) {
116 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
117 subflow->remote_key);
118 mptcp_finish_connect(sk);
119 subflow->conn_finished = 1;
648ef4b8
MM
120
121 if (skb) {
122 pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
123 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
124 }
cec37a6e
PK
125 }
126}
127
128static struct request_sock_ops subflow_request_sock_ops;
129static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
130
131static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
132{
133 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
134
135 pr_debug("subflow=%p", subflow);
136
137 /* Never answer to SYNs sent to broadcast or multicast */
138 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
139 goto drop;
140
141 return tcp_conn_request(&subflow_request_sock_ops,
142 &subflow_request_sock_ipv4_ops,
143 sk, skb);
144drop:
145 tcp_listendrop(sk);
146 return 0;
147}
148
149#if IS_ENABLED(CONFIG_MPTCP_IPV6)
150static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
151static struct inet_connection_sock_af_ops subflow_v6_specific;
152static struct inet_connection_sock_af_ops subflow_v6m_specific;
153
154static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
155{
156 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
157
158 pr_debug("subflow=%p", subflow);
159
160 if (skb->protocol == htons(ETH_P_IP))
161 return subflow_v4_conn_request(sk, skb);
162
163 if (!ipv6_unicast_destination(skb))
164 goto drop;
165
166 return tcp_conn_request(&subflow_request_sock_ops,
167 &subflow_request_sock_ipv6_ops, sk, skb);
168
169drop:
170 tcp_listendrop(sk);
171 return 0; /* don't send reset */
172}
173#endif
174
175static struct sock *subflow_syn_recv_sock(const struct sock *sk,
176 struct sk_buff *skb,
177 struct request_sock *req,
178 struct dst_entry *dst,
179 struct request_sock *req_unhash,
180 bool *own_req)
181{
182 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea
CP
183 struct mptcp_subflow_request_sock *subflow_req;
184 struct tcp_options_received opt_rx;
58b09919 185 struct sock *new_msk = NULL;
cec37a6e
PK
186 struct sock *child;
187
188 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
189
ae2dd716
FW
190 if (tcp_rsk(req)->is_mptcp == 0)
191 goto create_child;
192
d22f4988 193 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea
CP
194 subflow_req = mptcp_subflow_rsk(req);
195 if (subflow_req->mp_capable) {
d22f4988
CP
196 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
197 /* here we can receive and accept an in-window,
198 * out-of-order pkt, which will not carry the MP_CAPABLE
199 * opt even on mptcp enabled paths
200 */
58b09919 201 goto create_msk;
d22f4988
CP
202 }
203
cc7972ea
CP
204 opt_rx.mptcp.mp_capable = 0;
205 mptcp_get_options(skb, &opt_rx);
d22f4988 206 if (opt_rx.mptcp.mp_capable) {
cc7972ea 207 subflow_req->remote_key = opt_rx.mptcp.sndr_key;
d22f4988
CP
208 subflow_req->remote_key_valid = 1;
209 } else {
210 subflow_req->mp_capable = 0;
58b09919 211 goto create_child;
d22f4988 212 }
58b09919
PA
213
214create_msk:
215 new_msk = mptcp_sk_clone(listener->conn, req);
216 if (!new_msk)
217 subflow_req->mp_capable = 0;
cc7972ea 218 }
cec37a6e 219
d22f4988 220create_child:
cec37a6e
PK
221 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
222 req_unhash, own_req);
223
224 if (child && *own_req) {
79c0949e
PK
225 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
226
227 /* we have null ctx on TCP fallback, not fatal on MPC
228 * handshake
229 */
230 if (!ctx)
58b09919 231 goto out;
79c0949e
PK
232
233 if (ctx->mp_capable) {
58b09919
PA
234 /* new mpc subflow takes ownership of the newly
235 * created mptcp socket
236 */
237 ctx->conn = new_msk;
238 new_msk = NULL;
cec37a6e
PK
239 }
240 }
241
58b09919
PA
242out:
243 /* dispose of the left over mptcp master, if any */
244 if (unlikely(new_msk))
245 sock_put(new_msk);
cec37a6e
PK
246 return child;
247}
248
249static struct inet_connection_sock_af_ops subflow_specific;
250
648ef4b8
MM
251enum mapping_status {
252 MAPPING_OK,
253 MAPPING_INVALID,
254 MAPPING_EMPTY,
255 MAPPING_DATA_FIN
256};
257
258static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
259{
260 if ((u32)seq == (u32)old_seq)
261 return old_seq;
262
263 /* Assume map covers data not mapped yet. */
264 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
265}
266
267static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
268{
269 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
270 ssn, subflow->map_subflow_seq, subflow->map_data_len);
271}
272
273static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
274{
275 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
276 unsigned int skb_consumed;
277
278 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
279 if (WARN_ON_ONCE(skb_consumed >= skb->len))
280 return true;
281
282 return skb->len - skb_consumed <= subflow->map_data_len -
283 mptcp_subflow_get_map_offset(subflow);
284}
285
286static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
287{
288 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
289 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
290
291 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
292 /* Mapping covers data later in the subflow stream,
293 * currently unsupported.
294 */
295 warn_bad_map(subflow, ssn);
296 return false;
297 }
298 if (unlikely(!before(ssn, subflow->map_subflow_seq +
299 subflow->map_data_len))) {
300 /* Mapping does covers past subflow data, invalid */
301 warn_bad_map(subflow, ssn + skb->len);
302 return false;
303 }
304 return true;
305}
306
307static enum mapping_status get_mapping_status(struct sock *ssk)
308{
309 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
310 struct mptcp_ext *mpext;
311 struct sk_buff *skb;
312 u16 data_len;
313 u64 map_seq;
314
315 skb = skb_peek(&ssk->sk_receive_queue);
316 if (!skb)
317 return MAPPING_EMPTY;
318
319 mpext = mptcp_get_ext(skb);
320 if (!mpext || !mpext->use_map) {
321 if (!subflow->map_valid && !skb->len) {
322 /* the TCP stack deliver 0 len FIN pkt to the receive
323 * queue, that is the only 0len pkts ever expected here,
324 * and we can admit no mapping only for 0 len pkts
325 */
326 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
327 WARN_ONCE(1, "0len seq %d:%d flags %x",
328 TCP_SKB_CB(skb)->seq,
329 TCP_SKB_CB(skb)->end_seq,
330 TCP_SKB_CB(skb)->tcp_flags);
331 sk_eat_skb(ssk, skb);
332 return MAPPING_EMPTY;
333 }
334
335 if (!subflow->map_valid)
336 return MAPPING_INVALID;
337
338 goto validate_seq;
339 }
340
341 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
342 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
343 mpext->data_len, mpext->data_fin);
344
345 data_len = mpext->data_len;
346 if (data_len == 0) {
347 pr_err("Infinite mapping not handled");
348 return MAPPING_INVALID;
349 }
350
351 if (mpext->data_fin == 1) {
352 if (data_len == 1) {
353 pr_debug("DATA_FIN with no payload");
354 if (subflow->map_valid) {
355 /* A DATA_FIN might arrive in a DSS
356 * option before the previous mapping
357 * has been fully consumed. Continue
358 * handling the existing mapping.
359 */
360 skb_ext_del(skb, SKB_EXT_MPTCP);
361 return MAPPING_OK;
362 } else {
363 return MAPPING_DATA_FIN;
364 }
365 }
366
367 /* Adjust for DATA_FIN using 1 byte of sequence space */
368 data_len--;
369 }
370
371 if (!mpext->dsn64) {
372 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
373 mpext->data_seq);
374 pr_debug("expanded seq=%llu", subflow->map_seq);
375 } else {
376 map_seq = mpext->data_seq;
377 }
378
379 if (subflow->map_valid) {
380 /* Allow replacing only with an identical map */
381 if (subflow->map_seq == map_seq &&
382 subflow->map_subflow_seq == mpext->subflow_seq &&
383 subflow->map_data_len == data_len) {
384 skb_ext_del(skb, SKB_EXT_MPTCP);
385 return MAPPING_OK;
386 }
387
388 /* If this skb data are fully covered by the current mapping,
389 * the new map would need caching, which is not supported
390 */
391 if (skb_is_fully_mapped(ssk, skb))
392 return MAPPING_INVALID;
393
394 /* will validate the next map after consuming the current one */
395 return MAPPING_OK;
396 }
397
398 subflow->map_seq = map_seq;
399 subflow->map_subflow_seq = mpext->subflow_seq;
400 subflow->map_data_len = data_len;
401 subflow->map_valid = 1;
d22f4988 402 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
403 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
404 subflow->map_seq, subflow->map_subflow_seq,
405 subflow->map_data_len);
406
407validate_seq:
408 /* we revalidate valid mapping on new skb, because we must ensure
409 * the current skb is completely covered by the available mapping
410 */
411 if (!validate_mapping(ssk, skb))
412 return MAPPING_INVALID;
413
414 skb_ext_del(skb, SKB_EXT_MPTCP);
415 return MAPPING_OK;
416}
417
bfae9dae
FW
418static int subflow_read_actor(read_descriptor_t *desc,
419 struct sk_buff *skb,
420 unsigned int offset, size_t len)
421{
422 size_t copy_len = min(desc->count, len);
423
424 desc->count -= copy_len;
425
426 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
427 return copy_len;
428}
429
648ef4b8
MM
430static bool subflow_check_data_avail(struct sock *ssk)
431{
432 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
433 enum mapping_status status;
434 struct mptcp_sock *msk;
435 struct sk_buff *skb;
436
437 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
438 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
439 if (subflow->data_avail)
440 return true;
441
442 if (!subflow->conn)
443 return false;
444
445 msk = mptcp_sk(subflow->conn);
446 for (;;) {
447 u32 map_remaining;
448 size_t delta;
449 u64 ack_seq;
450 u64 old_ack;
451
452 status = get_mapping_status(ssk);
453 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
454 if (status == MAPPING_INVALID) {
455 ssk->sk_err = EBADMSG;
456 goto fatal;
457 }
458
459 if (status != MAPPING_OK)
460 return false;
461
462 skb = skb_peek(&ssk->sk_receive_queue);
463 if (WARN_ON_ONCE(!skb))
464 return false;
465
d22f4988
CP
466 /* if msk lacks the remote key, this subflow must provide an
467 * MP_CAPABLE-based mapping
468 */
469 if (unlikely(!READ_ONCE(msk->can_ack))) {
470 if (!subflow->mpc_map) {
471 ssk->sk_err = EBADMSG;
472 goto fatal;
473 }
474 WRITE_ONCE(msk->remote_key, subflow->remote_key);
475 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
476 WRITE_ONCE(msk->can_ack, true);
477 }
478
648ef4b8
MM
479 old_ack = READ_ONCE(msk->ack_seq);
480 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
481 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
482 ack_seq);
483 if (ack_seq == old_ack)
484 break;
485
486 /* only accept in-sequence mapping. Old values are spurious
487 * retransmission; we can hit "future" values on active backup
488 * subflow switch, we relay on retransmissions to get
489 * in-sequence data.
490 * Cuncurrent subflows support will require subflow data
491 * reordering
492 */
493 map_remaining = subflow->map_data_len -
494 mptcp_subflow_get_map_offset(subflow);
495 if (before64(ack_seq, old_ack))
496 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
497 else
498 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
499
500 /* discard mapped data */
501 pr_debug("discarding %zu bytes, current map len=%d", delta,
502 map_remaining);
503 if (delta) {
648ef4b8
MM
504 read_descriptor_t desc = {
505 .count = delta,
648ef4b8
MM
506 };
507 int ret;
508
bfae9dae 509 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
648ef4b8
MM
510 if (ret < 0) {
511 ssk->sk_err = -ret;
512 goto fatal;
513 }
514 if (ret < delta)
515 return false;
516 if (delta == map_remaining)
517 subflow->map_valid = 0;
518 }
519 }
520 return true;
521
522fatal:
523 /* fatal protocol error, close the socket */
524 /* This barrier is coupled with smp_rmb() in tcp_poll() */
525 smp_wmb();
526 ssk->sk_error_report(ssk);
527 tcp_set_state(ssk, TCP_CLOSE);
528 tcp_send_active_reset(ssk, GFP_ATOMIC);
529 return false;
530}
531
532bool mptcp_subflow_data_available(struct sock *sk)
533{
534 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
535 struct sk_buff *skb;
536
537 /* check if current mapping is still valid */
538 if (subflow->map_valid &&
539 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
540 subflow->map_valid = 0;
541 subflow->data_avail = 0;
542
543 pr_debug("Done with mapping: seq=%u data_len=%u",
544 subflow->map_subflow_seq,
545 subflow->map_data_len);
546 }
547
548 if (!subflow_check_data_avail(sk)) {
549 subflow->data_avail = 0;
550 return false;
551 }
552
553 skb = skb_peek(&sk->sk_receive_queue);
554 subflow->data_avail = skb &&
555 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
556 return subflow->data_avail;
557}
558
559static void subflow_data_ready(struct sock *sk)
560{
561 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
562 struct sock *parent = subflow->conn;
563
564 if (!parent || !subflow->mp_capable) {
565 subflow->tcp_data_ready(sk);
566
567 if (parent)
568 parent->sk_data_ready(parent);
569 return;
570 }
571
101f6f85 572 if (mptcp_subflow_data_available(sk))
2e52213c 573 mptcp_data_ready(parent, sk);
648ef4b8
MM
574}
575
576static void subflow_write_space(struct sock *sk)
577{
578 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
579 struct sock *parent = subflow->conn;
580
581 sk_stream_write_space(sk);
582 if (parent && sk_stream_is_writeable(sk)) {
1891c4a0
FW
583 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
584 smp_mb__after_atomic();
585 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
648ef4b8
MM
586 sk_stream_write_space(parent);
587 }
588}
589
cec37a6e
PK
590static struct inet_connection_sock_af_ops *
591subflow_default_af_ops(struct sock *sk)
592{
593#if IS_ENABLED(CONFIG_MPTCP_IPV6)
594 if (sk->sk_family == AF_INET6)
595 return &subflow_v6_specific;
596#endif
597 return &subflow_specific;
598}
599
cec37a6e 600#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
601void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
602{
cec37a6e
PK
603 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
604 struct inet_connection_sock *icsk = inet_csk(sk);
605 struct inet_connection_sock_af_ops *target;
606
607 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
608
609 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 610 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
611
612 if (likely(icsk->icsk_af_ops == target))
613 return;
614
615 subflow->icsk_af_ops = icsk->icsk_af_ops;
616 icsk->icsk_af_ops = target;
cec37a6e 617}
31484d56 618#endif
cec37a6e 619
2303f994
PK
620int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
621{
622 struct mptcp_subflow_context *subflow;
623 struct net *net = sock_net(sk);
624 struct socket *sf;
625 int err;
626
cec37a6e
PK
627 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
628 &sf);
2303f994
PK
629 if (err)
630 return err;
631
632 lock_sock(sf->sk);
633
634 /* kernel sockets do not by default acquire net ref, but TCP timer
635 * needs it.
636 */
637 sf->sk->sk_net_refcnt = 1;
638 get_net(net);
f6f7d8cf 639#ifdef CONFIG_PROC_FS
2303f994 640 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 641#endif
2303f994
PK
642 err = tcp_set_ulp(sf->sk, "mptcp");
643 release_sock(sf->sk);
644
645 if (err)
646 return err;
647
648 subflow = mptcp_subflow_ctx(sf->sk);
649 pr_debug("subflow=%p", subflow);
650
651 *new_sock = sf;
79c0949e 652 sock_hold(sk);
2303f994
PK
653 subflow->conn = sk;
654
655 return 0;
656}
657
658static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
659 gfp_t priority)
660{
661 struct inet_connection_sock *icsk = inet_csk(sk);
662 struct mptcp_subflow_context *ctx;
663
664 ctx = kzalloc(sizeof(*ctx), priority);
665 if (!ctx)
666 return NULL;
667
668 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 669 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
670
671 pr_debug("subflow=%p", ctx);
672
673 ctx->tcp_sock = sk;
674
675 return ctx;
676}
677
648ef4b8
MM
678static void __subflow_state_change(struct sock *sk)
679{
680 struct socket_wq *wq;
681
682 rcu_read_lock();
683 wq = rcu_dereference(sk->sk_wq);
684 if (skwq_has_sleeper(wq))
685 wake_up_interruptible_all(&wq->wait);
686 rcu_read_unlock();
687}
688
689static bool subflow_is_done(const struct sock *sk)
690{
691 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
692}
693
694static void subflow_state_change(struct sock *sk)
695{
696 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
697 struct sock *parent = READ_ONCE(subflow->conn);
698
699 __subflow_state_change(sk);
700
701 /* as recvmsg() does not acquire the subflow socket for ssk selection
702 * a fin packet carrying a DSS can be unnoticed if we don't trigger
703 * the data available machinery here.
704 */
101f6f85 705 if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk))
2e52213c 706 mptcp_data_ready(parent, sk);
648ef4b8
MM
707
708 if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
709 !subflow->rx_eof && subflow_is_done(sk)) {
710 subflow->rx_eof = 1;
711 parent->sk_shutdown |= RCV_SHUTDOWN;
712 __subflow_state_change(parent);
713 }
714}
715
2303f994
PK
716static int subflow_ulp_init(struct sock *sk)
717{
cec37a6e 718 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
719 struct mptcp_subflow_context *ctx;
720 struct tcp_sock *tp = tcp_sk(sk);
721 int err = 0;
722
723 /* disallow attaching ULP to a socket unless it has been
724 * created with sock_create_kern()
725 */
726 if (!sk->sk_kern_sock) {
727 err = -EOPNOTSUPP;
728 goto out;
729 }
730
731 ctx = subflow_create_ctx(sk, GFP_KERNEL);
732 if (!ctx) {
733 err = -ENOMEM;
734 goto out;
735 }
736
737 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
738
739 tp->is_mptcp = 1;
cec37a6e
PK
740 ctx->icsk_af_ops = icsk->icsk_af_ops;
741 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
742 ctx->tcp_data_ready = sk->sk_data_ready;
743 ctx->tcp_state_change = sk->sk_state_change;
744 ctx->tcp_write_space = sk->sk_write_space;
745 sk->sk_data_ready = subflow_data_ready;
746 sk->sk_write_space = subflow_write_space;
747 sk->sk_state_change = subflow_state_change;
2303f994
PK
748out:
749 return err;
750}
751
752static void subflow_ulp_release(struct sock *sk)
753{
754 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
755
756 if (!ctx)
757 return;
758
79c0949e
PK
759 if (ctx->conn)
760 sock_put(ctx->conn);
761
2303f994
PK
762 kfree_rcu(ctx, rcu);
763}
764
648ef4b8
MM
765static void subflow_ulp_fallback(struct sock *sk,
766 struct mptcp_subflow_context *old_ctx)
cec37a6e
PK
767{
768 struct inet_connection_sock *icsk = inet_csk(sk);
769
648ef4b8 770 mptcp_subflow_tcp_fallback(sk, old_ctx);
cec37a6e
PK
771 icsk->icsk_ulp_ops = NULL;
772 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
773 tcp_sk(sk)->is_mptcp = 0;
774}
775
776static void subflow_ulp_clone(const struct request_sock *req,
777 struct sock *newsk,
778 const gfp_t priority)
779{
780 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
781 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
782 struct mptcp_subflow_context *new_ctx;
783
ae2dd716 784 if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
648ef4b8 785 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
786 return;
787 }
788
789 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 790 if (!new_ctx) {
648ef4b8 791 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
792 return;
793 }
794
d22f4988
CP
795 /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
796 * established only after we receive the remote key
797 */
cec37a6e
PK
798 new_ctx->conn_finished = 1;
799 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
800 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
801 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
802 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
803 new_ctx->rel_write_seq = 1;
804 new_ctx->tcp_sock = newsk;
805
cec37a6e 806 new_ctx->mp_capable = 1;
d22f4988
CP
807 new_ctx->fourth_ack = subflow_req->remote_key_valid;
808 new_ctx->can_ack = subflow_req->remote_key_valid;
cec37a6e
PK
809 new_ctx->remote_key = subflow_req->remote_key;
810 new_ctx->local_key = subflow_req->local_key;
79c0949e 811 new_ctx->token = subflow_req->token;
648ef4b8
MM
812 new_ctx->ssn_offset = subflow_req->ssn_offset;
813 new_ctx->idsn = subflow_req->idsn;
cec37a6e
PK
814}
815
2303f994
PK
816static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
817 .name = "mptcp",
818 .owner = THIS_MODULE,
819 .init = subflow_ulp_init,
820 .release = subflow_ulp_release,
cec37a6e 821 .clone = subflow_ulp_clone,
2303f994
PK
822};
823
cec37a6e
PK
824static int subflow_ops_init(struct request_sock_ops *subflow_ops)
825{
826 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
827 subflow_ops->slab_name = "request_sock_subflow";
828
829 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
830 subflow_ops->obj_size, 0,
831 SLAB_ACCOUNT |
832 SLAB_TYPESAFE_BY_RCU,
833 NULL);
834 if (!subflow_ops->slab)
835 return -ENOMEM;
836
79c0949e
PK
837 subflow_ops->destructor = subflow_req_destructor;
838
cec37a6e
PK
839 return 0;
840}
841
2303f994
PK
842void mptcp_subflow_init(void)
843{
cec37a6e
PK
844 subflow_request_sock_ops = tcp_request_sock_ops;
845 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
846 panic("MPTCP: failed to init subflow request sock ops\n");
847
848 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
849 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
850
851 subflow_specific = ipv4_specific;
852 subflow_specific.conn_request = subflow_v4_conn_request;
853 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
854 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 855 subflow_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
856
857#if IS_ENABLED(CONFIG_MPTCP_IPV6)
858 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
859 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
860
861 subflow_v6_specific = ipv6_specific;
862 subflow_v6_specific.conn_request = subflow_v6_conn_request;
863 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
864 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
79c0949e 865 subflow_v6_specific.rebuild_header = subflow_rebuild_header;
cec37a6e
PK
866
867 subflow_v6m_specific = subflow_v6_specific;
868 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
869 subflow_v6m_specific.send_check = ipv4_specific.send_check;
870 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
871 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
872 subflow_v6m_specific.net_frag_header_len = 0;
873#endif
874
2303f994
PK
875 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
876 panic("MPTCP: failed to register subflows to ULP\n");
877}