]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/mptcp/subflow.c
mptcp: explicitly track the fully established status
[mirror_ubuntu-hirsute-kernel.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
f296234c 12#include <crypto/algapi.h>
bd697222 13#include <crypto/sha.h>
2303f994
PK
14#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
cec37a6e
PK
19#if IS_ENABLED(CONFIG_MPTCP_IPV6)
20#include <net/ip6_route.h>
21#endif
2303f994
PK
22#include <net/mptcp.h>
23#include "protocol.h"
fc518953
FW
24#include "mib.h"
25
26static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
27 enum linux_mptcp_mib_field field)
28{
29 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
30}
2303f994 31
79c0949e
PK
32static void subflow_req_destructor(struct request_sock *req)
33{
34 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
35
36 pr_debug("subflow_req=%p", subflow_req);
37
8fd4de12
PA
38 if (subflow_req->msk)
39 sock_put((struct sock *)subflow_req->msk);
40
2c5ebd00 41 mptcp_token_destroy_request(req);
79c0949e
PK
42 tcp_request_sock_ops.destructor(req);
43}
44
f296234c
PK
45static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
46 void *hmac)
47{
48 u8 msg[8];
49
50 put_unaligned_be32(nonce1, &msg[0]);
51 put_unaligned_be32(nonce2, &msg[4]);
52
53 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
54}
55
56/* validate received token and create truncated hmac and nonce for SYN-ACK */
8fd4de12
PA
57static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
58 const struct sk_buff *skb)
f296234c
PK
59{
60 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
bd697222 61 u8 hmac[SHA256_DIGEST_SIZE];
f296234c
PK
62 struct mptcp_sock *msk;
63 int local_id;
64
65 msk = mptcp_token_get_sock(subflow_req->token);
66 if (!msk) {
fc518953 67 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
8fd4de12 68 return NULL;
f296234c
PK
69 }
70
71 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
72 if (local_id < 0) {
73 sock_put((struct sock *)msk);
8fd4de12 74 return NULL;
f296234c
PK
75 }
76 subflow_req->local_id = local_id;
77
78 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
79
80 subflow_generate_hmac(msk->local_key, msk->remote_key,
81 subflow_req->local_nonce,
82 subflow_req->remote_nonce, hmac);
83
84 subflow_req->thmac = get_unaligned_be64(hmac);
8fd4de12 85 return msk;
f296234c
PK
86}
87
cec37a6e
PK
88static void subflow_init_req(struct request_sock *req,
89 const struct sock *sk_listener,
90 struct sk_buff *skb)
91{
92 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
93 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
cfde141e 94 struct mptcp_options_received mp_opt;
cec37a6e
PK
95
96 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
97
cfde141e 98 mptcp_get_options(skb, &mp_opt);
cec37a6e
PK
99
100 subflow_req->mp_capable = 0;
f296234c 101 subflow_req->mp_join = 0;
8fd4de12 102 subflow_req->msk = NULL;
2c5ebd00 103 mptcp_token_init_request(req);
cec37a6e
PK
104
105#ifdef CONFIG_TCP_MD5SIG
106 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
107 * TCP option space.
108 */
109 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
110 return;
111#endif
112
cfde141e 113 if (mp_opt.mp_capable) {
fc518953
FW
114 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
115
cfde141e 116 if (mp_opt.mp_join)
fc518953 117 return;
cfde141e 118 } else if (mp_opt.mp_join) {
fc518953
FW
119 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
120 }
f296234c 121
cfde141e 122 if (mp_opt.mp_capable && listener->request_mptcp) {
79c0949e
PK
123 int err;
124
125 err = mptcp_token_new_request(req);
126 if (err == 0)
127 subflow_req->mp_capable = 1;
128
648ef4b8 129 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
cfde141e 130 } else if (mp_opt.mp_join && listener->request_mptcp) {
ec3edaa7 131 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c 132 subflow_req->mp_join = 1;
cfde141e
PA
133 subflow_req->backup = mp_opt.backup;
134 subflow_req->remote_id = mp_opt.join_id;
135 subflow_req->token = mp_opt.token;
136 subflow_req->remote_nonce = mp_opt.nonce;
8fd4de12
PA
137 subflow_req->msk = subflow_token_join_request(req, skb);
138 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
139 subflow_req->remote_nonce, subflow_req->msk);
cec37a6e
PK
140 }
141}
142
143static void subflow_v4_init_req(struct request_sock *req,
144 const struct sock *sk_listener,
145 struct sk_buff *skb)
146{
147 tcp_rsk(req)->is_mptcp = 1;
148
149 tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
150
151 subflow_init_req(req, sk_listener, skb);
152}
153
154#if IS_ENABLED(CONFIG_MPTCP_IPV6)
155static void subflow_v6_init_req(struct request_sock *req,
156 const struct sock *sk_listener,
157 struct sk_buff *skb)
158{
159 tcp_rsk(req)->is_mptcp = 1;
160
161 tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
162
163 subflow_init_req(req, sk_listener, skb);
164}
165#endif
166
ec3edaa7
PK
167/* validate received truncated hmac and create hmac for third ACK */
168static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
169{
bd697222 170 u8 hmac[SHA256_DIGEST_SIZE];
ec3edaa7
PK
171 u64 thmac;
172
173 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
174 subflow->remote_nonce, subflow->local_nonce,
175 hmac);
176
177 thmac = get_unaligned_be64(hmac);
178 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
179 subflow, subflow->token,
180 (unsigned long long)thmac,
181 (unsigned long long)subflow->thmac);
182
183 return thmac == subflow->thmac;
184}
185
cec37a6e
PK
186static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
187{
188 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
cfde141e 189 struct mptcp_options_received mp_opt;
c3c123d1 190 struct sock *parent = subflow->conn;
cec37a6e
PK
191
192 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
193
1200832c 194 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
c3c123d1
DC
195 inet_sk_state_store(parent, TCP_ESTABLISHED);
196 parent->sk_state_change(parent);
197 }
198
263e1201
PA
199 /* be sure no special action on any packet other than syn-ack */
200 if (subflow->conn_finished)
201 return;
202
b0977bb2 203 subflow->rel_write_seq = 1;
263e1201 204 subflow->conn_finished = 1;
e1ff9e82
DC
205 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
206 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
263e1201 207
cfde141e
PA
208 mptcp_get_options(skb, &mp_opt);
209 if (subflow->request_mptcp && mp_opt.mp_capable) {
263e1201
PA
210 subflow->mp_capable = 1;
211 subflow->can_ack = 1;
cfde141e 212 subflow->remote_key = mp_opt.sndr_key;
263e1201
PA
213 pr_debug("subflow=%p, remote_key=%llu", subflow,
214 subflow->remote_key);
cfde141e 215 } else if (subflow->request_join && mp_opt.mp_join) {
263e1201 216 subflow->mp_join = 1;
cfde141e
PA
217 subflow->thmac = mp_opt.thmac;
218 subflow->remote_nonce = mp_opt.nonce;
263e1201
PA
219 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
220 subflow->thmac, subflow->remote_nonce);
2c5ebd00 221 } else {
e1ff9e82
DC
222 if (subflow->request_mptcp)
223 MPTCP_INC_STATS(sock_net(sk),
224 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
225 mptcp_do_fallback(sk);
226 pr_fallback(mptcp_sk(subflow->conn));
263e1201
PA
227 }
228
a6b118fe
FW
229 if (mptcp_check_fallback(sk)) {
230 mptcp_rcv_space_init(mptcp_sk(parent), sk);
ec3edaa7 231 return;
a6b118fe 232 }
ec3edaa7
PK
233
234 if (subflow->mp_capable) {
cec37a6e
PK
235 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
236 subflow->remote_key);
237 mptcp_finish_connect(sk);
ec3edaa7 238 } else if (subflow->mp_join) {
bd697222
TM
239 u8 hmac[SHA256_DIGEST_SIZE];
240
ec3edaa7
PK
241 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
242 subflow, subflow->thmac,
243 subflow->remote_nonce);
244 if (!subflow_thmac_valid(subflow)) {
fc518953 245 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
ec3edaa7
PK
246 subflow->mp_join = 0;
247 goto do_reset;
248 }
249
250 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
251 subflow->local_nonce,
252 subflow->remote_nonce,
bd697222
TM
253 hmac);
254
255 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
ec3edaa7 256
ec3edaa7
PK
257 if (!mptcp_finish_join(sk))
258 goto do_reset;
259
fc518953 260 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
ec3edaa7
PK
261 } else {
262do_reset:
263 tcp_send_active_reset(sk, GFP_ATOMIC);
264 tcp_done(sk);
cec37a6e
PK
265 }
266}
267
268static struct request_sock_ops subflow_request_sock_ops;
269static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
270
271static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
272{
273 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
274
275 pr_debug("subflow=%p", subflow);
276
277 /* Never answer to SYNs sent to broadcast or multicast */
278 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
279 goto drop;
280
281 return tcp_conn_request(&subflow_request_sock_ops,
282 &subflow_request_sock_ipv4_ops,
283 sk, skb);
284drop:
285 tcp_listendrop(sk);
286 return 0;
287}
288
289#if IS_ENABLED(CONFIG_MPTCP_IPV6)
290static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
291static struct inet_connection_sock_af_ops subflow_v6_specific;
292static struct inet_connection_sock_af_ops subflow_v6m_specific;
293
294static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
295{
296 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
297
298 pr_debug("subflow=%p", subflow);
299
300 if (skb->protocol == htons(ETH_P_IP))
301 return subflow_v4_conn_request(sk, skb);
302
303 if (!ipv6_unicast_destination(skb))
304 goto drop;
305
306 return tcp_conn_request(&subflow_request_sock_ops,
307 &subflow_request_sock_ipv6_ops, sk, skb);
308
309drop:
310 tcp_listendrop(sk);
311 return 0; /* don't send reset */
312}
313#endif
314
f296234c
PK
315/* validate hmac received in third ACK */
316static bool subflow_hmac_valid(const struct request_sock *req,
cfde141e 317 const struct mptcp_options_received *mp_opt)
f296234c
PK
318{
319 const struct mptcp_subflow_request_sock *subflow_req;
bd697222 320 u8 hmac[SHA256_DIGEST_SIZE];
f296234c 321 struct mptcp_sock *msk;
f296234c
PK
322
323 subflow_req = mptcp_subflow_rsk(req);
8fd4de12 324 msk = subflow_req->msk;
f296234c
PK
325 if (!msk)
326 return false;
327
328 subflow_generate_hmac(msk->remote_key, msk->local_key,
329 subflow_req->remote_nonce,
330 subflow_req->local_nonce, hmac);
331
8fd4de12 332 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
f296234c
PK
333}
334
df1036da
FW
335static void mptcp_sock_destruct(struct sock *sk)
336{
337 /* if new mptcp socket isn't accepted, it is free'd
338 * from the tcp listener sockets request queue, linked
339 * from req->sk. The tcp socket is released.
340 * This calls the ULP release function which will
341 * also remove the mptcp socket, via
342 * sock_put(ctx->conn).
343 *
344 * Problem is that the mptcp socket will not be in
345 * SYN_RECV state and doesn't have SOCK_DEAD flag.
346 * Both result in warnings from inet_sock_destruct.
347 */
348
349 if (sk->sk_state == TCP_SYN_RECV) {
350 sk->sk_state = TCP_CLOSE;
351 WARN_ON_ONCE(sk->sk_socket);
352 sock_orphan(sk);
353 }
354
2c5ebd00 355 mptcp_token_destroy(mptcp_sk(sk));
df1036da
FW
356 inet_sock_destruct(sk);
357}
358
9f5ca6a5
FW
359static void mptcp_force_close(struct sock *sk)
360{
361 inet_sk_state_store(sk, TCP_CLOSE);
362 sk_common_release(sk);
363}
364
4c8941de
PA
365static void subflow_ulp_fallback(struct sock *sk,
366 struct mptcp_subflow_context *old_ctx)
367{
368 struct inet_connection_sock *icsk = inet_csk(sk);
369
370 mptcp_subflow_tcp_fallback(sk, old_ctx);
371 icsk->icsk_ulp_ops = NULL;
372 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
373 tcp_sk(sk)->is_mptcp = 0;
374}
375
39884604
PA
376static void subflow_drop_ctx(struct sock *ssk)
377{
378 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
379
380 if (!ctx)
381 return;
382
383 subflow_ulp_fallback(ssk, ctx);
384 if (ctx->conn)
385 sock_put(ctx->conn);
386
387 kfree_rcu(ctx, rcu);
388}
389
b93df08c
PA
390void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
391 struct mptcp_options_received *mp_opt)
392{
393 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
394
395 subflow->remote_key = mp_opt->sndr_key;
396 subflow->fully_established = 1;
397 subflow->can_ack = 1;
398 WRITE_ONCE(msk->fully_established, true);
399}
400
cec37a6e
PK
401static struct sock *subflow_syn_recv_sock(const struct sock *sk,
402 struct sk_buff *skb,
403 struct request_sock *req,
404 struct dst_entry *dst,
405 struct request_sock *req_unhash,
406 bool *own_req)
407{
408 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea 409 struct mptcp_subflow_request_sock *subflow_req;
cfde141e 410 struct mptcp_options_received mp_opt;
9e365ff5 411 bool fallback, fallback_is_fatal;
58b09919 412 struct sock *new_msk = NULL;
cec37a6e
PK
413 struct sock *child;
414
415 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
416
9e365ff5
PA
417 /* After child creation we must look for 'mp_capable' even when options
418 * are not parsed
cfde141e
PA
419 */
420 mp_opt.mp_capable = 0;
9e365ff5
PA
421
422 /* hopefully temporary handling for MP_JOIN+syncookie */
423 subflow_req = mptcp_subflow_rsk(req);
424 fallback_is_fatal = subflow_req->mp_join;
425 fallback = !tcp_rsk(req)->is_mptcp;
426 if (fallback)
ae2dd716
FW
427 goto create_child;
428
d22f4988 429 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea 430 if (subflow_req->mp_capable) {
d22f4988
CP
431 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
432 /* here we can receive and accept an in-window,
433 * out-of-order pkt, which will not carry the MP_CAPABLE
434 * opt even on mptcp enabled paths
435 */
58b09919 436 goto create_msk;
d22f4988
CP
437 }
438
cfde141e
PA
439 mptcp_get_options(skb, &mp_opt);
440 if (!mp_opt.mp_capable) {
4c8941de 441 fallback = true;
58b09919 442 goto create_child;
d22f4988 443 }
58b09919
PA
444
445create_msk:
cfde141e 446 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
58b09919 447 if (!new_msk)
4c8941de 448 fallback = true;
f296234c 449 } else if (subflow_req->mp_join) {
cfde141e
PA
450 mptcp_get_options(skb, &mp_opt);
451 if (!mp_opt.mp_join ||
452 !subflow_hmac_valid(req, &mp_opt)) {
fc518953 453 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
9e365ff5 454 fallback = true;
fc518953 455 }
cc7972ea 456 }
cec37a6e 457
d22f4988 458create_child:
cec37a6e
PK
459 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
460 req_unhash, own_req);
461
462 if (child && *own_req) {
79c0949e
PK
463 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
464
90bf4513
PA
465 tcp_rsk(req)->drop_req = false;
466
4c8941de
PA
467 /* we need to fallback on ctx allocation failure and on pre-reqs
468 * checking above. In the latter scenario we additionally need
469 * to reset the context to non MPTCP status.
79c0949e 470 */
4c8941de 471 if (!ctx || fallback) {
f296234c 472 if (fallback_is_fatal)
729cd643 473 goto dispose_child;
4c8941de 474
39884604 475 subflow_drop_ctx(child);
58b09919 476 goto out;
f296234c 477 }
79c0949e
PK
478
479 if (ctx->mp_capable) {
b93df08c
PA
480 /* this can't race with mptcp_close(), as the msk is
481 * not yet exposted to user-space
482 */
483 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
484
58b09919
PA
485 /* new mpc subflow takes ownership of the newly
486 * created mptcp socket
487 */
df1036da 488 new_msk->sk_destruct = mptcp_sock_destruct;
1b1c7a0e 489 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
2c5ebd00 490 mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
58b09919
PA
491 ctx->conn = new_msk;
492 new_msk = NULL;
fca5c82c
PA
493
494 /* with OoO packets we can reach here without ingress
495 * mpc option
496 */
b93df08c
PA
497 if (mp_opt.mp_capable)
498 mptcp_subflow_fully_established(ctx, &mp_opt);
f296234c
PK
499 } else if (ctx->mp_join) {
500 struct mptcp_sock *owner;
501
8fd4de12 502 owner = subflow_req->msk;
f296234c 503 if (!owner)
729cd643 504 goto dispose_child;
f296234c 505
8fd4de12
PA
506 /* move the msk reference ownership to the subflow */
507 subflow_req->msk = NULL;
f296234c
PK
508 ctx->conn = (struct sock *)owner;
509 if (!mptcp_finish_join(child))
729cd643 510 goto dispose_child;
fc518953
FW
511
512 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
90bf4513 513 tcp_rsk(req)->drop_req = true;
cec37a6e
PK
514 }
515 }
516
58b09919
PA
517out:
518 /* dispose of the left over mptcp master, if any */
519 if (unlikely(new_msk))
9f5ca6a5 520 mptcp_force_close(new_msk);
4c8941de
PA
521
522 /* check for expected invariant - should never trigger, just help
523 * catching eariler subtle bugs
524 */
ac2b47fb 525 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
4c8941de
PA
526 (!mptcp_subflow_ctx(child) ||
527 !mptcp_subflow_ctx(child)->conn));
cec37a6e 528 return child;
f296234c 529
729cd643 530dispose_child:
39884604 531 subflow_drop_ctx(child);
729cd643 532 tcp_rsk(req)->drop_req = true;
f296234c 533 tcp_send_active_reset(child, GFP_ATOMIC);
729cd643 534 inet_csk_prepare_for_destroy_sock(child);
f296234c 535 tcp_done(child);
729cd643
PA
536
537 /* The last child reference will be released by the caller */
538 return child;
cec37a6e
PK
539}
540
541static struct inet_connection_sock_af_ops subflow_specific;
542
648ef4b8
MM
543enum mapping_status {
544 MAPPING_OK,
545 MAPPING_INVALID,
546 MAPPING_EMPTY,
e1ff9e82
DC
547 MAPPING_DATA_FIN,
548 MAPPING_DUMMY
648ef4b8
MM
549};
550
551static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
552{
553 if ((u32)seq == (u32)old_seq)
554 return old_seq;
555
556 /* Assume map covers data not mapped yet. */
557 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
558}
559
560static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
561{
562 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
563 ssn, subflow->map_subflow_seq, subflow->map_data_len);
564}
565
566static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
567{
568 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
569 unsigned int skb_consumed;
570
571 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
572 if (WARN_ON_ONCE(skb_consumed >= skb->len))
573 return true;
574
575 return skb->len - skb_consumed <= subflow->map_data_len -
576 mptcp_subflow_get_map_offset(subflow);
577}
578
579static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
580{
581 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
582 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
583
584 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
585 /* Mapping covers data later in the subflow stream,
586 * currently unsupported.
587 */
588 warn_bad_map(subflow, ssn);
589 return false;
590 }
591 if (unlikely(!before(ssn, subflow->map_subflow_seq +
592 subflow->map_data_len))) {
593 /* Mapping does covers past subflow data, invalid */
594 warn_bad_map(subflow, ssn + skb->len);
595 return false;
596 }
597 return true;
598}
599
600static enum mapping_status get_mapping_status(struct sock *ssk)
601{
602 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
603 struct mptcp_ext *mpext;
604 struct sk_buff *skb;
605 u16 data_len;
606 u64 map_seq;
607
608 skb = skb_peek(&ssk->sk_receive_queue);
609 if (!skb)
610 return MAPPING_EMPTY;
611
e1ff9e82
DC
612 if (mptcp_check_fallback(ssk))
613 return MAPPING_DUMMY;
614
648ef4b8
MM
615 mpext = mptcp_get_ext(skb);
616 if (!mpext || !mpext->use_map) {
617 if (!subflow->map_valid && !skb->len) {
618 /* the TCP stack deliver 0 len FIN pkt to the receive
619 * queue, that is the only 0len pkts ever expected here,
620 * and we can admit no mapping only for 0 len pkts
621 */
622 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
623 WARN_ONCE(1, "0len seq %d:%d flags %x",
624 TCP_SKB_CB(skb)->seq,
625 TCP_SKB_CB(skb)->end_seq,
626 TCP_SKB_CB(skb)->tcp_flags);
627 sk_eat_skb(ssk, skb);
628 return MAPPING_EMPTY;
629 }
630
631 if (!subflow->map_valid)
632 return MAPPING_INVALID;
633
634 goto validate_seq;
635 }
636
637 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
638 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
639 mpext->data_len, mpext->data_fin);
640
641 data_len = mpext->data_len;
642 if (data_len == 0) {
643 pr_err("Infinite mapping not handled");
fc518953 644 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
648ef4b8
MM
645 return MAPPING_INVALID;
646 }
647
648 if (mpext->data_fin == 1) {
649 if (data_len == 1) {
650 pr_debug("DATA_FIN with no payload");
651 if (subflow->map_valid) {
652 /* A DATA_FIN might arrive in a DSS
653 * option before the previous mapping
654 * has been fully consumed. Continue
655 * handling the existing mapping.
656 */
657 skb_ext_del(skb, SKB_EXT_MPTCP);
658 return MAPPING_OK;
659 } else {
660 return MAPPING_DATA_FIN;
661 }
662 }
663
664 /* Adjust for DATA_FIN using 1 byte of sequence space */
665 data_len--;
666 }
667
668 if (!mpext->dsn64) {
669 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
670 mpext->data_seq);
a0c1d0ea 671 subflow->use_64bit_ack = 0;
648ef4b8
MM
672 pr_debug("expanded seq=%llu", subflow->map_seq);
673 } else {
674 map_seq = mpext->data_seq;
a0c1d0ea 675 subflow->use_64bit_ack = 1;
648ef4b8
MM
676 }
677
678 if (subflow->map_valid) {
679 /* Allow replacing only with an identical map */
680 if (subflow->map_seq == map_seq &&
681 subflow->map_subflow_seq == mpext->subflow_seq &&
682 subflow->map_data_len == data_len) {
683 skb_ext_del(skb, SKB_EXT_MPTCP);
684 return MAPPING_OK;
685 }
686
687 /* If this skb data are fully covered by the current mapping,
688 * the new map would need caching, which is not supported
689 */
fc518953
FW
690 if (skb_is_fully_mapped(ssk, skb)) {
691 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
648ef4b8 692 return MAPPING_INVALID;
fc518953 693 }
648ef4b8
MM
694
695 /* will validate the next map after consuming the current one */
696 return MAPPING_OK;
697 }
698
699 subflow->map_seq = map_seq;
700 subflow->map_subflow_seq = mpext->subflow_seq;
701 subflow->map_data_len = data_len;
702 subflow->map_valid = 1;
d22f4988 703 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
704 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
705 subflow->map_seq, subflow->map_subflow_seq,
706 subflow->map_data_len);
707
708validate_seq:
709 /* we revalidate valid mapping on new skb, because we must ensure
710 * the current skb is completely covered by the available mapping
711 */
712 if (!validate_mapping(ssk, skb))
713 return MAPPING_INVALID;
714
715 skb_ext_del(skb, SKB_EXT_MPTCP);
716 return MAPPING_OK;
717}
718
bfae9dae
FW
719static int subflow_read_actor(read_descriptor_t *desc,
720 struct sk_buff *skb,
721 unsigned int offset, size_t len)
722{
723 size_t copy_len = min(desc->count, len);
724
725 desc->count -= copy_len;
726
727 pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
728 return copy_len;
729}
730
648ef4b8
MM
731static bool subflow_check_data_avail(struct sock *ssk)
732{
733 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
734 enum mapping_status status;
735 struct mptcp_sock *msk;
736 struct sk_buff *skb;
737
738 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
739 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
740 if (subflow->data_avail)
741 return true;
742
648ef4b8
MM
743 msk = mptcp_sk(subflow->conn);
744 for (;;) {
745 u32 map_remaining;
746 size_t delta;
747 u64 ack_seq;
748 u64 old_ack;
749
750 status = get_mapping_status(ssk);
751 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
752 if (status == MAPPING_INVALID) {
753 ssk->sk_err = EBADMSG;
754 goto fatal;
755 }
e1ff9e82
DC
756 if (status == MAPPING_DUMMY) {
757 __mptcp_do_fallback(msk);
758 skb = skb_peek(&ssk->sk_receive_queue);
759 subflow->map_valid = 1;
760 subflow->map_seq = READ_ONCE(msk->ack_seq);
761 subflow->map_data_len = skb->len;
762 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
763 subflow->ssn_offset;
764 return true;
765 }
648ef4b8
MM
766
767 if (status != MAPPING_OK)
768 return false;
769
770 skb = skb_peek(&ssk->sk_receive_queue);
771 if (WARN_ON_ONCE(!skb))
772 return false;
773
d22f4988
CP
774 /* if msk lacks the remote key, this subflow must provide an
775 * MP_CAPABLE-based mapping
776 */
777 if (unlikely(!READ_ONCE(msk->can_ack))) {
778 if (!subflow->mpc_map) {
779 ssk->sk_err = EBADMSG;
780 goto fatal;
781 }
782 WRITE_ONCE(msk->remote_key, subflow->remote_key);
783 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
784 WRITE_ONCE(msk->can_ack, true);
785 }
786
648ef4b8
MM
787 old_ack = READ_ONCE(msk->ack_seq);
788 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
789 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
790 ack_seq);
791 if (ack_seq == old_ack)
792 break;
793
794 /* only accept in-sequence mapping. Old values are spurious
795 * retransmission; we can hit "future" values on active backup
796 * subflow switch, we relay on retransmissions to get
797 * in-sequence data.
798 * Cuncurrent subflows support will require subflow data
799 * reordering
800 */
801 map_remaining = subflow->map_data_len -
802 mptcp_subflow_get_map_offset(subflow);
803 if (before64(ack_seq, old_ack))
804 delta = min_t(size_t, old_ack - ack_seq, map_remaining);
805 else
806 delta = min_t(size_t, ack_seq - old_ack, map_remaining);
807
808 /* discard mapped data */
809 pr_debug("discarding %zu bytes, current map len=%d", delta,
810 map_remaining);
811 if (delta) {
648ef4b8
MM
812 read_descriptor_t desc = {
813 .count = delta,
648ef4b8
MM
814 };
815 int ret;
816
bfae9dae 817 ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
648ef4b8
MM
818 if (ret < 0) {
819 ssk->sk_err = -ret;
820 goto fatal;
821 }
822 if (ret < delta)
823 return false;
824 if (delta == map_remaining)
825 subflow->map_valid = 0;
826 }
827 }
828 return true;
829
830fatal:
831 /* fatal protocol error, close the socket */
832 /* This barrier is coupled with smp_rmb() in tcp_poll() */
833 smp_wmb();
834 ssk->sk_error_report(ssk);
835 tcp_set_state(ssk, TCP_CLOSE);
836 tcp_send_active_reset(ssk, GFP_ATOMIC);
837 return false;
838}
839
840bool mptcp_subflow_data_available(struct sock *sk)
841{
842 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
843 struct sk_buff *skb;
844
845 /* check if current mapping is still valid */
846 if (subflow->map_valid &&
847 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
848 subflow->map_valid = 0;
849 subflow->data_avail = 0;
850
851 pr_debug("Done with mapping: seq=%u data_len=%u",
852 subflow->map_subflow_seq,
853 subflow->map_data_len);
854 }
855
856 if (!subflow_check_data_avail(sk)) {
857 subflow->data_avail = 0;
858 return false;
859 }
860
861 skb = skb_peek(&sk->sk_receive_queue);
862 subflow->data_avail = skb &&
863 before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
864 return subflow->data_avail;
865}
866
071c8ed6
FW
867/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
868 * not the ssk one.
869 *
870 * In mptcp, rwin is about the mptcp-level connection data.
871 *
872 * Data that is still on the ssk rx queue can thus be ignored,
873 * as far as mptcp peer is concerened that data is still inflight.
874 * DSS ACK is updated when skb is moved to the mptcp rx queue.
875 */
876void mptcp_space(const struct sock *ssk, int *space, int *full_space)
877{
878 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
879 const struct sock *sk = subflow->conn;
880
881 *space = tcp_space(sk);
882 *full_space = tcp_full_space(sk);
883}
884
648ef4b8
MM
885static void subflow_data_ready(struct sock *sk)
886{
887 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
8c728940 888 u16 state = 1 << inet_sk_state_load(sk);
648ef4b8 889 struct sock *parent = subflow->conn;
e1ff9e82 890 struct mptcp_sock *msk;
648ef4b8 891
e1ff9e82 892 msk = mptcp_sk(parent);
8c728940 893 if (state & TCPF_LISTEN) {
e1ff9e82 894 set_bit(MPTCP_DATA_READY, &msk->flags);
dc093db5 895 parent->sk_data_ready(parent);
648ef4b8
MM
896 return;
897 }
898
e1ff9e82 899 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
8c728940 900 !subflow->mp_join && !(state & TCPF_CLOSE));
e1ff9e82 901
101f6f85 902 if (mptcp_subflow_data_available(sk))
2e52213c 903 mptcp_data_ready(parent, sk);
648ef4b8
MM
904}
905
906static void subflow_write_space(struct sock *sk)
907{
908 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
909 struct sock *parent = subflow->conn;
910
911 sk_stream_write_space(sk);
dc093db5 912 if (sk_stream_is_writeable(sk)) {
1891c4a0
FW
913 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
914 smp_mb__after_atomic();
915 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
648ef4b8
MM
916 sk_stream_write_space(parent);
917 }
918}
919
cec37a6e
PK
920static struct inet_connection_sock_af_ops *
921subflow_default_af_ops(struct sock *sk)
922{
923#if IS_ENABLED(CONFIG_MPTCP_IPV6)
924 if (sk->sk_family == AF_INET6)
925 return &subflow_v6_specific;
926#endif
927 return &subflow_specific;
928}
929
cec37a6e 930#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
931void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
932{
cec37a6e
PK
933 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
934 struct inet_connection_sock *icsk = inet_csk(sk);
935 struct inet_connection_sock_af_ops *target;
936
937 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
938
939 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 940 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
941
942 if (likely(icsk->icsk_af_ops == target))
943 return;
944
945 subflow->icsk_af_ops = icsk->icsk_af_ops;
946 icsk->icsk_af_ops = target;
cec37a6e 947}
31484d56 948#endif
cec37a6e 949
ec3edaa7
PK
950static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
951 struct sockaddr_storage *addr)
952{
953 memset(addr, 0, sizeof(*addr));
954 addr->ss_family = info->family;
955 if (addr->ss_family == AF_INET) {
956 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
957
958 in_addr->sin_addr = info->addr;
959 in_addr->sin_port = info->port;
960 }
961#if IS_ENABLED(CONFIG_MPTCP_IPV6)
962 else if (addr->ss_family == AF_INET6) {
963 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
964
965 in6_addr->sin6_addr = info->addr6;
966 in6_addr->sin6_port = info->port;
967 }
968#endif
969}
970
971int __mptcp_subflow_connect(struct sock *sk, int ifindex,
972 const struct mptcp_addr_info *loc,
973 const struct mptcp_addr_info *remote)
974{
975 struct mptcp_sock *msk = mptcp_sk(sk);
976 struct mptcp_subflow_context *subflow;
977 struct sockaddr_storage addr;
6bad912b 978 int local_id = loc->id;
ec3edaa7 979 struct socket *sf;
6bad912b 980 struct sock *ssk;
ec3edaa7
PK
981 u32 remote_token;
982 int addrlen;
983 int err;
984
b93df08c 985 if (!mptcp_is_fully_established(sk))
ec3edaa7
PK
986 return -ENOTCONN;
987
988 err = mptcp_subflow_create_socket(sk, &sf);
989 if (err)
990 return err;
991
6bad912b
PA
992 ssk = sf->sk;
993 subflow = mptcp_subflow_ctx(ssk);
994 do {
995 get_random_bytes(&subflow->local_nonce, sizeof(u32));
996 } while (!subflow->local_nonce);
997
998 if (!local_id) {
999 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
1000 if (err < 0)
1001 goto failed;
1002
1003 local_id = err;
1004 }
1005
ec3edaa7
PK
1006 subflow->remote_key = msk->remote_key;
1007 subflow->local_key = msk->local_key;
1008 subflow->token = msk->token;
1009 mptcp_info2sockaddr(loc, &addr);
1010
1011 addrlen = sizeof(struct sockaddr_in);
1012#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1013 if (loc->family == AF_INET6)
1014 addrlen = sizeof(struct sockaddr_in6);
1015#endif
6bad912b 1016 ssk->sk_bound_dev_if = ifindex;
ec3edaa7
PK
1017 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1018 if (err)
1019 goto failed;
1020
1021 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
6bad912b
PA
1022 pr_debug("msk=%p remote_token=%u local_id=%d", msk, remote_token,
1023 local_id);
ec3edaa7 1024 subflow->remote_token = remote_token;
6bad912b 1025 subflow->local_id = local_id;
ec3edaa7
PK
1026 subflow->request_join = 1;
1027 subflow->request_bkup = 1;
1028 mptcp_info2sockaddr(remote, &addr);
1029
1030 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1031 if (err && err != -EINPROGRESS)
1032 goto failed;
1033
1034 spin_lock_bh(&msk->join_list_lock);
1035 list_add_tail(&subflow->node, &msk->join_list);
1036 spin_unlock_bh(&msk->join_list_lock);
1037
1038 return err;
1039
1040failed:
1041 sock_release(sf);
1042 return err;
1043}
1044
2303f994
PK
1045int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1046{
1047 struct mptcp_subflow_context *subflow;
1048 struct net *net = sock_net(sk);
1049 struct socket *sf;
1050 int err;
1051
cec37a6e
PK
1052 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1053 &sf);
2303f994
PK
1054 if (err)
1055 return err;
1056
1057 lock_sock(sf->sk);
1058
1059 /* kernel sockets do not by default acquire net ref, but TCP timer
1060 * needs it.
1061 */
1062 sf->sk->sk_net_refcnt = 1;
1063 get_net(net);
f6f7d8cf 1064#ifdef CONFIG_PROC_FS
2303f994 1065 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 1066#endif
2303f994
PK
1067 err = tcp_set_ulp(sf->sk, "mptcp");
1068 release_sock(sf->sk);
1069
b8ad540d
WY
1070 if (err) {
1071 sock_release(sf);
2303f994 1072 return err;
b8ad540d 1073 }
2303f994 1074
7d14b0d2
PA
1075 /* the newly created socket really belongs to the owning MPTCP master
1076 * socket, even if for additional subflows the allocation is performed
1077 * by a kernel workqueue. Adjust inode references, so that the
1078 * procfs/diag interaces really show this one belonging to the correct
1079 * user.
1080 */
1081 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1082 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1083 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1084
2303f994
PK
1085 subflow = mptcp_subflow_ctx(sf->sk);
1086 pr_debug("subflow=%p", subflow);
1087
1088 *new_sock = sf;
79c0949e 1089 sock_hold(sk);
2303f994
PK
1090 subflow->conn = sk;
1091
1092 return 0;
1093}
1094
1095static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1096 gfp_t priority)
1097{
1098 struct inet_connection_sock *icsk = inet_csk(sk);
1099 struct mptcp_subflow_context *ctx;
1100
1101 ctx = kzalloc(sizeof(*ctx), priority);
1102 if (!ctx)
1103 return NULL;
1104
1105 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 1106 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
1107
1108 pr_debug("subflow=%p", ctx);
1109
1110 ctx->tcp_sock = sk;
1111
1112 return ctx;
1113}
1114
648ef4b8
MM
1115static void __subflow_state_change(struct sock *sk)
1116{
1117 struct socket_wq *wq;
1118
1119 rcu_read_lock();
1120 wq = rcu_dereference(sk->sk_wq);
1121 if (skwq_has_sleeper(wq))
1122 wake_up_interruptible_all(&wq->wait);
1123 rcu_read_unlock();
1124}
1125
1126static bool subflow_is_done(const struct sock *sk)
1127{
1128 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1129}
1130
1131static void subflow_state_change(struct sock *sk)
1132{
1133 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
dc093db5 1134 struct sock *parent = subflow->conn;
648ef4b8
MM
1135
1136 __subflow_state_change(sk);
1137
8fd73804
DC
1138 if (subflow_simultaneous_connect(sk)) {
1139 mptcp_do_fallback(sk);
a6b118fe 1140 mptcp_rcv_space_init(mptcp_sk(parent), sk);
8fd73804
DC
1141 pr_fallback(mptcp_sk(parent));
1142 subflow->conn_finished = 1;
1143 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1144 inet_sk_state_store(parent, TCP_ESTABLISHED);
1145 parent->sk_state_change(parent);
1146 }
1147 }
1148
648ef4b8
MM
1149 /* as recvmsg() does not acquire the subflow socket for ssk selection
1150 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1151 * the data available machinery here.
1152 */
e1ff9e82 1153 if (mptcp_subflow_data_available(sk))
2e52213c 1154 mptcp_data_ready(parent, sk);
648ef4b8 1155
dc093db5 1156 if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
648ef4b8
MM
1157 !subflow->rx_eof && subflow_is_done(sk)) {
1158 subflow->rx_eof = 1;
59832e24 1159 mptcp_subflow_eof(parent);
648ef4b8
MM
1160 }
1161}
1162
2303f994
PK
1163static int subflow_ulp_init(struct sock *sk)
1164{
cec37a6e 1165 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
1166 struct mptcp_subflow_context *ctx;
1167 struct tcp_sock *tp = tcp_sk(sk);
1168 int err = 0;
1169
1170 /* disallow attaching ULP to a socket unless it has been
1171 * created with sock_create_kern()
1172 */
1173 if (!sk->sk_kern_sock) {
1174 err = -EOPNOTSUPP;
1175 goto out;
1176 }
1177
1178 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1179 if (!ctx) {
1180 err = -ENOMEM;
1181 goto out;
1182 }
1183
1184 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1185
1186 tp->is_mptcp = 1;
cec37a6e
PK
1187 ctx->icsk_af_ops = icsk->icsk_af_ops;
1188 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
1189 ctx->tcp_data_ready = sk->sk_data_ready;
1190 ctx->tcp_state_change = sk->sk_state_change;
1191 ctx->tcp_write_space = sk->sk_write_space;
1192 sk->sk_data_ready = subflow_data_ready;
1193 sk->sk_write_space = subflow_write_space;
1194 sk->sk_state_change = subflow_state_change;
2303f994
PK
1195out:
1196 return err;
1197}
1198
1199static void subflow_ulp_release(struct sock *sk)
1200{
1201 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
1202
1203 if (!ctx)
1204 return;
1205
79c0949e
PK
1206 if (ctx->conn)
1207 sock_put(ctx->conn);
1208
2303f994
PK
1209 kfree_rcu(ctx, rcu);
1210}
1211
cec37a6e
PK
1212static void subflow_ulp_clone(const struct request_sock *req,
1213 struct sock *newsk,
1214 const gfp_t priority)
1215{
1216 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1217 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1218 struct mptcp_subflow_context *new_ctx;
1219
f296234c
PK
1220 if (!tcp_rsk(req)->is_mptcp ||
1221 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
648ef4b8 1222 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1223 return;
1224 }
1225
1226 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 1227 if (!new_ctx) {
648ef4b8 1228 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1229 return;
1230 }
1231
1232 new_ctx->conn_finished = 1;
1233 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
1234 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1235 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1236 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
1237 new_ctx->rel_write_seq = 1;
1238 new_ctx->tcp_sock = newsk;
1239
f296234c
PK
1240 if (subflow_req->mp_capable) {
1241 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1242 * is fully established only after we receive the remote key
1243 */
1244 new_ctx->mp_capable = 1;
f296234c
PK
1245 new_ctx->local_key = subflow_req->local_key;
1246 new_ctx->token = subflow_req->token;
1247 new_ctx->ssn_offset = subflow_req->ssn_offset;
1248 new_ctx->idsn = subflow_req->idsn;
1249 } else if (subflow_req->mp_join) {
ec3edaa7 1250 new_ctx->ssn_offset = subflow_req->ssn_offset;
f296234c
PK
1251 new_ctx->mp_join = 1;
1252 new_ctx->fully_established = 1;
1253 new_ctx->backup = subflow_req->backup;
1254 new_ctx->local_id = subflow_req->local_id;
1255 new_ctx->token = subflow_req->token;
1256 new_ctx->thmac = subflow_req->thmac;
1257 }
cec37a6e
PK
1258}
1259
2303f994
PK
1260static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1261 .name = "mptcp",
1262 .owner = THIS_MODULE,
1263 .init = subflow_ulp_init,
1264 .release = subflow_ulp_release,
cec37a6e 1265 .clone = subflow_ulp_clone,
2303f994
PK
1266};
1267
cec37a6e
PK
1268static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1269{
1270 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1271 subflow_ops->slab_name = "request_sock_subflow";
1272
1273 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1274 subflow_ops->obj_size, 0,
1275 SLAB_ACCOUNT |
1276 SLAB_TYPESAFE_BY_RCU,
1277 NULL);
1278 if (!subflow_ops->slab)
1279 return -ENOMEM;
1280
79c0949e
PK
1281 subflow_ops->destructor = subflow_req_destructor;
1282
cec37a6e
PK
1283 return 0;
1284}
1285
d39dceca 1286void __init mptcp_subflow_init(void)
2303f994 1287{
cec37a6e
PK
1288 subflow_request_sock_ops = tcp_request_sock_ops;
1289 if (subflow_ops_init(&subflow_request_sock_ops) != 0)
1290 panic("MPTCP: failed to init subflow request sock ops\n");
1291
1292 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
1293 subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
1294
1295 subflow_specific = ipv4_specific;
1296 subflow_specific.conn_request = subflow_v4_conn_request;
1297 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1298 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1299
1300#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1301 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
1302 subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
1303
1304 subflow_v6_specific = ipv6_specific;
1305 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1306 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1307 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1308
1309 subflow_v6m_specific = subflow_v6_specific;
1310 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1311 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1312 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1313 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1314 subflow_v6m_specific.net_frag_header_len = 0;
1315#endif
1316
5147dfb5
DC
1317 mptcp_diag_subflow_init(&subflow_ulp_ops);
1318
2303f994
PK
1319 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1320 panic("MPTCP: failed to register subflows to ULP\n");
1321}