]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/mptcp/subflow.c
mptcp: use MPTCPOPT_HMAC_LEN macro
[mirror_ubuntu-hirsute-kernel.git] / net / mptcp / subflow.c
CommitLineData
2303f994
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
79c0949e
PK
7#define pr_fmt(fmt) "MPTCP: " fmt
8
2303f994
PK
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/netdevice.h>
f296234c 12#include <crypto/algapi.h>
bd697222 13#include <crypto/sha.h>
2303f994
PK
14#include <net/sock.h>
15#include <net/inet_common.h>
16#include <net/inet_hashtables.h>
17#include <net/protocol.h>
18#include <net/tcp.h>
cec37a6e
PK
19#if IS_ENABLED(CONFIG_MPTCP_IPV6)
20#include <net/ip6_route.h>
21#endif
2303f994 22#include <net/mptcp.h>
4596a2c1 23#include <uapi/linux/mptcp.h>
2303f994 24#include "protocol.h"
fc518953
FW
25#include "mib.h"
26
27static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
28 enum linux_mptcp_mib_field field)
29{
30 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
31}
2303f994 32
79c0949e
PK
33static void subflow_req_destructor(struct request_sock *req)
34{
35 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
36
37 pr_debug("subflow_req=%p", subflow_req);
38
8fd4de12
PA
39 if (subflow_req->msk)
40 sock_put((struct sock *)subflow_req->msk);
41
2c5ebd00 42 mptcp_token_destroy_request(req);
79c0949e
PK
43 tcp_request_sock_ops.destructor(req);
44}
45
f296234c
PK
46static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
47 void *hmac)
48{
49 u8 msg[8];
50
51 put_unaligned_be32(nonce1, &msg[0]);
52 put_unaligned_be32(nonce2, &msg[4]);
53
54 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
55}
56
4cf8b7e4
PA
57static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
58{
59 return mptcp_is_fully_established((void *)msk) &&
60 READ_ONCE(msk->pm.accept_subflow);
61}
62
f296234c 63/* validate received token and create truncated hmac and nonce for SYN-ACK */
8fd4de12
PA
64static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
65 const struct sk_buff *skb)
f296234c
PK
66{
67 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
bd697222 68 u8 hmac[SHA256_DIGEST_SIZE];
f296234c
PK
69 struct mptcp_sock *msk;
70 int local_id;
71
72 msk = mptcp_token_get_sock(subflow_req->token);
73 if (!msk) {
fc518953 74 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
8fd4de12 75 return NULL;
f296234c
PK
76 }
77
78 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
79 if (local_id < 0) {
80 sock_put((struct sock *)msk);
8fd4de12 81 return NULL;
f296234c
PK
82 }
83 subflow_req->local_id = local_id;
84
85 get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
86
87 subflow_generate_hmac(msk->local_key, msk->remote_key,
88 subflow_req->local_nonce,
89 subflow_req->remote_nonce, hmac);
90
91 subflow_req->thmac = get_unaligned_be64(hmac);
8fd4de12 92 return msk;
f296234c
PK
93}
94
78d8b7bc 95static int __subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
cec37a6e 96{
cec37a6e 97 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
cec37a6e
PK
98
99 subflow_req->mp_capable = 0;
f296234c 100 subflow_req->mp_join = 0;
8fd4de12 101 subflow_req->msk = NULL;
2c5ebd00 102 mptcp_token_init_request(req);
cec37a6e
PK
103
104#ifdef CONFIG_TCP_MD5SIG
105 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
106 * TCP option space.
107 */
108 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
78d8b7bc 109 return -EINVAL;
cec37a6e
PK
110#endif
111
78d8b7bc
FW
112 return 0;
113}
114
3ecfbe3e
FW
115/* Init mptcp request socket.
116 *
117 * Returns an error code if a JOIN has failed and a TCP reset
118 * should be sent.
119 */
120static int subflow_init_req(struct request_sock *req,
121 const struct sock *sk_listener,
122 struct sk_buff *skb)
78d8b7bc
FW
123{
124 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
125 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
126 struct mptcp_options_received mp_opt;
127 int ret;
128
129 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
130
131 ret = __subflow_init_req(req, sk_listener);
132 if (ret)
3ecfbe3e 133 return 0;
78d8b7bc
FW
134
135 mptcp_get_options(skb, &mp_opt);
136
cfde141e 137 if (mp_opt.mp_capable) {
fc518953
FW
138 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
139
cfde141e 140 if (mp_opt.mp_join)
3ecfbe3e 141 return 0;
cfde141e 142 } else if (mp_opt.mp_join) {
fc518953
FW
143 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
144 }
f296234c 145
cfde141e 146 if (mp_opt.mp_capable && listener->request_mptcp) {
535fb815
FW
147 int err, retries = 4;
148
c83a47e5 149 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
535fb815
FW
150again:
151 do {
152 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
153 } while (subflow_req->local_key == 0);
79c0949e 154
c83a47e5
FW
155 if (unlikely(req->syncookie)) {
156 mptcp_crypto_key_sha(subflow_req->local_key,
157 &subflow_req->token,
158 &subflow_req->idsn);
159 if (mptcp_token_exists(subflow_req->token)) {
160 if (retries-- > 0)
161 goto again;
162 } else {
163 subflow_req->mp_capable = 1;
164 }
3ecfbe3e 165 return 0;
c83a47e5
FW
166 }
167
79c0949e
PK
168 err = mptcp_token_new_request(req);
169 if (err == 0)
170 subflow_req->mp_capable = 1;
535fb815
FW
171 else if (retries-- > 0)
172 goto again;
79c0949e 173
cfde141e 174 } else if (mp_opt.mp_join && listener->request_mptcp) {
ec3edaa7 175 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
f296234c 176 subflow_req->mp_join = 1;
cfde141e
PA
177 subflow_req->backup = mp_opt.backup;
178 subflow_req->remote_id = mp_opt.join_id;
179 subflow_req->token = mp_opt.token;
180 subflow_req->remote_nonce = mp_opt.nonce;
8fd4de12 181 subflow_req->msk = subflow_token_join_request(req, skb);
9466a1cc 182
3ecfbe3e
FW
183 /* Can't fall back to TCP in this case. */
184 if (!subflow_req->msk)
185 return -EPERM;
186
187 if (unlikely(req->syncookie)) {
9466a1cc
FW
188 if (mptcp_can_accept_new_subflow(subflow_req->msk))
189 subflow_init_req_cookie_join_save(subflow_req, skb);
190 }
191
8fd4de12
PA
192 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
193 subflow_req->remote_nonce, subflow_req->msk);
cec37a6e 194 }
3ecfbe3e
FW
195
196 return 0;
cec37a6e
PK
197}
198
c83a47e5
FW
199int mptcp_subflow_init_cookie_req(struct request_sock *req,
200 const struct sock *sk_listener,
201 struct sk_buff *skb)
202{
203 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
204 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
205 struct mptcp_options_received mp_opt;
206 int err;
207
208 err = __subflow_init_req(req, sk_listener);
209 if (err)
210 return err;
211
212 mptcp_get_options(skb, &mp_opt);
213
214 if (mp_opt.mp_capable && mp_opt.mp_join)
215 return -EINVAL;
216
217 if (mp_opt.mp_capable && listener->request_mptcp) {
218 if (mp_opt.sndr_key == 0)
219 return -EINVAL;
220
221 subflow_req->local_key = mp_opt.rcvr_key;
222 err = mptcp_token_new_request(req);
223 if (err)
224 return err;
225
226 subflow_req->mp_capable = 1;
227 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
9466a1cc
FW
228 } else if (mp_opt.mp_join && listener->request_mptcp) {
229 if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
230 return -EINVAL;
231
232 if (mptcp_can_accept_new_subflow(subflow_req->msk))
233 subflow_req->mp_join = 1;
234
235 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
c83a47e5
FW
236 }
237
238 return 0;
239}
240EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
241
7ea851d1
FW
242static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
243 struct sk_buff *skb,
244 struct flowi *fl,
245 struct request_sock *req)
cec37a6e 246{
7ea851d1 247 struct dst_entry *dst;
3ecfbe3e 248 int err;
7ea851d1 249
cec37a6e
PK
250 tcp_rsk(req)->is_mptcp = 1;
251
7ea851d1
FW
252 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
253 if (!dst)
254 return NULL;
cec37a6e 255
3ecfbe3e
FW
256 err = subflow_init_req(req, sk, skb);
257 if (err == 0)
258 return dst;
259
260 dst_release(dst);
261 if (!req->syncookie)
262 tcp_request_sock_ops.send_reset(sk, skb);
263 return NULL;
cec37a6e
PK
264}
265
266#if IS_ENABLED(CONFIG_MPTCP_IPV6)
7ea851d1
FW
267static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
268 struct sk_buff *skb,
269 struct flowi *fl,
270 struct request_sock *req)
cec37a6e 271{
7ea851d1 272 struct dst_entry *dst;
3ecfbe3e 273 int err;
7ea851d1 274
cec37a6e
PK
275 tcp_rsk(req)->is_mptcp = 1;
276
7ea851d1
FW
277 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
278 if (!dst)
279 return NULL;
cec37a6e 280
3ecfbe3e
FW
281 err = subflow_init_req(req, sk, skb);
282 if (err == 0)
283 return dst;
284
285 dst_release(dst);
286 if (!req->syncookie)
287 tcp6_request_sock_ops.send_reset(sk, skb);
288 return NULL;
cec37a6e
PK
289}
290#endif
291
ec3edaa7
PK
292/* validate received truncated hmac and create hmac for third ACK */
293static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
294{
bd697222 295 u8 hmac[SHA256_DIGEST_SIZE];
ec3edaa7
PK
296 u64 thmac;
297
298 subflow_generate_hmac(subflow->remote_key, subflow->local_key,
299 subflow->remote_nonce, subflow->local_nonce,
300 hmac);
301
302 thmac = get_unaligned_be64(hmac);
303 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
304 subflow, subflow->token,
305 (unsigned long long)thmac,
306 (unsigned long long)subflow->thmac);
307
308 return thmac == subflow->thmac;
309}
310
d5824847
PA
311void mptcp_subflow_reset(struct sock *ssk)
312{
0e4f35d7
PA
313 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
314 struct sock *sk = subflow->conn;
315
d5824847
PA
316 tcp_set_state(ssk, TCP_CLOSE);
317 tcp_send_active_reset(ssk, GFP_ATOMIC);
318 tcp_done(ssk);
0e4f35d7
PA
319 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
320 schedule_work(&mptcp_sk(sk)->work))
321 sock_hold(sk);
d5824847
PA
322}
323
cec37a6e
PK
324static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
325{
326 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
cfde141e 327 struct mptcp_options_received mp_opt;
c3c123d1 328 struct sock *parent = subflow->conn;
cec37a6e
PK
329
330 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
331
1200832c 332 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
c3c123d1
DC
333 inet_sk_state_store(parent, TCP_ESTABLISHED);
334 parent->sk_state_change(parent);
335 }
336
263e1201
PA
337 /* be sure no special action on any packet other than syn-ack */
338 if (subflow->conn_finished)
339 return;
340
b0977bb2 341 subflow->rel_write_seq = 1;
263e1201 342 subflow->conn_finished = 1;
e1ff9e82
DC
343 subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
344 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
263e1201 345
cfde141e 346 mptcp_get_options(skb, &mp_opt);
fa25e815
PA
347 if (subflow->request_mptcp) {
348 if (!mp_opt.mp_capable) {
349 MPTCP_INC_STATS(sock_net(sk),
350 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
351 mptcp_do_fallback(sk);
352 pr_fallback(mptcp_sk(subflow->conn));
353 goto fallback;
354 }
355
263e1201
PA
356 subflow->mp_capable = 1;
357 subflow->can_ack = 1;
cfde141e 358 subflow->remote_key = mp_opt.sndr_key;
263e1201
PA
359 pr_debug("subflow=%p, remote_key=%llu", subflow,
360 subflow->remote_key);
fa25e815
PA
361 mptcp_finish_connect(sk);
362 } else if (subflow->request_join) {
363 u8 hmac[SHA256_DIGEST_SIZE];
364
365 if (!mp_opt.mp_join)
366 goto do_reset;
367
cfde141e
PA
368 subflow->thmac = mp_opt.thmac;
369 subflow->remote_nonce = mp_opt.nonce;
263e1201
PA
370 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
371 subflow->thmac, subflow->remote_nonce);
263e1201 372
ec3edaa7 373 if (!subflow_thmac_valid(subflow)) {
fc518953 374 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
ec3edaa7
PK
375 goto do_reset;
376 }
377
378 subflow_generate_hmac(subflow->local_key, subflow->remote_key,
379 subflow->local_nonce,
380 subflow->remote_nonce,
bd697222 381 hmac);
bd697222 382 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
ec3edaa7 383
ec3edaa7
PK
384 if (!mptcp_finish_join(sk))
385 goto do_reset;
386
fa25e815 387 subflow->mp_join = 1;
fc518953 388 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
fa25e815
PA
389 } else if (mptcp_check_fallback(sk)) {
390fallback:
391 mptcp_rcv_space_init(mptcp_sk(parent), sk);
cec37a6e 392 }
fa25e815
PA
393 return;
394
395do_reset:
d5824847 396 mptcp_subflow_reset(sk);
cec37a6e
PK
397}
398
08b8d080
FW
399struct request_sock_ops mptcp_subflow_request_sock_ops;
400EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
cec37a6e
PK
401static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
402
403static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
404{
405 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
406
407 pr_debug("subflow=%p", subflow);
408
409 /* Never answer to SYNs sent to broadcast or multicast */
410 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
411 goto drop;
412
08b8d080 413 return tcp_conn_request(&mptcp_subflow_request_sock_ops,
cec37a6e
PK
414 &subflow_request_sock_ipv4_ops,
415 sk, skb);
416drop:
417 tcp_listendrop(sk);
418 return 0;
419}
420
421#if IS_ENABLED(CONFIG_MPTCP_IPV6)
422static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
423static struct inet_connection_sock_af_ops subflow_v6_specific;
424static struct inet_connection_sock_af_ops subflow_v6m_specific;
425
426static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
427{
428 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
429
430 pr_debug("subflow=%p", subflow);
431
432 if (skb->protocol == htons(ETH_P_IP))
433 return subflow_v4_conn_request(sk, skb);
434
435 if (!ipv6_unicast_destination(skb))
436 goto drop;
437
08b8d080 438 return tcp_conn_request(&mptcp_subflow_request_sock_ops,
cec37a6e
PK
439 &subflow_request_sock_ipv6_ops, sk, skb);
440
441drop:
442 tcp_listendrop(sk);
443 return 0; /* don't send reset */
444}
445#endif
446
f296234c
PK
447/* validate hmac received in third ACK */
448static bool subflow_hmac_valid(const struct request_sock *req,
cfde141e 449 const struct mptcp_options_received *mp_opt)
f296234c
PK
450{
451 const struct mptcp_subflow_request_sock *subflow_req;
bd697222 452 u8 hmac[SHA256_DIGEST_SIZE];
f296234c 453 struct mptcp_sock *msk;
f296234c
PK
454
455 subflow_req = mptcp_subflow_rsk(req);
8fd4de12 456 msk = subflow_req->msk;
f296234c
PK
457 if (!msk)
458 return false;
459
460 subflow_generate_hmac(msk->remote_key, msk->local_key,
461 subflow_req->remote_nonce,
462 subflow_req->local_nonce, hmac);
463
8fd4de12 464 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
f296234c
PK
465}
466
df1036da
FW
467static void mptcp_sock_destruct(struct sock *sk)
468{
469 /* if new mptcp socket isn't accepted, it is free'd
470 * from the tcp listener sockets request queue, linked
471 * from req->sk. The tcp socket is released.
472 * This calls the ULP release function which will
473 * also remove the mptcp socket, via
474 * sock_put(ctx->conn).
475 *
7ee24926
PA
476 * Problem is that the mptcp socket will be in
477 * ESTABLISHED state and will not have the SOCK_DEAD flag.
df1036da
FW
478 * Both result in warnings from inet_sock_destruct.
479 */
480
7ee24926 481 if (sk->sk_state == TCP_ESTABLISHED) {
df1036da
FW
482 sk->sk_state = TCP_CLOSE;
483 WARN_ON_ONCE(sk->sk_socket);
484 sock_orphan(sk);
485 }
486
5c8c1640 487 mptcp_destroy_common(mptcp_sk(sk));
df1036da
FW
488 inet_sock_destruct(sk);
489}
490
9f5ca6a5
FW
491static void mptcp_force_close(struct sock *sk)
492{
493 inet_sk_state_store(sk, TCP_CLOSE);
494 sk_common_release(sk);
495}
496
4c8941de
PA
497static void subflow_ulp_fallback(struct sock *sk,
498 struct mptcp_subflow_context *old_ctx)
499{
500 struct inet_connection_sock *icsk = inet_csk(sk);
501
502 mptcp_subflow_tcp_fallback(sk, old_ctx);
503 icsk->icsk_ulp_ops = NULL;
504 rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
505 tcp_sk(sk)->is_mptcp = 0;
506}
507
39884604
PA
508static void subflow_drop_ctx(struct sock *ssk)
509{
510 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
511
512 if (!ctx)
513 return;
514
515 subflow_ulp_fallback(ssk, ctx);
516 if (ctx->conn)
517 sock_put(ctx->conn);
518
519 kfree_rcu(ctx, rcu);
520}
521
b93df08c
PA
522void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
523 struct mptcp_options_received *mp_opt)
524{
525 struct mptcp_sock *msk = mptcp_sk(subflow->conn);
526
527 subflow->remote_key = mp_opt->sndr_key;
528 subflow->fully_established = 1;
529 subflow->can_ack = 1;
530 WRITE_ONCE(msk->fully_established, true);
531}
532
cec37a6e
PK
533static struct sock *subflow_syn_recv_sock(const struct sock *sk,
534 struct sk_buff *skb,
535 struct request_sock *req,
536 struct dst_entry *dst,
537 struct request_sock *req_unhash,
538 bool *own_req)
539{
540 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
cc7972ea 541 struct mptcp_subflow_request_sock *subflow_req;
cfde141e 542 struct mptcp_options_received mp_opt;
9e365ff5 543 bool fallback, fallback_is_fatal;
58b09919 544 struct sock *new_msk = NULL;
cec37a6e
PK
545 struct sock *child;
546
547 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
548
9e365ff5
PA
549 /* After child creation we must look for 'mp_capable' even when options
550 * are not parsed
cfde141e
PA
551 */
552 mp_opt.mp_capable = 0;
9e365ff5
PA
553
554 /* hopefully temporary handling for MP_JOIN+syncookie */
555 subflow_req = mptcp_subflow_rsk(req);
b7514694 556 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
9e365ff5
PA
557 fallback = !tcp_rsk(req)->is_mptcp;
558 if (fallback)
ae2dd716
FW
559 goto create_child;
560
d22f4988 561 /* if the sk is MP_CAPABLE, we try to fetch the client key */
cc7972ea 562 if (subflow_req->mp_capable) {
d22f4988
CP
563 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
564 /* here we can receive and accept an in-window,
565 * out-of-order pkt, which will not carry the MP_CAPABLE
566 * opt even on mptcp enabled paths
567 */
58b09919 568 goto create_msk;
d22f4988
CP
569 }
570
cfde141e
PA
571 mptcp_get_options(skb, &mp_opt);
572 if (!mp_opt.mp_capable) {
4c8941de 573 fallback = true;
58b09919 574 goto create_child;
d22f4988 575 }
58b09919
PA
576
577create_msk:
cfde141e 578 new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
58b09919 579 if (!new_msk)
4c8941de 580 fallback = true;
f296234c 581 } else if (subflow_req->mp_join) {
cfde141e 582 mptcp_get_options(skb, &mp_opt);
d3ab7885
PA
583 if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) ||
584 !mptcp_can_accept_new_subflow(subflow_req->msk)) {
fc518953 585 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
9e365ff5 586 fallback = true;
fc518953 587 }
cc7972ea 588 }
cec37a6e 589
d22f4988 590create_child:
cec37a6e
PK
591 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
592 req_unhash, own_req);
593
594 if (child && *own_req) {
79c0949e
PK
595 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
596
90bf4513
PA
597 tcp_rsk(req)->drop_req = false;
598
4c8941de
PA
599 /* we need to fallback on ctx allocation failure and on pre-reqs
600 * checking above. In the latter scenario we additionally need
601 * to reset the context to non MPTCP status.
79c0949e 602 */
4c8941de 603 if (!ctx || fallback) {
f296234c 604 if (fallback_is_fatal)
729cd643 605 goto dispose_child;
4c8941de 606
39884604 607 subflow_drop_ctx(child);
58b09919 608 goto out;
f296234c 609 }
79c0949e
PK
610
611 if (ctx->mp_capable) {
b93df08c
PA
612 /* this can't race with mptcp_close(), as the msk is
613 * not yet exposted to user-space
614 */
615 inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
616
5b950ff4
PA
617 /* record the newly created socket as the first msk
618 * subflow, but don't link it yet into conn_list
619 */
0397c6d8
PA
620 WRITE_ONCE(mptcp_sk(new_msk)->first, child);
621
58b09919
PA
622 /* new mpc subflow takes ownership of the newly
623 * created mptcp socket
624 */
df1036da 625 new_msk->sk_destruct = mptcp_sock_destruct;
1b1c7a0e 626 mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
2c5ebd00 627 mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
58b09919
PA
628 ctx->conn = new_msk;
629 new_msk = NULL;
fca5c82c
PA
630
631 /* with OoO packets we can reach here without ingress
632 * mpc option
633 */
b93df08c
PA
634 if (mp_opt.mp_capable)
635 mptcp_subflow_fully_established(ctx, &mp_opt);
f296234c
PK
636 } else if (ctx->mp_join) {
637 struct mptcp_sock *owner;
638
8fd4de12 639 owner = subflow_req->msk;
f296234c 640 if (!owner)
729cd643 641 goto dispose_child;
f296234c 642
8fd4de12
PA
643 /* move the msk reference ownership to the subflow */
644 subflow_req->msk = NULL;
f296234c
PK
645 ctx->conn = (struct sock *)owner;
646 if (!mptcp_finish_join(child))
729cd643 647 goto dispose_child;
fc518953
FW
648
649 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
90bf4513 650 tcp_rsk(req)->drop_req = true;
cec37a6e
PK
651 }
652 }
653
58b09919
PA
654out:
655 /* dispose of the left over mptcp master, if any */
656 if (unlikely(new_msk))
9f5ca6a5 657 mptcp_force_close(new_msk);
4c8941de
PA
658
659 /* check for expected invariant - should never trigger, just help
660 * catching eariler subtle bugs
661 */
ac2b47fb 662 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
4c8941de
PA
663 (!mptcp_subflow_ctx(child) ||
664 !mptcp_subflow_ctx(child)->conn));
cec37a6e 665 return child;
f296234c 666
729cd643 667dispose_child:
39884604 668 subflow_drop_ctx(child);
729cd643 669 tcp_rsk(req)->drop_req = true;
729cd643 670 inet_csk_prepare_for_destroy_sock(child);
f296234c 671 tcp_done(child);
97e61751 672 req->rsk_ops->send_reset(sk, skb);
729cd643
PA
673
674 /* The last child reference will be released by the caller */
675 return child;
cec37a6e
PK
676}
677
678static struct inet_connection_sock_af_ops subflow_specific;
679
648ef4b8
MM
680enum mapping_status {
681 MAPPING_OK,
682 MAPPING_INVALID,
683 MAPPING_EMPTY,
e1ff9e82
DC
684 MAPPING_DATA_FIN,
685 MAPPING_DUMMY
648ef4b8
MM
686};
687
688static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
689{
690 if ((u32)seq == (u32)old_seq)
691 return old_seq;
692
693 /* Assume map covers data not mapped yet. */
694 return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
695}
696
697static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
698{
699 WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
700 ssn, subflow->map_subflow_seq, subflow->map_data_len);
701}
702
703static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
704{
705 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
706 unsigned int skb_consumed;
707
708 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
709 if (WARN_ON_ONCE(skb_consumed >= skb->len))
710 return true;
711
712 return skb->len - skb_consumed <= subflow->map_data_len -
713 mptcp_subflow_get_map_offset(subflow);
714}
715
716static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
717{
718 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
719 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
720
721 if (unlikely(before(ssn, subflow->map_subflow_seq))) {
722 /* Mapping covers data later in the subflow stream,
723 * currently unsupported.
724 */
725 warn_bad_map(subflow, ssn);
726 return false;
727 }
728 if (unlikely(!before(ssn, subflow->map_subflow_seq +
729 subflow->map_data_len))) {
730 /* Mapping does covers past subflow data, invalid */
731 warn_bad_map(subflow, ssn + skb->len);
732 return false;
733 }
734 return true;
735}
736
43b54c6e
MM
737static enum mapping_status get_mapping_status(struct sock *ssk,
738 struct mptcp_sock *msk)
648ef4b8
MM
739{
740 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
741 struct mptcp_ext *mpext;
742 struct sk_buff *skb;
743 u16 data_len;
744 u64 map_seq;
745
746 skb = skb_peek(&ssk->sk_receive_queue);
747 if (!skb)
748 return MAPPING_EMPTY;
749
e1ff9e82
DC
750 if (mptcp_check_fallback(ssk))
751 return MAPPING_DUMMY;
752
648ef4b8
MM
753 mpext = mptcp_get_ext(skb);
754 if (!mpext || !mpext->use_map) {
755 if (!subflow->map_valid && !skb->len) {
756 /* the TCP stack deliver 0 len FIN pkt to the receive
757 * queue, that is the only 0len pkts ever expected here,
758 * and we can admit no mapping only for 0 len pkts
759 */
760 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
761 WARN_ONCE(1, "0len seq %d:%d flags %x",
762 TCP_SKB_CB(skb)->seq,
763 TCP_SKB_CB(skb)->end_seq,
764 TCP_SKB_CB(skb)->tcp_flags);
765 sk_eat_skb(ssk, skb);
766 return MAPPING_EMPTY;
767 }
768
769 if (!subflow->map_valid)
770 return MAPPING_INVALID;
771
772 goto validate_seq;
773 }
774
775 pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
776 mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
777 mpext->data_len, mpext->data_fin);
778
779 data_len = mpext->data_len;
780 if (data_len == 0) {
781 pr_err("Infinite mapping not handled");
fc518953 782 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
648ef4b8
MM
783 return MAPPING_INVALID;
784 }
785
786 if (mpext->data_fin == 1) {
787 if (data_len == 1) {
1a49b2c2
MM
788 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
789 mpext->dsn64);
43b54c6e 790 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
648ef4b8
MM
791 if (subflow->map_valid) {
792 /* A DATA_FIN might arrive in a DSS
793 * option before the previous mapping
794 * has been fully consumed. Continue
795 * handling the existing mapping.
796 */
797 skb_ext_del(skb, SKB_EXT_MPTCP);
798 return MAPPING_OK;
799 } else {
ef59b195
MM
800 if (updated && schedule_work(&msk->work))
801 sock_hold((struct sock *)msk);
802
648ef4b8
MM
803 return MAPPING_DATA_FIN;
804 }
43b54c6e 805 } else {
017512a0 806 u64 data_fin_seq = mpext->data_seq + data_len - 1;
1a49b2c2
MM
807
808 /* If mpext->data_seq is a 32-bit value, data_fin_seq
809 * must also be limited to 32 bits.
810 */
811 if (!mpext->dsn64)
812 data_fin_seq &= GENMASK_ULL(31, 0);
813
814 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
815 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
816 data_fin_seq, mpext->dsn64);
648ef4b8
MM
817 }
818
819 /* Adjust for DATA_FIN using 1 byte of sequence space */
820 data_len--;
821 }
822
823 if (!mpext->dsn64) {
824 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
825 mpext->data_seq);
826 pr_debug("expanded seq=%llu", subflow->map_seq);
827 } else {
828 map_seq = mpext->data_seq;
829 }
37198e93 830 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
648ef4b8
MM
831
832 if (subflow->map_valid) {
833 /* Allow replacing only with an identical map */
834 if (subflow->map_seq == map_seq &&
835 subflow->map_subflow_seq == mpext->subflow_seq &&
836 subflow->map_data_len == data_len) {
837 skb_ext_del(skb, SKB_EXT_MPTCP);
838 return MAPPING_OK;
839 }
840
841 /* If this skb data are fully covered by the current mapping,
842 * the new map would need caching, which is not supported
843 */
fc518953
FW
844 if (skb_is_fully_mapped(ssk, skb)) {
845 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
648ef4b8 846 return MAPPING_INVALID;
fc518953 847 }
648ef4b8
MM
848
849 /* will validate the next map after consuming the current one */
850 return MAPPING_OK;
851 }
852
853 subflow->map_seq = map_seq;
854 subflow->map_subflow_seq = mpext->subflow_seq;
855 subflow->map_data_len = data_len;
856 subflow->map_valid = 1;
d22f4988 857 subflow->mpc_map = mpext->mpc_map;
648ef4b8
MM
858 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
859 subflow->map_seq, subflow->map_subflow_seq,
860 subflow->map_data_len);
861
862validate_seq:
863 /* we revalidate valid mapping on new skb, because we must ensure
864 * the current skb is completely covered by the available mapping
865 */
866 if (!validate_mapping(ssk, skb))
867 return MAPPING_INVALID;
868
869 skb_ext_del(skb, SKB_EXT_MPTCP);
870 return MAPPING_OK;
871}
872
04e4cd4f 873static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
1d39cd8c 874 u64 limit)
6719331c
PA
875{
876 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
04e4cd4f
PA
877 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
878 u32 incr;
879
880 incr = limit >= skb->len ? skb->len + fin : limit;
881
882 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
883 subflow->map_subflow_seq);
06242e44 884 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
04e4cd4f
PA
885 tcp_sk(ssk)->copied_seq += incr;
886 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
887 sk_eat_skb(ssk, skb);
888 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
889 subflow->map_valid = 0;
6719331c
PA
890}
891
648ef4b8
MM
892static bool subflow_check_data_avail(struct sock *ssk)
893{
894 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
895 enum mapping_status status;
896 struct mptcp_sock *msk;
897 struct sk_buff *skb;
898
899 pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
900 subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
47bebdf3
PA
901 if (!skb_peek(&ssk->sk_receive_queue))
902 subflow->data_avail = 0;
648ef4b8
MM
903 if (subflow->data_avail)
904 return true;
905
648ef4b8
MM
906 msk = mptcp_sk(subflow->conn);
907 for (;;) {
648ef4b8
MM
908 u64 ack_seq;
909 u64 old_ack;
910
43b54c6e 911 status = get_mapping_status(ssk, msk);
648ef4b8
MM
912 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
913 if (status == MAPPING_INVALID) {
914 ssk->sk_err = EBADMSG;
915 goto fatal;
916 }
e1ff9e82
DC
917 if (status == MAPPING_DUMMY) {
918 __mptcp_do_fallback(msk);
919 skb = skb_peek(&ssk->sk_receive_queue);
920 subflow->map_valid = 1;
921 subflow->map_seq = READ_ONCE(msk->ack_seq);
922 subflow->map_data_len = skb->len;
923 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq -
924 subflow->ssn_offset;
6719331c 925 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
e1ff9e82
DC
926 return true;
927 }
648ef4b8
MM
928
929 if (status != MAPPING_OK)
930 return false;
931
932 skb = skb_peek(&ssk->sk_receive_queue);
933 if (WARN_ON_ONCE(!skb))
934 return false;
935
d22f4988
CP
936 /* if msk lacks the remote key, this subflow must provide an
937 * MP_CAPABLE-based mapping
938 */
939 if (unlikely(!READ_ONCE(msk->can_ack))) {
940 if (!subflow->mpc_map) {
941 ssk->sk_err = EBADMSG;
942 goto fatal;
943 }
944 WRITE_ONCE(msk->remote_key, subflow->remote_key);
945 WRITE_ONCE(msk->ack_seq, subflow->map_seq);
946 WRITE_ONCE(msk->can_ack, true);
947 }
948
648ef4b8
MM
949 old_ack = READ_ONCE(msk->ack_seq);
950 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
951 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
952 ack_seq);
47bebdf3 953 if (ack_seq == old_ack) {
6719331c
PA
954 subflow->data_avail = MPTCP_SUBFLOW_DATA_AVAIL;
955 break;
956 } else if (after64(ack_seq, old_ack)) {
957 subflow->data_avail = MPTCP_SUBFLOW_OOO_DATA;
648ef4b8 958 break;
47bebdf3 959 }
648ef4b8
MM
960
961 /* only accept in-sequence mapping. Old values are spurious
6719331c 962 * retransmission
648ef4b8 963 */
04e4cd4f 964 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
648ef4b8
MM
965 }
966 return true;
967
968fatal:
969 /* fatal protocol error, close the socket */
970 /* This barrier is coupled with smp_rmb() in tcp_poll() */
971 smp_wmb();
972 ssk->sk_error_report(ssk);
973 tcp_set_state(ssk, TCP_CLOSE);
974 tcp_send_active_reset(ssk, GFP_ATOMIC);
47bebdf3 975 subflow->data_avail = 0;
648ef4b8
MM
976 return false;
977}
978
979bool mptcp_subflow_data_available(struct sock *sk)
980{
981 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
648ef4b8
MM
982
983 /* check if current mapping is still valid */
984 if (subflow->map_valid &&
985 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
986 subflow->map_valid = 0;
987 subflow->data_avail = 0;
988
989 pr_debug("Done with mapping: seq=%u data_len=%u",
990 subflow->map_subflow_seq,
991 subflow->map_data_len);
992 }
993
47bebdf3 994 return subflow_check_data_avail(sk);
648ef4b8
MM
995}
996
071c8ed6
FW
997/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
998 * not the ssk one.
999 *
1000 * In mptcp, rwin is about the mptcp-level connection data.
1001 *
1002 * Data that is still on the ssk rx queue can thus be ignored,
1003 * as far as mptcp peer is concerened that data is still inflight.
1004 * DSS ACK is updated when skb is moved to the mptcp rx queue.
1005 */
1006void mptcp_space(const struct sock *ssk, int *space, int *full_space)
1007{
1008 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1009 const struct sock *sk = subflow->conn;
1010
ea4ca586 1011 *space = __mptcp_space(sk);
071c8ed6
FW
1012 *full_space = tcp_full_space(sk);
1013}
1014
648ef4b8
MM
1015static void subflow_data_ready(struct sock *sk)
1016{
1017 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
8c728940 1018 u16 state = 1 << inet_sk_state_load(sk);
648ef4b8 1019 struct sock *parent = subflow->conn;
e1ff9e82 1020 struct mptcp_sock *msk;
648ef4b8 1021
e1ff9e82 1022 msk = mptcp_sk(parent);
8c728940 1023 if (state & TCPF_LISTEN) {
e1ff9e82 1024 set_bit(MPTCP_DATA_READY, &msk->flags);
dc093db5 1025 parent->sk_data_ready(parent);
648ef4b8
MM
1026 return;
1027 }
1028
e1ff9e82 1029 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
8c728940 1030 !subflow->mp_join && !(state & TCPF_CLOSE));
e1ff9e82 1031
101f6f85 1032 if (mptcp_subflow_data_available(sk))
2e52213c 1033 mptcp_data_ready(parent, sk);
648ef4b8
MM
1034}
1035
6e628cd3 1036static void subflow_write_space(struct sock *ssk)
648ef4b8 1037{
6e628cd3 1038 /* we take action in __mptcp_clean_una() */
648ef4b8
MM
1039}
1040
cec37a6e
PK
1041static struct inet_connection_sock_af_ops *
1042subflow_default_af_ops(struct sock *sk)
1043{
1044#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1045 if (sk->sk_family == AF_INET6)
1046 return &subflow_v6_specific;
1047#endif
1048 return &subflow_specific;
1049}
1050
cec37a6e 1051#if IS_ENABLED(CONFIG_MPTCP_IPV6)
31484d56
GU
1052void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
1053{
cec37a6e
PK
1054 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
1055 struct inet_connection_sock *icsk = inet_csk(sk);
1056 struct inet_connection_sock_af_ops *target;
1057
1058 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
1059
1060 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
edc7e489 1061 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
cec37a6e
PK
1062
1063 if (likely(icsk->icsk_af_ops == target))
1064 return;
1065
1066 subflow->icsk_af_ops = icsk->icsk_af_ops;
1067 icsk->icsk_af_ops = target;
cec37a6e 1068}
31484d56 1069#endif
cec37a6e 1070
ec3edaa7
PK
1071static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
1072 struct sockaddr_storage *addr)
1073{
1074 memset(addr, 0, sizeof(*addr));
1075 addr->ss_family = info->family;
1076 if (addr->ss_family == AF_INET) {
1077 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
1078
1079 in_addr->sin_addr = info->addr;
1080 in_addr->sin_port = info->port;
1081 }
1082#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1083 else if (addr->ss_family == AF_INET6) {
1084 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
1085
1086 in6_addr->sin6_addr = info->addr6;
1087 in6_addr->sin6_port = info->port;
1088 }
1089#endif
1090}
1091
ef0da3b8 1092int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
ec3edaa7
PK
1093 const struct mptcp_addr_info *remote)
1094{
1095 struct mptcp_sock *msk = mptcp_sk(sk);
1096 struct mptcp_subflow_context *subflow;
1097 struct sockaddr_storage addr;
2ff0e566 1098 int remote_id = remote->id;
6bad912b 1099 int local_id = loc->id;
ec3edaa7 1100 struct socket *sf;
6bad912b 1101 struct sock *ssk;
ec3edaa7
PK
1102 u32 remote_token;
1103 int addrlen;
1104 int err;
1105
b93df08c 1106 if (!mptcp_is_fully_established(sk))
ec3edaa7
PK
1107 return -ENOTCONN;
1108
1109 err = mptcp_subflow_create_socket(sk, &sf);
1110 if (err)
1111 return err;
1112
6bad912b
PA
1113 ssk = sf->sk;
1114 subflow = mptcp_subflow_ctx(ssk);
1115 do {
1116 get_random_bytes(&subflow->local_nonce, sizeof(u32));
1117 } while (!subflow->local_nonce);
1118
1119 if (!local_id) {
1120 err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
1121 if (err < 0)
1122 goto failed;
1123
1124 local_id = err;
1125 }
1126
ec3edaa7
PK
1127 subflow->remote_key = msk->remote_key;
1128 subflow->local_key = msk->local_key;
1129 subflow->token = msk->token;
1130 mptcp_info2sockaddr(loc, &addr);
1131
1132 addrlen = sizeof(struct sockaddr_in);
1133#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1134 if (loc->family == AF_INET6)
1135 addrlen = sizeof(struct sockaddr_in6);
1136#endif
ef0da3b8 1137 ssk->sk_bound_dev_if = loc->ifindex;
ec3edaa7
PK
1138 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
1139 if (err)
1140 goto failed;
1141
1142 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
2ff0e566
GT
1143 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
1144 remote_token, local_id, remote_id);
ec3edaa7 1145 subflow->remote_token = remote_token;
6bad912b 1146 subflow->local_id = local_id;
2ff0e566 1147 subflow->remote_id = remote_id;
ec3edaa7 1148 subflow->request_join = 1;
4596a2c1 1149 subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
ec3edaa7
PK
1150 mptcp_info2sockaddr(remote, &addr);
1151
5b950ff4 1152 mptcp_add_pending_subflow(msk, subflow);
ec3edaa7
PK
1153 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
1154 if (err && err != -EINPROGRESS)
5b950ff4 1155 goto failed_unlink;
ec3edaa7 1156
ec3edaa7
PK
1157 return err;
1158
5b950ff4
PA
1159failed_unlink:
1160 spin_lock_bh(&msk->join_list_lock);
1161 list_del(&subflow->node);
1162 spin_unlock_bh(&msk->join_list_lock);
1163
ec3edaa7 1164failed:
e16163b6 1165 subflow->disposable = 1;
ec3edaa7
PK
1166 sock_release(sf);
1167 return err;
1168}
1169
3764b0c5
NR
1170static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
1171{
1172#ifdef CONFIG_SOCK_CGROUP_DATA
1173 struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
1174 *child_skcd = &child->sk_cgrp_data;
1175
1176 /* only the additional subflows created by kworkers have to be modified */
1177 if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
1178 cgroup_id(sock_cgroup_ptr(child_skcd))) {
1179#ifdef CONFIG_MEMCG
1180 struct mem_cgroup *memcg = parent->sk_memcg;
1181
1182 mem_cgroup_sk_free(child);
1183 if (memcg && css_tryget(&memcg->css))
1184 child->sk_memcg = memcg;
1185#endif /* CONFIG_MEMCG */
1186
1187 cgroup_sk_free(child_skcd);
1188 *child_skcd = *parent_skcd;
1189 cgroup_sk_clone(child_skcd);
1190 }
1191#endif /* CONFIG_SOCK_CGROUP_DATA */
1192}
1193
2303f994
PK
1194int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
1195{
1196 struct mptcp_subflow_context *subflow;
1197 struct net *net = sock_net(sk);
1198 struct socket *sf;
1199 int err;
1200
adf73410
PA
1201 /* un-accepted server sockets can reach here - on bad configuration
1202 * bail early to avoid greater trouble later
1203 */
1204 if (unlikely(!sk->sk_socket))
1205 return -EINVAL;
1206
cec37a6e
PK
1207 err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
1208 &sf);
2303f994
PK
1209 if (err)
1210 return err;
1211
1212 lock_sock(sf->sk);
1213
3764b0c5
NR
1214 /* the newly created socket has to be in the same cgroup as its parent */
1215 mptcp_attach_cgroup(sk, sf->sk);
1216
2303f994
PK
1217 /* kernel sockets do not by default acquire net ref, but TCP timer
1218 * needs it.
1219 */
1220 sf->sk->sk_net_refcnt = 1;
1221 get_net(net);
f6f7d8cf 1222#ifdef CONFIG_PROC_FS
2303f994 1223 this_cpu_add(*net->core.sock_inuse, 1);
f6f7d8cf 1224#endif
2303f994
PK
1225 err = tcp_set_ulp(sf->sk, "mptcp");
1226 release_sock(sf->sk);
1227
b8ad540d
WY
1228 if (err) {
1229 sock_release(sf);
2303f994 1230 return err;
b8ad540d 1231 }
2303f994 1232
7d14b0d2
PA
1233 /* the newly created socket really belongs to the owning MPTCP master
1234 * socket, even if for additional subflows the allocation is performed
1235 * by a kernel workqueue. Adjust inode references, so that the
1236 * procfs/diag interaces really show this one belonging to the correct
1237 * user.
1238 */
1239 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
1240 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
1241 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
1242
2303f994
PK
1243 subflow = mptcp_subflow_ctx(sf->sk);
1244 pr_debug("subflow=%p", subflow);
1245
1246 *new_sock = sf;
79c0949e 1247 sock_hold(sk);
2303f994
PK
1248 subflow->conn = sk;
1249
1250 return 0;
1251}
1252
1253static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
1254 gfp_t priority)
1255{
1256 struct inet_connection_sock *icsk = inet_csk(sk);
1257 struct mptcp_subflow_context *ctx;
1258
1259 ctx = kzalloc(sizeof(*ctx), priority);
1260 if (!ctx)
1261 return NULL;
1262
1263 rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
cec37a6e 1264 INIT_LIST_HEAD(&ctx->node);
2303f994
PK
1265
1266 pr_debug("subflow=%p", ctx);
1267
1268 ctx->tcp_sock = sk;
1269
1270 return ctx;
1271}
1272
648ef4b8
MM
1273static void __subflow_state_change(struct sock *sk)
1274{
1275 struct socket_wq *wq;
1276
1277 rcu_read_lock();
1278 wq = rcu_dereference(sk->sk_wq);
1279 if (skwq_has_sleeper(wq))
1280 wake_up_interruptible_all(&wq->wait);
1281 rcu_read_unlock();
1282}
1283
1284static bool subflow_is_done(const struct sock *sk)
1285{
1286 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
1287}
1288
1289static void subflow_state_change(struct sock *sk)
1290{
1291 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
dc093db5 1292 struct sock *parent = subflow->conn;
648ef4b8
MM
1293
1294 __subflow_state_change(sk);
1295
8fd73804
DC
1296 if (subflow_simultaneous_connect(sk)) {
1297 mptcp_do_fallback(sk);
a6b118fe 1298 mptcp_rcv_space_init(mptcp_sk(parent), sk);
8fd73804
DC
1299 pr_fallback(mptcp_sk(parent));
1300 subflow->conn_finished = 1;
1301 if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
1302 inet_sk_state_store(parent, TCP_ESTABLISHED);
1303 parent->sk_state_change(parent);
1304 }
1305 }
1306
648ef4b8
MM
1307 /* as recvmsg() does not acquire the subflow socket for ssk selection
1308 * a fin packet carrying a DSS can be unnoticed if we don't trigger
1309 * the data available machinery here.
1310 */
e1ff9e82 1311 if (mptcp_subflow_data_available(sk))
2e52213c 1312 mptcp_data_ready(parent, sk);
648ef4b8 1313
067a0b3d 1314 if (__mptcp_check_fallback(mptcp_sk(parent)) &&
648ef4b8
MM
1315 !subflow->rx_eof && subflow_is_done(sk)) {
1316 subflow->rx_eof = 1;
59832e24 1317 mptcp_subflow_eof(parent);
648ef4b8
MM
1318 }
1319}
1320
2303f994
PK
1321static int subflow_ulp_init(struct sock *sk)
1322{
cec37a6e 1323 struct inet_connection_sock *icsk = inet_csk(sk);
2303f994
PK
1324 struct mptcp_subflow_context *ctx;
1325 struct tcp_sock *tp = tcp_sk(sk);
1326 int err = 0;
1327
1328 /* disallow attaching ULP to a socket unless it has been
1329 * created with sock_create_kern()
1330 */
1331 if (!sk->sk_kern_sock) {
1332 err = -EOPNOTSUPP;
1333 goto out;
1334 }
1335
1336 ctx = subflow_create_ctx(sk, GFP_KERNEL);
1337 if (!ctx) {
1338 err = -ENOMEM;
1339 goto out;
1340 }
1341
1342 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
1343
1344 tp->is_mptcp = 1;
cec37a6e
PK
1345 ctx->icsk_af_ops = icsk->icsk_af_ops;
1346 icsk->icsk_af_ops = subflow_default_af_ops(sk);
648ef4b8
MM
1347 ctx->tcp_data_ready = sk->sk_data_ready;
1348 ctx->tcp_state_change = sk->sk_state_change;
1349 ctx->tcp_write_space = sk->sk_write_space;
1350 sk->sk_data_ready = subflow_data_ready;
1351 sk->sk_write_space = subflow_write_space;
1352 sk->sk_state_change = subflow_state_change;
2303f994
PK
1353out:
1354 return err;
1355}
1356
e16163b6 1357static void subflow_ulp_release(struct sock *ssk)
2303f994 1358{
e16163b6
PA
1359 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
1360 bool release = true;
1361 struct sock *sk;
2303f994
PK
1362
1363 if (!ctx)
1364 return;
1365
e16163b6
PA
1366 sk = ctx->conn;
1367 if (sk) {
1368 /* if the msk has been orphaned, keep the ctx
0597d0f8
PA
1369 * alive, will be freed by __mptcp_close_ssk(),
1370 * when the subflow is still unaccepted
e16163b6 1371 */
0597d0f8 1372 release = ctx->disposable || list_empty(&ctx->node);
e16163b6
PA
1373 sock_put(sk);
1374 }
79c0949e 1375
e16163b6
PA
1376 if (release)
1377 kfree_rcu(ctx, rcu);
2303f994
PK
1378}
1379
cec37a6e
PK
1380static void subflow_ulp_clone(const struct request_sock *req,
1381 struct sock *newsk,
1382 const gfp_t priority)
1383{
1384 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
1385 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
1386 struct mptcp_subflow_context *new_ctx;
1387
f296234c
PK
1388 if (!tcp_rsk(req)->is_mptcp ||
1389 (!subflow_req->mp_capable && !subflow_req->mp_join)) {
648ef4b8 1390 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1391 return;
1392 }
1393
1394 new_ctx = subflow_create_ctx(newsk, priority);
edc7e489 1395 if (!new_ctx) {
648ef4b8 1396 subflow_ulp_fallback(newsk, old_ctx);
cec37a6e
PK
1397 return;
1398 }
1399
1400 new_ctx->conn_finished = 1;
1401 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
648ef4b8
MM
1402 new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
1403 new_ctx->tcp_state_change = old_ctx->tcp_state_change;
1404 new_ctx->tcp_write_space = old_ctx->tcp_write_space;
58b09919
PA
1405 new_ctx->rel_write_seq = 1;
1406 new_ctx->tcp_sock = newsk;
1407
f296234c
PK
1408 if (subflow_req->mp_capable) {
1409 /* see comments in subflow_syn_recv_sock(), MPTCP connection
1410 * is fully established only after we receive the remote key
1411 */
1412 new_ctx->mp_capable = 1;
f296234c
PK
1413 new_ctx->local_key = subflow_req->local_key;
1414 new_ctx->token = subflow_req->token;
1415 new_ctx->ssn_offset = subflow_req->ssn_offset;
1416 new_ctx->idsn = subflow_req->idsn;
1417 } else if (subflow_req->mp_join) {
ec3edaa7 1418 new_ctx->ssn_offset = subflow_req->ssn_offset;
f296234c
PK
1419 new_ctx->mp_join = 1;
1420 new_ctx->fully_established = 1;
1421 new_ctx->backup = subflow_req->backup;
1422 new_ctx->local_id = subflow_req->local_id;
2ff0e566 1423 new_ctx->remote_id = subflow_req->remote_id;
f296234c
PK
1424 new_ctx->token = subflow_req->token;
1425 new_ctx->thmac = subflow_req->thmac;
1426 }
cec37a6e
PK
1427}
1428
2303f994
PK
1429static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
1430 .name = "mptcp",
1431 .owner = THIS_MODULE,
1432 .init = subflow_ulp_init,
1433 .release = subflow_ulp_release,
cec37a6e 1434 .clone = subflow_ulp_clone,
2303f994
PK
1435};
1436
cec37a6e
PK
1437static int subflow_ops_init(struct request_sock_ops *subflow_ops)
1438{
1439 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
1440 subflow_ops->slab_name = "request_sock_subflow";
1441
1442 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
1443 subflow_ops->obj_size, 0,
1444 SLAB_ACCOUNT |
1445 SLAB_TYPESAFE_BY_RCU,
1446 NULL);
1447 if (!subflow_ops->slab)
1448 return -ENOMEM;
1449
79c0949e
PK
1450 subflow_ops->destructor = subflow_req_destructor;
1451
cec37a6e
PK
1452 return 0;
1453}
1454
d39dceca 1455void __init mptcp_subflow_init(void)
2303f994 1456{
08b8d080
FW
1457 mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
1458 if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
cec37a6e
PK
1459 panic("MPTCP: failed to init subflow request sock ops\n");
1460
1461 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
7ea851d1 1462 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
cec37a6e
PK
1463
1464 subflow_specific = ipv4_specific;
1465 subflow_specific.conn_request = subflow_v4_conn_request;
1466 subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
1467 subflow_specific.sk_rx_dst_set = subflow_finish_connect;
1468
1469#if IS_ENABLED(CONFIG_MPTCP_IPV6)
1470 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
7ea851d1 1471 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
cec37a6e
PK
1472
1473 subflow_v6_specific = ipv6_specific;
1474 subflow_v6_specific.conn_request = subflow_v6_conn_request;
1475 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
1476 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
1477
1478 subflow_v6m_specific = subflow_v6_specific;
1479 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
1480 subflow_v6m_specific.send_check = ipv4_specific.send_check;
1481 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
1482 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
1483 subflow_v6m_specific.net_frag_header_len = 0;
1484#endif
1485
5147dfb5
DC
1486 mptcp_diag_subflow_init(&subflow_ulp_ops);
1487
2303f994
PK
1488 if (tcp_register_ulp(&subflow_ulp_ops) != 0)
1489 panic("MPTCP: failed to register subflows to ULP\n");
1490}