]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/mptcp/options.c
Merge tag 'mlx5-updates-2020-03-17' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-jammy-kernel.git] / net / mptcp / options.c
CommitLineData
eda7acdd
PK
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#include <linux/kernel.h>
8#include <net/tcp.h>
9#include <net/mptcp.h>
10#include "protocol.h"
11
65492c5a
PA
12static bool mptcp_cap_flag_sha256(u8 flags)
13{
14 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
15}
16
cc7972ea
CP
17void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
18 int opsize, struct tcp_options_received *opt_rx)
eda7acdd
PK
19{
20 struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
21 u8 subtype = *ptr >> 4;
648ef4b8 22 int expected_opsize;
eda7acdd
PK
23 u8 version;
24 u8 flags;
25
26 switch (subtype) {
27 case MPTCPOPT_MP_CAPABLE:
cc7972ea
CP
28 /* strict size checking */
29 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
30 if (skb->len > tcp_hdr(skb)->doff << 2)
31 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
32 else
33 expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
34 } else {
35 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
36 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
37 else
38 expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
39 }
40 if (opsize != expected_opsize)
eda7acdd
PK
41 break;
42
cc7972ea 43 /* try to be gentle vs future versions on the initial syn */
eda7acdd 44 version = *ptr++ & MPTCP_VERSION_MASK;
cc7972ea
CP
45 if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
46 if (version != MPTCP_SUPPORTED_VERSION)
47 break;
48 } else if (version < MPTCP_SUPPORTED_VERSION) {
eda7acdd 49 break;
cc7972ea 50 }
eda7acdd
PK
51
52 flags = *ptr++;
65492c5a 53 if (!mptcp_cap_flag_sha256(flags) ||
eda7acdd
PK
54 (flags & MPTCP_CAP_EXTENSIBILITY))
55 break;
56
57 /* RFC 6824, Section 3.1:
58 * "For the Checksum Required bit (labeled "A"), if either
59 * host requires the use of checksums, checksums MUST be used.
60 * In other words, the only way for checksums not to be used
61 * is if both hosts in their SYNs set A=0."
62 *
63 * Section 3.3.0:
64 * "If a checksum is not present when its use has been
65 * negotiated, the receiver MUST close the subflow with a RST as
66 * it is considered broken."
67 *
68 * We don't implement DSS checksum - fall back to TCP.
69 */
70 if (flags & MPTCP_CAP_CHECKSUM_REQD)
71 break;
72
73 mp_opt->mp_capable = 1;
cc7972ea
CP
74 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
75 mp_opt->sndr_key = get_unaligned_be64(ptr);
76 ptr += 8;
77 }
78 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
eda7acdd
PK
79 mp_opt->rcvr_key = get_unaligned_be64(ptr);
80 ptr += 8;
eda7acdd 81 }
cc7972ea
CP
82 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
83 /* Section 3.1.:
84 * "the data parameters in a MP_CAPABLE are semantically
85 * equivalent to those in a DSS option and can be used
86 * interchangeably."
87 */
88 mp_opt->dss = 1;
89 mp_opt->use_map = 1;
90 mp_opt->mpc_map = 1;
91 mp_opt->data_len = get_unaligned_be16(ptr);
92 ptr += 2;
93 }
94 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
95 version, flags, opsize, mp_opt->sndr_key,
96 mp_opt->rcvr_key, mp_opt->data_len);
eda7acdd
PK
97 break;
98
99 case MPTCPOPT_DSS:
100 pr_debug("DSS");
648ef4b8
MM
101 ptr++;
102
cc7972ea
CP
103 /* we must clear 'mpc_map' be able to detect MP_CAPABLE
104 * map vs DSS map in mptcp_incoming_options(), and reconstruct
105 * map info accordingly
106 */
107 mp_opt->mpc_map = 0;
648ef4b8
MM
108 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
109 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
110 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
111 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
112 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
113 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
114
115 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
116 mp_opt->data_fin, mp_opt->dsn64,
117 mp_opt->use_map, mp_opt->ack64,
118 mp_opt->use_ack);
119
120 expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
121
122 if (mp_opt->use_ack) {
123 if (mp_opt->ack64)
124 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
125 else
126 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
127 }
128
129 if (mp_opt->use_map) {
130 if (mp_opt->dsn64)
131 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
132 else
133 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
134 }
135
136 /* RFC 6824, Section 3.3:
137 * If a checksum is present, but its use had
138 * not been negotiated in the MP_CAPABLE handshake,
139 * the checksum field MUST be ignored.
140 */
141 if (opsize != expected_opsize &&
142 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
143 break;
144
eda7acdd 145 mp_opt->dss = 1;
648ef4b8
MM
146
147 if (mp_opt->use_ack) {
148 if (mp_opt->ack64) {
149 mp_opt->data_ack = get_unaligned_be64(ptr);
150 ptr += 8;
151 } else {
152 mp_opt->data_ack = get_unaligned_be32(ptr);
153 ptr += 4;
154 }
155
156 pr_debug("data_ack=%llu", mp_opt->data_ack);
157 }
158
159 if (mp_opt->use_map) {
160 if (mp_opt->dsn64) {
161 mp_opt->data_seq = get_unaligned_be64(ptr);
162 ptr += 8;
163 } else {
164 mp_opt->data_seq = get_unaligned_be32(ptr);
165 ptr += 4;
166 }
167
168 mp_opt->subflow_seq = get_unaligned_be32(ptr);
169 ptr += 4;
170
171 mp_opt->data_len = get_unaligned_be16(ptr);
172 ptr += 2;
173
174 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
175 mp_opt->data_seq, mp_opt->subflow_seq,
176 mp_opt->data_len);
177 }
178
eda7acdd
PK
179 break;
180
181 default:
182 break;
183 }
184}
185
cec37a6e
PK
186void mptcp_get_options(const struct sk_buff *skb,
187 struct tcp_options_received *opt_rx)
188{
189 const unsigned char *ptr;
190 const struct tcphdr *th = tcp_hdr(skb);
191 int length = (th->doff * 4) - sizeof(struct tcphdr);
192
193 ptr = (const unsigned char *)(th + 1);
194
195 while (length > 0) {
196 int opcode = *ptr++;
197 int opsize;
198
199 switch (opcode) {
200 case TCPOPT_EOL:
201 return;
202 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
203 length--;
204 continue;
205 default:
206 opsize = *ptr++;
207 if (opsize < 2) /* "silly options" */
208 return;
209 if (opsize > length)
210 return; /* don't parse partial options */
211 if (opcode == TCPOPT_MPTCP)
cc7972ea 212 mptcp_parse_option(skb, ptr, opsize, opt_rx);
cec37a6e
PK
213 ptr += opsize - 2;
214 length -= opsize;
215 }
216 }
217}
218
cc7972ea
CP
219bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
220 unsigned int *size, struct mptcp_out_options *opts)
cec37a6e
PK
221{
222 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
223
cc7972ea
CP
224 /* we will use snd_isn to detect first pkt [re]transmission
225 * in mptcp_established_options_mp()
226 */
227 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
cec37a6e
PK
228 if (subflow->request_mptcp) {
229 pr_debug("local_key=%llu", subflow->local_key);
230 opts->suboptions = OPTION_MPTCP_MPC_SYN;
231 opts->sndr_key = subflow->local_key;
232 *size = TCPOLEN_MPTCP_MPC_SYN;
233 return true;
234 }
235 return false;
236}
237
238void mptcp_rcv_synsent(struct sock *sk)
239{
240 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
241 struct tcp_sock *tp = tcp_sk(sk);
242
243 pr_debug("subflow=%p", subflow);
244 if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
245 subflow->mp_capable = 1;
d22f4988 246 subflow->can_ack = 1;
cec37a6e
PK
247 subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
248 } else {
249 tcp_sk(sk)->is_mptcp = 0;
250 }
251}
252
cc7972ea
CP
253static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
254 unsigned int *size,
6d0060f6
MM
255 unsigned int remaining,
256 struct mptcp_out_options *opts)
cec37a6e
PK
257{
258 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
cc7972ea
CP
259 struct mptcp_ext *mpext;
260 unsigned int data_len;
261
262 pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
263 subflow->fourth_ack, subflow->snd_isn,
264 skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
265
266 if (subflow->mp_capable && !subflow->fourth_ack && skb &&
267 subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
268 /* When skb is not available, we better over-estimate the
269 * emitted options len. A full DSS option is longer than
270 * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
271 * that.
272 */
273 mpext = mptcp_get_ext(skb);
274 data_len = mpext ? mpext->data_len : 0;
cec37a6e 275
cc7972ea
CP
276 /* we will check ext_copy.data_len in mptcp_write_options() to
277 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
278 * TCPOLEN_MPTCP_MPC_ACK
279 */
280 opts->ext_copy.data_len = data_len;
cec37a6e
PK
281 opts->suboptions = OPTION_MPTCP_MPC_ACK;
282 opts->sndr_key = subflow->local_key;
283 opts->rcvr_key = subflow->remote_key;
cc7972ea
CP
284
285 /* Section 3.1.
286 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
287 * packets that start the first subflow of an MPTCP connection,
288 * as well as the first packet that carries data
289 */
290 if (data_len > 0)
291 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
292 else
293 *size = TCPOLEN_MPTCP_MPC_ACK;
294
295 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
296 subflow, subflow->local_key, subflow->remote_key,
297 data_len);
298
cec37a6e
PK
299 return true;
300 }
301 return false;
302}
303
6d0060f6
MM
304static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
305 struct mptcp_ext *ext)
306{
6d0060f6
MM
307 if (!ext->use_map) {
308 /* RFC6824 requires a DSS mapping with specific values
309 * if DATA_FIN is set but no data payload is mapped
310 */
6d37a0b8 311 ext->data_fin = 1;
6d0060f6
MM
312 ext->use_map = 1;
313 ext->dsn64 = 1;
76c42a29 314 ext->data_seq = subflow->data_fin_tx_seq;
6d0060f6
MM
315 ext->subflow_seq = 0;
316 ext->data_len = 1;
6d37a0b8
MM
317 } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) {
318 /* If there's an existing DSS mapping and it is the
319 * final mapping, DATA_FIN consumes 1 additional byte of
320 * mapping space.
6d0060f6 321 */
6d37a0b8 322 ext->data_fin = 1;
6d0060f6
MM
323 ext->data_len++;
324 }
325}
326
327static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
328 unsigned int *size,
329 unsigned int remaining,
330 struct mptcp_out_options *opts)
331{
332 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
333 unsigned int dss_size = 0;
334 struct mptcp_ext *mpext;
335 struct mptcp_sock *msk;
336 unsigned int ack_size;
d22f4988 337 bool ret = false;
2398e399 338 bool can_ack;
6d0060f6
MM
339 u8 tcp_fin;
340
341 if (skb) {
342 mpext = mptcp_get_ext(skb);
343 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
344 } else {
345 mpext = NULL;
346 tcp_fin = 0;
347 }
348
349 if (!skb || (mpext && mpext->use_map) || tcp_fin) {
350 unsigned int map_size;
351
352 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
353
354 remaining -= map_size;
355 dss_size = map_size;
356 if (mpext)
357 opts->ext_copy = *mpext;
358
76c42a29 359 if (skb && tcp_fin && subflow->data_fin_tx_enable)
6d0060f6 360 mptcp_write_data_fin(subflow, &opts->ext_copy);
d22f4988
CP
361 ret = true;
362 }
363
2398e399
PA
364 /* passive sockets msk will set the 'can_ack' after accept(), even
365 * if the first subflow may have the already the remote key handy
366 */
367 can_ack = true;
d22f4988
CP
368 opts->ext_copy.use_ack = 0;
369 msk = mptcp_sk(subflow->conn);
dc093db5 370 if (!READ_ONCE(msk->can_ack)) {
d22f4988
CP
371 *size = ALIGN(dss_size, 4);
372 return ret;
6d0060f6
MM
373 }
374
375 ack_size = TCPOLEN_MPTCP_DSS_ACK64;
376
377 /* Add kind/length/subtype/flag overhead if mapping is not populated */
378 if (dss_size == 0)
379 ack_size += TCPOLEN_MPTCP_DSS_BASE;
380
381 dss_size += ack_size;
382
dc093db5 383 opts->ext_copy.data_ack = msk->ack_seq;
6d0060f6
MM
384 opts->ext_copy.ack64 = 1;
385 opts->ext_copy.use_ack = 1;
386
387 *size = ALIGN(dss_size, 4);
388 return true;
389}
390
391bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
392 unsigned int *size, unsigned int remaining,
393 struct mptcp_out_options *opts)
394{
395 unsigned int opt_size = 0;
396 bool ret = false;
397
cc7972ea 398 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
6d0060f6
MM
399 ret = true;
400 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
401 opts))
402 ret = true;
403
404 /* we reserved enough space for the above options, and exceeding the
405 * TCP option space would be fatal
406 */
407 if (WARN_ON_ONCE(opt_size > remaining))
408 return false;
409
410 *size += opt_size;
411 remaining -= opt_size;
412
413 return ret;
414}
415
cec37a6e
PK
416bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
417 struct mptcp_out_options *opts)
418{
419 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
420
421 if (subflow_req->mp_capable) {
422 opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
423 opts->sndr_key = subflow_req->local_key;
424 *size = TCPOLEN_MPTCP_MPC_SYNACK;
425 pr_debug("subflow_req=%p, local_key=%llu",
426 subflow_req, subflow_req->local_key);
427 return true;
428 }
429 return false;
430}
431
d22f4988
CP
432static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
433 struct sk_buff *skb,
434 struct mptcp_options_received *mp_opt)
435{
436 /* here we can process OoO, in-window pkts, only in-sequence 4th ack
437 * are relevant
438 */
439 if (likely(subflow->fourth_ack ||
440 TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
441 return true;
442
443 if (mp_opt->use_ack)
444 subflow->fourth_ack = 1;
445
446 if (subflow->can_ack)
447 return true;
448
449 /* If the first established packet does not contain MP_CAPABLE + data
450 * then fallback to TCP
451 */
452 if (!mp_opt->mp_capable) {
453 subflow->mp_capable = 0;
454 tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
455 return false;
456 }
457 subflow->remote_key = mp_opt->sndr_key;
458 subflow->can_ack = 1;
459 return true;
460}
461
648ef4b8
MM
462void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
463 struct tcp_options_received *opt_rx)
464{
d22f4988 465 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
648ef4b8
MM
466 struct mptcp_options_received *mp_opt;
467 struct mptcp_ext *mpext;
468
469 mp_opt = &opt_rx->mptcp;
d22f4988
CP
470 if (!check_fourth_ack(subflow, skb, mp_opt))
471 return;
648ef4b8
MM
472
473 if (!mp_opt->dss)
474 return;
475
476 mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
477 if (!mpext)
478 return;
479
480 memset(mpext, 0, sizeof(*mpext));
481
482 if (mp_opt->use_map) {
cc7972ea 483 if (mp_opt->mpc_map) {
cc7972ea
CP
484 /* this is an MP_CAPABLE carrying MPTCP data
485 * we know this map the first chunk of data
486 */
487 mptcp_crypto_key_sha(subflow->remote_key, NULL,
488 &mpext->data_seq);
489 mpext->data_seq++;
490 mpext->subflow_seq = 1;
491 mpext->dsn64 = 1;
492 mpext->mpc_map = 1;
493 } else {
494 mpext->data_seq = mp_opt->data_seq;
495 mpext->subflow_seq = mp_opt->subflow_seq;
496 mpext->dsn64 = mp_opt->dsn64;
497 }
648ef4b8
MM
498 mpext->data_len = mp_opt->data_len;
499 mpext->use_map = 1;
648ef4b8
MM
500 }
501
502 if (mp_opt->use_ack) {
503 mpext->data_ack = mp_opt->data_ack;
504 mpext->use_ack = 1;
505 mpext->ack64 = mp_opt->ack64;
506 }
507
508 mpext->data_fin = mp_opt->data_fin;
509}
510
eda7acdd
PK
511void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
512{
cc7972ea 513 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
eda7acdd
PK
514 OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
515 u8 len;
516
517 if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
518 len = TCPOLEN_MPTCP_MPC_SYN;
cec37a6e
PK
519 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
520 len = TCPOLEN_MPTCP_MPC_SYNACK;
cc7972ea
CP
521 else if (opts->ext_copy.data_len)
522 len = TCPOLEN_MPTCP_MPC_ACK_DATA;
eda7acdd
PK
523 else
524 len = TCPOLEN_MPTCP_MPC_ACK;
525
526 *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
527 (MPTCPOPT_MP_CAPABLE << 12) |
528 (MPTCP_SUPPORTED_VERSION << 8) |
65492c5a 529 MPTCP_CAP_HMAC_SHA256);
cc7972ea
CP
530
531 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
532 opts->suboptions))
533 goto mp_capable_done;
534
eda7acdd
PK
535 put_unaligned_be64(opts->sndr_key, ptr);
536 ptr += 2;
cc7972ea
CP
537 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
538 goto mp_capable_done;
539
540 put_unaligned_be64(opts->rcvr_key, ptr);
541 ptr += 2;
542 if (!opts->ext_copy.data_len)
543 goto mp_capable_done;
544
545 put_unaligned_be32(opts->ext_copy.data_len << 16 |
546 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
547 ptr += 1;
eda7acdd 548 }
6d0060f6 549
cc7972ea 550mp_capable_done:
6d0060f6
MM
551 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
552 struct mptcp_ext *mpext = &opts->ext_copy;
553 u8 len = TCPOLEN_MPTCP_DSS_BASE;
554 u8 flags = 0;
555
556 if (mpext->use_ack) {
557 len += TCPOLEN_MPTCP_DSS_ACK64;
558 flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
559 }
560
561 if (mpext->use_map) {
562 len += TCPOLEN_MPTCP_DSS_MAP64;
563
564 /* Use only 64-bit mapping flags for now, add
565 * support for optional 32-bit mappings later.
566 */
567 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
568 if (mpext->data_fin)
569 flags |= MPTCP_DSS_DATA_FIN;
570 }
571
572 *ptr++ = htonl((TCPOPT_MPTCP << 24) |
573 (len << 16) |
574 (MPTCPOPT_DSS << 12) |
575 (flags));
576
577 if (mpext->use_ack) {
578 put_unaligned_be64(mpext->data_ack, ptr);
579 ptr += 2;
580 }
581
582 if (mpext->use_map) {
583 put_unaligned_be64(mpext->data_seq, ptr);
584 ptr += 2;
585 put_unaligned_be32(mpext->subflow_seq, ptr);
586 ptr += 1;
587 put_unaligned_be32(mpext->data_len << 16 |
588 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
589 }
590 }
eda7acdd 591}