]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - net/ipv4/tcp_ipv4.c
Merge tag 'for-5.16/block-2021-10-29' of git://git.kernel.dk/linux-block
[mirror_ubuntu-kernels.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
951cf368 79#include <linux/btf_ids.h>
1da177e4 80
cf80e0e4 81#include <crypto/hash.h>
cfb6eeb4
YH
82#include <linux/scatterlist.h>
83
c24b14c4
SL
84#include <trace/events/tcp.h>
85
cfb6eeb4 86#ifdef CONFIG_TCP_MD5SIG
a915da9b 87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
89#endif
90
5caea4ea 91struct inet_hashinfo tcp_hashinfo;
4bc2f18b 92EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 93
84b114b9 94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 95{
84b114b9
ED
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100}
101
5d2ed052 102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 103{
5d2ed052 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
105}
106
6d6ee43e
ACM
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
79e9fed4 109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122#if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
128 loopback = true;
129 } else
130#endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
6d6ee43e
ACM
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
0f317464
ED
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
6d6ee43e
ACM
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179}
6d6ee43e
ACM
180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
d74bad4e
AI
182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184{
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195}
196
1da177e4
LT
197/* This will initiate an outgoing connection. */
198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199{
2d7192d6 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 203 __be16 orig_sport, orig_dport;
bada8adc 204 __be32 daddr, nexthop;
da905bd1 205 struct flowi4 *fl4;
2d7192d6 206 struct rtable *rt;
1da177e4 207 int err;
f6d8bd05 208 struct ip_options_rcu *inet_opt;
1946e672 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 218 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 219 lockdep_sock_is_held(sk));
f6d8bd05 220 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
221 if (!daddr)
222 return -EINVAL;
f6d8bd05 223 nexthop = inet_opt->opt.faddr;
1da177e4
LT
224 }
225
dca8b089
DM
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
da905bd1
DM
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
0e0d44ab 232 orig_sport, orig_dport, sk);
b23dd4fe
DM
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
f1d8cba6 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 237 return err;
584bdf8c 238 }
1da177e4
LT
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
f6d8bd05 245 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 246 daddr = fl4->daddr;
1da177e4 247
c720c7e8 248 if (!inet->inet_saddr)
da905bd1 249 inet->inet_saddr = fl4->saddr;
d1e559d0 250 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 251
c720c7e8 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
ee995283 256 if (likely(!tp->repair))
0f317464 257 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
258 }
259
c720c7e8 260 inet->inet_dport = usin->sin_port;
d1e559d0 261 sk_daddr_set(sk, daddr);
1da177e4 262
d83d8461 263 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 266
bee7ca9e 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 275 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
276 if (err)
277 goto failure;
278
877d1f62 279 sk_set_txhash(sk);
9e7ceb06 280
da905bd1 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
1da177e4 286 goto failure;
b23dd4fe 287 }
1da177e4 288 /* OK, now commit destination to socket. */
bcd76111 289 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 290 sk_setup_caps(sk, &rt->dst);
19f6d3f3 291 rt = NULL;
1da177e4 292
00355fa5 293 if (likely(!tp->repair)) {
00355fa5 294 if (!tp->write_seq)
0f317464
ED
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
5d2ed052
ED
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
84b114b9 302 inet->inet_daddr);
00355fa5 303 }
1da177e4 304
a904a069 305 inet->inet_id = prandom_u32();
1da177e4 306
19f6d3f3
WW
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
2b916477 312 err = tcp_connect(sk);
ee995283 313
1da177e4
LT
314 if (err)
315 goto failure;
316
317 return 0;
318
319failure:
7174259e
ACM
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
1da177e4
LT
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
c720c7e8 327 inet->inet_dport = 0;
1da177e4
LT
328 return err;
329}
4bc2f18b 330EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 331
1da177e4 332/*
563d34d0
ED
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 336 */
4fab9071 337void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 338{
1da177e4 339 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
340 struct dst_entry *dst;
341 u32 mtu;
1da177e4 342
02b2faaf
ED
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
561022ac 345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
80d0a69f
DM
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
1da177e4
LT
348 return;
349
1da177e4
LT
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
352 */
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 359 ip_sk_accept_pmtu(sk) &&
d83d8461 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
361 tcp_sync_mss(sk, mtu);
362
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
366 * discovery.
367 */
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
370}
4fab9071 371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 372
55be7a9c
DM
373static void do_redirect(struct sk_buff *skb, struct sock *sk)
374{
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
1ed5c48f 377 if (dst)
6700c270 378 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
379}
380
26e37360
ED
381
382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
384{
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
390 */
26e37360 391 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 393 } else if (abort) {
26e37360
ED
394 /*
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
399 */
c6973669 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 401 tcp_listendrop(req->rsk_listener);
26e37360 402 }
ef84d8ce 403 reqsk_put(req);
26e37360
ED
404}
405EXPORT_SYMBOL(tcp_req_err);
406
f7456642 407/* TCP-LD (RFC 6069) logic */
d2924569 408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
f7456642
ED
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
441 */
442 tcp_retransmit_timer(sk);
443 }
444}
d2924569 445EXPORT_SYMBOL(tcp_ld_RTO_revert);
f7456642 446
1da177e4
LT
447/*
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
454 *
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
460 *
461 */
462
a12daf13 463int tcp_v4_err(struct sk_buff *skb, u32 info)
1da177e4 464{
a12daf13
ED
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1da177e4
LT
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
a12daf13
ED
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
1da177e4 471 struct sock *sk;
0a672f74 472 struct request_sock *fastopen;
9a568de4 473 u32 seq, snd_una;
1da177e4 474 int err;
a12daf13 475 struct net *net = dev_net(skb->dev);
1da177e4 476
26e37360
ED
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
a12daf13 479 inet_iif(skb), 0);
1da177e4 480 if (!sk) {
5d3848bc 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 482 return -ENOENT;
1da177e4
LT
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 485 inet_twsk_put(inet_twsk(sk));
32bbd879 486 return 0;
1da177e4 487 }
26e37360 488 seq = ntohl(th->seq);
32bbd879
SB
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
1da177e4
LT
497
498 bh_lock_sock(sk);
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
563d34d0
ED
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
1da177e4 503 */
b74aa930
ED
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 507 }
1da177e4
LT
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
97e3ecd1 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 513 goto out;
514 }
515
1da177e4 516 tp = tcp_sk(sk);
0a672f74 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 518 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 520 if (sk->sk_state != TCP_LISTEN &&
0a672f74 521 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
523 goto out;
524 }
525
526 switch (type) {
55be7a9c 527 case ICMP_REDIRECT:
45caeaa5 528 if (!sock_owned_by_user(sk))
a12daf13 529 do_redirect(skb, sk);
55be7a9c 530 goto out;
1da177e4
LT
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
545 */
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
561022ac 549 WRITE_ONCE(tp->mtu_info, info);
144d56e9 550 if (!sock_owned_by_user(sk)) {
563d34d0 551 tcp_v4_mtu_reduced(sk);
144d56e9 552 } else {
7aa5470c 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
554 sock_hold(sk);
555 }
1da177e4
LT
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
f7456642
ED
560 /* check if this ICMP message allows revert of backoff.
561 * (see RFC 6069)
562 */
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
1da177e4
LT
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
1da177e4 575 case TCP_SYN_SENT:
0a672f74
YC
576 case TCP_SYN_RECV:
577 /* Only in fast or simultaneous open. If a fast open socket is
2bdcc73c 578 * already accepted it is treated as a connected one below.
0a672f74 579 */
51456b29 580 if (fastopen && !fastopen->sk)
0a672f74
YC
581 break;
582
a12daf13 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
45af29ca 584
1da177e4 585 if (!sock_owned_by_user(sk)) {
1da177e4
LT
586 sk->sk_err = err;
587
e3ae2365 588 sk_error_report(sk);
1da177e4
LT
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
599 *
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
603 *
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 *
609 * Now we are in compliance with RFCs.
610 * --ANK (980905)
611 */
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
e3ae2365 616 sk_error_report(sk);
1da177e4
LT
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
619 }
620
621out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
32bbd879 624 return 0;
1da177e4
LT
625}
626
28850dc7 627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 628{
aa8223c7 629 struct tcphdr *th = tcp_hdr(skb);
1da177e4 630
98be9b12
ED
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
634}
635
419f9f89 636/* This routine computes an IPv4 TCP checksum. */
bb296246 637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 638{
cf533ea5 639 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642}
4bc2f18b 643EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 644
1da177e4
LT
645/*
646 * This routine will send an RST to the other tcp.
647 *
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 * for reset.
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
656 */
657
dc87efdb
FW
658#ifdef CONFIG_TCP_MD5SIG
659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660#else
661#define OPTION_BYTES sizeof(__be32)
662#endif
663
a00e7444 664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 665{
cf533ea5 666 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
667 struct {
668 struct tcphdr th;
dc87efdb 669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
cfb6eeb4 670 } rep;
1da177e4 671 struct ip_reply_arg arg;
cfb6eeb4 672#ifdef CONFIG_TCP_MD5SIG
e46787f0 673 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
cfb6eeb4 678#endif
d6fb396c 679 u64 transmit_time = 0;
00483690 680 struct sock *ctl_sk;
d6fb396c 681 struct net *net;
1da177e4
LT
682
683 /* Never send a reset in response to a reset. */
684 if (th->rst)
685 return;
686
c3658e8d
ED
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
689 */
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
691 return;
692
693 /* Swap the send and the receive. */
cfb6eeb4
YH
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
1da177e4
LT
699
700 if (th->ack) {
cfb6eeb4 701 rep.th.seq = th->ack_seq;
1da177e4 702 } else {
cfb6eeb4
YH
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
1da177e4
LT
706 }
707
7174259e 708 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
0f85feae 712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 713#ifdef CONFIG_TCP_MD5SIG
3b24d854 714 rcu_read_lock();
658ddaaf 715 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 716 if (sk && sk_fullsock(sk)) {
cea97609 717 const union tcp_md5_addr *addr;
dea53bb8 718 int l3index;
cea97609 719
dea53bb8
DA
720 /* sdif set, means packet ingressed via a device
721 * in an L3 domain and inet_iif is set to it.
722 */
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
cea97609 724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
e46787f0 726 } else if (hash_location) {
cea97609 727 const union tcp_md5_addr *addr;
534322ca
DA
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
dea53bb8 730 int l3index;
cea97609 731
658ddaaf
SL
732 /*
733 * active side is lost. Try to find listening socket through
734 * source port, and then find md5 key through listening socket.
735 * we are not loose security here:
736 * Incoming packet is checked with md5 hash with finding key,
737 * no RST generated if md5 hash doesn't match.
738 */
a583636a
CG
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
da5e3630 741 th->source, ip_hdr(skb)->daddr,
534322ca 742 ntohs(th->source), dif, sdif);
658ddaaf
SL
743 /* don't send rst if it can't find key */
744 if (!sk1)
3b24d854
ED
745 goto out;
746
dea53bb8
DA
747 /* sdif set, means packet ingressed via a device
748 * in an L3 domain and dif is set to it.
749 */
750 l3index = sdif ? dif : 0;
cea97609 751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
658ddaaf 753 if (!key)
3b24d854
ED
754 goto out;
755
658ddaaf 756
39f8e58e 757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
759 goto out;
760
658ddaaf
SL
761 }
762
cfb6eeb4
YH
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768 /* Update length and the length the header thinks exists */
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
49a72dfb 772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
775 }
776#endif
dc87efdb
FW
777 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
780
781 if (mrst) {
782 rep.opt[0] = mrst;
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
785 }
786 }
787
eddc9ec5
ACM
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr, /* XXX */
52cd5750 790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793
e2446eaa 794 /* When socket is gone, all binding information is lost.
4c675258
AK
795 * routing might fail in this case. No choice here, if we choose to force
796 * input interface, we will misroute in case of asymmetric route.
e2446eaa 797 */
c24b14c4 798 if (sk) {
4c675258 799 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
800 if (sk_fullsock(sk))
801 trace_tcp_send_reset(sk, skb);
c24b14c4 802 }
1da177e4 803
271c3b9b
FW
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806
66b13d99 807 arg.tos = ip_hdr(skb)->tos;
e2d118a1 808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 809 local_bh_disable();
5472c3c6 810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14 811 if (sk) {
00483690
JM
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 816 transmit_time = tcp_transmit_time(sk);
a842fe14 817 }
00483690 818 ip_send_unicast_reply(ctl_sk,
bdbbb852 819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
1da177e4 823
00483690 824 ctl_sk->sk_mark = 0;
90bbcc60
ED
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 827 local_bh_enable();
658ddaaf
SL
828
829#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
830out:
831 rcu_read_unlock();
658ddaaf 832#endif
1da177e4
LT
833}
834
835/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836 outside socket context is ugly, certainly. What can I do?
837 */
838
e2d118a1 839static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 840 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 841 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 842 struct tcp_md5sig_key *key,
66b13d99 843 int reply_flags, u8 tos)
1da177e4 844{
cf533ea5 845 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
846 struct {
847 struct tcphdr th;
714e85be 848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 849#ifdef CONFIG_TCP_MD5SIG
714e85be 850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
851#endif
852 ];
1da177e4 853 } rep;
e2d118a1 854 struct net *net = sock_net(sk);
1da177e4 855 struct ip_reply_arg arg;
00483690 856 struct sock *ctl_sk;
d6fb396c 857 u64 transmit_time;
1da177e4
LT
858
859 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 860 memset(&arg, 0, sizeof(arg));
1da177e4
LT
861
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 864 if (tsecr) {
cfb6eeb4
YH
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
867 TCPOLEN_TIMESTAMP);
ee684b6f
AV
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
cb48cfe8 870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
871 }
872
873 /* Swap the send and the receive. */
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
879 rep.th.ack = 1;
880 rep.th.window = htons(win);
881
cfb6eeb4 882#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 883 if (key) {
ee684b6f 884 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
885
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 (TCPOPT_NOP << 16) |
888 (TCPOPT_MD5SIG << 8) |
889 TCPOLEN_MD5SIG);
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
892
49a72dfb 893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
896 }
897#endif
88ef4a5a 898 arg.flags = reply_flags;
eddc9ec5
ACM
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
903 if (oif)
904 arg.bound_dev_if = oif;
66b13d99 905 arg.tos = tos;
e2d118a1 906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 907 local_bh_disable();
5472c3c6 908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14
ED
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 913 transmit_time = tcp_transmit_time(sk);
00483690 914 ip_send_unicast_reply(ctl_sk,
bdbbb852 915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
917 &arg, arg.iov[0].iov_len,
918 transmit_time);
1da177e4 919
00483690 920 ctl_sk->sk_mark = 0;
90bbcc60 921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 922 local_bh_enable();
1da177e4
LT
923}
924
925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926{
8feaf0c0 927 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 929
e2d118a1 930 tcp_v4_send_ack(sk, skb,
e62a123b 931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
934 tcptw->tw_ts_recent,
935 tw->tw_bound_dev_if,
88ef4a5a 936 tcp_twsk_md5_key(tcptw),
66b13d99
ED
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 tw->tw_tos
9501f972 939 );
1da177e4 940
8feaf0c0 941 inet_twsk_put(tw);
1da177e4
LT
942}
943
a00e7444 944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 945 struct request_sock *req)
1da177e4 946{
cea97609 947 const union tcp_md5_addr *addr;
dea53bb8 948 int l3index;
cea97609 949
168a8f58
JC
950 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952 */
e62a123b
ED
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 tcp_sk(sk)->snd_nxt;
955
20a2b49f
ED
956 /* RFC 7323 2.3
957 * The window field (SEG.WND) of every outgoing segment, with the
958 * exception of <SYN> segments, MUST be right-shifted by
959 * Rcv.Wind.Shift bits:
960 */
cea97609 961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
e2d118a1 963 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
967 req->ts_recent,
968 0,
dea53bb8 969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
66b13d99
ED
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 ip_hdr(skb)->tos);
1da177e4
LT
972}
973
1da177e4 974/*
9bf1d83e 975 * Send a SYN-ACK after having received a SYN.
60236fdd 976 * This still operates on a request_sock only, not on a big
1da177e4
LT
977 * socket.
978 */
0f935dbe 979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 980 struct flowi *fl,
72659ecc 981 struct request_sock *req,
ca6fb065 982 struct tcp_fastopen_cookie *foc,
331fca43
MKL
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
1da177e4 985{
2e6599cb 986 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 987 struct flowi4 fl4;
1da177e4 988 int err = -1;
d41db5af 989 struct sk_buff *skb;
ac8f1710 990 u8 tos;
1da177e4
LT
991
992 /* First, grab a route. */
ba3f7f04 993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 994 return -1;
1da177e4 995
331fca43 996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1da177e4
LT
997
998 if (skb) {
634fb979 999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 1000
407c85c7 1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
8ef44b6f
WW
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
407c85c7
AD
1004 inet_sk(sk)->tos;
1005
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1009
2ab2ddd3 1010 rcu_read_lock();
634fb979
ED
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 ireq->ir_rmt_addr,
de033b7d 1013 rcu_dereference(ireq->ireq_opt),
861602b5 1014 tos);
2ab2ddd3 1015 rcu_read_unlock();
b9df3cb8 1016 err = net_xmit_eval(err);
1da177e4
LT
1017 }
1018
1da177e4
LT
1019 return err;
1020}
1021
1022/*
60236fdd 1023 * IPv4 request_sock destructor.
1da177e4 1024 */
60236fdd 1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1026{
c92e8c02 1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
1028}
1029
cfb6eeb4
YH
1030#ifdef CONFIG_TCP_MD5SIG
1031/*
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1035 */
1036
921f9a0f 1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
86f1e3a8
LC
1040static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041{
1042 if (!old)
1043 return true;
1044
1045 /* l3index always overrides non-l3index */
1046 if (old->l3index && new->l3index == 0)
1047 return false;
1048 if (old->l3index == 0 && new->l3index)
1049 return true;
1050
1051 return old->prefixlen < new->prefixlen;
1052}
1053
cfb6eeb4 1054/* Find the Key structure for an address. */
dea53bb8 1055struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
6015c71e
ED
1056 const union tcp_md5_addr *addr,
1057 int family)
cfb6eeb4 1058{
fd3a154a 1059 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1060 struct tcp_md5sig_key *key;
fd3a154a 1061 const struct tcp_md5sig_info *md5sig;
6797318e
ID
1062 __be32 mask;
1063 struct tcp_md5sig_key *best_match = NULL;
1064 bool match;
cfb6eeb4 1065
a8afca03
ED
1066 /* caller either holds rcu_read_lock() or socket lock */
1067 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 1068 lockdep_sock_is_held(sk));
a8afca03 1069 if (!md5sig)
cfb6eeb4 1070 return NULL;
083a0326 1071
c8b91770
AG
1072 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073 lockdep_sock_is_held(sk)) {
a915da9b
ED
1074 if (key->family != family)
1075 continue;
a76c2315 1076 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
dea53bb8 1077 continue;
6797318e
ID
1078 if (family == AF_INET) {
1079 mask = inet_make_mask(key->prefixlen);
1080 match = (key->addr.a4.s_addr & mask) ==
1081 (addr->a4.s_addr & mask);
1082#if IS_ENABLED(CONFIG_IPV6)
1083 } else if (family == AF_INET6) {
1084 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085 key->prefixlen);
1086#endif
1087 } else {
1088 match = false;
1089 }
1090
86f1e3a8 1091 if (match && better_md5_match(best_match, key))
6797318e
ID
1092 best_match = key;
1093 }
1094 return best_match;
1095}
6015c71e 1096EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1097
e8f37d57
WF
1098static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099 const union tcp_md5_addr *addr,
dea53bb8 1100 int family, u8 prefixlen,
a76c2315 1101 int l3index, u8 flags)
6797318e
ID
1102{
1103 const struct tcp_sock *tp = tcp_sk(sk);
1104 struct tcp_md5sig_key *key;
1105 unsigned int size = sizeof(struct in_addr);
1106 const struct tcp_md5sig_info *md5sig;
1107
1108 /* caller either holds rcu_read_lock() or socket lock */
1109 md5sig = rcu_dereference_check(tp->md5sig_info,
1110 lockdep_sock_is_held(sk));
1111 if (!md5sig)
1112 return NULL;
1113#if IS_ENABLED(CONFIG_IPV6)
1114 if (family == AF_INET6)
1115 size = sizeof(struct in6_addr);
1116#endif
c8b91770
AG
1117 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118 lockdep_sock_is_held(sk)) {
6797318e
ID
1119 if (key->family != family)
1120 continue;
a76c2315
LC
1121 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1122 continue;
86f1e3a8 1123 if (key->l3index != l3index)
dea53bb8 1124 continue;
6797318e
ID
1125 if (!memcmp(&key->addr, addr, size) &&
1126 key->prefixlen == prefixlen)
a915da9b 1127 return key;
cfb6eeb4
YH
1128 }
1129 return NULL;
1130}
1131
b83e3deb 1132struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1133 const struct sock *addr_sk)
cfb6eeb4 1134{
b52e6921 1135 const union tcp_md5_addr *addr;
dea53bb8 1136 int l3index;
a915da9b 1137
dea53bb8
DA
1138 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1139 addr_sk->sk_bound_dev_if);
b52e6921 1140 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
dea53bb8 1141 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
cfb6eeb4 1142}
cfb6eeb4
YH
1143EXPORT_SYMBOL(tcp_v4_md5_lookup);
1144
cfb6eeb4 1145/* This can be called on a newly created socket, from other files */
a915da9b 1146int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
a76c2315 1147 int family, u8 prefixlen, int l3index, u8 flags,
dea53bb8 1148 const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1149{
1150 /* Add Key to the list */
b0a713e9 1151 struct tcp_md5sig_key *key;
cfb6eeb4 1152 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1153 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1154
a76c2315 1155 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
cfb6eeb4 1156 if (key) {
e6ced831
ED
1157 /* Pre-existing entry - just update that one.
1158 * Note that the key might be used concurrently.
1159 * data_race() is telling kcsan that we do not care of
1160 * key mismatches, since changing MD5 key on live flows
1161 * can lead to packet drops.
1162 */
1163 data_race(memcpy(key->key, newkey, newkeylen));
6a2febec 1164
e6ced831
ED
1165 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1166 * Also note that a reader could catch new key->keylen value
1167 * but old key->key[], this is the reason we use __GFP_ZERO
1168 * at sock_kmalloc() time below these lines.
1169 */
1170 WRITE_ONCE(key->keylen, newkeylen);
6a2febec 1171
a915da9b
ED
1172 return 0;
1173 }
260fcbeb 1174
a8afca03 1175 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1176 lockdep_sock_is_held(sk));
a915da9b
ED
1177 if (!md5sig) {
1178 md5sig = kmalloc(sizeof(*md5sig), gfp);
1179 if (!md5sig)
cfb6eeb4 1180 return -ENOMEM;
cfb6eeb4 1181
a915da9b
ED
1182 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1183 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1184 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1185 }
cfb6eeb4 1186
e6ced831 1187 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
a915da9b
ED
1188 if (!key)
1189 return -ENOMEM;
71cea17e 1190 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1191 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1192 return -ENOMEM;
cfb6eeb4 1193 }
a915da9b
ED
1194
1195 memcpy(key->key, newkey, newkeylen);
1196 key->keylen = newkeylen;
1197 key->family = family;
6797318e 1198 key->prefixlen = prefixlen;
dea53bb8 1199 key->l3index = l3index;
a76c2315 1200 key->flags = flags;
a915da9b
ED
1201 memcpy(&key->addr, addr,
1202 (family == AF_INET6) ? sizeof(struct in6_addr) :
1203 sizeof(struct in_addr));
1204 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1205 return 0;
1206}
a915da9b 1207EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1208
6797318e 1209int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
a76c2315 1210 u8 prefixlen, int l3index, u8 flags)
cfb6eeb4 1211{
a915da9b
ED
1212 struct tcp_md5sig_key *key;
1213
a76c2315 1214 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
a915da9b
ED
1215 if (!key)
1216 return -ENOENT;
1217 hlist_del_rcu(&key->node);
5f3d9cb2 1218 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1219 kfree_rcu(key, rcu);
a915da9b 1220 return 0;
cfb6eeb4 1221}
a915da9b 1222EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1223
e0683e70 1224static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1225{
1226 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1227 struct tcp_md5sig_key *key;
b67bfe0d 1228 struct hlist_node *n;
a8afca03 1229 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1230
a8afca03
ED
1231 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1232
b67bfe0d 1233 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1234 hlist_del_rcu(&key->node);
5f3d9cb2 1235 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1236 kfree_rcu(key, rcu);
cfb6eeb4
YH
1237 }
1238}
1239
8917a777 1240static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
d4c19c49 1241 sockptr_t optval, int optlen)
cfb6eeb4
YH
1242{
1243 struct tcp_md5sig cmd;
1244 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cea97609 1245 const union tcp_md5_addr *addr;
8917a777 1246 u8 prefixlen = 32;
dea53bb8 1247 int l3index = 0;
a76c2315 1248 u8 flags;
cfb6eeb4
YH
1249
1250 if (optlen < sizeof(cmd))
1251 return -EINVAL;
1252
d4c19c49 1253 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1254 return -EFAULT;
1255
1256 if (sin->sin_family != AF_INET)
1257 return -EINVAL;
1258
a76c2315
LC
1259 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1260
8917a777
ID
1261 if (optname == TCP_MD5SIG_EXT &&
1262 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1263 prefixlen = cmd.tcpm_prefixlen;
1264 if (prefixlen > 32)
1265 return -EINVAL;
1266 }
1267
a76c2315 1268 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
6b102db5
DA
1269 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1270 struct net_device *dev;
1271
1272 rcu_read_lock();
1273 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1274 if (dev && netif_is_l3_master(dev))
1275 l3index = dev->ifindex;
1276
1277 rcu_read_unlock();
1278
1279 /* ok to reference set/not set outside of rcu;
1280 * right now device MUST be an L3 master
1281 */
1282 if (!dev || !l3index)
1283 return -EINVAL;
1284 }
1285
cea97609
DA
1286 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1287
64a124ed 1288 if (!cmd.tcpm_keylen)
a76c2315 1289 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
cfb6eeb4
YH
1290
1291 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1292 return -EINVAL;
1293
a76c2315 1294 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
cea97609 1295 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
cfb6eeb4
YH
1296}
1297
19689e38
ED
1298static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1299 __be32 daddr, __be32 saddr,
1300 const struct tcphdr *th, int nbytes)
cfb6eeb4 1301{
cfb6eeb4 1302 struct tcp4_pseudohdr *bp;
49a72dfb 1303 struct scatterlist sg;
19689e38 1304 struct tcphdr *_th;
cfb6eeb4 1305
19689e38 1306 bp = hp->scratch;
cfb6eeb4
YH
1307 bp->saddr = saddr;
1308 bp->daddr = daddr;
1309 bp->pad = 0;
076fb722 1310 bp->protocol = IPPROTO_TCP;
49a72dfb 1311 bp->len = cpu_to_be16(nbytes);
c7da57a1 1312
19689e38
ED
1313 _th = (struct tcphdr *)(bp + 1);
1314 memcpy(_th, th, sizeof(*th));
1315 _th->check = 0;
1316
1317 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1318 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1319 sizeof(*bp) + sizeof(*th));
cf80e0e4 1320 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1321}
1322
a915da9b 1323static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1324 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1325{
1326 struct tcp_md5sig_pool *hp;
cf80e0e4 1327 struct ahash_request *req;
49a72dfb
AL
1328
1329 hp = tcp_get_md5sig_pool();
1330 if (!hp)
1331 goto clear_hash_noput;
cf80e0e4 1332 req = hp->md5_req;
49a72dfb 1333
cf80e0e4 1334 if (crypto_ahash_init(req))
49a72dfb 1335 goto clear_hash;
19689e38 1336 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1337 goto clear_hash;
1338 if (tcp_md5_hash_key(hp, key))
1339 goto clear_hash;
cf80e0e4
HX
1340 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1341 if (crypto_ahash_final(req))
cfb6eeb4
YH
1342 goto clear_hash;
1343
cfb6eeb4 1344 tcp_put_md5sig_pool();
cfb6eeb4 1345 return 0;
49a72dfb 1346
cfb6eeb4
YH
1347clear_hash:
1348 tcp_put_md5sig_pool();
1349clear_hash_noput:
1350 memset(md5_hash, 0, 16);
49a72dfb 1351 return 1;
cfb6eeb4
YH
1352}
1353
39f8e58e
ED
1354int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1355 const struct sock *sk,
318cf7aa 1356 const struct sk_buff *skb)
cfb6eeb4 1357{
49a72dfb 1358 struct tcp_md5sig_pool *hp;
cf80e0e4 1359 struct ahash_request *req;
318cf7aa 1360 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1361 __be32 saddr, daddr;
1362
39f8e58e
ED
1363 if (sk) { /* valid for establish/request sockets */
1364 saddr = sk->sk_rcv_saddr;
1365 daddr = sk->sk_daddr;
cfb6eeb4 1366 } else {
49a72dfb
AL
1367 const struct iphdr *iph = ip_hdr(skb);
1368 saddr = iph->saddr;
1369 daddr = iph->daddr;
cfb6eeb4 1370 }
49a72dfb
AL
1371
1372 hp = tcp_get_md5sig_pool();
1373 if (!hp)
1374 goto clear_hash_noput;
cf80e0e4 1375 req = hp->md5_req;
49a72dfb 1376
cf80e0e4 1377 if (crypto_ahash_init(req))
49a72dfb
AL
1378 goto clear_hash;
1379
19689e38 1380 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1381 goto clear_hash;
1382 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1383 goto clear_hash;
1384 if (tcp_md5_hash_key(hp, key))
1385 goto clear_hash;
cf80e0e4
HX
1386 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1387 if (crypto_ahash_final(req))
49a72dfb
AL
1388 goto clear_hash;
1389
1390 tcp_put_md5sig_pool();
1391 return 0;
1392
1393clear_hash:
1394 tcp_put_md5sig_pool();
1395clear_hash_noput:
1396 memset(md5_hash, 0, 16);
1397 return 1;
cfb6eeb4 1398}
49a72dfb 1399EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1400
ba8e275a
ED
1401#endif
1402
ff74e23f 1403/* Called with rcu_read_lock() */
ba8e275a 1404static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
534322ca
DA
1405 const struct sk_buff *skb,
1406 int dif, int sdif)
cfb6eeb4 1407{
ba8e275a 1408#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1409 /*
1410 * This gets called for each TCP segment that arrives
1411 * so we want to be efficient.
1412 * We have 3 drop cases:
1413 * o No MD5 hash and one expected.
1414 * o MD5 hash and we're not expecting one.
1415 * o MD5 hash and its wrong.
1416 */
cf533ea5 1417 const __u8 *hash_location = NULL;
cfb6eeb4 1418 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1419 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1420 const struct tcphdr *th = tcp_hdr(skb);
cea97609 1421 const union tcp_md5_addr *addr;
cfb6eeb4 1422 unsigned char newhash[16];
dea53bb8
DA
1423 int genhash, l3index;
1424
1425 /* sdif set, means packet ingressed via a device
1426 * in an L3 domain and dif is set to the l3mdev
1427 */
1428 l3index = sdif ? dif : 0;
cfb6eeb4 1429
cea97609 1430 addr = (union tcp_md5_addr *)&iph->saddr;
dea53bb8 1431 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
7d5d5525 1432 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1433
cfb6eeb4
YH
1434 /* We've parsed the options - do we have a hash? */
1435 if (!hash_expected && !hash_location)
a2a385d6 1436 return false;
cfb6eeb4
YH
1437
1438 if (hash_expected && !hash_location) {
c10d9310 1439 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1440 return true;
cfb6eeb4
YH
1441 }
1442
1443 if (!hash_expected && hash_location) {
c10d9310 1444 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1445 return true;
cfb6eeb4
YH
1446 }
1447
1448 /* Okay, so this is hash_expected and hash_location -
1449 * so we need to calculate the checksum.
1450 */
49a72dfb
AL
1451 genhash = tcp_v4_md5_hash_skb(newhash,
1452 hash_expected,
39f8e58e 1453 NULL, skb);
cfb6eeb4
YH
1454
1455 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
dea53bb8 1457 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
e87cc472
JP
1458 &iph->saddr, ntohs(th->source),
1459 &iph->daddr, ntohs(th->dest),
1460 genhash ? " tcp_v4_calc_md5_hash failed"
dea53bb8 1461 : "", l3index);
a2a385d6 1462 return true;
cfb6eeb4 1463 }
a2a385d6 1464 return false;
cfb6eeb4 1465#endif
ba8e275a
ED
1466 return false;
1467}
cfb6eeb4 1468
b40cf18e
ED
1469static void tcp_v4_init_req(struct request_sock *req,
1470 const struct sock *sk_listener,
16bea70a
OP
1471 struct sk_buff *skb)
1472{
1473 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1474 struct net *net = sock_net(sk_listener);
16bea70a 1475
08d2cc3b
ED
1476 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1477 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1478 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1479}
1480
f964629e 1481static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
7ea851d1 1482 struct sk_buff *skb,
f964629e 1483 struct flowi *fl,
7ea851d1 1484 struct request_sock *req)
d94e0417 1485{
7ea851d1
FW
1486 tcp_v4_init_req(req, sk, skb);
1487
1488 if (security_inet_conn_request(sk, skb, req))
1489 return NULL;
1490
4396e461 1491 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1492}
1493
72a3effa 1494struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1495 .family = PF_INET,
2e6599cb 1496 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1497 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1498 .send_ack = tcp_v4_reqsk_send_ack,
1499 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1500 .send_reset = tcp_v4_send_reset,
688d1945 1501 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1502};
1503
35b2c321 1504const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1505 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1506#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1507 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1508 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1509#endif
fb7b37a7
OP
1510#ifdef CONFIG_SYN_COOKIES
1511 .cookie_init_seq = cookie_v4_init_sequence,
1512#endif
d94e0417 1513 .route_req = tcp_v4_route_req,
84b114b9
ED
1514 .init_seq = tcp_v4_init_seq,
1515 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1516 .send_synack = tcp_v4_send_synack,
16bea70a 1517};
cfb6eeb4 1518
1da177e4
LT
1519int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1520{
1da177e4 1521 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1522 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1523 goto drop;
1524
1fb6f159
OP
1525 return tcp_conn_request(&tcp_request_sock_ops,
1526 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1527
1da177e4 1528drop:
9caad864 1529 tcp_listendrop(sk);
1da177e4
LT
1530 return 0;
1531}
4bc2f18b 1532EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1533
1534
1535/*
1536 * The three way handshake has completed - we got a valid synack -
1537 * now create the new socket.
1538 */
0c27171e 1539struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1540 struct request_sock *req,
5e0724d0
ED
1541 struct dst_entry *dst,
1542 struct request_sock *req_unhash,
1543 bool *own_req)
1da177e4 1544{
2e6599cb 1545 struct inet_request_sock *ireq;
01770a16 1546 bool found_dup_sk = false;
1da177e4
LT
1547 struct inet_sock *newinet;
1548 struct tcp_sock *newtp;
1549 struct sock *newsk;
cfb6eeb4 1550#ifdef CONFIG_TCP_MD5SIG
cea97609 1551 const union tcp_md5_addr *addr;
cfb6eeb4 1552 struct tcp_md5sig_key *key;
dea53bb8 1553 int l3index;
cfb6eeb4 1554#endif
f6d8bd05 1555 struct ip_options_rcu *inet_opt;
1da177e4
LT
1556
1557 if (sk_acceptq_is_full(sk))
1558 goto exit_overflow;
1559
1da177e4
LT
1560 newsk = tcp_create_openreq_child(sk, req, skb);
1561 if (!newsk)
093d2823 1562 goto exit_nonewsk;
1da177e4 1563
bcd76111 1564 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1565 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1566
1567 newtp = tcp_sk(newsk);
1568 newinet = inet_sk(newsk);
2e6599cb 1569 ireq = inet_rsk(req);
d1e559d0
ED
1570 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1571 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1572 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1573 newinet->inet_saddr = ireq->ir_loc_addr;
1574 inet_opt = rcu_dereference(ireq->ireq_opt);
1575 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1576 newinet->mc_index = inet_iif(skb);
eddc9ec5 1577 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1578 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1579 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1580 if (inet_opt)
1581 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
a904a069 1582 newinet->inet_id = prandom_u32();
1da177e4 1583
8ef44b6f
WW
1584 /* Set ToS of the new socket based upon the value of incoming SYN.
1585 * ECT bits are set later in tcp_init_transfer().
1586 */
ac8f1710
WW
1587 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1588 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1589
dfd25fff
ED
1590 if (!dst) {
1591 dst = inet_csk_route_child_sock(sk, newsk, req);
1592 if (!dst)
1593 goto put_and_exit;
1594 } else {
1595 /* syncookie case : see end of cookie_v4_check() */
1596 }
0e734419
DM
1597 sk_setup_caps(newsk, dst);
1598
81164413
DB
1599 tcp_ca_openreq_child(newsk, dst);
1600
1da177e4 1601 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1602 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1603
1da177e4
LT
1604 tcp_initialize_rcv_mss(newsk);
1605
cfb6eeb4 1606#ifdef CONFIG_TCP_MD5SIG
dea53bb8 1607 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
cfb6eeb4 1608 /* Copy over the MD5 key from the original socket */
cea97609 1609 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
dea53bb8 1610 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
00db4124 1611 if (key) {
cfb6eeb4
YH
1612 /*
1613 * We're using one, so create a matching key
1614 * on the newsk structure. If we fail to get
1615 * memory, then we end up not copying the key
1616 * across. Shucks.
1617 */
a76c2315 1618 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
cea97609 1619 key->key, key->keylen, GFP_ATOMIC);
a465419b 1620 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1621 }
1622#endif
1623
0e734419
DM
1624 if (__inet_inherit_port(sk, newsk) < 0)
1625 goto put_and_exit;
01770a16
RD
1626 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1627 &found_dup_sk);
c92e8c02 1628 if (likely(*own_req)) {
49a496c9 1629 tcp_move_syn(newtp, req);
c92e8c02
ED
1630 ireq->ireq_opt = NULL;
1631 } else {
c89dffc7
KI
1632 newinet->inet_opt = NULL;
1633
01770a16
RD
1634 if (!req_unhash && found_dup_sk) {
1635 /* This code path should only be executed in the
1636 * syncookie case only
1637 */
1638 bh_unlock_sock(newsk);
1639 sock_put(newsk);
1640 newsk = NULL;
01770a16 1641 }
c92e8c02 1642 }
1da177e4
LT
1643 return newsk;
1644
1645exit_overflow:
c10d9310 1646 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1647exit_nonewsk:
1648 dst_release(dst);
1da177e4 1649exit:
9caad864 1650 tcp_listendrop(sk);
1da177e4 1651 return NULL;
0e734419 1652put_and_exit:
c92e8c02 1653 newinet->inet_opt = NULL;
e337e24d
CP
1654 inet_csk_prepare_forced_close(newsk);
1655 tcp_done(newsk);
0e734419 1656 goto exit;
1da177e4 1657}
4bc2f18b 1658EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1659
079096f1 1660static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1661{
079096f1 1662#ifdef CONFIG_SYN_COOKIES
52452c54 1663 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1664
af9b4738 1665 if (!th->syn)
461b74c3 1666 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1667#endif
1668 return sk;
1669}
1670
9349d600
PP
1671u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1672 struct tcphdr *th, u32 *cookie)
1673{
1674 u16 mss = 0;
1675#ifdef CONFIG_SYN_COOKIES
1676 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1677 &tcp_request_sock_ipv4_ops, sk, th);
1678 if (mss) {
1679 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1680 tcp_synq_overflow(sk);
1681 }
1682#endif
1683 return mss;
1684}
1685
bbd807df
BV
1686INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1687 u32));
1da177e4 1688/* The socket must have it's spinlock held when we get
e994b2f0 1689 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1690 *
1691 * We have a potential double-lock case here, so even when
1692 * doing backlog processing we use the BH locking scheme.
1693 * This is because we cannot sleep with the original spinlock
1694 * held.
1695 */
1696int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1697{
cfb6eeb4 1698 struct sock *rsk;
cfb6eeb4 1699
1da177e4 1700 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1701 struct dst_entry *dst = sk->sk_rx_dst;
1702
bdeab991 1703 sock_rps_save_rxhash(sk, skb);
3d97379a 1704 sk_mark_napi_id(sk, skb);
404e0a8b 1705 if (dst) {
505fbcf0 1706 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
bbd807df
BV
1707 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1708 dst, 0)) {
92101b3b
DM
1709 dst_release(dst);
1710 sk->sk_rx_dst = NULL;
1711 }
1712 }
3d97d88e 1713 tcp_rcv_established(sk, skb);
1da177e4
LT
1714 return 0;
1715 }
1716
12e25e10 1717 if (tcp_checksum_complete(skb))
1da177e4
LT
1718 goto csum_err;
1719
1720 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1721 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1722
1da177e4
LT
1723 if (!nsk)
1724 goto discard;
1da177e4 1725 if (nsk != sk) {
cfb6eeb4
YH
1726 if (tcp_child_process(sk, nsk, skb)) {
1727 rsk = nsk;
1da177e4 1728 goto reset;
cfb6eeb4 1729 }
1da177e4
LT
1730 return 0;
1731 }
ca55158c 1732 } else
bdeab991 1733 sock_rps_save_rxhash(sk, skb);
ca55158c 1734
72ab4a86 1735 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1736 rsk = sk;
1da177e4 1737 goto reset;
cfb6eeb4 1738 }
1da177e4
LT
1739 return 0;
1740
1741reset:
cfb6eeb4 1742 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1743discard:
1744 kfree_skb(skb);
1745 /* Be careful here. If this function gets more complicated and
1746 * gcc suffers from register pressure on the x86, sk (in %ebx)
1747 * might be destroyed here. This current version compiles correctly,
1748 * but you have been warned.
1749 */
1750 return 0;
1751
1752csum_err:
709c0314 1753 trace_tcp_bad_csum(skb);
c10d9310
ED
1754 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1755 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1756 goto discard;
1757}
4bc2f18b 1758EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1759
7487449c 1760int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1761{
41063e9d
DM
1762 const struct iphdr *iph;
1763 const struct tcphdr *th;
1764 struct sock *sk;
41063e9d 1765
41063e9d 1766 if (skb->pkt_type != PACKET_HOST)
7487449c 1767 return 0;
41063e9d 1768
45f00f99 1769 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1770 return 0;
41063e9d
DM
1771
1772 iph = ip_hdr(skb);
45f00f99 1773 th = tcp_hdr(skb);
41063e9d
DM
1774
1775 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1776 return 0;
41063e9d 1777
45f00f99 1778 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1779 iph->saddr, th->source,
7011d085 1780 iph->daddr, ntohs(th->dest),
3fa6f616 1781 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1782 if (sk) {
1783 skb->sk = sk;
1784 skb->destructor = sock_edemux;
f7e4eb03 1785 if (sk_fullsock(sk)) {
d0c294c5 1786 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1787
41063e9d
DM
1788 if (dst)
1789 dst = dst_check(dst, 0);
92101b3b 1790 if (dst &&
505fbcf0 1791 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1792 skb_dst_set_noref(skb, dst);
41063e9d
DM
1793 }
1794 }
7487449c 1795 return 0;
41063e9d
DM
1796}
1797
c9c33212
ED
1798bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1799{
8265792b 1800 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
b160c285 1801 u32 tail_gso_size, tail_gso_segs;
4f693b55
ED
1802 struct skb_shared_info *shinfo;
1803 const struct tcphdr *th;
1804 struct tcphdr *thtail;
1805 struct sk_buff *tail;
1806 unsigned int hdrlen;
1807 bool fragstolen;
1808 u32 gso_segs;
b160c285 1809 u32 gso_size;
4f693b55 1810 int delta;
c9c33212
ED
1811
1812 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1813 * we can fix skb->truesize to its real value to avoid future drops.
1814 * This is valid because skb is not yet charged to the socket.
1815 * It has been noticed pure SACK packets were sometimes dropped
1816 * (if cooked by drivers without copybreak feature).
1817 */
60b1af33 1818 skb_condense(skb);
c9c33212 1819
ade9628e
ED
1820 skb_dst_drop(skb);
1821
4f693b55
ED
1822 if (unlikely(tcp_checksum_complete(skb))) {
1823 bh_unlock_sock(sk);
709c0314 1824 trace_tcp_bad_csum(skb);
4f693b55
ED
1825 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1826 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1827 return true;
1828 }
1829
1830 /* Attempt coalescing to last skb in backlog, even if we are
1831 * above the limits.
1832 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1833 */
1834 th = (const struct tcphdr *)skb->data;
1835 hdrlen = th->doff * 4;
4f693b55
ED
1836
1837 tail = sk->sk_backlog.tail;
1838 if (!tail)
1839 goto no_coalesce;
1840 thtail = (struct tcphdr *)tail->data;
1841
1842 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1843 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1844 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1845 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1846 !((TCP_SKB_CB(tail)->tcp_flags &
1847 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1848 ((TCP_SKB_CB(tail)->tcp_flags ^
1849 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1850#ifdef CONFIG_TLS_DEVICE
1851 tail->decrypted != skb->decrypted ||
1852#endif
1853 thtail->doff != th->doff ||
1854 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1855 goto no_coalesce;
1856
1857 __skb_pull(skb, hdrlen);
b160c285
ED
1858
1859 shinfo = skb_shinfo(skb);
1860 gso_size = shinfo->gso_size ?: skb->len;
1861 gso_segs = shinfo->gso_segs ?: 1;
1862
1863 shinfo = skb_shinfo(tail);
1864 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1865 tail_gso_segs = shinfo->gso_segs ?: 1;
1866
4f693b55 1867 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
4f693b55
ED
1868 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1869
86bccd03 1870 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
4f693b55 1871 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
86bccd03
ED
1872 thtail->window = th->window;
1873 }
4f693b55 1874
ca2fe295
ED
1875 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1876 * thtail->fin, so that the fast path in tcp_rcv_established()
1877 * is not entered if we append a packet with a FIN.
1878 * SYN, RST, URG are not present.
1879 * ACK is set on both packets.
1880 * PSH : we do not really care in TCP stack,
1881 * at least for 'GRO' packets.
1882 */
1883 thtail->fin |= th->fin;
4f693b55
ED
1884 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1885
1886 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1887 TCP_SKB_CB(tail)->has_rxtstamp = true;
1888 tail->tstamp = skb->tstamp;
1889 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1890 }
1891
1892 /* Not as strict as GRO. We only need to carry mss max value */
b160c285
ED
1893 shinfo->gso_size = max(gso_size, tail_gso_size);
1894 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
4f693b55
ED
1895
1896 sk->sk_backlog.len += delta;
1897 __NET_INC_STATS(sock_net(sk),
1898 LINUX_MIB_TCPBACKLOGCOALESCE);
1899 kfree_skb_partial(skb, fragstolen);
1900 return false;
1901 }
1902 __skb_push(skb, hdrlen);
1903
1904no_coalesce:
1905 /* Only socket owner can try to collapse/prune rx queues
1906 * to reduce memory overhead, so add a little headroom here.
1907 * Few sockets backlog are possibly concurrently non empty.
1908 */
1909 limit += 64*1024;
1910
c9c33212
ED
1911 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1912 bh_unlock_sock(sk);
1913 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1914 return true;
1915 }
1916 return false;
1917}
1918EXPORT_SYMBOL(tcp_add_backlog);
1919
ac6e7800
ED
1920int tcp_filter(struct sock *sk, struct sk_buff *skb)
1921{
1922 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1923
f2feaefd 1924 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1925}
1926EXPORT_SYMBOL(tcp_filter);
1927
eeea10b8
ED
1928static void tcp_v4_restore_cb(struct sk_buff *skb)
1929{
1930 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1931 sizeof(struct inet_skb_parm));
1932}
1933
1934static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1935 const struct tcphdr *th)
1936{
1937 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1938 * barrier() makes sure compiler wont play fool^Waliasing games.
1939 */
1940 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1941 sizeof(struct inet_skb_parm));
1942 barrier();
1943
1944 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1945 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1946 skb->len - th->doff * 4);
1947 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1948 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1949 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1950 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1951 TCP_SKB_CB(skb)->sacked = 0;
1952 TCP_SKB_CB(skb)->has_rxtstamp =
1953 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1954}
1955
1da177e4
LT
1956/*
1957 * From tcp_input.c
1958 */
1959
1960int tcp_v4_rcv(struct sk_buff *skb)
1961{
3b24d854 1962 struct net *net = dev_net(skb->dev);
8b27dae5 1963 struct sk_buff *skb_to_free;
3fa6f616 1964 int sdif = inet_sdif(skb);
534322ca 1965 int dif = inet_iif(skb);
eddc9ec5 1966 const struct iphdr *iph;
cf533ea5 1967 const struct tcphdr *th;
3b24d854 1968 bool refcounted;
1da177e4
LT
1969 struct sock *sk;
1970 int ret;
1971
1972 if (skb->pkt_type != PACKET_HOST)
1973 goto discard_it;
1974
1975 /* Count it even if it's bad */
90bbcc60 1976 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1977
1978 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1979 goto discard_it;
1980
ea1627c2 1981 th = (const struct tcphdr *)skb->data;
1da177e4 1982
ea1627c2 1983 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1984 goto bad_packet;
1985 if (!pskb_may_pull(skb, th->doff * 4))
1986 goto discard_it;
1987
1988 /* An explanation is required here, I think.
1989 * Packet length and doff are validated by header prediction,
caa20d9a 1990 * provided case of th->doff==0 is eliminated.
1da177e4 1991 * So, we defer the checks. */
ed70fcfc
TH
1992
1993 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1994 goto csum_error;
1da177e4 1995
ea1627c2 1996 th = (const struct tcphdr *)skb->data;
eddc9ec5 1997 iph = ip_hdr(skb);
4bdc3d66 1998lookup:
a583636a 1999 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 2000 th->dest, sdif, &refcounted);
1da177e4
LT
2001 if (!sk)
2002 goto no_tcp_socket;
2003
bb134d5d
ED
2004process:
2005 if (sk->sk_state == TCP_TIME_WAIT)
2006 goto do_time_wait;
2007
079096f1
ED
2008 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2009 struct request_sock *req = inet_reqsk(sk);
e0f9759f 2010 bool req_stolen = false;
7716682c 2011 struct sock *nsk;
079096f1
ED
2012
2013 sk = req->rsk_listener;
534322ca 2014 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
e65c332d 2015 sk_drops_add(sk, skb);
72923555
ED
2016 reqsk_put(req);
2017 goto discard_it;
2018 }
4fd44a98
FL
2019 if (tcp_checksum_complete(skb)) {
2020 reqsk_put(req);
2021 goto csum_error;
2022 }
7716682c 2023 if (unlikely(sk->sk_state != TCP_LISTEN)) {
d4f2c86b
KI
2024 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2025 if (!nsk) {
2026 inet_csk_reqsk_queue_drop_and_put(sk, req);
2027 goto lookup;
2028 }
2029 sk = nsk;
2030 /* reuseport_migrate_sock() has already held one sk_refcnt
2031 * before returning.
2032 */
2033 } else {
2034 /* We own a reference on the listener, increase it again
2035 * as we might lose it too soon.
2036 */
2037 sock_hold(sk);
4bdc3d66 2038 }
3b24d854 2039 refcounted = true;
1f3b359f 2040 nsk = NULL;
eeea10b8
ED
2041 if (!tcp_filter(sk, skb)) {
2042 th = (const struct tcphdr *)skb->data;
2043 iph = ip_hdr(skb);
2044 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 2045 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 2046 }
079096f1
ED
2047 if (!nsk) {
2048 reqsk_put(req);
e0f9759f
ED
2049 if (req_stolen) {
2050 /* Another cpu got exclusive access to req
2051 * and created a full blown socket.
2052 * Try to feed this packet to this socket
2053 * instead of discarding it.
2054 */
2055 tcp_v4_restore_cb(skb);
2056 sock_put(sk);
2057 goto lookup;
2058 }
7716682c 2059 goto discard_and_relse;
079096f1
ED
2060 }
2061 if (nsk == sk) {
079096f1 2062 reqsk_put(req);
eeea10b8 2063 tcp_v4_restore_cb(skb);
079096f1
ED
2064 } else if (tcp_child_process(sk, nsk, skb)) {
2065 tcp_v4_send_reset(nsk, skb);
7716682c 2066 goto discard_and_relse;
079096f1 2067 } else {
7716682c 2068 sock_put(sk);
079096f1
ED
2069 return 0;
2070 }
2071 }
6cce09f8 2072 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 2073 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 2074 goto discard_and_relse;
6cce09f8 2075 }
d218d111 2076
1da177e4
LT
2077 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2078 goto discard_and_relse;
9ea88a15 2079
534322ca 2080 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
9ea88a15 2081 goto discard_and_relse;
9ea88a15 2082
895b5c9f 2083 nf_reset_ct(skb);
1da177e4 2084
ac6e7800 2085 if (tcp_filter(sk, skb))
1da177e4 2086 goto discard_and_relse;
ac6e7800
ED
2087 th = (const struct tcphdr *)skb->data;
2088 iph = ip_hdr(skb);
eeea10b8 2089 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
2090
2091 skb->dev = NULL;
2092
e994b2f0
ED
2093 if (sk->sk_state == TCP_LISTEN) {
2094 ret = tcp_v4_do_rcv(sk, skb);
2095 goto put_and_return;
2096 }
2097
2098 sk_incoming_cpu_update(sk);
2099
c6366184 2100 bh_lock_sock_nested(sk);
a44d6eac 2101 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
2102 ret = 0;
2103 if (!sock_owned_by_user(sk)) {
8b27dae5
ED
2104 skb_to_free = sk->sk_rx_skb_cache;
2105 sk->sk_rx_skb_cache = NULL;
e7942d06 2106 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5
ED
2107 } else {
2108 if (tcp_add_backlog(sk, skb))
2109 goto discard_and_relse;
2110 skb_to_free = NULL;
6b03a53a 2111 }
1da177e4 2112 bh_unlock_sock(sk);
8b27dae5
ED
2113 if (skb_to_free)
2114 __kfree_skb(skb_to_free);
1da177e4 2115
e994b2f0 2116put_and_return:
3b24d854
ED
2117 if (refcounted)
2118 sock_put(sk);
1da177e4
LT
2119
2120 return ret;
2121
2122no_tcp_socket:
2123 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2124 goto discard_it;
2125
eeea10b8
ED
2126 tcp_v4_fill_cb(skb, iph, th);
2127
12e25e10 2128 if (tcp_checksum_complete(skb)) {
6a5dc9e5 2129csum_error:
709c0314 2130 trace_tcp_bad_csum(skb);
90bbcc60 2131 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 2132bad_packet:
90bbcc60 2133 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 2134 } else {
cfb6eeb4 2135 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2136 }
2137
2138discard_it:
2139 /* Discard frame. */
2140 kfree_skb(skb);
e905a9ed 2141 return 0;
1da177e4
LT
2142
2143discard_and_relse:
532182cd 2144 sk_drops_add(sk, skb);
3b24d854
ED
2145 if (refcounted)
2146 sock_put(sk);
1da177e4
LT
2147 goto discard_it;
2148
2149do_time_wait:
2150 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2151 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2152 goto discard_it;
2153 }
2154
eeea10b8
ED
2155 tcp_v4_fill_cb(skb, iph, th);
2156
6a5dc9e5
ED
2157 if (tcp_checksum_complete(skb)) {
2158 inet_twsk_put(inet_twsk(sk));
2159 goto csum_error;
1da177e4 2160 }
9469c7b4 2161 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2162 case TCP_TW_SYN: {
c346dca1 2163 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
2164 &tcp_hashinfo, skb,
2165 __tcp_hdrlen(th),
da5e3630 2166 iph->saddr, th->source,
eddc9ec5 2167 iph->daddr, th->dest,
3fa6f616
DA
2168 inet_iif(skb),
2169 sdif);
1da177e4 2170 if (sk2) {
dbe7faa4 2171 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2172 sk = sk2;
eeea10b8 2173 tcp_v4_restore_cb(skb);
3b24d854 2174 refcounted = false;
1da177e4
LT
2175 goto process;
2176 }
1da177e4 2177 }
fcfd6dfa 2178 /* to ACK */
a8eceea8 2179 fallthrough;
1da177e4
LT
2180 case TCP_TW_ACK:
2181 tcp_v4_timewait_ack(sk, skb);
2182 break;
2183 case TCP_TW_RST:
271c3b9b
FW
2184 tcp_v4_send_reset(sk, skb);
2185 inet_twsk_deschedule_put(inet_twsk(sk));
2186 goto discard_it;
1da177e4
LT
2187 case TCP_TW_SUCCESS:;
2188 }
2189 goto discard_it;
2190}
2191
ccb7c410
DM
2192static struct timewait_sock_ops tcp_timewait_sock_ops = {
2193 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2194 .twsk_unique = tcp_twsk_unique,
2195 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2196};
1da177e4 2197
63d02d15 2198void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2199{
2200 struct dst_entry *dst = skb_dst(skb);
2201
5037e9ef 2202 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2203 sk->sk_rx_dst = dst;
2204 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2205 }
5d299f3d 2206}
63d02d15 2207EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2208
3b401a81 2209const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2210 .queue_xmit = ip_queue_xmit,
2211 .send_check = tcp_v4_send_check,
2212 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2213 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2214 .conn_request = tcp_v4_conn_request,
2215 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2216 .net_header_len = sizeof(struct iphdr),
2217 .setsockopt = ip_setsockopt,
2218 .getsockopt = ip_getsockopt,
2219 .addr2sockaddr = inet_csk_addr2sockaddr,
2220 .sockaddr_len = sizeof(struct sockaddr_in),
4fab9071 2221 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2222};
4bc2f18b 2223EXPORT_SYMBOL(ipv4_specific);
1da177e4 2224
cfb6eeb4 2225#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2226static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2227 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2228 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2229 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2230};
b6332e6c 2231#endif
cfb6eeb4 2232
1da177e4
LT
2233/* NOTE: A lot of things set to zero explicitly by call to
2234 * sk_alloc() so need not be done here.
2235 */
2236static int tcp_v4_init_sock(struct sock *sk)
2237{
6687e988 2238 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2239
900f65d3 2240 tcp_init_sock(sk);
1da177e4 2241
8292a17a 2242 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2243
cfb6eeb4 2244#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2245 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2246#endif
1da177e4 2247
1da177e4
LT
2248 return 0;
2249}
2250
7d06b2e0 2251void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2252{
2253 struct tcp_sock *tp = tcp_sk(sk);
2254
e1a4aa50
SL
2255 trace_tcp_destroy_sock(sk);
2256
1da177e4
LT
2257 tcp_clear_xmit_timers(sk);
2258
6687e988 2259 tcp_cleanup_congestion_control(sk);
317a76f9 2260
734942cc
DW
2261 tcp_cleanup_ulp(sk);
2262
1da177e4 2263 /* Cleanup up the write buffer. */
fe067e8a 2264 tcp_write_queue_purge(sk);
1da177e4 2265
cf1ef3f0
WW
2266 /* Check if we want to disable active TFO */
2267 tcp_fastopen_active_disable_ofo_check(sk);
2268
1da177e4 2269 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2270 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2271
cfb6eeb4
YH
2272#ifdef CONFIG_TCP_MD5SIG
2273 /* Clean up the MD5 key list, if any */
2274 if (tp->md5sig_info) {
a915da9b 2275 tcp_clear_md5_list(sk);
fb7df5e4 2276 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2277 tp->md5sig_info = NULL;
2278 }
2279#endif
1a2449a8 2280
1da177e4 2281 /* Clean up a referenced TCP bind bucket. */
463c84b9 2282 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2283 inet_put_port(sk);
1da177e4 2284
d983ea6f 2285 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2286
cf60af03
YC
2287 /* If socket is aborted during connect operation */
2288 tcp_free_fastopen_req(tp);
1fba70e5 2289 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2290 tcp_saved_syn_free(tp);
cf60af03 2291
180d8cd9 2292 sk_sockets_allocated_dec(sk);
1da177e4 2293}
1da177e4
LT
2294EXPORT_SYMBOL(tcp_v4_destroy_sock);
2295
2296#ifdef CONFIG_PROC_FS
2297/* Proc filesystem TCP sock list dumping. */
2298
ad2d6137
MKL
2299static unsigned short seq_file_family(const struct seq_file *seq);
2300
2301static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2302{
2303 unsigned short family = seq_file_family(seq);
2304
2305 /* AF_UNSPEC is used as a match all */
2306 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2307 net_eq(sock_net(sk), seq_file_net(seq)));
2308}
2309
b72acf45
MKL
2310/* Find a non empty bucket (starting from st->bucket)
2311 * and return the first sk from it.
a8b690f9 2312 */
b72acf45 2313static void *listening_get_first(struct seq_file *seq)
1da177e4 2314{
5799de0b 2315 struct tcp_iter_state *st = seq->private;
1da177e4 2316
b72acf45 2317 st->offset = 0;
05c0b357
MKL
2318 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2319 struct inet_listen_hashbucket *ilb2;
2320 struct inet_connection_sock *icsk;
b72acf45 2321 struct sock *sk;
b08d4d3b 2322
05c0b357
MKL
2323 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2324 if (hlist_empty(&ilb2->head))
b72acf45
MKL
2325 continue;
2326
05c0b357
MKL
2327 spin_lock(&ilb2->lock);
2328 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2329 sk = (struct sock *)icsk;
b72acf45
MKL
2330 if (seq_sk_match(seq, sk))
2331 return sk;
2332 }
05c0b357 2333 spin_unlock(&ilb2->lock);
1da177e4 2334 }
b72acf45
MKL
2335
2336 return NULL;
2337}
2338
2339/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2340 * If "cur" is the last one in the st->bucket,
2341 * call listening_get_first() to return the first sk of the next
2342 * non empty bucket.
a8b690f9 2343 */
1da177e4
LT
2344static void *listening_get_next(struct seq_file *seq, void *cur)
2345{
5799de0b 2346 struct tcp_iter_state *st = seq->private;
05c0b357
MKL
2347 struct inet_listen_hashbucket *ilb2;
2348 struct inet_connection_sock *icsk;
3b24d854 2349 struct sock *sk = cur;
1da177e4 2350
1da177e4 2351 ++st->num;
a8b690f9 2352 ++st->offset;
1da177e4 2353
05c0b357
MKL
2354 icsk = inet_csk(sk);
2355 inet_lhash2_for_each_icsk_continue(icsk) {
2356 sk = (struct sock *)icsk;
ad2d6137 2357 if (seq_sk_match(seq, sk))
3b24d854 2358 return sk;
1da177e4 2359 }
b72acf45 2360
05c0b357
MKL
2361 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2362 spin_unlock(&ilb2->lock);
b72acf45
MKL
2363 ++st->bucket;
2364 return listening_get_first(seq);
1da177e4
LT
2365}
2366
2367static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2368{
a8b690f9
TH
2369 struct tcp_iter_state *st = seq->private;
2370 void *rc;
2371
2372 st->bucket = 0;
2373 st->offset = 0;
b72acf45 2374 rc = listening_get_first(seq);
1da177e4
LT
2375
2376 while (rc && *pos) {
2377 rc = listening_get_next(seq, rc);
2378 --*pos;
2379 }
2380 return rc;
2381}
2382
05dbc7b5 2383static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2384{
05dbc7b5 2385 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2386}
2387
a8b690f9
TH
2388/*
2389 * Get first established socket starting from bucket given in st->bucket.
2390 * If st->bucket is zero, the very first socket in the hash is returned.
2391 */
1da177e4
LT
2392static void *established_get_first(struct seq_file *seq)
2393{
5799de0b 2394 struct tcp_iter_state *st = seq->private;
b08d4d3b 2395
a8b690f9
TH
2396 st->offset = 0;
2397 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2398 struct sock *sk;
3ab5aee7 2399 struct hlist_nulls_node *node;
9db66bdc 2400 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2401
6eac5604
AK
2402 /* Lockless fast path for the common case of empty buckets */
2403 if (empty_bucket(st))
2404 continue;
2405
9db66bdc 2406 spin_lock_bh(lock);
3ab5aee7 2407 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
ad2d6137
MKL
2408 if (seq_sk_match(seq, sk))
2409 return sk;
1da177e4 2410 }
9db66bdc 2411 spin_unlock_bh(lock);
1da177e4 2412 }
ad2d6137
MKL
2413
2414 return NULL;
1da177e4
LT
2415}
2416
2417static void *established_get_next(struct seq_file *seq, void *cur)
2418{
2419 struct sock *sk = cur;
3ab5aee7 2420 struct hlist_nulls_node *node;
5799de0b 2421 struct tcp_iter_state *st = seq->private;
b08d4d3b 2422
1da177e4 2423 ++st->num;
a8b690f9 2424 ++st->offset;
1da177e4 2425
05dbc7b5 2426 sk = sk_nulls_next(sk);
1da177e4 2427
3ab5aee7 2428 sk_nulls_for_each_from(sk, node) {
ad2d6137 2429 if (seq_sk_match(seq, sk))
05dbc7b5 2430 return sk;
1da177e4
LT
2431 }
2432
05dbc7b5
ED
2433 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2434 ++st->bucket;
2435 return established_get_first(seq);
1da177e4
LT
2436}
2437
2438static void *established_get_idx(struct seq_file *seq, loff_t pos)
2439{
a8b690f9
TH
2440 struct tcp_iter_state *st = seq->private;
2441 void *rc;
2442
2443 st->bucket = 0;
2444 rc = established_get_first(seq);
1da177e4
LT
2445
2446 while (rc && pos) {
2447 rc = established_get_next(seq, rc);
2448 --pos;
7174259e 2449 }
1da177e4
LT
2450 return rc;
2451}
2452
2453static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2454{
2455 void *rc;
5799de0b 2456 struct tcp_iter_state *st = seq->private;
1da177e4 2457
1da177e4
LT
2458 st->state = TCP_SEQ_STATE_LISTENING;
2459 rc = listening_get_idx(seq, &pos);
2460
2461 if (!rc) {
1da177e4
LT
2462 st->state = TCP_SEQ_STATE_ESTABLISHED;
2463 rc = established_get_idx(seq, pos);
2464 }
2465
2466 return rc;
2467}
2468
a8b690f9
TH
2469static void *tcp_seek_last_pos(struct seq_file *seq)
2470{
2471 struct tcp_iter_state *st = seq->private;
525e2f9f 2472 int bucket = st->bucket;
a8b690f9
TH
2473 int offset = st->offset;
2474 int orig_num = st->num;
2475 void *rc = NULL;
2476
2477 switch (st->state) {
a8b690f9 2478 case TCP_SEQ_STATE_LISTENING:
05c0b357 2479 if (st->bucket > tcp_hashinfo.lhash2_mask)
a8b690f9
TH
2480 break;
2481 st->state = TCP_SEQ_STATE_LISTENING;
b72acf45 2482 rc = listening_get_first(seq);
525e2f9f 2483 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2484 rc = listening_get_next(seq, rc);
2485 if (rc)
2486 break;
2487 st->bucket = 0;
05dbc7b5 2488 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8eceea8 2489 fallthrough;
a8b690f9 2490 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2491 if (st->bucket > tcp_hashinfo.ehash_mask)
2492 break;
2493 rc = established_get_first(seq);
525e2f9f 2494 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2495 rc = established_get_next(seq, rc);
2496 }
2497
2498 st->num = orig_num;
2499
2500 return rc;
2501}
2502
37d849bb 2503void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2504{
5799de0b 2505 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2506 void *rc;
2507
2508 if (*pos && *pos == st->last_pos) {
2509 rc = tcp_seek_last_pos(seq);
2510 if (rc)
2511 goto out;
2512 }
2513
1da177e4
LT
2514 st->state = TCP_SEQ_STATE_LISTENING;
2515 st->num = 0;
a8b690f9
TH
2516 st->bucket = 0;
2517 st->offset = 0;
2518 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2519
2520out:
2521 st->last_pos = *pos;
2522 return rc;
1da177e4 2523}
37d849bb 2524EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2525
37d849bb 2526void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2527{
a8b690f9 2528 struct tcp_iter_state *st = seq->private;
1da177e4 2529 void *rc = NULL;
1da177e4
LT
2530
2531 if (v == SEQ_START_TOKEN) {
2532 rc = tcp_get_idx(seq, 0);
2533 goto out;
2534 }
1da177e4
LT
2535
2536 switch (st->state) {
1da177e4
LT
2537 case TCP_SEQ_STATE_LISTENING:
2538 rc = listening_get_next(seq, v);
2539 if (!rc) {
1da177e4 2540 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2541 st->bucket = 0;
2542 st->offset = 0;
1da177e4
LT
2543 rc = established_get_first(seq);
2544 }
2545 break;
2546 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2547 rc = established_get_next(seq, v);
2548 break;
2549 }
2550out:
2551 ++*pos;
a8b690f9 2552 st->last_pos = *pos;
1da177e4
LT
2553 return rc;
2554}
37d849bb 2555EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2556
37d849bb 2557void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2558{
5799de0b 2559 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2560
2561 switch (st->state) {
1da177e4
LT
2562 case TCP_SEQ_STATE_LISTENING:
2563 if (v != SEQ_START_TOKEN)
05c0b357 2564 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
1da177e4 2565 break;
1da177e4
LT
2566 case TCP_SEQ_STATE_ESTABLISHED:
2567 if (v)
9db66bdc 2568 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2569 break;
2570 }
2571}
37d849bb 2572EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2573
d4f06873 2574static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2575 struct seq_file *f, int i)
1da177e4 2576{
2e6599cb 2577 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2578 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2579
5e659e4c 2580 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2581 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2582 i,
634fb979 2583 ireq->ir_loc_addr,
d4f06873 2584 ireq->ir_num,
634fb979
ED
2585 ireq->ir_rmt_addr,
2586 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2587 TCP_SYN_RECV,
2588 0, 0, /* could print option size, but that is af dependent. */
2589 1, /* timers active (only the expire timer) */
a399a805 2590 jiffies_delta_to_clock_t(delta),
e6c022a4 2591 req->num_timeout,
aa3a0c8c
ED
2592 from_kuid_munged(seq_user_ns(f),
2593 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2594 0, /* non standard timer */
2595 0, /* open_requests have no inode */
d4f06873 2596 0,
652586df 2597 req);
1da177e4
LT
2598}
2599
652586df 2600static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2601{
2602 int timer_active;
2603 unsigned long timer_expires;
cf533ea5 2604 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2605 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2606 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2607 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2608 __be32 dest = inet->inet_daddr;
2609 __be32 src = inet->inet_rcv_saddr;
2610 __u16 destp = ntohs(inet->inet_dport);
2611 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2612 int rx_queue;
00fd38d9 2613 int state;
1da177e4 2614
6ba8a3b1 2615 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2616 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2617 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2618 timer_active = 1;
463c84b9
ACM
2619 timer_expires = icsk->icsk_timeout;
2620 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2621 timer_active = 4;
463c84b9 2622 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2623 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2624 timer_active = 2;
cf4c6bf8 2625 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2626 } else {
2627 timer_active = 0;
2628 timer_expires = jiffies;
2629 }
2630
986ffdfd 2631 state = inet_sk_state_load(sk);
00fd38d9 2632 if (state == TCP_LISTEN)
288efe86 2633 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2634 else
00fd38d9
ED
2635 /* Because we don't lock the socket,
2636 * we might find a transient negative value.
49d09007 2637 */
dba7d9b8 2638 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2639 READ_ONCE(tp->copied_seq), 0);
49d09007 2640
5e659e4c 2641 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2642 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2643 i, src, srcp, dest, destp, state,
0f317464 2644 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2645 rx_queue,
1da177e4 2646 timer_active,
a399a805 2647 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2648 icsk->icsk_retransmits,
a7cb5a49 2649 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2650 icsk->icsk_probes_out,
cf4c6bf8 2651 sock_i_ino(sk),
41c6d650 2652 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2653 jiffies_to_clock_t(icsk->icsk_rto),
2654 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2655 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
1da177e4 2656 tp->snd_cwnd,
00fd38d9
ED
2657 state == TCP_LISTEN ?
2658 fastopenq->max_qlen :
652586df 2659 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2660}
2661
cf533ea5 2662static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2663 struct seq_file *f, int i)
1da177e4 2664{
789f558c 2665 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2666 __be32 dest, src;
1da177e4 2667 __u16 destp, srcp;
1da177e4
LT
2668
2669 dest = tw->tw_daddr;
2670 src = tw->tw_rcv_saddr;
2671 destp = ntohs(tw->tw_dport);
2672 srcp = ntohs(tw->tw_sport);
2673
5e659e4c 2674 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2675 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2676 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2677 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2678 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2679}
2680
2681#define TMPSZ 150
2682
2683static int tcp4_seq_show(struct seq_file *seq, void *v)
2684{
5799de0b 2685 struct tcp_iter_state *st;
05dbc7b5 2686 struct sock *sk = v;
1da177e4 2687
652586df 2688 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2689 if (v == SEQ_START_TOKEN) {
652586df 2690 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2691 "rx_queue tr tm->when retrnsmt uid timeout "
2692 "inode");
2693 goto out;
2694 }
2695 st = seq->private;
2696
079096f1
ED
2697 if (sk->sk_state == TCP_TIME_WAIT)
2698 get_timewait4_sock(v, seq, st->num);
2699 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2700 get_openreq4(v, seq, st->num);
079096f1
ED
2701 else
2702 get_tcp4_sock(v, seq, st->num);
1da177e4 2703out:
652586df 2704 seq_pad(seq, '\n');
1da177e4
LT
2705 return 0;
2706}
2707
52d87d5f 2708#ifdef CONFIG_BPF_SYSCALL
04c7820b
MKL
2709struct bpf_tcp_iter_state {
2710 struct tcp_iter_state state;
2711 unsigned int cur_sk;
2712 unsigned int end_sk;
2713 unsigned int max_sk;
2714 struct sock **batch;
2715 bool st_bucket_done;
2716};
2717
52d87d5f
YS
2718struct bpf_iter__tcp {
2719 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2720 __bpf_md_ptr(struct sock_common *, sk_common);
2721 uid_t uid __aligned(8);
2722};
2723
2724static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2725 struct sock_common *sk_common, uid_t uid)
2726{
2727 struct bpf_iter__tcp ctx;
2728
2729 meta->seq_num--; /* skip SEQ_START_TOKEN */
2730 ctx.meta = meta;
2731 ctx.sk_common = sk_common;
2732 ctx.uid = uid;
2733 return bpf_iter_run_prog(prog, &ctx);
2734}
2735
04c7820b
MKL
2736static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2737{
2738 while (iter->cur_sk < iter->end_sk)
2739 sock_put(iter->batch[iter->cur_sk++]);
2740}
2741
2742static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2743 unsigned int new_batch_sz)
2744{
2745 struct sock **new_batch;
2746
2747 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2748 GFP_USER | __GFP_NOWARN);
2749 if (!new_batch)
2750 return -ENOMEM;
2751
2752 bpf_iter_tcp_put_batch(iter);
2753 kvfree(iter->batch);
2754 iter->batch = new_batch;
2755 iter->max_sk = new_batch_sz;
2756
2757 return 0;
2758}
2759
2760static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2761 struct sock *start_sk)
2762{
2763 struct bpf_tcp_iter_state *iter = seq->private;
2764 struct tcp_iter_state *st = &iter->state;
2765 struct inet_connection_sock *icsk;
2766 unsigned int expected = 1;
2767 struct sock *sk;
2768
2769 sock_hold(start_sk);
2770 iter->batch[iter->end_sk++] = start_sk;
2771
2772 icsk = inet_csk(start_sk);
2773 inet_lhash2_for_each_icsk_continue(icsk) {
2774 sk = (struct sock *)icsk;
2775 if (seq_sk_match(seq, sk)) {
2776 if (iter->end_sk < iter->max_sk) {
2777 sock_hold(sk);
2778 iter->batch[iter->end_sk++] = sk;
2779 }
2780 expected++;
2781 }
2782 }
2783 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2784
2785 return expected;
2786}
2787
2788static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2789 struct sock *start_sk)
2790{
2791 struct bpf_tcp_iter_state *iter = seq->private;
2792 struct tcp_iter_state *st = &iter->state;
2793 struct hlist_nulls_node *node;
2794 unsigned int expected = 1;
2795 struct sock *sk;
2796
2797 sock_hold(start_sk);
2798 iter->batch[iter->end_sk++] = start_sk;
2799
2800 sk = sk_nulls_next(start_sk);
2801 sk_nulls_for_each_from(sk, node) {
2802 if (seq_sk_match(seq, sk)) {
2803 if (iter->end_sk < iter->max_sk) {
2804 sock_hold(sk);
2805 iter->batch[iter->end_sk++] = sk;
2806 }
2807 expected++;
2808 }
2809 }
2810 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2811
2812 return expected;
2813}
2814
2815static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2816{
2817 struct bpf_tcp_iter_state *iter = seq->private;
2818 struct tcp_iter_state *st = &iter->state;
2819 unsigned int expected;
2820 bool resized = false;
2821 struct sock *sk;
2822
2823 /* The st->bucket is done. Directly advance to the next
2824 * bucket instead of having the tcp_seek_last_pos() to skip
2825 * one by one in the current bucket and eventually find out
2826 * it has to advance to the next bucket.
2827 */
2828 if (iter->st_bucket_done) {
2829 st->offset = 0;
2830 st->bucket++;
2831 if (st->state == TCP_SEQ_STATE_LISTENING &&
2832 st->bucket > tcp_hashinfo.lhash2_mask) {
2833 st->state = TCP_SEQ_STATE_ESTABLISHED;
2834 st->bucket = 0;
2835 }
2836 }
2837
2838again:
2839 /* Get a new batch */
2840 iter->cur_sk = 0;
2841 iter->end_sk = 0;
2842 iter->st_bucket_done = false;
2843
2844 sk = tcp_seek_last_pos(seq);
2845 if (!sk)
2846 return NULL; /* Done */
2847
2848 if (st->state == TCP_SEQ_STATE_LISTENING)
2849 expected = bpf_iter_tcp_listening_batch(seq, sk);
2850 else
2851 expected = bpf_iter_tcp_established_batch(seq, sk);
2852
2853 if (iter->end_sk == expected) {
2854 iter->st_bucket_done = true;
2855 return sk;
2856 }
2857
2858 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2859 resized = true;
2860 goto again;
2861 }
2862
2863 return sk;
2864}
2865
2866static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2867{
2868 /* bpf iter does not support lseek, so it always
2869 * continue from where it was stop()-ped.
2870 */
2871 if (*pos)
2872 return bpf_iter_tcp_batch(seq);
2873
2874 return SEQ_START_TOKEN;
2875}
2876
2877static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2878{
2879 struct bpf_tcp_iter_state *iter = seq->private;
2880 struct tcp_iter_state *st = &iter->state;
2881 struct sock *sk;
2882
2883 /* Whenever seq_next() is called, the iter->cur_sk is
2884 * done with seq_show(), so advance to the next sk in
2885 * the batch.
2886 */
2887 if (iter->cur_sk < iter->end_sk) {
2888 /* Keeping st->num consistent in tcp_iter_state.
2889 * bpf_iter_tcp does not use st->num.
2890 * meta.seq_num is used instead.
2891 */
2892 st->num++;
2893 /* Move st->offset to the next sk in the bucket such that
2894 * the future start() will resume at st->offset in
2895 * st->bucket. See tcp_seek_last_pos().
2896 */
2897 st->offset++;
2898 sock_put(iter->batch[iter->cur_sk++]);
2899 }
2900
2901 if (iter->cur_sk < iter->end_sk)
2902 sk = iter->batch[iter->cur_sk];
2903 else
2904 sk = bpf_iter_tcp_batch(seq);
2905
2906 ++*pos;
2907 /* Keeping st->last_pos consistent in tcp_iter_state.
2908 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2909 */
2910 st->last_pos = *pos;
2911 return sk;
2912}
2913
52d87d5f
YS
2914static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2915{
2916 struct bpf_iter_meta meta;
2917 struct bpf_prog *prog;
2918 struct sock *sk = v;
04c7820b 2919 bool slow;
52d87d5f 2920 uid_t uid;
04c7820b 2921 int ret;
52d87d5f
YS
2922
2923 if (v == SEQ_START_TOKEN)
2924 return 0;
2925
04c7820b
MKL
2926 if (sk_fullsock(sk))
2927 slow = lock_sock_fast(sk);
2928
2929 if (unlikely(sk_unhashed(sk))) {
2930 ret = SEQ_SKIP;
2931 goto unlock;
2932 }
2933
52d87d5f
YS
2934 if (sk->sk_state == TCP_TIME_WAIT) {
2935 uid = 0;
2936 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2937 const struct request_sock *req = v;
2938
2939 uid = from_kuid_munged(seq_user_ns(seq),
2940 sock_i_uid(req->rsk_listener));
2941 } else {
2942 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2943 }
2944
2945 meta.seq = seq;
2946 prog = bpf_iter_get_info(&meta, false);
04c7820b
MKL
2947 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2948
2949unlock:
2950 if (sk_fullsock(sk))
2951 unlock_sock_fast(sk, slow);
2952 return ret;
2953
52d87d5f
YS
2954}
2955
2956static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2957{
04c7820b 2958 struct bpf_tcp_iter_state *iter = seq->private;
52d87d5f
YS
2959 struct bpf_iter_meta meta;
2960 struct bpf_prog *prog;
2961
2962 if (!v) {
2963 meta.seq = seq;
2964 prog = bpf_iter_get_info(&meta, true);
2965 if (prog)
2966 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2967 }
2968
04c7820b
MKL
2969 if (iter->cur_sk < iter->end_sk) {
2970 bpf_iter_tcp_put_batch(iter);
2971 iter->st_bucket_done = false;
2972 }
52d87d5f
YS
2973}
2974
2975static const struct seq_operations bpf_iter_tcp_seq_ops = {
2976 .show = bpf_iter_tcp_seq_show,
04c7820b
MKL
2977 .start = bpf_iter_tcp_seq_start,
2978 .next = bpf_iter_tcp_seq_next,
52d87d5f
YS
2979 .stop = bpf_iter_tcp_seq_stop,
2980};
2981#endif
ad2d6137
MKL
2982static unsigned short seq_file_family(const struct seq_file *seq)
2983{
62001372 2984 const struct tcp_seq_afinfo *afinfo;
ad2d6137 2985
62001372 2986#ifdef CONFIG_BPF_SYSCALL
ad2d6137 2987 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
62001372 2988 if (seq->op == &bpf_iter_tcp_seq_ops)
ad2d6137 2989 return AF_UNSPEC;
52d87d5f
YS
2990#endif
2991
ad2d6137
MKL
2992 /* Iterated from proc fs */
2993 afinfo = PDE_DATA(file_inode(seq->file));
2994 return afinfo->family;
2995}
52d87d5f 2996
37d849bb
CH
2997static const struct seq_operations tcp4_seq_ops = {
2998 .show = tcp4_seq_show,
2999 .start = tcp_seq_start,
3000 .next = tcp_seq_next,
3001 .stop = tcp_seq_stop,
3002};
3003
1da177e4 3004static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 3005 .family = AF_INET,
1da177e4
LT
3006};
3007
2c8c1e72 3008static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 3009{
c3506372
CH
3010 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3011 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
3012 return -ENOMEM;
3013 return 0;
757764f6
PE
3014}
3015
2c8c1e72 3016static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 3017{
37d849bb 3018 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
3019}
3020
3021static struct pernet_operations tcp4_net_ops = {
3022 .init = tcp4_proc_init_net,
3023 .exit = tcp4_proc_exit_net,
3024};
3025
1da177e4
LT
3026int __init tcp4_proc_init(void)
3027{
757764f6 3028 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3029}
3030
3031void tcp4_proc_exit(void)
3032{
757764f6 3033 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3034}
3035#endif /* CONFIG_PROC_FS */
3036
d3cd4924
ED
3037/* @wake is one when sk_stream_write_space() calls us.
3038 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3039 * This mimics the strategy used in sock_def_write_space().
3040 */
3041bool tcp_stream_memory_free(const struct sock *sk, int wake)
3042{
3043 const struct tcp_sock *tp = tcp_sk(sk);
3044 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3045 READ_ONCE(tp->snd_nxt);
3046
3047 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3048}
3049EXPORT_SYMBOL(tcp_stream_memory_free);
3050
1da177e4
LT
3051struct proto tcp_prot = {
3052 .name = "TCP",
3053 .owner = THIS_MODULE,
3054 .close = tcp_close,
d74bad4e 3055 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
3056 .connect = tcp_v4_connect,
3057 .disconnect = tcp_disconnect,
463c84b9 3058 .accept = inet_csk_accept,
1da177e4
LT
3059 .ioctl = tcp_ioctl,
3060 .init = tcp_v4_init_sock,
3061 .destroy = tcp_v4_destroy_sock,
3062 .shutdown = tcp_shutdown,
3063 .setsockopt = tcp_setsockopt,
3064 .getsockopt = tcp_getsockopt,
9cacf81f 3065 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
4b9d07a4 3066 .keepalive = tcp_set_keepalive,
1da177e4 3067 .recvmsg = tcp_recvmsg,
7ba42910
CG
3068 .sendmsg = tcp_sendmsg,
3069 .sendpage = tcp_sendpage,
1da177e4 3070 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 3071 .release_cb = tcp_release_cb,
ab1e0a13
ACM
3072 .hash = inet_hash,
3073 .unhash = inet_unhash,
3074 .get_port = inet_csk_get_port,
8a59f9d1
CW
3075#ifdef CONFIG_BPF_SYSCALL
3076 .psock_update_sk_prot = tcp_bpf_update_proto,
3077#endif
1da177e4 3078 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 3079 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 3080 .stream_memory_free = tcp_stream_memory_free,
1da177e4 3081 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 3082 .orphan_count = &tcp_orphan_count,
1da177e4
LT
3083 .memory_allocated = &tcp_memory_allocated,
3084 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 3085 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
3086 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3087 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
3088 .max_header = MAX_TCP_HEADER,
3089 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 3090 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 3091 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 3092 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 3093 .h.hashinfo = &tcp_hashinfo,
7ba42910 3094 .no_autobind = true,
c1e64e29 3095 .diag_destroy = tcp_abort,
1da177e4 3096};
4bc2f18b 3097EXPORT_SYMBOL(tcp_prot);
1da177e4 3098
bdbbb852
ED
3099static void __net_exit tcp_sk_exit(struct net *net)
3100{
3101 int cpu;
3102
b506bc97 3103 if (net->ipv4.tcp_congestion_control)
0baf26b0
MKL
3104 bpf_module_put(net->ipv4.tcp_congestion_control,
3105 net->ipv4.tcp_congestion_control->owner);
6670e152 3106
bdbbb852
ED
3107 for_each_possible_cpu(cpu)
3108 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3109 free_percpu(net->ipv4.tcp_sk);
3110}
3111
046ee902
DL
3112static int __net_init tcp_sk_init(struct net *net)
3113{
fee83d09 3114 int res, cpu, cnt;
bdbbb852
ED
3115
3116 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3117 if (!net->ipv4.tcp_sk)
3118 return -ENOMEM;
3119
3120 for_each_possible_cpu(cpu) {
3121 struct sock *sk;
3122
3123 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3124 IPPROTO_TCP, net);
3125 if (res)
3126 goto fail;
a9d6532b 3127 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
3128
3129 /* Please enforce IP_DF and IPID==0 for RST and
3130 * ACK sent in SYN-RECV and TIME-WAIT state.
3131 */
3132 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3133
bdbbb852
ED
3134 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3135 }
49213555 3136
5d134f1c 3137 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
3138 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3139
b0f9ca53 3140 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 3141 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 3142 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 3143 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 3144 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 3145
13b287e8 3146 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 3147 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 3148 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 3149
6fa25166 3150 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 3151 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 3152 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 3153 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 3154 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 3155 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 3156 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 3157 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 3158 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 3159 net->ipv4.sysctl_tcp_tw_reuse = 2;
65e6d901 3160 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
12ed8244 3161
fee83d09 3162 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 3163 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
3164 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3165
623d0c2d 3166 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
f9301034 3167 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 3168 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 3169 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 3170 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 3171 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 3172 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 3173 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 3174 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 3175 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 3176 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 3177 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 3178 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 3179 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
3180 /* This limits the percentage of the congestion window which we
3181 * will allow a single TSO frame to consume. Building TSO frames
3182 * which are too large can cause TCP streams to be bursty.
3183 */
3184 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
3185 /* Default TSQ limit of 16 TSO segments */
3186 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
3187 /* rfc5961 challenge ack rate limiting */
3188 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 3189 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 3190 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 3191 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 3192 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 3193 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 3194 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
3195 if (net != &init_net) {
3196 memcpy(net->ipv4.sysctl_tcp_rmem,
3197 init_net.ipv4.sysctl_tcp_rmem,
3198 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3199 memcpy(net->ipv4.sysctl_tcp_wmem,
3200 init_net.ipv4.sysctl_tcp_wmem,
3201 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3202 }
6d82aa24 3203 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
a70437cc 3204 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
9c21d2fc 3205 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 3206 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
213ad73d 3207 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3733be14 3208 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 3209
6670e152
SH
3210 /* Reno is always built in */
3211 if (!net_eq(net, &init_net) &&
0baf26b0
MKL
3212 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3213 init_net.ipv4.tcp_congestion_control->owner))
6670e152
SH
3214 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3215 else
3216 net->ipv4.tcp_congestion_control = &tcp_reno;
3217
49213555 3218 return 0;
bdbbb852
ED
3219fail:
3220 tcp_sk_exit(net);
3221
3222 return res;
b099ce26
EB
3223}
3224
3225static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3226{
43713848
HY
3227 struct net *net;
3228
1946e672 3229 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
3230
3231 list_for_each_entry(net, net_exit_list, exit_list)
3232 tcp_fastopen_ctx_destroy(net);
046ee902
DL
3233}
3234
3235static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
3236 .init = tcp_sk_init,
3237 .exit = tcp_sk_exit,
3238 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
3239};
3240
52d87d5f
YS
3241#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3242DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3243 struct sock_common *sk_common, uid_t uid)
3244
04c7820b
MKL
3245#define INIT_BATCH_SZ 16
3246
f9c79272 3247static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
52d87d5f 3248{
04c7820b
MKL
3249 struct bpf_tcp_iter_state *iter = priv_data;
3250 int err;
52d87d5f 3251
04c7820b
MKL
3252 err = bpf_iter_init_seq_net(priv_data, aux);
3253 if (err)
3254 return err;
52d87d5f 3255
04c7820b
MKL
3256 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3257 if (err) {
3258 bpf_iter_fini_seq_net(priv_data);
3259 return err;
3260 }
3261
3262 return 0;
52d87d5f
YS
3263}
3264
3265static void bpf_iter_fini_tcp(void *priv_data)
3266{
04c7820b 3267 struct bpf_tcp_iter_state *iter = priv_data;
52d87d5f 3268
52d87d5f 3269 bpf_iter_fini_seq_net(priv_data);
04c7820b 3270 kvfree(iter->batch);
52d87d5f
YS
3271}
3272
14fc6bd6 3273static const struct bpf_iter_seq_info tcp_seq_info = {
52d87d5f
YS
3274 .seq_ops = &bpf_iter_tcp_seq_ops,
3275 .init_seq_private = bpf_iter_init_tcp,
3276 .fini_seq_private = bpf_iter_fini_tcp,
04c7820b 3277 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
14fc6bd6
YS
3278};
3279
3cee6fb8
MKL
3280static const struct bpf_func_proto *
3281bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3282 const struct bpf_prog *prog)
3283{
3284 switch (func_id) {
3285 case BPF_FUNC_setsockopt:
3286 return &bpf_sk_setsockopt_proto;
3287 case BPF_FUNC_getsockopt:
3288 return &bpf_sk_getsockopt_proto;
3289 default:
3290 return NULL;
3291 }
3292}
3293
14fc6bd6
YS
3294static struct bpf_iter_reg tcp_reg_info = {
3295 .target = "tcp",
52d87d5f
YS
3296 .ctx_arg_info_size = 1,
3297 .ctx_arg_info = {
3298 { offsetof(struct bpf_iter__tcp, sk_common),
3299 PTR_TO_BTF_ID_OR_NULL },
3300 },
3cee6fb8 3301 .get_func_proto = bpf_iter_tcp_get_func_proto,
14fc6bd6 3302 .seq_info = &tcp_seq_info,
52d87d5f
YS
3303};
3304
3305static void __init bpf_iter_register(void)
3306{
951cf368 3307 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
52d87d5f
YS
3308 if (bpf_iter_reg_target(&tcp_reg_info))
3309 pr_warn("Warning: could not register bpf iterator tcp\n");
3310}
3311
3312#endif
3313
9b0f976f 3314void __init tcp_v4_init(void)
1da177e4 3315{
6a1b3054 3316 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 3317 panic("Failed to create the TCP control socket.\n");
52d87d5f
YS
3318
3319#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3320 bpf_iter_register();
3321#endif
1da177e4 3322}