]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/tcp_ipv4.c
Merge tag 'afs-fixes-20210913' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowe...
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
951cf368 79#include <linux/btf_ids.h>
1da177e4 80
cf80e0e4 81#include <crypto/hash.h>
cfb6eeb4
YH
82#include <linux/scatterlist.h>
83
c24b14c4
SL
84#include <trace/events/tcp.h>
85
cfb6eeb4 86#ifdef CONFIG_TCP_MD5SIG
a915da9b 87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
89#endif
90
5caea4ea 91struct inet_hashinfo tcp_hashinfo;
4bc2f18b 92EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 93
84b114b9 94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 95{
84b114b9
ED
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100}
101
5d2ed052 102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 103{
5d2ed052 104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
105}
106
6d6ee43e
ACM
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
79e9fed4 109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122#if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
128 loopback = true;
129 } else
130#endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
6d6ee43e
ACM
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
0f317464
ED
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
6d6ee43e
ACM
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179}
6d6ee43e
ACM
180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
d74bad4e
AI
182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184{
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195}
196
1da177e4
LT
197/* This will initiate an outgoing connection. */
198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199{
2d7192d6 200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 203 __be16 orig_sport, orig_dport;
bada8adc 204 __be32 daddr, nexthop;
da905bd1 205 struct flowi4 *fl4;
2d7192d6 206 struct rtable *rt;
1da177e4 207 int err;
f6d8bd05 208 struct ip_options_rcu *inet_opt;
1946e672 209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 218 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 219 lockdep_sock_is_held(sk));
f6d8bd05 220 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
221 if (!daddr)
222 return -EINVAL;
f6d8bd05 223 nexthop = inet_opt->opt.faddr;
1da177e4
LT
224 }
225
dca8b089
DM
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
da905bd1
DM
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
0e0d44ab 232 orig_sport, orig_dport, sk);
b23dd4fe
DM
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
f1d8cba6 236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 237 return err;
584bdf8c 238 }
1da177e4
LT
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
f6d8bd05 245 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 246 daddr = fl4->daddr;
1da177e4 247
c720c7e8 248 if (!inet->inet_saddr)
da905bd1 249 inet->inet_saddr = fl4->saddr;
d1e559d0 250 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 251
c720c7e8 252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
ee995283 256 if (likely(!tp->repair))
0f317464 257 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
258 }
259
c720c7e8 260 inet->inet_dport = usin->sin_port;
d1e559d0 261 sk_daddr_set(sk, daddr);
1da177e4 262
d83d8461 263 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 266
bee7ca9e 267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 275 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
276 if (err)
277 goto failure;
278
877d1f62 279 sk_set_txhash(sk);
9e7ceb06 280
da905bd1 281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
1da177e4 286 goto failure;
b23dd4fe 287 }
1da177e4 288 /* OK, now commit destination to socket. */
bcd76111 289 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 290 sk_setup_caps(sk, &rt->dst);
19f6d3f3 291 rt = NULL;
1da177e4 292
00355fa5 293 if (likely(!tp->repair)) {
00355fa5 294 if (!tp->write_seq)
0f317464
ED
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
5d2ed052
ED
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
84b114b9 302 inet->inet_daddr);
00355fa5 303 }
1da177e4 304
a904a069 305 inet->inet_id = prandom_u32();
1da177e4 306
19f6d3f3
WW
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
2b916477 312 err = tcp_connect(sk);
ee995283 313
1da177e4
LT
314 if (err)
315 goto failure;
316
317 return 0;
318
319failure:
7174259e
ACM
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
1da177e4
LT
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
c720c7e8 327 inet->inet_dport = 0;
1da177e4
LT
328 return err;
329}
4bc2f18b 330EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 331
1da177e4 332/*
563d34d0
ED
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 336 */
4fab9071 337void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 338{
1da177e4 339 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
340 struct dst_entry *dst;
341 u32 mtu;
1da177e4 342
02b2faaf
ED
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
561022ac 345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
80d0a69f
DM
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
1da177e4
LT
348 return;
349
1da177e4
LT
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
352 */
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 359 ip_sk_accept_pmtu(sk) &&
d83d8461 360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
361 tcp_sync_mss(sk, mtu);
362
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
366 * discovery.
367 */
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
370}
4fab9071 371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 372
55be7a9c
DM
373static void do_redirect(struct sk_buff *skb, struct sock *sk)
374{
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
1ed5c48f 377 if (dst)
6700c270 378 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
379}
380
26e37360
ED
381
382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
384{
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
390 */
26e37360 391 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 393 } else if (abort) {
26e37360
ED
394 /*
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
399 */
c6973669 400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 401 tcp_listendrop(req->rsk_listener);
26e37360 402 }
ef84d8ce 403 reqsk_put(req);
26e37360
ED
404}
405EXPORT_SYMBOL(tcp_req_err);
406
f7456642 407/* TCP-LD (RFC 6069) logic */
d2924569 408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
f7456642
ED
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
441 */
442 tcp_retransmit_timer(sk);
443 }
444}
d2924569 445EXPORT_SYMBOL(tcp_ld_RTO_revert);
f7456642 446
1da177e4
LT
447/*
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
454 *
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
460 *
461 */
462
a12daf13 463int tcp_v4_err(struct sk_buff *skb, u32 info)
1da177e4 464{
a12daf13
ED
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
1da177e4
LT
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
a12daf13
ED
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
1da177e4 471 struct sock *sk;
0a672f74 472 struct request_sock *fastopen;
9a568de4 473 u32 seq, snd_una;
1da177e4 474 int err;
a12daf13 475 struct net *net = dev_net(skb->dev);
1da177e4 476
26e37360
ED
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
a12daf13 479 inet_iif(skb), 0);
1da177e4 480 if (!sk) {
5d3848bc 481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 482 return -ENOENT;
1da177e4
LT
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 485 inet_twsk_put(inet_twsk(sk));
32bbd879 486 return 0;
1da177e4 487 }
26e37360 488 seq = ntohl(th->seq);
32bbd879
SB
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
1da177e4
LT
497
498 bh_lock_sock(sk);
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
563d34d0
ED
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
1da177e4 503 */
b74aa930
ED
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 507 }
1da177e4
LT
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
97e3ecd1 511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 513 goto out;
514 }
515
1da177e4 516 tp = tcp_sk(sk);
0a672f74 517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 518 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 520 if (sk->sk_state != TCP_LISTEN &&
0a672f74 521 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
523 goto out;
524 }
525
526 switch (type) {
55be7a9c 527 case ICMP_REDIRECT:
45caeaa5 528 if (!sock_owned_by_user(sk))
a12daf13 529 do_redirect(skb, sk);
55be7a9c 530 goto out;
1da177e4
LT
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
545 */
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
561022ac 549 WRITE_ONCE(tp->mtu_info, info);
144d56e9 550 if (!sock_owned_by_user(sk)) {
563d34d0 551 tcp_v4_mtu_reduced(sk);
144d56e9 552 } else {
7aa5470c 553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
554 sock_hold(sk);
555 }
1da177e4
LT
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
f7456642
ED
560 /* check if this ICMP message allows revert of backoff.
561 * (see RFC 6069)
562 */
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
1da177e4
LT
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
1da177e4 575 case TCP_SYN_SENT:
0a672f74
YC
576 case TCP_SYN_RECV:
577 /* Only in fast or simultaneous open. If a fast open socket is
2bdcc73c 578 * already accepted it is treated as a connected one below.
0a672f74 579 */
51456b29 580 if (fastopen && !fastopen->sk)
0a672f74
YC
581 break;
582
a12daf13 583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
45af29ca 584
1da177e4 585 if (!sock_owned_by_user(sk)) {
1da177e4
LT
586 sk->sk_err = err;
587
e3ae2365 588 sk_error_report(sk);
1da177e4
LT
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
599 *
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
603 *
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 *
609 * Now we are in compliance with RFCs.
610 * --ANK (980905)
611 */
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
e3ae2365 616 sk_error_report(sk);
1da177e4
LT
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
619 }
620
621out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
32bbd879 624 return 0;
1da177e4
LT
625}
626
28850dc7 627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 628{
aa8223c7 629 struct tcphdr *th = tcp_hdr(skb);
1da177e4 630
98be9b12
ED
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
634}
635
419f9f89 636/* This routine computes an IPv4 TCP checksum. */
bb296246 637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 638{
cf533ea5 639 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642}
4bc2f18b 643EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 644
1da177e4
LT
645/*
646 * This routine will send an RST to the other tcp.
647 *
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 * for reset.
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
656 */
657
dc87efdb
FW
658#ifdef CONFIG_TCP_MD5SIG
659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660#else
661#define OPTION_BYTES sizeof(__be32)
662#endif
663
a00e7444 664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 665{
cf533ea5 666 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
667 struct {
668 struct tcphdr th;
dc87efdb 669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
cfb6eeb4 670 } rep;
1da177e4 671 struct ip_reply_arg arg;
cfb6eeb4 672#ifdef CONFIG_TCP_MD5SIG
e46787f0 673 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
cfb6eeb4 678#endif
d6fb396c 679 u64 transmit_time = 0;
00483690 680 struct sock *ctl_sk;
d6fb396c 681 struct net *net;
1da177e4
LT
682
683 /* Never send a reset in response to a reset. */
684 if (th->rst)
685 return;
686
c3658e8d
ED
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
689 */
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
691 return;
692
693 /* Swap the send and the receive. */
cfb6eeb4
YH
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
1da177e4
LT
699
700 if (th->ack) {
cfb6eeb4 701 rep.th.seq = th->ack_seq;
1da177e4 702 } else {
cfb6eeb4
YH
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
1da177e4
LT
706 }
707
7174259e 708 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
0f85feae 712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 713#ifdef CONFIG_TCP_MD5SIG
3b24d854 714 rcu_read_lock();
658ddaaf 715 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 716 if (sk && sk_fullsock(sk)) {
cea97609 717 const union tcp_md5_addr *addr;
dea53bb8 718 int l3index;
cea97609 719
dea53bb8
DA
720 /* sdif set, means packet ingressed via a device
721 * in an L3 domain and inet_iif is set to it.
722 */
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
cea97609 724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
e46787f0 726 } else if (hash_location) {
cea97609 727 const union tcp_md5_addr *addr;
534322ca
DA
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
dea53bb8 730 int l3index;
cea97609 731
658ddaaf
SL
732 /*
733 * active side is lost. Try to find listening socket through
734 * source port, and then find md5 key through listening socket.
735 * we are not loose security here:
736 * Incoming packet is checked with md5 hash with finding key,
737 * no RST generated if md5 hash doesn't match.
738 */
a583636a
CG
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
da5e3630 741 th->source, ip_hdr(skb)->daddr,
534322ca 742 ntohs(th->source), dif, sdif);
658ddaaf
SL
743 /* don't send rst if it can't find key */
744 if (!sk1)
3b24d854
ED
745 goto out;
746
dea53bb8
DA
747 /* sdif set, means packet ingressed via a device
748 * in an L3 domain and dif is set to it.
749 */
750 l3index = sdif ? dif : 0;
cea97609 751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
658ddaaf 753 if (!key)
3b24d854
ED
754 goto out;
755
658ddaaf 756
39f8e58e 757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
759 goto out;
760
658ddaaf
SL
761 }
762
cfb6eeb4
YH
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768 /* Update length and the length the header thinks exists */
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
49a72dfb 772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
775 }
776#endif
dc87efdb
FW
777 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
780
781 if (mrst) {
782 rep.opt[0] = mrst;
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
785 }
786 }
787
eddc9ec5
ACM
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr, /* XXX */
52cd5750 790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793
e2446eaa 794 /* When socket is gone, all binding information is lost.
4c675258
AK
795 * routing might fail in this case. No choice here, if we choose to force
796 * input interface, we will misroute in case of asymmetric route.
e2446eaa 797 */
c24b14c4 798 if (sk) {
4c675258 799 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
800 if (sk_fullsock(sk))
801 trace_tcp_send_reset(sk, skb);
c24b14c4 802 }
1da177e4 803
271c3b9b
FW
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806
66b13d99 807 arg.tos = ip_hdr(skb)->tos;
e2d118a1 808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 809 local_bh_disable();
5472c3c6 810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14 811 if (sk) {
00483690
JM
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 816 transmit_time = tcp_transmit_time(sk);
a842fe14 817 }
00483690 818 ip_send_unicast_reply(ctl_sk,
bdbbb852 819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
1da177e4 823
00483690 824 ctl_sk->sk_mark = 0;
90bbcc60
ED
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 827 local_bh_enable();
658ddaaf
SL
828
829#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
830out:
831 rcu_read_unlock();
658ddaaf 832#endif
1da177e4
LT
833}
834
835/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836 outside socket context is ugly, certainly. What can I do?
837 */
838
e2d118a1 839static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 840 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 841 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 842 struct tcp_md5sig_key *key,
66b13d99 843 int reply_flags, u8 tos)
1da177e4 844{
cf533ea5 845 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
846 struct {
847 struct tcphdr th;
714e85be 848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 849#ifdef CONFIG_TCP_MD5SIG
714e85be 850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
851#endif
852 ];
1da177e4 853 } rep;
e2d118a1 854 struct net *net = sock_net(sk);
1da177e4 855 struct ip_reply_arg arg;
00483690 856 struct sock *ctl_sk;
d6fb396c 857 u64 transmit_time;
1da177e4
LT
858
859 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 860 memset(&arg, 0, sizeof(arg));
1da177e4
LT
861
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 864 if (tsecr) {
cfb6eeb4
YH
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
867 TCPOLEN_TIMESTAMP);
ee684b6f
AV
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
cb48cfe8 870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
871 }
872
873 /* Swap the send and the receive. */
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
879 rep.th.ack = 1;
880 rep.th.window = htons(win);
881
cfb6eeb4 882#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 883 if (key) {
ee684b6f 884 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
885
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 (TCPOPT_NOP << 16) |
888 (TCPOPT_MD5SIG << 8) |
889 TCPOLEN_MD5SIG);
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
892
49a72dfb 893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
896 }
897#endif
88ef4a5a 898 arg.flags = reply_flags;
eddc9ec5
ACM
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
903 if (oif)
904 arg.bound_dev_if = oif;
66b13d99 905 arg.tos = tos;
e2d118a1 906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 907 local_bh_disable();
5472c3c6 908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14
ED
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 913 transmit_time = tcp_transmit_time(sk);
00483690 914 ip_send_unicast_reply(ctl_sk,
bdbbb852 915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
917 &arg, arg.iov[0].iov_len,
918 transmit_time);
1da177e4 919
00483690 920 ctl_sk->sk_mark = 0;
90bbcc60 921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 922 local_bh_enable();
1da177e4
LT
923}
924
925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926{
8feaf0c0 927 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 929
e2d118a1 930 tcp_v4_send_ack(sk, skb,
e62a123b 931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
934 tcptw->tw_ts_recent,
935 tw->tw_bound_dev_if,
88ef4a5a 936 tcp_twsk_md5_key(tcptw),
66b13d99
ED
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 tw->tw_tos
9501f972 939 );
1da177e4 940
8feaf0c0 941 inet_twsk_put(tw);
1da177e4
LT
942}
943
a00e7444 944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 945 struct request_sock *req)
1da177e4 946{
cea97609 947 const union tcp_md5_addr *addr;
dea53bb8 948 int l3index;
cea97609 949
168a8f58
JC
950 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952 */
e62a123b
ED
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 tcp_sk(sk)->snd_nxt;
955
20a2b49f
ED
956 /* RFC 7323 2.3
957 * The window field (SEG.WND) of every outgoing segment, with the
958 * exception of <SYN> segments, MUST be right-shifted by
959 * Rcv.Wind.Shift bits:
960 */
cea97609 961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
e2d118a1 963 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
967 req->ts_recent,
968 0,
dea53bb8 969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
66b13d99
ED
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 ip_hdr(skb)->tos);
1da177e4
LT
972}
973
1da177e4 974/*
9bf1d83e 975 * Send a SYN-ACK after having received a SYN.
60236fdd 976 * This still operates on a request_sock only, not on a big
1da177e4
LT
977 * socket.
978 */
0f935dbe 979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 980 struct flowi *fl,
72659ecc 981 struct request_sock *req,
ca6fb065 982 struct tcp_fastopen_cookie *foc,
331fca43
MKL
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
1da177e4 985{
2e6599cb 986 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 987 struct flowi4 fl4;
1da177e4 988 int err = -1;
d41db5af 989 struct sk_buff *skb;
ac8f1710 990 u8 tos;
1da177e4
LT
991
992 /* First, grab a route. */
ba3f7f04 993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 994 return -1;
1da177e4 995
331fca43 996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1da177e4
LT
997
998 if (skb) {
634fb979 999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 1000
407c85c7 1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
8ef44b6f
WW
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
407c85c7
AD
1004 inet_sk(sk)->tos;
1005
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1009
2ab2ddd3 1010 rcu_read_lock();
634fb979
ED
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 ireq->ir_rmt_addr,
de033b7d 1013 rcu_dereference(ireq->ireq_opt),
861602b5 1014 tos);
2ab2ddd3 1015 rcu_read_unlock();
b9df3cb8 1016 err = net_xmit_eval(err);
1da177e4
LT
1017 }
1018
1da177e4
LT
1019 return err;
1020}
1021
1022/*
60236fdd 1023 * IPv4 request_sock destructor.
1da177e4 1024 */
60236fdd 1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1026{
c92e8c02 1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
1028}
1029
cfb6eeb4
YH
1030#ifdef CONFIG_TCP_MD5SIG
1031/*
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1035 */
1036
921f9a0f 1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
cfb6eeb4 1040/* Find the Key structure for an address. */
dea53bb8 1041struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
6015c71e
ED
1042 const union tcp_md5_addr *addr,
1043 int family)
cfb6eeb4 1044{
fd3a154a 1045 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1046 struct tcp_md5sig_key *key;
fd3a154a 1047 const struct tcp_md5sig_info *md5sig;
6797318e
ID
1048 __be32 mask;
1049 struct tcp_md5sig_key *best_match = NULL;
1050 bool match;
cfb6eeb4 1051
a8afca03
ED
1052 /* caller either holds rcu_read_lock() or socket lock */
1053 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 1054 lockdep_sock_is_held(sk));
a8afca03 1055 if (!md5sig)
cfb6eeb4 1056 return NULL;
083a0326 1057
c8b91770
AG
1058 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059 lockdep_sock_is_held(sk)) {
a915da9b
ED
1060 if (key->family != family)
1061 continue;
dea53bb8
DA
1062 if (key->l3index && key->l3index != l3index)
1063 continue;
6797318e
ID
1064 if (family == AF_INET) {
1065 mask = inet_make_mask(key->prefixlen);
1066 match = (key->addr.a4.s_addr & mask) ==
1067 (addr->a4.s_addr & mask);
1068#if IS_ENABLED(CONFIG_IPV6)
1069 } else if (family == AF_INET6) {
1070 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071 key->prefixlen);
1072#endif
1073 } else {
1074 match = false;
1075 }
1076
1077 if (match && (!best_match ||
1078 key->prefixlen > best_match->prefixlen))
1079 best_match = key;
1080 }
1081 return best_match;
1082}
6015c71e 1083EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1084
e8f37d57
WF
1085static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086 const union tcp_md5_addr *addr,
dea53bb8
DA
1087 int family, u8 prefixlen,
1088 int l3index)
6797318e
ID
1089{
1090 const struct tcp_sock *tp = tcp_sk(sk);
1091 struct tcp_md5sig_key *key;
1092 unsigned int size = sizeof(struct in_addr);
1093 const struct tcp_md5sig_info *md5sig;
1094
1095 /* caller either holds rcu_read_lock() or socket lock */
1096 md5sig = rcu_dereference_check(tp->md5sig_info,
1097 lockdep_sock_is_held(sk));
1098 if (!md5sig)
1099 return NULL;
1100#if IS_ENABLED(CONFIG_IPV6)
1101 if (family == AF_INET6)
1102 size = sizeof(struct in6_addr);
1103#endif
c8b91770
AG
1104 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 lockdep_sock_is_held(sk)) {
6797318e
ID
1106 if (key->family != family)
1107 continue;
dea53bb8
DA
1108 if (key->l3index && key->l3index != l3index)
1109 continue;
6797318e
ID
1110 if (!memcmp(&key->addr, addr, size) &&
1111 key->prefixlen == prefixlen)
a915da9b 1112 return key;
cfb6eeb4
YH
1113 }
1114 return NULL;
1115}
1116
b83e3deb 1117struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1118 const struct sock *addr_sk)
cfb6eeb4 1119{
b52e6921 1120 const union tcp_md5_addr *addr;
dea53bb8 1121 int l3index;
a915da9b 1122
dea53bb8
DA
1123 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124 addr_sk->sk_bound_dev_if);
b52e6921 1125 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
dea53bb8 1126 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
cfb6eeb4 1127}
cfb6eeb4
YH
1128EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129
cfb6eeb4 1130/* This can be called on a newly created socket, from other files */
a915da9b 1131int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
dea53bb8
DA
1132 int family, u8 prefixlen, int l3index,
1133 const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1134{
1135 /* Add Key to the list */
b0a713e9 1136 struct tcp_md5sig_key *key;
cfb6eeb4 1137 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1138 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1139
dea53bb8 1140 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
cfb6eeb4 1141 if (key) {
e6ced831
ED
1142 /* Pre-existing entry - just update that one.
1143 * Note that the key might be used concurrently.
1144 * data_race() is telling kcsan that we do not care of
1145 * key mismatches, since changing MD5 key on live flows
1146 * can lead to packet drops.
1147 */
1148 data_race(memcpy(key->key, newkey, newkeylen));
6a2febec 1149
e6ced831
ED
1150 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151 * Also note that a reader could catch new key->keylen value
1152 * but old key->key[], this is the reason we use __GFP_ZERO
1153 * at sock_kmalloc() time below these lines.
1154 */
1155 WRITE_ONCE(key->keylen, newkeylen);
6a2febec 1156
a915da9b
ED
1157 return 0;
1158 }
260fcbeb 1159
a8afca03 1160 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1161 lockdep_sock_is_held(sk));
a915da9b
ED
1162 if (!md5sig) {
1163 md5sig = kmalloc(sizeof(*md5sig), gfp);
1164 if (!md5sig)
cfb6eeb4 1165 return -ENOMEM;
cfb6eeb4 1166
a915da9b
ED
1167 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1169 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1170 }
cfb6eeb4 1171
e6ced831 1172 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
a915da9b
ED
1173 if (!key)
1174 return -ENOMEM;
71cea17e 1175 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1176 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1177 return -ENOMEM;
cfb6eeb4 1178 }
a915da9b
ED
1179
1180 memcpy(key->key, newkey, newkeylen);
1181 key->keylen = newkeylen;
1182 key->family = family;
6797318e 1183 key->prefixlen = prefixlen;
dea53bb8 1184 key->l3index = l3index;
a915da9b
ED
1185 memcpy(&key->addr, addr,
1186 (family == AF_INET6) ? sizeof(struct in6_addr) :
1187 sizeof(struct in_addr));
1188 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1189 return 0;
1190}
a915da9b 1191EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1192
6797318e 1193int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
dea53bb8 1194 u8 prefixlen, int l3index)
cfb6eeb4 1195{
a915da9b
ED
1196 struct tcp_md5sig_key *key;
1197
dea53bb8 1198 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
a915da9b
ED
1199 if (!key)
1200 return -ENOENT;
1201 hlist_del_rcu(&key->node);
5f3d9cb2 1202 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1203 kfree_rcu(key, rcu);
a915da9b 1204 return 0;
cfb6eeb4 1205}
a915da9b 1206EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1207
e0683e70 1208static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1209{
1210 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1211 struct tcp_md5sig_key *key;
b67bfe0d 1212 struct hlist_node *n;
a8afca03 1213 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1214
a8afca03
ED
1215 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216
b67bfe0d 1217 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1218 hlist_del_rcu(&key->node);
5f3d9cb2 1219 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1220 kfree_rcu(key, rcu);
cfb6eeb4
YH
1221 }
1222}
1223
8917a777 1224static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
d4c19c49 1225 sockptr_t optval, int optlen)
cfb6eeb4
YH
1226{
1227 struct tcp_md5sig cmd;
1228 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cea97609 1229 const union tcp_md5_addr *addr;
8917a777 1230 u8 prefixlen = 32;
dea53bb8 1231 int l3index = 0;
cfb6eeb4
YH
1232
1233 if (optlen < sizeof(cmd))
1234 return -EINVAL;
1235
d4c19c49 1236 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1237 return -EFAULT;
1238
1239 if (sin->sin_family != AF_INET)
1240 return -EINVAL;
1241
8917a777
ID
1242 if (optname == TCP_MD5SIG_EXT &&
1243 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244 prefixlen = cmd.tcpm_prefixlen;
1245 if (prefixlen > 32)
1246 return -EINVAL;
1247 }
1248
6b102db5
DA
1249 if (optname == TCP_MD5SIG_EXT &&
1250 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251 struct net_device *dev;
1252
1253 rcu_read_lock();
1254 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255 if (dev && netif_is_l3_master(dev))
1256 l3index = dev->ifindex;
1257
1258 rcu_read_unlock();
1259
1260 /* ok to reference set/not set outside of rcu;
1261 * right now device MUST be an L3 master
1262 */
1263 if (!dev || !l3index)
1264 return -EINVAL;
1265 }
1266
cea97609
DA
1267 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
64a124ed 1269 if (!cmd.tcpm_keylen)
dea53bb8 1270 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
cfb6eeb4
YH
1271
1272 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273 return -EINVAL;
1274
dea53bb8 1275 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
cea97609 1276 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
cfb6eeb4
YH
1277}
1278
19689e38
ED
1279static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280 __be32 daddr, __be32 saddr,
1281 const struct tcphdr *th, int nbytes)
cfb6eeb4 1282{
cfb6eeb4 1283 struct tcp4_pseudohdr *bp;
49a72dfb 1284 struct scatterlist sg;
19689e38 1285 struct tcphdr *_th;
cfb6eeb4 1286
19689e38 1287 bp = hp->scratch;
cfb6eeb4
YH
1288 bp->saddr = saddr;
1289 bp->daddr = daddr;
1290 bp->pad = 0;
076fb722 1291 bp->protocol = IPPROTO_TCP;
49a72dfb 1292 bp->len = cpu_to_be16(nbytes);
c7da57a1 1293
19689e38
ED
1294 _th = (struct tcphdr *)(bp + 1);
1295 memcpy(_th, th, sizeof(*th));
1296 _th->check = 0;
1297
1298 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300 sizeof(*bp) + sizeof(*th));
cf80e0e4 1301 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1302}
1303
a915da9b 1304static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1305 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1306{
1307 struct tcp_md5sig_pool *hp;
cf80e0e4 1308 struct ahash_request *req;
49a72dfb
AL
1309
1310 hp = tcp_get_md5sig_pool();
1311 if (!hp)
1312 goto clear_hash_noput;
cf80e0e4 1313 req = hp->md5_req;
49a72dfb 1314
cf80e0e4 1315 if (crypto_ahash_init(req))
49a72dfb 1316 goto clear_hash;
19689e38 1317 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1318 goto clear_hash;
1319 if (tcp_md5_hash_key(hp, key))
1320 goto clear_hash;
cf80e0e4
HX
1321 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322 if (crypto_ahash_final(req))
cfb6eeb4
YH
1323 goto clear_hash;
1324
cfb6eeb4 1325 tcp_put_md5sig_pool();
cfb6eeb4 1326 return 0;
49a72dfb 1327
cfb6eeb4
YH
1328clear_hash:
1329 tcp_put_md5sig_pool();
1330clear_hash_noput:
1331 memset(md5_hash, 0, 16);
49a72dfb 1332 return 1;
cfb6eeb4
YH
1333}
1334
39f8e58e
ED
1335int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336 const struct sock *sk,
318cf7aa 1337 const struct sk_buff *skb)
cfb6eeb4 1338{
49a72dfb 1339 struct tcp_md5sig_pool *hp;
cf80e0e4 1340 struct ahash_request *req;
318cf7aa 1341 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1342 __be32 saddr, daddr;
1343
39f8e58e
ED
1344 if (sk) { /* valid for establish/request sockets */
1345 saddr = sk->sk_rcv_saddr;
1346 daddr = sk->sk_daddr;
cfb6eeb4 1347 } else {
49a72dfb
AL
1348 const struct iphdr *iph = ip_hdr(skb);
1349 saddr = iph->saddr;
1350 daddr = iph->daddr;
cfb6eeb4 1351 }
49a72dfb
AL
1352
1353 hp = tcp_get_md5sig_pool();
1354 if (!hp)
1355 goto clear_hash_noput;
cf80e0e4 1356 req = hp->md5_req;
49a72dfb 1357
cf80e0e4 1358 if (crypto_ahash_init(req))
49a72dfb
AL
1359 goto clear_hash;
1360
19689e38 1361 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1362 goto clear_hash;
1363 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364 goto clear_hash;
1365 if (tcp_md5_hash_key(hp, key))
1366 goto clear_hash;
cf80e0e4
HX
1367 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368 if (crypto_ahash_final(req))
49a72dfb
AL
1369 goto clear_hash;
1370
1371 tcp_put_md5sig_pool();
1372 return 0;
1373
1374clear_hash:
1375 tcp_put_md5sig_pool();
1376clear_hash_noput:
1377 memset(md5_hash, 0, 16);
1378 return 1;
cfb6eeb4 1379}
49a72dfb 1380EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1381
ba8e275a
ED
1382#endif
1383
ff74e23f 1384/* Called with rcu_read_lock() */
ba8e275a 1385static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
534322ca
DA
1386 const struct sk_buff *skb,
1387 int dif, int sdif)
cfb6eeb4 1388{
ba8e275a 1389#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1390 /*
1391 * This gets called for each TCP segment that arrives
1392 * so we want to be efficient.
1393 * We have 3 drop cases:
1394 * o No MD5 hash and one expected.
1395 * o MD5 hash and we're not expecting one.
1396 * o MD5 hash and its wrong.
1397 */
cf533ea5 1398 const __u8 *hash_location = NULL;
cfb6eeb4 1399 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1400 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1401 const struct tcphdr *th = tcp_hdr(skb);
cea97609 1402 const union tcp_md5_addr *addr;
cfb6eeb4 1403 unsigned char newhash[16];
dea53bb8
DA
1404 int genhash, l3index;
1405
1406 /* sdif set, means packet ingressed via a device
1407 * in an L3 domain and dif is set to the l3mdev
1408 */
1409 l3index = sdif ? dif : 0;
cfb6eeb4 1410
cea97609 1411 addr = (union tcp_md5_addr *)&iph->saddr;
dea53bb8 1412 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
7d5d5525 1413 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1414
cfb6eeb4
YH
1415 /* We've parsed the options - do we have a hash? */
1416 if (!hash_expected && !hash_location)
a2a385d6 1417 return false;
cfb6eeb4
YH
1418
1419 if (hash_expected && !hash_location) {
c10d9310 1420 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1421 return true;
cfb6eeb4
YH
1422 }
1423
1424 if (!hash_expected && hash_location) {
c10d9310 1425 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1426 return true;
cfb6eeb4
YH
1427 }
1428
1429 /* Okay, so this is hash_expected and hash_location -
1430 * so we need to calculate the checksum.
1431 */
49a72dfb
AL
1432 genhash = tcp_v4_md5_hash_skb(newhash,
1433 hash_expected,
39f8e58e 1434 NULL, skb);
cfb6eeb4
YH
1435
1436 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1437 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
dea53bb8 1438 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
e87cc472
JP
1439 &iph->saddr, ntohs(th->source),
1440 &iph->daddr, ntohs(th->dest),
1441 genhash ? " tcp_v4_calc_md5_hash failed"
dea53bb8 1442 : "", l3index);
a2a385d6 1443 return true;
cfb6eeb4 1444 }
a2a385d6 1445 return false;
cfb6eeb4 1446#endif
ba8e275a
ED
1447 return false;
1448}
cfb6eeb4 1449
b40cf18e
ED
1450static void tcp_v4_init_req(struct request_sock *req,
1451 const struct sock *sk_listener,
16bea70a
OP
1452 struct sk_buff *skb)
1453{
1454 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1455 struct net *net = sock_net(sk_listener);
16bea70a 1456
08d2cc3b
ED
1457 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1459 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1460}
1461
f964629e 1462static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
7ea851d1 1463 struct sk_buff *skb,
f964629e 1464 struct flowi *fl,
7ea851d1 1465 struct request_sock *req)
d94e0417 1466{
7ea851d1
FW
1467 tcp_v4_init_req(req, sk, skb);
1468
1469 if (security_inet_conn_request(sk, skb, req))
1470 return NULL;
1471
4396e461 1472 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1473}
1474
72a3effa 1475struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1476 .family = PF_INET,
2e6599cb 1477 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1478 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1479 .send_ack = tcp_v4_reqsk_send_ack,
1480 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1481 .send_reset = tcp_v4_send_reset,
688d1945 1482 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1483};
1484
35b2c321 1485const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1486 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1487#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1488 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1489 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1490#endif
fb7b37a7
OP
1491#ifdef CONFIG_SYN_COOKIES
1492 .cookie_init_seq = cookie_v4_init_sequence,
1493#endif
d94e0417 1494 .route_req = tcp_v4_route_req,
84b114b9
ED
1495 .init_seq = tcp_v4_init_seq,
1496 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1497 .send_synack = tcp_v4_send_synack,
16bea70a 1498};
cfb6eeb4 1499
1da177e4
LT
1500int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501{
1da177e4 1502 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1503 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1504 goto drop;
1505
1fb6f159
OP
1506 return tcp_conn_request(&tcp_request_sock_ops,
1507 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1508
1da177e4 1509drop:
9caad864 1510 tcp_listendrop(sk);
1da177e4
LT
1511 return 0;
1512}
4bc2f18b 1513EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1514
1515
1516/*
1517 * The three way handshake has completed - we got a valid synack -
1518 * now create the new socket.
1519 */
0c27171e 1520struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1521 struct request_sock *req,
5e0724d0
ED
1522 struct dst_entry *dst,
1523 struct request_sock *req_unhash,
1524 bool *own_req)
1da177e4 1525{
2e6599cb 1526 struct inet_request_sock *ireq;
01770a16 1527 bool found_dup_sk = false;
1da177e4
LT
1528 struct inet_sock *newinet;
1529 struct tcp_sock *newtp;
1530 struct sock *newsk;
cfb6eeb4 1531#ifdef CONFIG_TCP_MD5SIG
cea97609 1532 const union tcp_md5_addr *addr;
cfb6eeb4 1533 struct tcp_md5sig_key *key;
dea53bb8 1534 int l3index;
cfb6eeb4 1535#endif
f6d8bd05 1536 struct ip_options_rcu *inet_opt;
1da177e4
LT
1537
1538 if (sk_acceptq_is_full(sk))
1539 goto exit_overflow;
1540
1da177e4
LT
1541 newsk = tcp_create_openreq_child(sk, req, skb);
1542 if (!newsk)
093d2823 1543 goto exit_nonewsk;
1da177e4 1544
bcd76111 1545 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1546 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1547
1548 newtp = tcp_sk(newsk);
1549 newinet = inet_sk(newsk);
2e6599cb 1550 ireq = inet_rsk(req);
d1e559d0
ED
1551 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1553 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1554 newinet->inet_saddr = ireq->ir_loc_addr;
1555 inet_opt = rcu_dereference(ireq->ireq_opt);
1556 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1557 newinet->mc_index = inet_iif(skb);
eddc9ec5 1558 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1559 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1560 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1561 if (inet_opt)
1562 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
a904a069 1563 newinet->inet_id = prandom_u32();
1da177e4 1564
8ef44b6f
WW
1565 /* Set ToS of the new socket based upon the value of incoming SYN.
1566 * ECT bits are set later in tcp_init_transfer().
1567 */
ac8f1710
WW
1568 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570
dfd25fff
ED
1571 if (!dst) {
1572 dst = inet_csk_route_child_sock(sk, newsk, req);
1573 if (!dst)
1574 goto put_and_exit;
1575 } else {
1576 /* syncookie case : see end of cookie_v4_check() */
1577 }
0e734419
DM
1578 sk_setup_caps(newsk, dst);
1579
81164413
DB
1580 tcp_ca_openreq_child(newsk, dst);
1581
1da177e4 1582 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1583 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1584
1da177e4
LT
1585 tcp_initialize_rcv_mss(newsk);
1586
cfb6eeb4 1587#ifdef CONFIG_TCP_MD5SIG
dea53bb8 1588 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
cfb6eeb4 1589 /* Copy over the MD5 key from the original socket */
cea97609 1590 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
dea53bb8 1591 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
00db4124 1592 if (key) {
cfb6eeb4
YH
1593 /*
1594 * We're using one, so create a matching key
1595 * on the newsk structure. If we fail to get
1596 * memory, then we end up not copying the key
1597 * across. Shucks.
1598 */
dea53bb8 1599 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
cea97609 1600 key->key, key->keylen, GFP_ATOMIC);
a465419b 1601 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1602 }
1603#endif
1604
0e734419
DM
1605 if (__inet_inherit_port(sk, newsk) < 0)
1606 goto put_and_exit;
01770a16
RD
1607 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608 &found_dup_sk);
c92e8c02 1609 if (likely(*own_req)) {
49a496c9 1610 tcp_move_syn(newtp, req);
c92e8c02
ED
1611 ireq->ireq_opt = NULL;
1612 } else {
c89dffc7
KI
1613 newinet->inet_opt = NULL;
1614
01770a16
RD
1615 if (!req_unhash && found_dup_sk) {
1616 /* This code path should only be executed in the
1617 * syncookie case only
1618 */
1619 bh_unlock_sock(newsk);
1620 sock_put(newsk);
1621 newsk = NULL;
01770a16 1622 }
c92e8c02 1623 }
1da177e4
LT
1624 return newsk;
1625
1626exit_overflow:
c10d9310 1627 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1628exit_nonewsk:
1629 dst_release(dst);
1da177e4 1630exit:
9caad864 1631 tcp_listendrop(sk);
1da177e4 1632 return NULL;
0e734419 1633put_and_exit:
c92e8c02 1634 newinet->inet_opt = NULL;
e337e24d
CP
1635 inet_csk_prepare_forced_close(newsk);
1636 tcp_done(newsk);
0e734419 1637 goto exit;
1da177e4 1638}
4bc2f18b 1639EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1640
079096f1 1641static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1642{
079096f1 1643#ifdef CONFIG_SYN_COOKIES
52452c54 1644 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1645
af9b4738 1646 if (!th->syn)
461b74c3 1647 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1648#endif
1649 return sk;
1650}
1651
9349d600
PP
1652u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653 struct tcphdr *th, u32 *cookie)
1654{
1655 u16 mss = 0;
1656#ifdef CONFIG_SYN_COOKIES
1657 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658 &tcp_request_sock_ipv4_ops, sk, th);
1659 if (mss) {
1660 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661 tcp_synq_overflow(sk);
1662 }
1663#endif
1664 return mss;
1665}
1666
bbd807df
BV
1667INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668 u32));
1da177e4 1669/* The socket must have it's spinlock held when we get
e994b2f0 1670 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1671 *
1672 * We have a potential double-lock case here, so even when
1673 * doing backlog processing we use the BH locking scheme.
1674 * This is because we cannot sleep with the original spinlock
1675 * held.
1676 */
1677int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678{
cfb6eeb4 1679 struct sock *rsk;
cfb6eeb4 1680
1da177e4 1681 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1682 struct dst_entry *dst = sk->sk_rx_dst;
1683
bdeab991 1684 sock_rps_save_rxhash(sk, skb);
3d97379a 1685 sk_mark_napi_id(sk, skb);
404e0a8b 1686 if (dst) {
505fbcf0 1687 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
bbd807df
BV
1688 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689 dst, 0)) {
92101b3b
DM
1690 dst_release(dst);
1691 sk->sk_rx_dst = NULL;
1692 }
1693 }
3d97d88e 1694 tcp_rcv_established(sk, skb);
1da177e4
LT
1695 return 0;
1696 }
1697
12e25e10 1698 if (tcp_checksum_complete(skb))
1da177e4
LT
1699 goto csum_err;
1700
1701 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1702 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703
1da177e4
LT
1704 if (!nsk)
1705 goto discard;
1da177e4 1706 if (nsk != sk) {
cfb6eeb4
YH
1707 if (tcp_child_process(sk, nsk, skb)) {
1708 rsk = nsk;
1da177e4 1709 goto reset;
cfb6eeb4 1710 }
1da177e4
LT
1711 return 0;
1712 }
ca55158c 1713 } else
bdeab991 1714 sock_rps_save_rxhash(sk, skb);
ca55158c 1715
72ab4a86 1716 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1717 rsk = sk;
1da177e4 1718 goto reset;
cfb6eeb4 1719 }
1da177e4
LT
1720 return 0;
1721
1722reset:
cfb6eeb4 1723 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1724discard:
1725 kfree_skb(skb);
1726 /* Be careful here. If this function gets more complicated and
1727 * gcc suffers from register pressure on the x86, sk (in %ebx)
1728 * might be destroyed here. This current version compiles correctly,
1729 * but you have been warned.
1730 */
1731 return 0;
1732
1733csum_err:
709c0314 1734 trace_tcp_bad_csum(skb);
c10d9310
ED
1735 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1737 goto discard;
1738}
4bc2f18b 1739EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1740
7487449c 1741int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1742{
41063e9d
DM
1743 const struct iphdr *iph;
1744 const struct tcphdr *th;
1745 struct sock *sk;
41063e9d 1746
41063e9d 1747 if (skb->pkt_type != PACKET_HOST)
7487449c 1748 return 0;
41063e9d 1749
45f00f99 1750 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1751 return 0;
41063e9d
DM
1752
1753 iph = ip_hdr(skb);
45f00f99 1754 th = tcp_hdr(skb);
41063e9d
DM
1755
1756 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1757 return 0;
41063e9d 1758
45f00f99 1759 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1760 iph->saddr, th->source,
7011d085 1761 iph->daddr, ntohs(th->dest),
3fa6f616 1762 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1763 if (sk) {
1764 skb->sk = sk;
1765 skb->destructor = sock_edemux;
f7e4eb03 1766 if (sk_fullsock(sk)) {
d0c294c5 1767 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1768
41063e9d
DM
1769 if (dst)
1770 dst = dst_check(dst, 0);
92101b3b 1771 if (dst &&
505fbcf0 1772 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1773 skb_dst_set_noref(skb, dst);
41063e9d
DM
1774 }
1775 }
7487449c 1776 return 0;
41063e9d
DM
1777}
1778
c9c33212
ED
1779bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780{
8265792b 1781 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
b160c285 1782 u32 tail_gso_size, tail_gso_segs;
4f693b55
ED
1783 struct skb_shared_info *shinfo;
1784 const struct tcphdr *th;
1785 struct tcphdr *thtail;
1786 struct sk_buff *tail;
1787 unsigned int hdrlen;
1788 bool fragstolen;
1789 u32 gso_segs;
b160c285 1790 u32 gso_size;
4f693b55 1791 int delta;
c9c33212
ED
1792
1793 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1794 * we can fix skb->truesize to its real value to avoid future drops.
1795 * This is valid because skb is not yet charged to the socket.
1796 * It has been noticed pure SACK packets were sometimes dropped
1797 * (if cooked by drivers without copybreak feature).
1798 */
60b1af33 1799 skb_condense(skb);
c9c33212 1800
ade9628e
ED
1801 skb_dst_drop(skb);
1802
4f693b55
ED
1803 if (unlikely(tcp_checksum_complete(skb))) {
1804 bh_unlock_sock(sk);
709c0314 1805 trace_tcp_bad_csum(skb);
4f693b55
ED
1806 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808 return true;
1809 }
1810
1811 /* Attempt coalescing to last skb in backlog, even if we are
1812 * above the limits.
1813 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1814 */
1815 th = (const struct tcphdr *)skb->data;
1816 hdrlen = th->doff * 4;
4f693b55
ED
1817
1818 tail = sk->sk_backlog.tail;
1819 if (!tail)
1820 goto no_coalesce;
1821 thtail = (struct tcphdr *)tail->data;
1822
1823 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1826 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827 !((TCP_SKB_CB(tail)->tcp_flags &
1828 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1829 ((TCP_SKB_CB(tail)->tcp_flags ^
1830 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831#ifdef CONFIG_TLS_DEVICE
1832 tail->decrypted != skb->decrypted ||
1833#endif
1834 thtail->doff != th->doff ||
1835 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836 goto no_coalesce;
1837
1838 __skb_pull(skb, hdrlen);
b160c285
ED
1839
1840 shinfo = skb_shinfo(skb);
1841 gso_size = shinfo->gso_size ?: skb->len;
1842 gso_segs = shinfo->gso_segs ?: 1;
1843
1844 shinfo = skb_shinfo(tail);
1845 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846 tail_gso_segs = shinfo->gso_segs ?: 1;
1847
4f693b55 1848 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
4f693b55
ED
1849 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850
86bccd03 1851 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
4f693b55 1852 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
86bccd03
ED
1853 thtail->window = th->window;
1854 }
4f693b55 1855
ca2fe295
ED
1856 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1857 * thtail->fin, so that the fast path in tcp_rcv_established()
1858 * is not entered if we append a packet with a FIN.
1859 * SYN, RST, URG are not present.
1860 * ACK is set on both packets.
1861 * PSH : we do not really care in TCP stack,
1862 * at least for 'GRO' packets.
1863 */
1864 thtail->fin |= th->fin;
4f693b55
ED
1865 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866
1867 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868 TCP_SKB_CB(tail)->has_rxtstamp = true;
1869 tail->tstamp = skb->tstamp;
1870 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871 }
1872
1873 /* Not as strict as GRO. We only need to carry mss max value */
b160c285
ED
1874 shinfo->gso_size = max(gso_size, tail_gso_size);
1875 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
4f693b55
ED
1876
1877 sk->sk_backlog.len += delta;
1878 __NET_INC_STATS(sock_net(sk),
1879 LINUX_MIB_TCPBACKLOGCOALESCE);
1880 kfree_skb_partial(skb, fragstolen);
1881 return false;
1882 }
1883 __skb_push(skb, hdrlen);
1884
1885no_coalesce:
1886 /* Only socket owner can try to collapse/prune rx queues
1887 * to reduce memory overhead, so add a little headroom here.
1888 * Few sockets backlog are possibly concurrently non empty.
1889 */
1890 limit += 64*1024;
1891
c9c33212
ED
1892 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893 bh_unlock_sock(sk);
1894 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895 return true;
1896 }
1897 return false;
1898}
1899EXPORT_SYMBOL(tcp_add_backlog);
1900
ac6e7800
ED
1901int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902{
1903 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1904
f2feaefd 1905 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1906}
1907EXPORT_SYMBOL(tcp_filter);
1908
eeea10b8
ED
1909static void tcp_v4_restore_cb(struct sk_buff *skb)
1910{
1911 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912 sizeof(struct inet_skb_parm));
1913}
1914
1915static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916 const struct tcphdr *th)
1917{
1918 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1919 * barrier() makes sure compiler wont play fool^Waliasing games.
1920 */
1921 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922 sizeof(struct inet_skb_parm));
1923 barrier();
1924
1925 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927 skb->len - th->doff * 4);
1928 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932 TCP_SKB_CB(skb)->sacked = 0;
1933 TCP_SKB_CB(skb)->has_rxtstamp =
1934 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935}
1936
1da177e4
LT
1937/*
1938 * From tcp_input.c
1939 */
1940
1941int tcp_v4_rcv(struct sk_buff *skb)
1942{
3b24d854 1943 struct net *net = dev_net(skb->dev);
8b27dae5 1944 struct sk_buff *skb_to_free;
3fa6f616 1945 int sdif = inet_sdif(skb);
534322ca 1946 int dif = inet_iif(skb);
eddc9ec5 1947 const struct iphdr *iph;
cf533ea5 1948 const struct tcphdr *th;
3b24d854 1949 bool refcounted;
1da177e4
LT
1950 struct sock *sk;
1951 int ret;
1952
1953 if (skb->pkt_type != PACKET_HOST)
1954 goto discard_it;
1955
1956 /* Count it even if it's bad */
90bbcc60 1957 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1958
1959 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960 goto discard_it;
1961
ea1627c2 1962 th = (const struct tcphdr *)skb->data;
1da177e4 1963
ea1627c2 1964 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1965 goto bad_packet;
1966 if (!pskb_may_pull(skb, th->doff * 4))
1967 goto discard_it;
1968
1969 /* An explanation is required here, I think.
1970 * Packet length and doff are validated by header prediction,
caa20d9a 1971 * provided case of th->doff==0 is eliminated.
1da177e4 1972 * So, we defer the checks. */
ed70fcfc
TH
1973
1974 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1975 goto csum_error;
1da177e4 1976
ea1627c2 1977 th = (const struct tcphdr *)skb->data;
eddc9ec5 1978 iph = ip_hdr(skb);
4bdc3d66 1979lookup:
a583636a 1980 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1981 th->dest, sdif, &refcounted);
1da177e4
LT
1982 if (!sk)
1983 goto no_tcp_socket;
1984
bb134d5d
ED
1985process:
1986 if (sk->sk_state == TCP_TIME_WAIT)
1987 goto do_time_wait;
1988
079096f1
ED
1989 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1991 bool req_stolen = false;
7716682c 1992 struct sock *nsk;
079096f1
ED
1993
1994 sk = req->rsk_listener;
534322ca 1995 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
e65c332d 1996 sk_drops_add(sk, skb);
72923555
ED
1997 reqsk_put(req);
1998 goto discard_it;
1999 }
4fd44a98
FL
2000 if (tcp_checksum_complete(skb)) {
2001 reqsk_put(req);
2002 goto csum_error;
2003 }
7716682c 2004 if (unlikely(sk->sk_state != TCP_LISTEN)) {
d4f2c86b
KI
2005 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006 if (!nsk) {
2007 inet_csk_reqsk_queue_drop_and_put(sk, req);
2008 goto lookup;
2009 }
2010 sk = nsk;
2011 /* reuseport_migrate_sock() has already held one sk_refcnt
2012 * before returning.
2013 */
2014 } else {
2015 /* We own a reference on the listener, increase it again
2016 * as we might lose it too soon.
2017 */
2018 sock_hold(sk);
4bdc3d66 2019 }
3b24d854 2020 refcounted = true;
1f3b359f 2021 nsk = NULL;
eeea10b8
ED
2022 if (!tcp_filter(sk, skb)) {
2023 th = (const struct tcphdr *)skb->data;
2024 iph = ip_hdr(skb);
2025 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 2026 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 2027 }
079096f1
ED
2028 if (!nsk) {
2029 reqsk_put(req);
e0f9759f
ED
2030 if (req_stolen) {
2031 /* Another cpu got exclusive access to req
2032 * and created a full blown socket.
2033 * Try to feed this packet to this socket
2034 * instead of discarding it.
2035 */
2036 tcp_v4_restore_cb(skb);
2037 sock_put(sk);
2038 goto lookup;
2039 }
7716682c 2040 goto discard_and_relse;
079096f1
ED
2041 }
2042 if (nsk == sk) {
079096f1 2043 reqsk_put(req);
eeea10b8 2044 tcp_v4_restore_cb(skb);
079096f1
ED
2045 } else if (tcp_child_process(sk, nsk, skb)) {
2046 tcp_v4_send_reset(nsk, skb);
7716682c 2047 goto discard_and_relse;
079096f1 2048 } else {
7716682c 2049 sock_put(sk);
079096f1
ED
2050 return 0;
2051 }
2052 }
6cce09f8 2053 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 2054 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 2055 goto discard_and_relse;
6cce09f8 2056 }
d218d111 2057
1da177e4
LT
2058 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059 goto discard_and_relse;
9ea88a15 2060
534322ca 2061 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
9ea88a15 2062 goto discard_and_relse;
9ea88a15 2063
895b5c9f 2064 nf_reset_ct(skb);
1da177e4 2065
ac6e7800 2066 if (tcp_filter(sk, skb))
1da177e4 2067 goto discard_and_relse;
ac6e7800
ED
2068 th = (const struct tcphdr *)skb->data;
2069 iph = ip_hdr(skb);
eeea10b8 2070 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
2071
2072 skb->dev = NULL;
2073
e994b2f0
ED
2074 if (sk->sk_state == TCP_LISTEN) {
2075 ret = tcp_v4_do_rcv(sk, skb);
2076 goto put_and_return;
2077 }
2078
2079 sk_incoming_cpu_update(sk);
2080
c6366184 2081 bh_lock_sock_nested(sk);
a44d6eac 2082 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
2083 ret = 0;
2084 if (!sock_owned_by_user(sk)) {
8b27dae5
ED
2085 skb_to_free = sk->sk_rx_skb_cache;
2086 sk->sk_rx_skb_cache = NULL;
e7942d06 2087 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5
ED
2088 } else {
2089 if (tcp_add_backlog(sk, skb))
2090 goto discard_and_relse;
2091 skb_to_free = NULL;
6b03a53a 2092 }
1da177e4 2093 bh_unlock_sock(sk);
8b27dae5
ED
2094 if (skb_to_free)
2095 __kfree_skb(skb_to_free);
1da177e4 2096
e994b2f0 2097put_and_return:
3b24d854
ED
2098 if (refcounted)
2099 sock_put(sk);
1da177e4
LT
2100
2101 return ret;
2102
2103no_tcp_socket:
2104 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105 goto discard_it;
2106
eeea10b8
ED
2107 tcp_v4_fill_cb(skb, iph, th);
2108
12e25e10 2109 if (tcp_checksum_complete(skb)) {
6a5dc9e5 2110csum_error:
709c0314 2111 trace_tcp_bad_csum(skb);
90bbcc60 2112 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 2113bad_packet:
90bbcc60 2114 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 2115 } else {
cfb6eeb4 2116 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2117 }
2118
2119discard_it:
2120 /* Discard frame. */
2121 kfree_skb(skb);
e905a9ed 2122 return 0;
1da177e4
LT
2123
2124discard_and_relse:
532182cd 2125 sk_drops_add(sk, skb);
3b24d854
ED
2126 if (refcounted)
2127 sock_put(sk);
1da177e4
LT
2128 goto discard_it;
2129
2130do_time_wait:
2131 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2132 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2133 goto discard_it;
2134 }
2135
eeea10b8
ED
2136 tcp_v4_fill_cb(skb, iph, th);
2137
6a5dc9e5
ED
2138 if (tcp_checksum_complete(skb)) {
2139 inet_twsk_put(inet_twsk(sk));
2140 goto csum_error;
1da177e4 2141 }
9469c7b4 2142 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2143 case TCP_TW_SYN: {
c346dca1 2144 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
2145 &tcp_hashinfo, skb,
2146 __tcp_hdrlen(th),
da5e3630 2147 iph->saddr, th->source,
eddc9ec5 2148 iph->daddr, th->dest,
3fa6f616
DA
2149 inet_iif(skb),
2150 sdif);
1da177e4 2151 if (sk2) {
dbe7faa4 2152 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2153 sk = sk2;
eeea10b8 2154 tcp_v4_restore_cb(skb);
3b24d854 2155 refcounted = false;
1da177e4
LT
2156 goto process;
2157 }
1da177e4 2158 }
fcfd6dfa 2159 /* to ACK */
a8eceea8 2160 fallthrough;
1da177e4
LT
2161 case TCP_TW_ACK:
2162 tcp_v4_timewait_ack(sk, skb);
2163 break;
2164 case TCP_TW_RST:
271c3b9b
FW
2165 tcp_v4_send_reset(sk, skb);
2166 inet_twsk_deschedule_put(inet_twsk(sk));
2167 goto discard_it;
1da177e4
LT
2168 case TCP_TW_SUCCESS:;
2169 }
2170 goto discard_it;
2171}
2172
ccb7c410
DM
2173static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2175 .twsk_unique = tcp_twsk_unique,
2176 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2177};
1da177e4 2178
63d02d15 2179void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2180{
2181 struct dst_entry *dst = skb_dst(skb);
2182
5037e9ef 2183 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2184 sk->sk_rx_dst = dst;
2185 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186 }
5d299f3d 2187}
63d02d15 2188EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2189
3b401a81 2190const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2191 .queue_xmit = ip_queue_xmit,
2192 .send_check = tcp_v4_send_check,
2193 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2194 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2195 .conn_request = tcp_v4_conn_request,
2196 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2197 .net_header_len = sizeof(struct iphdr),
2198 .setsockopt = ip_setsockopt,
2199 .getsockopt = ip_getsockopt,
2200 .addr2sockaddr = inet_csk_addr2sockaddr,
2201 .sockaddr_len = sizeof(struct sockaddr_in),
4fab9071 2202 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2203};
4bc2f18b 2204EXPORT_SYMBOL(ipv4_specific);
1da177e4 2205
cfb6eeb4 2206#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2207static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2208 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2209 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2210 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2211};
b6332e6c 2212#endif
cfb6eeb4 2213
1da177e4
LT
2214/* NOTE: A lot of things set to zero explicitly by call to
2215 * sk_alloc() so need not be done here.
2216 */
2217static int tcp_v4_init_sock(struct sock *sk)
2218{
6687e988 2219 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2220
900f65d3 2221 tcp_init_sock(sk);
1da177e4 2222
8292a17a 2223 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2224
cfb6eeb4 2225#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2226 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2227#endif
1da177e4 2228
1da177e4
LT
2229 return 0;
2230}
2231
7d06b2e0 2232void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2233{
2234 struct tcp_sock *tp = tcp_sk(sk);
2235
e1a4aa50
SL
2236 trace_tcp_destroy_sock(sk);
2237
1da177e4
LT
2238 tcp_clear_xmit_timers(sk);
2239
6687e988 2240 tcp_cleanup_congestion_control(sk);
317a76f9 2241
734942cc
DW
2242 tcp_cleanup_ulp(sk);
2243
1da177e4 2244 /* Cleanup up the write buffer. */
fe067e8a 2245 tcp_write_queue_purge(sk);
1da177e4 2246
cf1ef3f0
WW
2247 /* Check if we want to disable active TFO */
2248 tcp_fastopen_active_disable_ofo_check(sk);
2249
1da177e4 2250 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2251 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2252
cfb6eeb4
YH
2253#ifdef CONFIG_TCP_MD5SIG
2254 /* Clean up the MD5 key list, if any */
2255 if (tp->md5sig_info) {
a915da9b 2256 tcp_clear_md5_list(sk);
fb7df5e4 2257 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2258 tp->md5sig_info = NULL;
2259 }
2260#endif
1a2449a8 2261
1da177e4 2262 /* Clean up a referenced TCP bind bucket. */
463c84b9 2263 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2264 inet_put_port(sk);
1da177e4 2265
d983ea6f 2266 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2267
cf60af03
YC
2268 /* If socket is aborted during connect operation */
2269 tcp_free_fastopen_req(tp);
1fba70e5 2270 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2271 tcp_saved_syn_free(tp);
cf60af03 2272
180d8cd9 2273 sk_sockets_allocated_dec(sk);
1da177e4 2274}
1da177e4
LT
2275EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276
2277#ifdef CONFIG_PROC_FS
2278/* Proc filesystem TCP sock list dumping. */
2279
ad2d6137
MKL
2280static unsigned short seq_file_family(const struct seq_file *seq);
2281
2282static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2283{
2284 unsigned short family = seq_file_family(seq);
2285
2286 /* AF_UNSPEC is used as a match all */
2287 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2288 net_eq(sock_net(sk), seq_file_net(seq)));
2289}
2290
b72acf45
MKL
2291/* Find a non empty bucket (starting from st->bucket)
2292 * and return the first sk from it.
a8b690f9 2293 */
b72acf45 2294static void *listening_get_first(struct seq_file *seq)
1da177e4 2295{
5799de0b 2296 struct tcp_iter_state *st = seq->private;
1da177e4 2297
b72acf45 2298 st->offset = 0;
05c0b357
MKL
2299 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2300 struct inet_listen_hashbucket *ilb2;
2301 struct inet_connection_sock *icsk;
b72acf45 2302 struct sock *sk;
b08d4d3b 2303
05c0b357
MKL
2304 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2305 if (hlist_empty(&ilb2->head))
b72acf45
MKL
2306 continue;
2307
05c0b357
MKL
2308 spin_lock(&ilb2->lock);
2309 inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2310 sk = (struct sock *)icsk;
b72acf45
MKL
2311 if (seq_sk_match(seq, sk))
2312 return sk;
2313 }
05c0b357 2314 spin_unlock(&ilb2->lock);
1da177e4 2315 }
b72acf45
MKL
2316
2317 return NULL;
2318}
2319
2320/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2321 * If "cur" is the last one in the st->bucket,
2322 * call listening_get_first() to return the first sk of the next
2323 * non empty bucket.
a8b690f9 2324 */
1da177e4
LT
2325static void *listening_get_next(struct seq_file *seq, void *cur)
2326{
5799de0b 2327 struct tcp_iter_state *st = seq->private;
05c0b357
MKL
2328 struct inet_listen_hashbucket *ilb2;
2329 struct inet_connection_sock *icsk;
3b24d854 2330 struct sock *sk = cur;
1da177e4 2331
1da177e4 2332 ++st->num;
a8b690f9 2333 ++st->offset;
1da177e4 2334
05c0b357
MKL
2335 icsk = inet_csk(sk);
2336 inet_lhash2_for_each_icsk_continue(icsk) {
2337 sk = (struct sock *)icsk;
ad2d6137 2338 if (seq_sk_match(seq, sk))
3b24d854 2339 return sk;
1da177e4 2340 }
b72acf45 2341
05c0b357
MKL
2342 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2343 spin_unlock(&ilb2->lock);
b72acf45
MKL
2344 ++st->bucket;
2345 return listening_get_first(seq);
1da177e4
LT
2346}
2347
2348static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2349{
a8b690f9
TH
2350 struct tcp_iter_state *st = seq->private;
2351 void *rc;
2352
2353 st->bucket = 0;
2354 st->offset = 0;
b72acf45 2355 rc = listening_get_first(seq);
1da177e4
LT
2356
2357 while (rc && *pos) {
2358 rc = listening_get_next(seq, rc);
2359 --*pos;
2360 }
2361 return rc;
2362}
2363
05dbc7b5 2364static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2365{
05dbc7b5 2366 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2367}
2368
a8b690f9
TH
2369/*
2370 * Get first established socket starting from bucket given in st->bucket.
2371 * If st->bucket is zero, the very first socket in the hash is returned.
2372 */
1da177e4
LT
2373static void *established_get_first(struct seq_file *seq)
2374{
5799de0b 2375 struct tcp_iter_state *st = seq->private;
b08d4d3b 2376
a8b690f9
TH
2377 st->offset = 0;
2378 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2379 struct sock *sk;
3ab5aee7 2380 struct hlist_nulls_node *node;
9db66bdc 2381 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2382
6eac5604
AK
2383 /* Lockless fast path for the common case of empty buckets */
2384 if (empty_bucket(st))
2385 continue;
2386
9db66bdc 2387 spin_lock_bh(lock);
3ab5aee7 2388 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
ad2d6137
MKL
2389 if (seq_sk_match(seq, sk))
2390 return sk;
1da177e4 2391 }
9db66bdc 2392 spin_unlock_bh(lock);
1da177e4 2393 }
ad2d6137
MKL
2394
2395 return NULL;
1da177e4
LT
2396}
2397
2398static void *established_get_next(struct seq_file *seq, void *cur)
2399{
2400 struct sock *sk = cur;
3ab5aee7 2401 struct hlist_nulls_node *node;
5799de0b 2402 struct tcp_iter_state *st = seq->private;
b08d4d3b 2403
1da177e4 2404 ++st->num;
a8b690f9 2405 ++st->offset;
1da177e4 2406
05dbc7b5 2407 sk = sk_nulls_next(sk);
1da177e4 2408
3ab5aee7 2409 sk_nulls_for_each_from(sk, node) {
ad2d6137 2410 if (seq_sk_match(seq, sk))
05dbc7b5 2411 return sk;
1da177e4
LT
2412 }
2413
05dbc7b5
ED
2414 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2415 ++st->bucket;
2416 return established_get_first(seq);
1da177e4
LT
2417}
2418
2419static void *established_get_idx(struct seq_file *seq, loff_t pos)
2420{
a8b690f9
TH
2421 struct tcp_iter_state *st = seq->private;
2422 void *rc;
2423
2424 st->bucket = 0;
2425 rc = established_get_first(seq);
1da177e4
LT
2426
2427 while (rc && pos) {
2428 rc = established_get_next(seq, rc);
2429 --pos;
7174259e 2430 }
1da177e4
LT
2431 return rc;
2432}
2433
2434static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2435{
2436 void *rc;
5799de0b 2437 struct tcp_iter_state *st = seq->private;
1da177e4 2438
1da177e4
LT
2439 st->state = TCP_SEQ_STATE_LISTENING;
2440 rc = listening_get_idx(seq, &pos);
2441
2442 if (!rc) {
1da177e4
LT
2443 st->state = TCP_SEQ_STATE_ESTABLISHED;
2444 rc = established_get_idx(seq, pos);
2445 }
2446
2447 return rc;
2448}
2449
a8b690f9
TH
2450static void *tcp_seek_last_pos(struct seq_file *seq)
2451{
2452 struct tcp_iter_state *st = seq->private;
525e2f9f 2453 int bucket = st->bucket;
a8b690f9
TH
2454 int offset = st->offset;
2455 int orig_num = st->num;
2456 void *rc = NULL;
2457
2458 switch (st->state) {
a8b690f9 2459 case TCP_SEQ_STATE_LISTENING:
05c0b357 2460 if (st->bucket > tcp_hashinfo.lhash2_mask)
a8b690f9
TH
2461 break;
2462 st->state = TCP_SEQ_STATE_LISTENING;
b72acf45 2463 rc = listening_get_first(seq);
525e2f9f 2464 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2465 rc = listening_get_next(seq, rc);
2466 if (rc)
2467 break;
2468 st->bucket = 0;
05dbc7b5 2469 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8eceea8 2470 fallthrough;
a8b690f9 2471 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2472 if (st->bucket > tcp_hashinfo.ehash_mask)
2473 break;
2474 rc = established_get_first(seq);
525e2f9f 2475 while (offset-- && rc && bucket == st->bucket)
a8b690f9
TH
2476 rc = established_get_next(seq, rc);
2477 }
2478
2479 st->num = orig_num;
2480
2481 return rc;
2482}
2483
37d849bb 2484void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2485{
5799de0b 2486 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2487 void *rc;
2488
2489 if (*pos && *pos == st->last_pos) {
2490 rc = tcp_seek_last_pos(seq);
2491 if (rc)
2492 goto out;
2493 }
2494
1da177e4
LT
2495 st->state = TCP_SEQ_STATE_LISTENING;
2496 st->num = 0;
a8b690f9
TH
2497 st->bucket = 0;
2498 st->offset = 0;
2499 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500
2501out:
2502 st->last_pos = *pos;
2503 return rc;
1da177e4 2504}
37d849bb 2505EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2506
37d849bb 2507void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2508{
a8b690f9 2509 struct tcp_iter_state *st = seq->private;
1da177e4 2510 void *rc = NULL;
1da177e4
LT
2511
2512 if (v == SEQ_START_TOKEN) {
2513 rc = tcp_get_idx(seq, 0);
2514 goto out;
2515 }
1da177e4
LT
2516
2517 switch (st->state) {
1da177e4
LT
2518 case TCP_SEQ_STATE_LISTENING:
2519 rc = listening_get_next(seq, v);
2520 if (!rc) {
1da177e4 2521 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2522 st->bucket = 0;
2523 st->offset = 0;
1da177e4
LT
2524 rc = established_get_first(seq);
2525 }
2526 break;
2527 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2528 rc = established_get_next(seq, v);
2529 break;
2530 }
2531out:
2532 ++*pos;
a8b690f9 2533 st->last_pos = *pos;
1da177e4
LT
2534 return rc;
2535}
37d849bb 2536EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2537
37d849bb 2538void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2539{
5799de0b 2540 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2541
2542 switch (st->state) {
1da177e4
LT
2543 case TCP_SEQ_STATE_LISTENING:
2544 if (v != SEQ_START_TOKEN)
05c0b357 2545 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
1da177e4 2546 break;
1da177e4
LT
2547 case TCP_SEQ_STATE_ESTABLISHED:
2548 if (v)
9db66bdc 2549 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2550 break;
2551 }
2552}
37d849bb 2553EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2554
d4f06873 2555static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2556 struct seq_file *f, int i)
1da177e4 2557{
2e6599cb 2558 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2559 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2560
5e659e4c 2561 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2562 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2563 i,
634fb979 2564 ireq->ir_loc_addr,
d4f06873 2565 ireq->ir_num,
634fb979
ED
2566 ireq->ir_rmt_addr,
2567 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2568 TCP_SYN_RECV,
2569 0, 0, /* could print option size, but that is af dependent. */
2570 1, /* timers active (only the expire timer) */
a399a805 2571 jiffies_delta_to_clock_t(delta),
e6c022a4 2572 req->num_timeout,
aa3a0c8c
ED
2573 from_kuid_munged(seq_user_ns(f),
2574 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2575 0, /* non standard timer */
2576 0, /* open_requests have no inode */
d4f06873 2577 0,
652586df 2578 req);
1da177e4
LT
2579}
2580
652586df 2581static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2582{
2583 int timer_active;
2584 unsigned long timer_expires;
cf533ea5 2585 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2586 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2587 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2588 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2589 __be32 dest = inet->inet_daddr;
2590 __be32 src = inet->inet_rcv_saddr;
2591 __u16 destp = ntohs(inet->inet_dport);
2592 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2593 int rx_queue;
00fd38d9 2594 int state;
1da177e4 2595
6ba8a3b1 2596 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2597 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2598 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2599 timer_active = 1;
463c84b9
ACM
2600 timer_expires = icsk->icsk_timeout;
2601 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2602 timer_active = 4;
463c84b9 2603 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2604 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2605 timer_active = 2;
cf4c6bf8 2606 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2607 } else {
2608 timer_active = 0;
2609 timer_expires = jiffies;
2610 }
2611
986ffdfd 2612 state = inet_sk_state_load(sk);
00fd38d9 2613 if (state == TCP_LISTEN)
288efe86 2614 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2615 else
00fd38d9
ED
2616 /* Because we don't lock the socket,
2617 * we might find a transient negative value.
49d09007 2618 */
dba7d9b8 2619 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2620 READ_ONCE(tp->copied_seq), 0);
49d09007 2621
5e659e4c 2622 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2623 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2624 i, src, srcp, dest, destp, state,
0f317464 2625 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2626 rx_queue,
1da177e4 2627 timer_active,
a399a805 2628 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2629 icsk->icsk_retransmits,
a7cb5a49 2630 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2631 icsk->icsk_probes_out,
cf4c6bf8 2632 sock_i_ino(sk),
41c6d650 2633 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2634 jiffies_to_clock_t(icsk->icsk_rto),
2635 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2636 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
1da177e4 2637 tp->snd_cwnd,
00fd38d9
ED
2638 state == TCP_LISTEN ?
2639 fastopenq->max_qlen :
652586df 2640 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2641}
2642
cf533ea5 2643static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2644 struct seq_file *f, int i)
1da177e4 2645{
789f558c 2646 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2647 __be32 dest, src;
1da177e4 2648 __u16 destp, srcp;
1da177e4
LT
2649
2650 dest = tw->tw_daddr;
2651 src = tw->tw_rcv_saddr;
2652 destp = ntohs(tw->tw_dport);
2653 srcp = ntohs(tw->tw_sport);
2654
5e659e4c 2655 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2656 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2657 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2658 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2659 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2660}
2661
2662#define TMPSZ 150
2663
2664static int tcp4_seq_show(struct seq_file *seq, void *v)
2665{
5799de0b 2666 struct tcp_iter_state *st;
05dbc7b5 2667 struct sock *sk = v;
1da177e4 2668
652586df 2669 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2670 if (v == SEQ_START_TOKEN) {
652586df 2671 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2672 "rx_queue tr tm->when retrnsmt uid timeout "
2673 "inode");
2674 goto out;
2675 }
2676 st = seq->private;
2677
079096f1
ED
2678 if (sk->sk_state == TCP_TIME_WAIT)
2679 get_timewait4_sock(v, seq, st->num);
2680 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2681 get_openreq4(v, seq, st->num);
079096f1
ED
2682 else
2683 get_tcp4_sock(v, seq, st->num);
1da177e4 2684out:
652586df 2685 seq_pad(seq, '\n');
1da177e4
LT
2686 return 0;
2687}
2688
52d87d5f 2689#ifdef CONFIG_BPF_SYSCALL
04c7820b
MKL
2690struct bpf_tcp_iter_state {
2691 struct tcp_iter_state state;
2692 unsigned int cur_sk;
2693 unsigned int end_sk;
2694 unsigned int max_sk;
2695 struct sock **batch;
2696 bool st_bucket_done;
2697};
2698
52d87d5f
YS
2699struct bpf_iter__tcp {
2700 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2701 __bpf_md_ptr(struct sock_common *, sk_common);
2702 uid_t uid __aligned(8);
2703};
2704
2705static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2706 struct sock_common *sk_common, uid_t uid)
2707{
2708 struct bpf_iter__tcp ctx;
2709
2710 meta->seq_num--; /* skip SEQ_START_TOKEN */
2711 ctx.meta = meta;
2712 ctx.sk_common = sk_common;
2713 ctx.uid = uid;
2714 return bpf_iter_run_prog(prog, &ctx);
2715}
2716
04c7820b
MKL
2717static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2718{
2719 while (iter->cur_sk < iter->end_sk)
2720 sock_put(iter->batch[iter->cur_sk++]);
2721}
2722
2723static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2724 unsigned int new_batch_sz)
2725{
2726 struct sock **new_batch;
2727
2728 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2729 GFP_USER | __GFP_NOWARN);
2730 if (!new_batch)
2731 return -ENOMEM;
2732
2733 bpf_iter_tcp_put_batch(iter);
2734 kvfree(iter->batch);
2735 iter->batch = new_batch;
2736 iter->max_sk = new_batch_sz;
2737
2738 return 0;
2739}
2740
2741static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2742 struct sock *start_sk)
2743{
2744 struct bpf_tcp_iter_state *iter = seq->private;
2745 struct tcp_iter_state *st = &iter->state;
2746 struct inet_connection_sock *icsk;
2747 unsigned int expected = 1;
2748 struct sock *sk;
2749
2750 sock_hold(start_sk);
2751 iter->batch[iter->end_sk++] = start_sk;
2752
2753 icsk = inet_csk(start_sk);
2754 inet_lhash2_for_each_icsk_continue(icsk) {
2755 sk = (struct sock *)icsk;
2756 if (seq_sk_match(seq, sk)) {
2757 if (iter->end_sk < iter->max_sk) {
2758 sock_hold(sk);
2759 iter->batch[iter->end_sk++] = sk;
2760 }
2761 expected++;
2762 }
2763 }
2764 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2765
2766 return expected;
2767}
2768
2769static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2770 struct sock *start_sk)
2771{
2772 struct bpf_tcp_iter_state *iter = seq->private;
2773 struct tcp_iter_state *st = &iter->state;
2774 struct hlist_nulls_node *node;
2775 unsigned int expected = 1;
2776 struct sock *sk;
2777
2778 sock_hold(start_sk);
2779 iter->batch[iter->end_sk++] = start_sk;
2780
2781 sk = sk_nulls_next(start_sk);
2782 sk_nulls_for_each_from(sk, node) {
2783 if (seq_sk_match(seq, sk)) {
2784 if (iter->end_sk < iter->max_sk) {
2785 sock_hold(sk);
2786 iter->batch[iter->end_sk++] = sk;
2787 }
2788 expected++;
2789 }
2790 }
2791 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2792
2793 return expected;
2794}
2795
2796static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2797{
2798 struct bpf_tcp_iter_state *iter = seq->private;
2799 struct tcp_iter_state *st = &iter->state;
2800 unsigned int expected;
2801 bool resized = false;
2802 struct sock *sk;
2803
2804 /* The st->bucket is done. Directly advance to the next
2805 * bucket instead of having the tcp_seek_last_pos() to skip
2806 * one by one in the current bucket and eventually find out
2807 * it has to advance to the next bucket.
2808 */
2809 if (iter->st_bucket_done) {
2810 st->offset = 0;
2811 st->bucket++;
2812 if (st->state == TCP_SEQ_STATE_LISTENING &&
2813 st->bucket > tcp_hashinfo.lhash2_mask) {
2814 st->state = TCP_SEQ_STATE_ESTABLISHED;
2815 st->bucket = 0;
2816 }
2817 }
2818
2819again:
2820 /* Get a new batch */
2821 iter->cur_sk = 0;
2822 iter->end_sk = 0;
2823 iter->st_bucket_done = false;
2824
2825 sk = tcp_seek_last_pos(seq);
2826 if (!sk)
2827 return NULL; /* Done */
2828
2829 if (st->state == TCP_SEQ_STATE_LISTENING)
2830 expected = bpf_iter_tcp_listening_batch(seq, sk);
2831 else
2832 expected = bpf_iter_tcp_established_batch(seq, sk);
2833
2834 if (iter->end_sk == expected) {
2835 iter->st_bucket_done = true;
2836 return sk;
2837 }
2838
2839 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2840 resized = true;
2841 goto again;
2842 }
2843
2844 return sk;
2845}
2846
2847static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2848{
2849 /* bpf iter does not support lseek, so it always
2850 * continue from where it was stop()-ped.
2851 */
2852 if (*pos)
2853 return bpf_iter_tcp_batch(seq);
2854
2855 return SEQ_START_TOKEN;
2856}
2857
2858static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2859{
2860 struct bpf_tcp_iter_state *iter = seq->private;
2861 struct tcp_iter_state *st = &iter->state;
2862 struct sock *sk;
2863
2864 /* Whenever seq_next() is called, the iter->cur_sk is
2865 * done with seq_show(), so advance to the next sk in
2866 * the batch.
2867 */
2868 if (iter->cur_sk < iter->end_sk) {
2869 /* Keeping st->num consistent in tcp_iter_state.
2870 * bpf_iter_tcp does not use st->num.
2871 * meta.seq_num is used instead.
2872 */
2873 st->num++;
2874 /* Move st->offset to the next sk in the bucket such that
2875 * the future start() will resume at st->offset in
2876 * st->bucket. See tcp_seek_last_pos().
2877 */
2878 st->offset++;
2879 sock_put(iter->batch[iter->cur_sk++]);
2880 }
2881
2882 if (iter->cur_sk < iter->end_sk)
2883 sk = iter->batch[iter->cur_sk];
2884 else
2885 sk = bpf_iter_tcp_batch(seq);
2886
2887 ++*pos;
2888 /* Keeping st->last_pos consistent in tcp_iter_state.
2889 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2890 */
2891 st->last_pos = *pos;
2892 return sk;
2893}
2894
52d87d5f
YS
2895static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2896{
2897 struct bpf_iter_meta meta;
2898 struct bpf_prog *prog;
2899 struct sock *sk = v;
04c7820b 2900 bool slow;
52d87d5f 2901 uid_t uid;
04c7820b 2902 int ret;
52d87d5f
YS
2903
2904 if (v == SEQ_START_TOKEN)
2905 return 0;
2906
04c7820b
MKL
2907 if (sk_fullsock(sk))
2908 slow = lock_sock_fast(sk);
2909
2910 if (unlikely(sk_unhashed(sk))) {
2911 ret = SEQ_SKIP;
2912 goto unlock;
2913 }
2914
52d87d5f
YS
2915 if (sk->sk_state == TCP_TIME_WAIT) {
2916 uid = 0;
2917 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2918 const struct request_sock *req = v;
2919
2920 uid = from_kuid_munged(seq_user_ns(seq),
2921 sock_i_uid(req->rsk_listener));
2922 } else {
2923 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2924 }
2925
2926 meta.seq = seq;
2927 prog = bpf_iter_get_info(&meta, false);
04c7820b
MKL
2928 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2929
2930unlock:
2931 if (sk_fullsock(sk))
2932 unlock_sock_fast(sk, slow);
2933 return ret;
2934
52d87d5f
YS
2935}
2936
2937static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2938{
04c7820b 2939 struct bpf_tcp_iter_state *iter = seq->private;
52d87d5f
YS
2940 struct bpf_iter_meta meta;
2941 struct bpf_prog *prog;
2942
2943 if (!v) {
2944 meta.seq = seq;
2945 prog = bpf_iter_get_info(&meta, true);
2946 if (prog)
2947 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2948 }
2949
04c7820b
MKL
2950 if (iter->cur_sk < iter->end_sk) {
2951 bpf_iter_tcp_put_batch(iter);
2952 iter->st_bucket_done = false;
2953 }
52d87d5f
YS
2954}
2955
2956static const struct seq_operations bpf_iter_tcp_seq_ops = {
2957 .show = bpf_iter_tcp_seq_show,
04c7820b
MKL
2958 .start = bpf_iter_tcp_seq_start,
2959 .next = bpf_iter_tcp_seq_next,
52d87d5f
YS
2960 .stop = bpf_iter_tcp_seq_stop,
2961};
2962#endif
ad2d6137
MKL
2963static unsigned short seq_file_family(const struct seq_file *seq)
2964{
62001372 2965 const struct tcp_seq_afinfo *afinfo;
ad2d6137 2966
62001372 2967#ifdef CONFIG_BPF_SYSCALL
ad2d6137 2968 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
62001372 2969 if (seq->op == &bpf_iter_tcp_seq_ops)
ad2d6137 2970 return AF_UNSPEC;
52d87d5f
YS
2971#endif
2972
ad2d6137
MKL
2973 /* Iterated from proc fs */
2974 afinfo = PDE_DATA(file_inode(seq->file));
2975 return afinfo->family;
2976}
52d87d5f 2977
37d849bb
CH
2978static const struct seq_operations tcp4_seq_ops = {
2979 .show = tcp4_seq_show,
2980 .start = tcp_seq_start,
2981 .next = tcp_seq_next,
2982 .stop = tcp_seq_stop,
2983};
2984
1da177e4 2985static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2986 .family = AF_INET,
1da177e4
LT
2987};
2988
2c8c1e72 2989static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2990{
c3506372
CH
2991 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2992 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2993 return -ENOMEM;
2994 return 0;
757764f6
PE
2995}
2996
2c8c1e72 2997static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2998{
37d849bb 2999 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
3000}
3001
3002static struct pernet_operations tcp4_net_ops = {
3003 .init = tcp4_proc_init_net,
3004 .exit = tcp4_proc_exit_net,
3005};
3006
1da177e4
LT
3007int __init tcp4_proc_init(void)
3008{
757764f6 3009 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3010}
3011
3012void tcp4_proc_exit(void)
3013{
757764f6 3014 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
3015}
3016#endif /* CONFIG_PROC_FS */
3017
d3cd4924
ED
3018/* @wake is one when sk_stream_write_space() calls us.
3019 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3020 * This mimics the strategy used in sock_def_write_space().
3021 */
3022bool tcp_stream_memory_free(const struct sock *sk, int wake)
3023{
3024 const struct tcp_sock *tp = tcp_sk(sk);
3025 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3026 READ_ONCE(tp->snd_nxt);
3027
3028 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3029}
3030EXPORT_SYMBOL(tcp_stream_memory_free);
3031
1da177e4
LT
3032struct proto tcp_prot = {
3033 .name = "TCP",
3034 .owner = THIS_MODULE,
3035 .close = tcp_close,
d74bad4e 3036 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
3037 .connect = tcp_v4_connect,
3038 .disconnect = tcp_disconnect,
463c84b9 3039 .accept = inet_csk_accept,
1da177e4
LT
3040 .ioctl = tcp_ioctl,
3041 .init = tcp_v4_init_sock,
3042 .destroy = tcp_v4_destroy_sock,
3043 .shutdown = tcp_shutdown,
3044 .setsockopt = tcp_setsockopt,
3045 .getsockopt = tcp_getsockopt,
9cacf81f 3046 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
4b9d07a4 3047 .keepalive = tcp_set_keepalive,
1da177e4 3048 .recvmsg = tcp_recvmsg,
7ba42910
CG
3049 .sendmsg = tcp_sendmsg,
3050 .sendpage = tcp_sendpage,
1da177e4 3051 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 3052 .release_cb = tcp_release_cb,
ab1e0a13
ACM
3053 .hash = inet_hash,
3054 .unhash = inet_unhash,
3055 .get_port = inet_csk_get_port,
8a59f9d1
CW
3056#ifdef CONFIG_BPF_SYSCALL
3057 .psock_update_sk_prot = tcp_bpf_update_proto,
3058#endif
1da177e4 3059 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 3060 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 3061 .stream_memory_free = tcp_stream_memory_free,
1da177e4 3062 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 3063 .orphan_count = &tcp_orphan_count,
1da177e4
LT
3064 .memory_allocated = &tcp_memory_allocated,
3065 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 3066 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
3067 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3068 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
3069 .max_header = MAX_TCP_HEADER,
3070 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 3071 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 3072 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 3073 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 3074 .h.hashinfo = &tcp_hashinfo,
7ba42910 3075 .no_autobind = true,
c1e64e29 3076 .diag_destroy = tcp_abort,
1da177e4 3077};
4bc2f18b 3078EXPORT_SYMBOL(tcp_prot);
1da177e4 3079
bdbbb852
ED
3080static void __net_exit tcp_sk_exit(struct net *net)
3081{
3082 int cpu;
3083
b506bc97 3084 if (net->ipv4.tcp_congestion_control)
0baf26b0
MKL
3085 bpf_module_put(net->ipv4.tcp_congestion_control,
3086 net->ipv4.tcp_congestion_control->owner);
6670e152 3087
bdbbb852
ED
3088 for_each_possible_cpu(cpu)
3089 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3090 free_percpu(net->ipv4.tcp_sk);
3091}
3092
046ee902
DL
3093static int __net_init tcp_sk_init(struct net *net)
3094{
fee83d09 3095 int res, cpu, cnt;
bdbbb852
ED
3096
3097 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3098 if (!net->ipv4.tcp_sk)
3099 return -ENOMEM;
3100
3101 for_each_possible_cpu(cpu) {
3102 struct sock *sk;
3103
3104 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3105 IPPROTO_TCP, net);
3106 if (res)
3107 goto fail;
a9d6532b 3108 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
3109
3110 /* Please enforce IP_DF and IPID==0 for RST and
3111 * ACK sent in SYN-RECV and TIME-WAIT state.
3112 */
3113 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3114
bdbbb852
ED
3115 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3116 }
49213555 3117
5d134f1c 3118 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
3119 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3120
b0f9ca53 3121 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 3122 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 3123 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 3124 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 3125 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 3126
13b287e8 3127 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 3128 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 3129 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 3130
6fa25166 3131 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 3132 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 3133 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 3134 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 3135 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 3136 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 3137 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 3138 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 3139 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 3140 net->ipv4.sysctl_tcp_tw_reuse = 2;
65e6d901 3141 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
12ed8244 3142
fee83d09 3143 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 3144 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
3145 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3146
623d0c2d 3147 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
f9301034 3148 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 3149 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 3150 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 3151 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 3152 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 3153 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 3154 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 3155 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 3156 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 3157 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 3158 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 3159 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 3160 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
3161 /* This limits the percentage of the congestion window which we
3162 * will allow a single TSO frame to consume. Building TSO frames
3163 * which are too large can cause TCP streams to be bursty.
3164 */
3165 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
3166 /* Default TSQ limit of 16 TSO segments */
3167 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
3168 /* rfc5961 challenge ack rate limiting */
3169 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 3170 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 3171 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 3172 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 3173 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 3174 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 3175 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
3176 if (net != &init_net) {
3177 memcpy(net->ipv4.sysctl_tcp_rmem,
3178 init_net.ipv4.sysctl_tcp_rmem,
3179 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3180 memcpy(net->ipv4.sysctl_tcp_wmem,
3181 init_net.ipv4.sysctl_tcp_wmem,
3182 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3183 }
6d82aa24 3184 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
a70437cc 3185 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
9c21d2fc 3186 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 3187 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
213ad73d 3188 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3733be14 3189 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 3190
6670e152
SH
3191 /* Reno is always built in */
3192 if (!net_eq(net, &init_net) &&
0baf26b0
MKL
3193 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3194 init_net.ipv4.tcp_congestion_control->owner))
6670e152
SH
3195 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3196 else
3197 net->ipv4.tcp_congestion_control = &tcp_reno;
3198
49213555 3199 return 0;
bdbbb852
ED
3200fail:
3201 tcp_sk_exit(net);
3202
3203 return res;
b099ce26
EB
3204}
3205
3206static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3207{
43713848
HY
3208 struct net *net;
3209
1946e672 3210 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
3211
3212 list_for_each_entry(net, net_exit_list, exit_list)
3213 tcp_fastopen_ctx_destroy(net);
046ee902
DL
3214}
3215
3216static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
3217 .init = tcp_sk_init,
3218 .exit = tcp_sk_exit,
3219 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
3220};
3221
52d87d5f
YS
3222#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3223DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3224 struct sock_common *sk_common, uid_t uid)
3225
04c7820b
MKL
3226#define INIT_BATCH_SZ 16
3227
f9c79272 3228static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
52d87d5f 3229{
04c7820b
MKL
3230 struct bpf_tcp_iter_state *iter = priv_data;
3231 int err;
52d87d5f 3232
04c7820b
MKL
3233 err = bpf_iter_init_seq_net(priv_data, aux);
3234 if (err)
3235 return err;
52d87d5f 3236
04c7820b
MKL
3237 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3238 if (err) {
3239 bpf_iter_fini_seq_net(priv_data);
3240 return err;
3241 }
3242
3243 return 0;
52d87d5f
YS
3244}
3245
3246static void bpf_iter_fini_tcp(void *priv_data)
3247{
04c7820b 3248 struct bpf_tcp_iter_state *iter = priv_data;
52d87d5f 3249
52d87d5f 3250 bpf_iter_fini_seq_net(priv_data);
04c7820b 3251 kvfree(iter->batch);
52d87d5f
YS
3252}
3253
14fc6bd6 3254static const struct bpf_iter_seq_info tcp_seq_info = {
52d87d5f
YS
3255 .seq_ops = &bpf_iter_tcp_seq_ops,
3256 .init_seq_private = bpf_iter_init_tcp,
3257 .fini_seq_private = bpf_iter_fini_tcp,
04c7820b 3258 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
14fc6bd6
YS
3259};
3260
3cee6fb8
MKL
3261static const struct bpf_func_proto *
3262bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3263 const struct bpf_prog *prog)
3264{
3265 switch (func_id) {
3266 case BPF_FUNC_setsockopt:
3267 return &bpf_sk_setsockopt_proto;
3268 case BPF_FUNC_getsockopt:
3269 return &bpf_sk_getsockopt_proto;
3270 default:
3271 return NULL;
3272 }
3273}
3274
14fc6bd6
YS
3275static struct bpf_iter_reg tcp_reg_info = {
3276 .target = "tcp",
52d87d5f
YS
3277 .ctx_arg_info_size = 1,
3278 .ctx_arg_info = {
3279 { offsetof(struct bpf_iter__tcp, sk_common),
3280 PTR_TO_BTF_ID_OR_NULL },
3281 },
3cee6fb8 3282 .get_func_proto = bpf_iter_tcp_get_func_proto,
14fc6bd6 3283 .seq_info = &tcp_seq_info,
52d87d5f
YS
3284};
3285
3286static void __init bpf_iter_register(void)
3287{
951cf368 3288 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
52d87d5f
YS
3289 if (bpf_iter_reg_target(&tcp_reg_info))
3290 pr_warn("Warning: could not register bpf iterator tcp\n");
3291}
3292
3293#endif
3294
9b0f976f 3295void __init tcp_v4_init(void)
1da177e4 3296{
6a1b3054 3297 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 3298 panic("Failed to create the TCP control socket.\n");
52d87d5f
YS
3299
3300#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3301 bpf_iter_register();
3302#endif
1da177e4 3303}