]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/tcp_ipv4.c
inet: Use fallthrough;
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
1da177e4
LT
9 * IPv4 specific functions
10 *
1da177e4
LT
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
1da177e4
LT
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
60236fdd 32 * request_sock handling and moved
1da177e4
LT
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
caa20d9a 35 * Added new listen semantics.
1da177e4
LT
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
afd46503 48#define pr_fmt(fmt) "TCP: " fmt
1da177e4 49
eb4dea58 50#include <linux/bottom_half.h>
1da177e4
LT
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
5a0e3ad6 59#include <linux/slab.h>
1da177e4 60
457c4cbc 61#include <net/net_namespace.h>
1da177e4 62#include <net/icmp.h>
304a1618 63#include <net/inet_hashtables.h>
1da177e4 64#include <net/tcp.h>
20380731 65#include <net/transp_v6.h>
1da177e4
LT
66#include <net/ipv6.h>
67#include <net/inet_common.h>
6d6ee43e 68#include <net/timewait_sock.h>
1da177e4 69#include <net/xfrm.h>
6e5714ea 70#include <net/secure_seq.h>
076bb0c8 71#include <net/busy_poll.h>
1da177e4
LT
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
6797318e 78#include <linux/inetdevice.h>
1da177e4 79
cf80e0e4 80#include <crypto/hash.h>
cfb6eeb4
YH
81#include <linux/scatterlist.h>
82
c24b14c4
SL
83#include <trace/events/tcp.h>
84
cfb6eeb4 85#ifdef CONFIG_TCP_MD5SIG
a915da9b 86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 87 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
88#endif
89
5caea4ea 90struct inet_hashinfo tcp_hashinfo;
4bc2f18b 91EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 92
84b114b9 93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 94{
84b114b9
ED
95 return secure_tcp_seq(ip_hdr(skb)->daddr,
96 ip_hdr(skb)->saddr,
97 tcp_hdr(skb)->dest,
98 tcp_hdr(skb)->source);
99}
100
5d2ed052 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 102{
5d2ed052 103 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
104}
105
6d6ee43e
ACM
106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107{
79e9fed4 108 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
111 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112
113 if (reuse == 2) {
114 /* Still does not detect *everything* that goes through
115 * lo, since we require a loopback src or dst address
116 * or direct binding to 'lo' interface.
117 */
118 bool loopback = false;
119 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 loopback = true;
121#if IS_ENABLED(CONFIG_IPV6)
122 if (tw->tw_family == AF_INET6) {
123 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
be2644aa 124 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
79e9fed4 125 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
be2644aa 126 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
79e9fed4
127 loopback = true;
128 } else
129#endif
130 {
131 if (ipv4_is_loopback(tw->tw_daddr) ||
132 ipv4_is_loopback(tw->tw_rcv_saddr))
133 loopback = true;
134 }
135 if (!loopback)
136 reuse = 0;
137 }
6d6ee43e
ACM
138
139 /* With PAWS, it is safe from the viewpoint
140 of data integrity. Even without PAWS it is safe provided sequence
141 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143 Actually, the idea is close to VJ's one, only timestamp cache is
144 held not per host, but per port pair and TW bucket is used as state
145 holder.
146
147 If TW bucket has been already destroyed we fall back to VJ's scheme
148 and use initial timestamp retrieved from peer table.
149 */
150 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
151 (!twp || (reuse && time_after32(ktime_get_seconds(),
152 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
153 /* In case of repair and re-using TIME-WAIT sockets we still
154 * want to be sure that it is safe as above but honor the
155 * sequence numbers and time stamps set as part of the repair
156 * process.
157 *
158 * Without this check re-using a TIME-WAIT socket with TCP
159 * repair would accumulate a -1 on the repair assigned
160 * sequence number. The first time it is reused the sequence
161 * is -1, the second time -2, etc. This fixes that issue
162 * without appearing to create any others.
163 */
164 if (likely(!tp->repair)) {
0f317464
ED
165 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166
167 if (!seq)
168 seq = 1;
169 WRITE_ONCE(tp->write_seq, seq);
21684dc4
SB
170 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
171 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 }
6d6ee43e
ACM
173 sock_hold(sktw);
174 return 1;
175 }
176
177 return 0;
178}
6d6ee43e
ACM
179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180
d74bad4e
AI
181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 int addr_len)
183{
184 /* This check is replicated from tcp_v4_connect() and intended to
185 * prevent BPF program called below from accessing bytes that are out
186 * of the bound specified by user in addr_len.
187 */
188 if (addr_len < sizeof(struct sockaddr_in))
189 return -EINVAL;
190
191 sock_owned_by_me(sk);
192
193 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194}
195
1da177e4
LT
196/* This will initiate an outgoing connection. */
197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198{
2d7192d6 199 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
200 struct inet_sock *inet = inet_sk(sk);
201 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 202 __be16 orig_sport, orig_dport;
bada8adc 203 __be32 daddr, nexthop;
da905bd1 204 struct flowi4 *fl4;
2d7192d6 205 struct rtable *rt;
1da177e4 206 int err;
f6d8bd05 207 struct ip_options_rcu *inet_opt;
1946e672 208 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
209
210 if (addr_len < sizeof(struct sockaddr_in))
211 return -EINVAL;
212
213 if (usin->sin_family != AF_INET)
214 return -EAFNOSUPPORT;
215
216 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 217 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 218 lockdep_sock_is_held(sk));
f6d8bd05 219 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
220 if (!daddr)
221 return -EINVAL;
f6d8bd05 222 nexthop = inet_opt->opt.faddr;
1da177e4
LT
223 }
224
dca8b089
DM
225 orig_sport = inet->inet_sport;
226 orig_dport = usin->sin_port;
da905bd1
DM
227 fl4 = &inet->cork.fl.u.ip4;
228 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
229 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 IPPROTO_TCP,
0e0d44ab 231 orig_sport, orig_dport, sk);
b23dd4fe
DM
232 if (IS_ERR(rt)) {
233 err = PTR_ERR(rt);
234 if (err == -ENETUNREACH)
f1d8cba6 235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 236 return err;
584bdf8c 237 }
1da177e4
LT
238
239 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 ip_rt_put(rt);
241 return -ENETUNREACH;
242 }
243
f6d8bd05 244 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 245 daddr = fl4->daddr;
1da177e4 246
c720c7e8 247 if (!inet->inet_saddr)
da905bd1 248 inet->inet_saddr = fl4->saddr;
d1e559d0 249 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 250
c720c7e8 251 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
252 /* Reset inherited state */
253 tp->rx_opt.ts_recent = 0;
254 tp->rx_opt.ts_recent_stamp = 0;
ee995283 255 if (likely(!tp->repair))
0f317464 256 WRITE_ONCE(tp->write_seq, 0);
1da177e4
LT
257 }
258
c720c7e8 259 inet->inet_dport = usin->sin_port;
d1e559d0 260 sk_daddr_set(sk, daddr);
1da177e4 261
d83d8461 262 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
263 if (inet_opt)
264 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 265
bee7ca9e 266 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
267
268 /* Socket identity is still unknown (sport may be zero).
269 * However we set state to SYN-SENT and not releasing socket
270 * lock select source port, enter ourselves into the hash tables and
271 * complete initialization after this.
272 */
273 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 274 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
275 if (err)
276 goto failure;
277
877d1f62 278 sk_set_txhash(sk);
9e7ceb06 279
da905bd1 280 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
281 inet->inet_sport, inet->inet_dport, sk);
282 if (IS_ERR(rt)) {
283 err = PTR_ERR(rt);
284 rt = NULL;
1da177e4 285 goto failure;
b23dd4fe 286 }
1da177e4 287 /* OK, now commit destination to socket. */
bcd76111 288 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 289 sk_setup_caps(sk, &rt->dst);
19f6d3f3 290 rt = NULL;
1da177e4 291
00355fa5 292 if (likely(!tp->repair)) {
00355fa5 293 if (!tp->write_seq)
0f317464
ED
294 WRITE_ONCE(tp->write_seq,
295 secure_tcp_seq(inet->inet_saddr,
296 inet->inet_daddr,
297 inet->inet_sport,
298 usin->sin_port));
5d2ed052
ED
299 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 inet->inet_saddr,
84b114b9 301 inet->inet_daddr);
00355fa5 302 }
1da177e4 303
a904a069 304 inet->inet_id = prandom_u32();
1da177e4 305
19f6d3f3
WW
306 if (tcp_fastopen_defer_connect(sk, &err))
307 return err;
308 if (err)
309 goto failure;
310
2b916477 311 err = tcp_connect(sk);
ee995283 312
1da177e4
LT
313 if (err)
314 goto failure;
315
316 return 0;
317
318failure:
7174259e
ACM
319 /*
320 * This unhashes the socket and releases the local port,
321 * if necessary.
322 */
1da177e4
LT
323 tcp_set_state(sk, TCP_CLOSE);
324 ip_rt_put(rt);
325 sk->sk_route_caps = 0;
c720c7e8 326 inet->inet_dport = 0;
1da177e4
LT
327 return err;
328}
4bc2f18b 329EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 330
1da177e4 331/*
563d34d0
ED
332 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333 * It can be called through tcp_release_cb() if socket was owned by user
334 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 335 */
4fab9071 336void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 337{
1da177e4 338 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
339 struct dst_entry *dst;
340 u32 mtu;
1da177e4 341
02b2faaf
ED
342 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 return;
344 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
345 dst = inet_csk_update_pmtu(sk, mtu);
346 if (!dst)
1da177e4
LT
347 return;
348
1da177e4
LT
349 /* Something is about to be wrong... Remember soft error
350 * for the case, if this connection will not able to recover.
351 */
352 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 sk->sk_err_soft = EMSGSIZE;
354
355 mtu = dst_mtu(dst);
356
357 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 358 ip_sk_accept_pmtu(sk) &&
d83d8461 359 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
360 tcp_sync_mss(sk, mtu);
361
362 /* Resend the TCP packet because it's
363 * clear that the old packet has been
364 * dropped. This is the new "fast" path mtu
365 * discovery.
366 */
367 tcp_simple_retransmit(sk);
368 } /* else let the usual retransmit timer handle it */
369}
4fab9071 370EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 371
55be7a9c
DM
372static void do_redirect(struct sk_buff *skb, struct sock *sk)
373{
374 struct dst_entry *dst = __sk_dst_check(sk, 0);
375
1ed5c48f 376 if (dst)
6700c270 377 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
378}
379
26e37360
ED
380
381/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 382void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
383{
384 struct request_sock *req = inet_reqsk(sk);
385 struct net *net = sock_net(sk);
386
387 /* ICMPs are not backlogged, hence we cannot get
388 * an established socket here.
389 */
26e37360 390 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 391 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 392 } else if (abort) {
26e37360
ED
393 /*
394 * Still in SYN_RECV, just remove it silently.
395 * There is no good way to pass the error to the newly
396 * created socket, and POSIX does not want network
397 * errors returned from accept().
398 */
c6973669 399 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 400 tcp_listendrop(req->rsk_listener);
26e37360 401 }
ef84d8ce 402 reqsk_put(req);
26e37360
ED
403}
404EXPORT_SYMBOL(tcp_req_err);
405
1da177e4
LT
406/*
407 * This routine is called by the ICMP module when it gets some
408 * sort of error condition. If err < 0 then the socket should
409 * be closed and the error returned to the user. If err > 0
410 * it's just the icmp type << 8 | icmp code. After adjustment
411 * header points to the first 8 bytes of the tcp header. We need
412 * to find the appropriate port.
413 *
414 * The locking strategy used here is very "optimistic". When
415 * someone else accesses the socket the ICMP is just dropped
416 * and for some paths there is no check at all.
417 * A more general error queue to queue errors for later handling
418 * is probably better.
419 *
420 */
421
32bbd879 422int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 423{
b71d1d42 424 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 425 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 426 struct inet_connection_sock *icsk;
1da177e4
LT
427 struct tcp_sock *tp;
428 struct inet_sock *inet;
4d1a2d9e
DL
429 const int type = icmp_hdr(icmp_skb)->type;
430 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 431 struct sock *sk;
f1ecd5d9 432 struct sk_buff *skb;
0a672f74 433 struct request_sock *fastopen;
9a568de4
ED
434 u32 seq, snd_una;
435 s32 remaining;
436 u32 delta_us;
1da177e4 437 int err;
4d1a2d9e 438 struct net *net = dev_net(icmp_skb->dev);
1da177e4 439
26e37360
ED
440 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 442 inet_iif(icmp_skb), 0);
1da177e4 443 if (!sk) {
5d3848bc 444 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 445 return -ENOENT;
1da177e4
LT
446 }
447 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 448 inet_twsk_put(inet_twsk(sk));
32bbd879 449 return 0;
1da177e4 450 }
26e37360 451 seq = ntohl(th->seq);
32bbd879
SB
452 if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 type == ICMP_TIME_EXCEEDED ||
455 (type == ICMP_DEST_UNREACH &&
456 (code == ICMP_NET_UNREACH ||
457 code == ICMP_HOST_UNREACH)));
458 return 0;
459 }
1da177e4
LT
460
461 bh_lock_sock(sk);
462 /* If too many ICMPs get dropped on busy
463 * servers this needs to be solved differently.
563d34d0
ED
464 * We do take care of PMTU discovery (RFC1191) special case :
465 * we can receive locally generated ICMP messages while socket is held.
1da177e4 466 */
b74aa930
ED
467 if (sock_owned_by_user(sk)) {
468 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 469 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 470 }
1da177e4
LT
471 if (sk->sk_state == TCP_CLOSE)
472 goto out;
473
97e3ecd1 474 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 475 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 476 goto out;
477 }
478
f1ecd5d9 479 icsk = inet_csk(sk);
1da177e4 480 tp = tcp_sk(sk);
0a672f74 481 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
d983ea6f 482 fastopen = rcu_dereference(tp->fastopen_rsk);
0a672f74 483 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 484 if (sk->sk_state != TCP_LISTEN &&
0a672f74 485 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 486 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
487 goto out;
488 }
489
490 switch (type) {
55be7a9c 491 case ICMP_REDIRECT:
45caeaa5
JM
492 if (!sock_owned_by_user(sk))
493 do_redirect(icmp_skb, sk);
55be7a9c 494 goto out;
1da177e4
LT
495 case ICMP_SOURCE_QUENCH:
496 /* Just silently ignore these. */
497 goto out;
498 case ICMP_PARAMETERPROB:
499 err = EPROTO;
500 break;
501 case ICMP_DEST_UNREACH:
502 if (code > NR_ICMP_UNREACH)
503 goto out;
504
505 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
506 /* We are not interested in TCP_LISTEN and open_requests
507 * (SYN-ACKs send out by Linux are always <576bytes so
508 * they should go through unfragmented).
509 */
510 if (sk->sk_state == TCP_LISTEN)
511 goto out;
512
563d34d0 513 tp->mtu_info = info;
144d56e9 514 if (!sock_owned_by_user(sk)) {
563d34d0 515 tcp_v4_mtu_reduced(sk);
144d56e9 516 } else {
7aa5470c 517 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
518 sock_hold(sk);
519 }
1da177e4
LT
520 goto out;
521 }
522
523 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
524 /* check if icmp_skb allows revert of backoff
525 * (see draft-zimmermann-tcp-lcd) */
526 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 break;
528 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 529 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
530 break;
531
8f49c270
DM
532 if (sock_owned_by_user(sk))
533 break;
534
2c4cc971
ED
535 skb = tcp_rtx_queue_head(sk);
536 if (WARN_ON_ONCE(!skb))
537 break;
538
f1ecd5d9 539 icsk->icsk_backoff--;
fcdd1cf4
ED
540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 TCP_TIMEOUT_INIT;
542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9 543
f1ecd5d9 544
9a568de4 545 tcp_mstamp_refresh(tp);
2fd66ffb 546 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
7faee5c0 547 remaining = icsk->icsk_rto -
9a568de4 548 usecs_to_jiffies(delta_us);
f1ecd5d9 549
9a568de4 550 if (remaining > 0) {
f1ecd5d9
DL
551 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
553 } else {
554 /* RTO revert clocked out retransmission.
555 * Will retransmit now */
556 tcp_retransmit_timer(sk);
557 }
558
1da177e4
LT
559 break;
560 case ICMP_TIME_EXCEEDED:
561 err = EHOSTUNREACH;
562 break;
563 default:
564 goto out;
565 }
566
567 switch (sk->sk_state) {
1da177e4 568 case TCP_SYN_SENT:
0a672f74
YC
569 case TCP_SYN_RECV:
570 /* Only in fast or simultaneous open. If a fast open socket is
571 * is already accepted it is treated as a connected one below.
572 */
51456b29 573 if (fastopen && !fastopen->sk)
0a672f74
YC
574 break;
575
1da177e4 576 if (!sock_owned_by_user(sk)) {
1da177e4
LT
577 sk->sk_err = err;
578
579 sk->sk_error_report(sk);
580
581 tcp_done(sk);
582 } else {
583 sk->sk_err_soft = err;
584 }
585 goto out;
586 }
587
588 /* If we've already connected we will keep trying
589 * until we time out, or the user gives up.
590 *
591 * rfc1122 4.2.3.9 allows to consider as hard errors
592 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 * but it is obsoleted by pmtu discovery).
594 *
595 * Note, that in modern internet, where routing is unreliable
596 * and in each dark corner broken firewalls sit, sending random
597 * errors ordered by their masters even this two messages finally lose
598 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 *
600 * Now we are in compliance with RFCs.
601 * --ANK (980905)
602 */
603
604 inet = inet_sk(sk);
605 if (!sock_owned_by_user(sk) && inet->recverr) {
606 sk->sk_err = err;
607 sk->sk_error_report(sk);
608 } else { /* Only an error on timeout */
609 sk->sk_err_soft = err;
610 }
611
612out:
613 bh_unlock_sock(sk);
614 sock_put(sk);
32bbd879 615 return 0;
1da177e4
LT
616}
617
28850dc7 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 619{
aa8223c7 620 struct tcphdr *th = tcp_hdr(skb);
1da177e4 621
98be9b12
ED
622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 skb->csum_start = skb_transport_header(skb) - skb->head;
624 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
625}
626
419f9f89 627/* This routine computes an IPv4 TCP checksum. */
bb296246 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 629{
cf533ea5 630 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
631
632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633}
4bc2f18b 634EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 635
1da177e4
LT
636/*
637 * This routine will send an RST to the other tcp.
638 *
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * for reset.
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
647 */
648
a00e7444 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 650{
cf533ea5 651 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
652 struct {
653 struct tcphdr th;
654#ifdef CONFIG_TCP_MD5SIG
714e85be 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
656#endif
657 } rep;
1da177e4 658 struct ip_reply_arg arg;
cfb6eeb4 659#ifdef CONFIG_TCP_MD5SIG
e46787f0 660 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
661 const __u8 *hash_location = NULL;
662 unsigned char newhash[16];
663 int genhash;
664 struct sock *sk1 = NULL;
cfb6eeb4 665#endif
d6fb396c 666 u64 transmit_time = 0;
00483690 667 struct sock *ctl_sk;
d6fb396c 668 struct net *net;
1da177e4
LT
669
670 /* Never send a reset in response to a reset. */
671 if (th->rst)
672 return;
673
c3658e8d
ED
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
676 */
677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
678 return;
679
680 /* Swap the send and the receive. */
cfb6eeb4
YH
681 memset(&rep, 0, sizeof(rep));
682 rep.th.dest = th->source;
683 rep.th.source = th->dest;
684 rep.th.doff = sizeof(struct tcphdr) / 4;
685 rep.th.rst = 1;
1da177e4
LT
686
687 if (th->ack) {
cfb6eeb4 688 rep.th.seq = th->ack_seq;
1da177e4 689 } else {
cfb6eeb4
YH
690 rep.th.ack = 1;
691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 skb->len - (th->doff << 2));
1da177e4
LT
693 }
694
7174259e 695 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
696 arg.iov[0].iov_base = (unsigned char *)&rep;
697 arg.iov[0].iov_len = sizeof(rep.th);
698
0f85feae 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 700#ifdef CONFIG_TCP_MD5SIG
3b24d854 701 rcu_read_lock();
658ddaaf 702 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 703 if (sk && sk_fullsock(sk)) {
cea97609 704 const union tcp_md5_addr *addr;
dea53bb8 705 int l3index;
cea97609 706
dea53bb8
DA
707 /* sdif set, means packet ingressed via a device
708 * in an L3 domain and inet_iif is set to it.
709 */
710 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
cea97609 711 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 712 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
e46787f0 713 } else if (hash_location) {
cea97609 714 const union tcp_md5_addr *addr;
534322ca
DA
715 int sdif = tcp_v4_sdif(skb);
716 int dif = inet_iif(skb);
dea53bb8 717 int l3index;
cea97609 718
658ddaaf
SL
719 /*
720 * active side is lost. Try to find listening socket through
721 * source port, and then find md5 key through listening socket.
722 * we are not loose security here:
723 * Incoming packet is checked with md5 hash with finding key,
724 * no RST generated if md5 hash doesn't match.
725 */
a583636a
CG
726 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
727 ip_hdr(skb)->saddr,
da5e3630 728 th->source, ip_hdr(skb)->daddr,
534322ca 729 ntohs(th->source), dif, sdif);
658ddaaf
SL
730 /* don't send rst if it can't find key */
731 if (!sk1)
3b24d854
ED
732 goto out;
733
dea53bb8
DA
734 /* sdif set, means packet ingressed via a device
735 * in an L3 domain and dif is set to it.
736 */
737 l3index = sdif ? dif : 0;
cea97609 738 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 739 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
658ddaaf 740 if (!key)
3b24d854
ED
741 goto out;
742
658ddaaf 743
39f8e58e 744 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 745 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
746 goto out;
747
658ddaaf
SL
748 }
749
cfb6eeb4
YH
750 if (key) {
751 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
752 (TCPOPT_NOP << 16) |
753 (TCPOPT_MD5SIG << 8) |
754 TCPOLEN_MD5SIG);
755 /* Update length and the length the header thinks exists */
756 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757 rep.th.doff = arg.iov[0].iov_len / 4;
758
49a72dfb 759 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
760 key, ip_hdr(skb)->saddr,
761 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
762 }
763#endif
eddc9ec5
ACM
764 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 ip_hdr(skb)->saddr, /* XXX */
52cd5750 766 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 767 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
768 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
769
e2446eaa 770 /* When socket is gone, all binding information is lost.
4c675258
AK
771 * routing might fail in this case. No choice here, if we choose to force
772 * input interface, we will misroute in case of asymmetric route.
e2446eaa 773 */
c24b14c4 774 if (sk) {
4c675258 775 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
776 if (sk_fullsock(sk))
777 trace_tcp_send_reset(sk, skb);
c24b14c4 778 }
1da177e4 779
271c3b9b
FW
780 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
781 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
782
66b13d99 783 arg.tos = ip_hdr(skb)->tos;
e2d118a1 784 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 785 local_bh_disable();
5472c3c6 786 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14 787 if (sk) {
00483690
JM
788 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
789 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
790 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
791 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 792 transmit_time = tcp_transmit_time(sk);
a842fe14 793 }
00483690 794 ip_send_unicast_reply(ctl_sk,
bdbbb852 795 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 796 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
797 &arg, arg.iov[0].iov_len,
798 transmit_time);
1da177e4 799
00483690 800 ctl_sk->sk_mark = 0;
90bbcc60
ED
801 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
802 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 803 local_bh_enable();
658ddaaf
SL
804
805#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
806out:
807 rcu_read_unlock();
658ddaaf 808#endif
1da177e4
LT
809}
810
811/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
812 outside socket context is ugly, certainly. What can I do?
813 */
814
e2d118a1 815static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 816 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 817 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 818 struct tcp_md5sig_key *key,
66b13d99 819 int reply_flags, u8 tos)
1da177e4 820{
cf533ea5 821 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
822 struct {
823 struct tcphdr th;
714e85be 824 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 825#ifdef CONFIG_TCP_MD5SIG
714e85be 826 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
827#endif
828 ];
1da177e4 829 } rep;
e2d118a1 830 struct net *net = sock_net(sk);
1da177e4 831 struct ip_reply_arg arg;
00483690 832 struct sock *ctl_sk;
d6fb396c 833 u64 transmit_time;
1da177e4
LT
834
835 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 836 memset(&arg, 0, sizeof(arg));
1da177e4
LT
837
838 arg.iov[0].iov_base = (unsigned char *)&rep;
839 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 840 if (tsecr) {
cfb6eeb4
YH
841 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
842 (TCPOPT_TIMESTAMP << 8) |
843 TCPOLEN_TIMESTAMP);
ee684b6f
AV
844 rep.opt[1] = htonl(tsval);
845 rep.opt[2] = htonl(tsecr);
cb48cfe8 846 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
847 }
848
849 /* Swap the send and the receive. */
850 rep.th.dest = th->source;
851 rep.th.source = th->dest;
852 rep.th.doff = arg.iov[0].iov_len / 4;
853 rep.th.seq = htonl(seq);
854 rep.th.ack_seq = htonl(ack);
855 rep.th.ack = 1;
856 rep.th.window = htons(win);
857
cfb6eeb4 858#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 859 if (key) {
ee684b6f 860 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
861
862 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
863 (TCPOPT_NOP << 16) |
864 (TCPOPT_MD5SIG << 8) |
865 TCPOLEN_MD5SIG);
866 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
867 rep.th.doff = arg.iov[0].iov_len/4;
868
49a72dfb 869 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
870 key, ip_hdr(skb)->saddr,
871 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
872 }
873#endif
88ef4a5a 874 arg.flags = reply_flags;
eddc9ec5
ACM
875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
877 arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
879 if (oif)
880 arg.bound_dev_if = oif;
66b13d99 881 arg.tos = tos;
e2d118a1 882 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 883 local_bh_disable();
5472c3c6 884 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
a842fe14
ED
885 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
886 inet_twsk(sk)->tw_mark : sk->sk_mark;
f6c0f5d2
ED
887 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
888 inet_twsk(sk)->tw_priority : sk->sk_priority;
d6fb396c 889 transmit_time = tcp_transmit_time(sk);
00483690 890 ip_send_unicast_reply(ctl_sk,
bdbbb852 891 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d 892 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
d6fb396c
ED
893 &arg, arg.iov[0].iov_len,
894 transmit_time);
1da177e4 895
00483690 896 ctl_sk->sk_mark = 0;
90bbcc60 897 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 898 local_bh_enable();
1da177e4
LT
899}
900
901static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
902{
8feaf0c0 903 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 904 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 905
e2d118a1 906 tcp_v4_send_ack(sk, skb,
e62a123b 907 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 908 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 909 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
910 tcptw->tw_ts_recent,
911 tw->tw_bound_dev_if,
88ef4a5a 912 tcp_twsk_md5_key(tcptw),
66b13d99
ED
913 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
914 tw->tw_tos
9501f972 915 );
1da177e4 916
8feaf0c0 917 inet_twsk_put(tw);
1da177e4
LT
918}
919
a00e7444 920static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 921 struct request_sock *req)
1da177e4 922{
cea97609 923 const union tcp_md5_addr *addr;
dea53bb8 924 int l3index;
cea97609 925
168a8f58
JC
926 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
927 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
928 */
e62a123b
ED
929 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
930 tcp_sk(sk)->snd_nxt;
931
20a2b49f
ED
932 /* RFC 7323 2.3
933 * The window field (SEG.WND) of every outgoing segment, with the
934 * exception of <SYN> segments, MUST be right-shifted by
935 * Rcv.Wind.Shift bits:
936 */
cea97609 937 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
dea53bb8 938 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
e2d118a1 939 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
940 tcp_rsk(req)->rcv_nxt,
941 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 942 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
943 req->ts_recent,
944 0,
dea53bb8 945 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
66b13d99
ED
946 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
947 ip_hdr(skb)->tos);
1da177e4
LT
948}
949
1da177e4 950/*
9bf1d83e 951 * Send a SYN-ACK after having received a SYN.
60236fdd 952 * This still operates on a request_sock only, not on a big
1da177e4
LT
953 * socket.
954 */
0f935dbe 955static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 956 struct flowi *fl,
72659ecc 957 struct request_sock *req,
ca6fb065 958 struct tcp_fastopen_cookie *foc,
b3d05147 959 enum tcp_synack_type synack_type)
1da177e4 960{
2e6599cb 961 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 962 struct flowi4 fl4;
1da177e4 963 int err = -1;
d41db5af 964 struct sk_buff *skb;
1da177e4
LT
965
966 /* First, grab a route. */
ba3f7f04 967 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 968 return -1;
1da177e4 969
b3d05147 970 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
971
972 if (skb) {
634fb979 973 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 974
2ab2ddd3 975 rcu_read_lock();
634fb979
ED
976 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
977 ireq->ir_rmt_addr,
2ab2ddd3
ED
978 rcu_dereference(ireq->ireq_opt));
979 rcu_read_unlock();
b9df3cb8 980 err = net_xmit_eval(err);
1da177e4
LT
981 }
982
1da177e4
LT
983 return err;
984}
985
986/*
60236fdd 987 * IPv4 request_sock destructor.
1da177e4 988 */
60236fdd 989static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 990{
c92e8c02 991 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
992}
993
cfb6eeb4
YH
994#ifdef CONFIG_TCP_MD5SIG
995/*
996 * RFC2385 MD5 checksumming requires a mapping of
997 * IP address->MD5 Key.
998 * We need to maintain these in the sk structure.
999 */
1000
921f9a0f 1001DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
6015c71e
ED
1002EXPORT_SYMBOL(tcp_md5_needed);
1003
cfb6eeb4 1004/* Find the Key structure for an address. */
dea53bb8 1005struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
6015c71e
ED
1006 const union tcp_md5_addr *addr,
1007 int family)
cfb6eeb4 1008{
fd3a154a 1009 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1010 struct tcp_md5sig_key *key;
fd3a154a 1011 const struct tcp_md5sig_info *md5sig;
6797318e
ID
1012 __be32 mask;
1013 struct tcp_md5sig_key *best_match = NULL;
1014 bool match;
cfb6eeb4 1015
a8afca03
ED
1016 /* caller either holds rcu_read_lock() or socket lock */
1017 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 1018 lockdep_sock_is_held(sk));
a8afca03 1019 if (!md5sig)
cfb6eeb4 1020 return NULL;
083a0326 1021
c8b91770
AG
1022 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1023 lockdep_sock_is_held(sk)) {
a915da9b
ED
1024 if (key->family != family)
1025 continue;
dea53bb8
DA
1026 if (key->l3index && key->l3index != l3index)
1027 continue;
6797318e
ID
1028 if (family == AF_INET) {
1029 mask = inet_make_mask(key->prefixlen);
1030 match = (key->addr.a4.s_addr & mask) ==
1031 (addr->a4.s_addr & mask);
1032#if IS_ENABLED(CONFIG_IPV6)
1033 } else if (family == AF_INET6) {
1034 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1035 key->prefixlen);
1036#endif
1037 } else {
1038 match = false;
1039 }
1040
1041 if (match && (!best_match ||
1042 key->prefixlen > best_match->prefixlen))
1043 best_match = key;
1044 }
1045 return best_match;
1046}
6015c71e 1047EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1048
e8f37d57
WF
1049static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1050 const union tcp_md5_addr *addr,
dea53bb8
DA
1051 int family, u8 prefixlen,
1052 int l3index)
6797318e
ID
1053{
1054 const struct tcp_sock *tp = tcp_sk(sk);
1055 struct tcp_md5sig_key *key;
1056 unsigned int size = sizeof(struct in_addr);
1057 const struct tcp_md5sig_info *md5sig;
1058
1059 /* caller either holds rcu_read_lock() or socket lock */
1060 md5sig = rcu_dereference_check(tp->md5sig_info,
1061 lockdep_sock_is_held(sk));
1062 if (!md5sig)
1063 return NULL;
1064#if IS_ENABLED(CONFIG_IPV6)
1065 if (family == AF_INET6)
1066 size = sizeof(struct in6_addr);
1067#endif
c8b91770
AG
1068 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069 lockdep_sock_is_held(sk)) {
6797318e
ID
1070 if (key->family != family)
1071 continue;
dea53bb8
DA
1072 if (key->l3index && key->l3index != l3index)
1073 continue;
6797318e
ID
1074 if (!memcmp(&key->addr, addr, size) &&
1075 key->prefixlen == prefixlen)
a915da9b 1076 return key;
cfb6eeb4
YH
1077 }
1078 return NULL;
1079}
1080
b83e3deb 1081struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1082 const struct sock *addr_sk)
cfb6eeb4 1083{
b52e6921 1084 const union tcp_md5_addr *addr;
dea53bb8 1085 int l3index;
a915da9b 1086
dea53bb8
DA
1087 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1088 addr_sk->sk_bound_dev_if);
b52e6921 1089 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
dea53bb8 1090 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
cfb6eeb4 1091}
cfb6eeb4
YH
1092EXPORT_SYMBOL(tcp_v4_md5_lookup);
1093
cfb6eeb4 1094/* This can be called on a newly created socket, from other files */
a915da9b 1095int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
dea53bb8
DA
1096 int family, u8 prefixlen, int l3index,
1097 const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1098{
1099 /* Add Key to the list */
b0a713e9 1100 struct tcp_md5sig_key *key;
cfb6eeb4 1101 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1102 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1103
dea53bb8 1104 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
cfb6eeb4
YH
1105 if (key) {
1106 /* Pre-existing entry - just update that one. */
a915da9b 1107 memcpy(key->key, newkey, newkeylen);
b0a713e9 1108 key->keylen = newkeylen;
a915da9b
ED
1109 return 0;
1110 }
260fcbeb 1111
a8afca03 1112 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1113 lockdep_sock_is_held(sk));
a915da9b
ED
1114 if (!md5sig) {
1115 md5sig = kmalloc(sizeof(*md5sig), gfp);
1116 if (!md5sig)
cfb6eeb4 1117 return -ENOMEM;
cfb6eeb4 1118
a915da9b
ED
1119 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1120 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1121 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1122 }
cfb6eeb4 1123
5f3d9cb2 1124 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1125 if (!key)
1126 return -ENOMEM;
71cea17e 1127 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1128 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1129 return -ENOMEM;
cfb6eeb4 1130 }
a915da9b
ED
1131
1132 memcpy(key->key, newkey, newkeylen);
1133 key->keylen = newkeylen;
1134 key->family = family;
6797318e 1135 key->prefixlen = prefixlen;
dea53bb8 1136 key->l3index = l3index;
a915da9b
ED
1137 memcpy(&key->addr, addr,
1138 (family == AF_INET6) ? sizeof(struct in6_addr) :
1139 sizeof(struct in_addr));
1140 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1141 return 0;
1142}
a915da9b 1143EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1144
6797318e 1145int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
dea53bb8 1146 u8 prefixlen, int l3index)
cfb6eeb4 1147{
a915da9b
ED
1148 struct tcp_md5sig_key *key;
1149
dea53bb8 1150 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
a915da9b
ED
1151 if (!key)
1152 return -ENOENT;
1153 hlist_del_rcu(&key->node);
5f3d9cb2 1154 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1155 kfree_rcu(key, rcu);
a915da9b 1156 return 0;
cfb6eeb4 1157}
a915da9b 1158EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1159
e0683e70 1160static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1161{
1162 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1163 struct tcp_md5sig_key *key;
b67bfe0d 1164 struct hlist_node *n;
a8afca03 1165 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1166
a8afca03
ED
1167 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1168
b67bfe0d 1169 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1170 hlist_del_rcu(&key->node);
5f3d9cb2 1171 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1172 kfree_rcu(key, rcu);
cfb6eeb4
YH
1173 }
1174}
1175
8917a777
ID
1176static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1177 char __user *optval, int optlen)
cfb6eeb4
YH
1178{
1179 struct tcp_md5sig cmd;
1180 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cea97609 1181 const union tcp_md5_addr *addr;
8917a777 1182 u8 prefixlen = 32;
dea53bb8 1183 int l3index = 0;
cfb6eeb4
YH
1184
1185 if (optlen < sizeof(cmd))
1186 return -EINVAL;
1187
7174259e 1188 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1189 return -EFAULT;
1190
1191 if (sin->sin_family != AF_INET)
1192 return -EINVAL;
1193
8917a777
ID
1194 if (optname == TCP_MD5SIG_EXT &&
1195 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1196 prefixlen = cmd.tcpm_prefixlen;
1197 if (prefixlen > 32)
1198 return -EINVAL;
1199 }
1200
6b102db5
DA
1201 if (optname == TCP_MD5SIG_EXT &&
1202 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1203 struct net_device *dev;
1204
1205 rcu_read_lock();
1206 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1207 if (dev && netif_is_l3_master(dev))
1208 l3index = dev->ifindex;
1209
1210 rcu_read_unlock();
1211
1212 /* ok to reference set/not set outside of rcu;
1213 * right now device MUST be an L3 master
1214 */
1215 if (!dev || !l3index)
1216 return -EINVAL;
1217 }
1218
cea97609
DA
1219 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1220
64a124ed 1221 if (!cmd.tcpm_keylen)
dea53bb8 1222 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
cfb6eeb4
YH
1223
1224 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1225 return -EINVAL;
1226
dea53bb8 1227 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
cea97609 1228 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
cfb6eeb4
YH
1229}
1230
19689e38
ED
1231static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1232 __be32 daddr, __be32 saddr,
1233 const struct tcphdr *th, int nbytes)
cfb6eeb4 1234{
cfb6eeb4 1235 struct tcp4_pseudohdr *bp;
49a72dfb 1236 struct scatterlist sg;
19689e38 1237 struct tcphdr *_th;
cfb6eeb4 1238
19689e38 1239 bp = hp->scratch;
cfb6eeb4
YH
1240 bp->saddr = saddr;
1241 bp->daddr = daddr;
1242 bp->pad = 0;
076fb722 1243 bp->protocol = IPPROTO_TCP;
49a72dfb 1244 bp->len = cpu_to_be16(nbytes);
c7da57a1 1245
19689e38
ED
1246 _th = (struct tcphdr *)(bp + 1);
1247 memcpy(_th, th, sizeof(*th));
1248 _th->check = 0;
1249
1250 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1251 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1252 sizeof(*bp) + sizeof(*th));
cf80e0e4 1253 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1254}
1255
a915da9b 1256static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1257 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1258{
1259 struct tcp_md5sig_pool *hp;
cf80e0e4 1260 struct ahash_request *req;
49a72dfb
AL
1261
1262 hp = tcp_get_md5sig_pool();
1263 if (!hp)
1264 goto clear_hash_noput;
cf80e0e4 1265 req = hp->md5_req;
49a72dfb 1266
cf80e0e4 1267 if (crypto_ahash_init(req))
49a72dfb 1268 goto clear_hash;
19689e38 1269 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1270 goto clear_hash;
1271 if (tcp_md5_hash_key(hp, key))
1272 goto clear_hash;
cf80e0e4
HX
1273 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1274 if (crypto_ahash_final(req))
cfb6eeb4
YH
1275 goto clear_hash;
1276
cfb6eeb4 1277 tcp_put_md5sig_pool();
cfb6eeb4 1278 return 0;
49a72dfb 1279
cfb6eeb4
YH
1280clear_hash:
1281 tcp_put_md5sig_pool();
1282clear_hash_noput:
1283 memset(md5_hash, 0, 16);
49a72dfb 1284 return 1;
cfb6eeb4
YH
1285}
1286
39f8e58e
ED
1287int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1288 const struct sock *sk,
318cf7aa 1289 const struct sk_buff *skb)
cfb6eeb4 1290{
49a72dfb 1291 struct tcp_md5sig_pool *hp;
cf80e0e4 1292 struct ahash_request *req;
318cf7aa 1293 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1294 __be32 saddr, daddr;
1295
39f8e58e
ED
1296 if (sk) { /* valid for establish/request sockets */
1297 saddr = sk->sk_rcv_saddr;
1298 daddr = sk->sk_daddr;
cfb6eeb4 1299 } else {
49a72dfb
AL
1300 const struct iphdr *iph = ip_hdr(skb);
1301 saddr = iph->saddr;
1302 daddr = iph->daddr;
cfb6eeb4 1303 }
49a72dfb
AL
1304
1305 hp = tcp_get_md5sig_pool();
1306 if (!hp)
1307 goto clear_hash_noput;
cf80e0e4 1308 req = hp->md5_req;
49a72dfb 1309
cf80e0e4 1310 if (crypto_ahash_init(req))
49a72dfb
AL
1311 goto clear_hash;
1312
19689e38 1313 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1314 goto clear_hash;
1315 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1316 goto clear_hash;
1317 if (tcp_md5_hash_key(hp, key))
1318 goto clear_hash;
cf80e0e4
HX
1319 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 if (crypto_ahash_final(req))
49a72dfb
AL
1321 goto clear_hash;
1322
1323 tcp_put_md5sig_pool();
1324 return 0;
1325
1326clear_hash:
1327 tcp_put_md5sig_pool();
1328clear_hash_noput:
1329 memset(md5_hash, 0, 16);
1330 return 1;
cfb6eeb4 1331}
49a72dfb 1332EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1333
ba8e275a
ED
1334#endif
1335
ff74e23f 1336/* Called with rcu_read_lock() */
ba8e275a 1337static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
534322ca
DA
1338 const struct sk_buff *skb,
1339 int dif, int sdif)
cfb6eeb4 1340{
ba8e275a 1341#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1342 /*
1343 * This gets called for each TCP segment that arrives
1344 * so we want to be efficient.
1345 * We have 3 drop cases:
1346 * o No MD5 hash and one expected.
1347 * o MD5 hash and we're not expecting one.
1348 * o MD5 hash and its wrong.
1349 */
cf533ea5 1350 const __u8 *hash_location = NULL;
cfb6eeb4 1351 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1352 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1353 const struct tcphdr *th = tcp_hdr(skb);
cea97609 1354 const union tcp_md5_addr *addr;
cfb6eeb4 1355 unsigned char newhash[16];
dea53bb8
DA
1356 int genhash, l3index;
1357
1358 /* sdif set, means packet ingressed via a device
1359 * in an L3 domain and dif is set to the l3mdev
1360 */
1361 l3index = sdif ? dif : 0;
cfb6eeb4 1362
cea97609 1363 addr = (union tcp_md5_addr *)&iph->saddr;
dea53bb8 1364 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
7d5d5525 1365 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1366
cfb6eeb4
YH
1367 /* We've parsed the options - do we have a hash? */
1368 if (!hash_expected && !hash_location)
a2a385d6 1369 return false;
cfb6eeb4
YH
1370
1371 if (hash_expected && !hash_location) {
c10d9310 1372 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1373 return true;
cfb6eeb4
YH
1374 }
1375
1376 if (!hash_expected && hash_location) {
c10d9310 1377 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1378 return true;
cfb6eeb4
YH
1379 }
1380
1381 /* Okay, so this is hash_expected and hash_location -
1382 * so we need to calculate the checksum.
1383 */
49a72dfb
AL
1384 genhash = tcp_v4_md5_hash_skb(newhash,
1385 hash_expected,
39f8e58e 1386 NULL, skb);
cfb6eeb4
YH
1387
1388 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1389 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
dea53bb8 1390 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
e87cc472
JP
1391 &iph->saddr, ntohs(th->source),
1392 &iph->daddr, ntohs(th->dest),
1393 genhash ? " tcp_v4_calc_md5_hash failed"
dea53bb8 1394 : "", l3index);
a2a385d6 1395 return true;
cfb6eeb4 1396 }
a2a385d6 1397 return false;
cfb6eeb4 1398#endif
ba8e275a
ED
1399 return false;
1400}
cfb6eeb4 1401
b40cf18e
ED
1402static void tcp_v4_init_req(struct request_sock *req,
1403 const struct sock *sk_listener,
16bea70a
OP
1404 struct sk_buff *skb)
1405{
1406 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1407 struct net *net = sock_net(sk_listener);
16bea70a 1408
08d2cc3b
ED
1409 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1410 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1411 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1412}
1413
f964629e
ED
1414static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1415 struct flowi *fl,
4396e461 1416 const struct request_sock *req)
d94e0417 1417{
4396e461 1418 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1419}
1420
72a3effa 1421struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1422 .family = PF_INET,
2e6599cb 1423 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1424 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1425 .send_ack = tcp_v4_reqsk_send_ack,
1426 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1427 .send_reset = tcp_v4_send_reset,
688d1945 1428 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1429};
1430
35b2c321 1431const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1432 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1433#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1434 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1435 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1436#endif
16bea70a 1437 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1438#ifdef CONFIG_SYN_COOKIES
1439 .cookie_init_seq = cookie_v4_init_sequence,
1440#endif
d94e0417 1441 .route_req = tcp_v4_route_req,
84b114b9
ED
1442 .init_seq = tcp_v4_init_seq,
1443 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1444 .send_synack = tcp_v4_send_synack,
16bea70a 1445};
cfb6eeb4 1446
1da177e4
LT
1447int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1448{
1da177e4 1449 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1450 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1451 goto drop;
1452
1fb6f159
OP
1453 return tcp_conn_request(&tcp_request_sock_ops,
1454 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1455
1da177e4 1456drop:
9caad864 1457 tcp_listendrop(sk);
1da177e4
LT
1458 return 0;
1459}
4bc2f18b 1460EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1461
1462
1463/*
1464 * The three way handshake has completed - we got a valid synack -
1465 * now create the new socket.
1466 */
0c27171e 1467struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1468 struct request_sock *req,
5e0724d0
ED
1469 struct dst_entry *dst,
1470 struct request_sock *req_unhash,
1471 bool *own_req)
1da177e4 1472{
2e6599cb 1473 struct inet_request_sock *ireq;
1da177e4
LT
1474 struct inet_sock *newinet;
1475 struct tcp_sock *newtp;
1476 struct sock *newsk;
cfb6eeb4 1477#ifdef CONFIG_TCP_MD5SIG
cea97609 1478 const union tcp_md5_addr *addr;
cfb6eeb4 1479 struct tcp_md5sig_key *key;
dea53bb8 1480 int l3index;
cfb6eeb4 1481#endif
f6d8bd05 1482 struct ip_options_rcu *inet_opt;
1da177e4
LT
1483
1484 if (sk_acceptq_is_full(sk))
1485 goto exit_overflow;
1486
1da177e4
LT
1487 newsk = tcp_create_openreq_child(sk, req, skb);
1488 if (!newsk)
093d2823 1489 goto exit_nonewsk;
1da177e4 1490
bcd76111 1491 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1492 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1493
1494 newtp = tcp_sk(newsk);
1495 newinet = inet_sk(newsk);
2e6599cb 1496 ireq = inet_rsk(req);
d1e559d0
ED
1497 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1498 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1499 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1500 newinet->inet_saddr = ireq->ir_loc_addr;
1501 inet_opt = rcu_dereference(ireq->ireq_opt);
1502 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1503 newinet->mc_index = inet_iif(skb);
eddc9ec5 1504 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1505 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1506 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1507 if (inet_opt)
1508 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
a904a069 1509 newinet->inet_id = prandom_u32();
1da177e4 1510
dfd25fff
ED
1511 if (!dst) {
1512 dst = inet_csk_route_child_sock(sk, newsk, req);
1513 if (!dst)
1514 goto put_and_exit;
1515 } else {
1516 /* syncookie case : see end of cookie_v4_check() */
1517 }
0e734419
DM
1518 sk_setup_caps(newsk, dst);
1519
81164413
DB
1520 tcp_ca_openreq_child(newsk, dst);
1521
1da177e4 1522 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1523 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1524
1da177e4
LT
1525 tcp_initialize_rcv_mss(newsk);
1526
cfb6eeb4 1527#ifdef CONFIG_TCP_MD5SIG
dea53bb8 1528 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
cfb6eeb4 1529 /* Copy over the MD5 key from the original socket */
cea97609 1530 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
dea53bb8 1531 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
00db4124 1532 if (key) {
cfb6eeb4
YH
1533 /*
1534 * We're using one, so create a matching key
1535 * on the newsk structure. If we fail to get
1536 * memory, then we end up not copying the key
1537 * across. Shucks.
1538 */
dea53bb8 1539 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
cea97609 1540 key->key, key->keylen, GFP_ATOMIC);
a465419b 1541 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1542 }
1543#endif
1544
0e734419
DM
1545 if (__inet_inherit_port(sk, newsk) < 0)
1546 goto put_and_exit;
5e0724d0 1547 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
c92e8c02 1548 if (likely(*own_req)) {
49a496c9 1549 tcp_move_syn(newtp, req);
c92e8c02
ED
1550 ireq->ireq_opt = NULL;
1551 } else {
1552 newinet->inet_opt = NULL;
1553 }
1da177e4
LT
1554 return newsk;
1555
1556exit_overflow:
c10d9310 1557 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1558exit_nonewsk:
1559 dst_release(dst);
1da177e4 1560exit:
9caad864 1561 tcp_listendrop(sk);
1da177e4 1562 return NULL;
0e734419 1563put_and_exit:
c92e8c02 1564 newinet->inet_opt = NULL;
e337e24d
CP
1565 inet_csk_prepare_forced_close(newsk);
1566 tcp_done(newsk);
0e734419 1567 goto exit;
1da177e4 1568}
4bc2f18b 1569EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1570
079096f1 1571static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1572{
079096f1 1573#ifdef CONFIG_SYN_COOKIES
52452c54 1574 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1575
af9b4738 1576 if (!th->syn)
461b74c3 1577 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1578#endif
1579 return sk;
1580}
1581
9349d600
PP
1582u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1583 struct tcphdr *th, u32 *cookie)
1584{
1585 u16 mss = 0;
1586#ifdef CONFIG_SYN_COOKIES
1587 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1588 &tcp_request_sock_ipv4_ops, sk, th);
1589 if (mss) {
1590 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1591 tcp_synq_overflow(sk);
1592 }
1593#endif
1594 return mss;
1595}
1596
1da177e4 1597/* The socket must have it's spinlock held when we get
e994b2f0 1598 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1599 *
1600 * We have a potential double-lock case here, so even when
1601 * doing backlog processing we use the BH locking scheme.
1602 * This is because we cannot sleep with the original spinlock
1603 * held.
1604 */
1605int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1606{
cfb6eeb4 1607 struct sock *rsk;
cfb6eeb4 1608
1da177e4 1609 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1610 struct dst_entry *dst = sk->sk_rx_dst;
1611
bdeab991 1612 sock_rps_save_rxhash(sk, skb);
3d97379a 1613 sk_mark_napi_id(sk, skb);
404e0a8b 1614 if (dst) {
505fbcf0 1615 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1616 !dst->ops->check(dst, 0)) {
92101b3b
DM
1617 dst_release(dst);
1618 sk->sk_rx_dst = NULL;
1619 }
1620 }
3d97d88e 1621 tcp_rcv_established(sk, skb);
1da177e4
LT
1622 return 0;
1623 }
1624
12e25e10 1625 if (tcp_checksum_complete(skb))
1da177e4
LT
1626 goto csum_err;
1627
1628 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1629 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1630
1da177e4
LT
1631 if (!nsk)
1632 goto discard;
1da177e4 1633 if (nsk != sk) {
cfb6eeb4
YH
1634 if (tcp_child_process(sk, nsk, skb)) {
1635 rsk = nsk;
1da177e4 1636 goto reset;
cfb6eeb4 1637 }
1da177e4
LT
1638 return 0;
1639 }
ca55158c 1640 } else
bdeab991 1641 sock_rps_save_rxhash(sk, skb);
ca55158c 1642
72ab4a86 1643 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1644 rsk = sk;
1da177e4 1645 goto reset;
cfb6eeb4 1646 }
1da177e4
LT
1647 return 0;
1648
1649reset:
cfb6eeb4 1650 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1651discard:
1652 kfree_skb(skb);
1653 /* Be careful here. If this function gets more complicated and
1654 * gcc suffers from register pressure on the x86, sk (in %ebx)
1655 * might be destroyed here. This current version compiles correctly,
1656 * but you have been warned.
1657 */
1658 return 0;
1659
1660csum_err:
c10d9310
ED
1661 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1662 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1663 goto discard;
1664}
4bc2f18b 1665EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1666
7487449c 1667int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1668{
41063e9d
DM
1669 const struct iphdr *iph;
1670 const struct tcphdr *th;
1671 struct sock *sk;
41063e9d 1672
41063e9d 1673 if (skb->pkt_type != PACKET_HOST)
7487449c 1674 return 0;
41063e9d 1675
45f00f99 1676 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1677 return 0;
41063e9d
DM
1678
1679 iph = ip_hdr(skb);
45f00f99 1680 th = tcp_hdr(skb);
41063e9d
DM
1681
1682 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1683 return 0;
41063e9d 1684
45f00f99 1685 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1686 iph->saddr, th->source,
7011d085 1687 iph->daddr, ntohs(th->dest),
3fa6f616 1688 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1689 if (sk) {
1690 skb->sk = sk;
1691 skb->destructor = sock_edemux;
f7e4eb03 1692 if (sk_fullsock(sk)) {
d0c294c5 1693 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1694
41063e9d
DM
1695 if (dst)
1696 dst = dst_check(dst, 0);
92101b3b 1697 if (dst &&
505fbcf0 1698 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1699 skb_dst_set_noref(skb, dst);
41063e9d
DM
1700 }
1701 }
7487449c 1702 return 0;
41063e9d
DM
1703}
1704
c9c33212
ED
1705bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1706{
8265792b 1707 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
4f693b55
ED
1708 struct skb_shared_info *shinfo;
1709 const struct tcphdr *th;
1710 struct tcphdr *thtail;
1711 struct sk_buff *tail;
1712 unsigned int hdrlen;
1713 bool fragstolen;
1714 u32 gso_segs;
1715 int delta;
c9c33212
ED
1716
1717 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1718 * we can fix skb->truesize to its real value to avoid future drops.
1719 * This is valid because skb is not yet charged to the socket.
1720 * It has been noticed pure SACK packets were sometimes dropped
1721 * (if cooked by drivers without copybreak feature).
1722 */
60b1af33 1723 skb_condense(skb);
c9c33212 1724
ade9628e
ED
1725 skb_dst_drop(skb);
1726
4f693b55
ED
1727 if (unlikely(tcp_checksum_complete(skb))) {
1728 bh_unlock_sock(sk);
1729 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1730 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1731 return true;
1732 }
1733
1734 /* Attempt coalescing to last skb in backlog, even if we are
1735 * above the limits.
1736 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1737 */
1738 th = (const struct tcphdr *)skb->data;
1739 hdrlen = th->doff * 4;
1740 shinfo = skb_shinfo(skb);
1741
1742 if (!shinfo->gso_size)
1743 shinfo->gso_size = skb->len - hdrlen;
1744
1745 if (!shinfo->gso_segs)
1746 shinfo->gso_segs = 1;
1747
1748 tail = sk->sk_backlog.tail;
1749 if (!tail)
1750 goto no_coalesce;
1751 thtail = (struct tcphdr *)tail->data;
1752
1753 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1754 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1755 ((TCP_SKB_CB(tail)->tcp_flags |
ca2fe295
ED
1756 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1757 !((TCP_SKB_CB(tail)->tcp_flags &
1758 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
4f693b55
ED
1759 ((TCP_SKB_CB(tail)->tcp_flags ^
1760 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1761#ifdef CONFIG_TLS_DEVICE
1762 tail->decrypted != skb->decrypted ||
1763#endif
1764 thtail->doff != th->doff ||
1765 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1766 goto no_coalesce;
1767
1768 __skb_pull(skb, hdrlen);
1769 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1770 thtail->window = th->window;
1771
1772 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1773
1774 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1775 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1776
ca2fe295
ED
1777 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1778 * thtail->fin, so that the fast path in tcp_rcv_established()
1779 * is not entered if we append a packet with a FIN.
1780 * SYN, RST, URG are not present.
1781 * ACK is set on both packets.
1782 * PSH : we do not really care in TCP stack,
1783 * at least for 'GRO' packets.
1784 */
1785 thtail->fin |= th->fin;
4f693b55
ED
1786 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1787
1788 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1789 TCP_SKB_CB(tail)->has_rxtstamp = true;
1790 tail->tstamp = skb->tstamp;
1791 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1792 }
1793
1794 /* Not as strict as GRO. We only need to carry mss max value */
1795 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1796 skb_shinfo(tail)->gso_size);
1797
1798 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1799 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1800
1801 sk->sk_backlog.len += delta;
1802 __NET_INC_STATS(sock_net(sk),
1803 LINUX_MIB_TCPBACKLOGCOALESCE);
1804 kfree_skb_partial(skb, fragstolen);
1805 return false;
1806 }
1807 __skb_push(skb, hdrlen);
1808
1809no_coalesce:
1810 /* Only socket owner can try to collapse/prune rx queues
1811 * to reduce memory overhead, so add a little headroom here.
1812 * Few sockets backlog are possibly concurrently non empty.
1813 */
1814 limit += 64*1024;
1815
c9c33212
ED
1816 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1817 bh_unlock_sock(sk);
1818 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1819 return true;
1820 }
1821 return false;
1822}
1823EXPORT_SYMBOL(tcp_add_backlog);
1824
ac6e7800
ED
1825int tcp_filter(struct sock *sk, struct sk_buff *skb)
1826{
1827 struct tcphdr *th = (struct tcphdr *)skb->data;
ac6e7800 1828
f2feaefd 1829 return sk_filter_trim_cap(sk, skb, th->doff * 4);
ac6e7800
ED
1830}
1831EXPORT_SYMBOL(tcp_filter);
1832
eeea10b8
ED
1833static void tcp_v4_restore_cb(struct sk_buff *skb)
1834{
1835 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1836 sizeof(struct inet_skb_parm));
1837}
1838
1839static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1840 const struct tcphdr *th)
1841{
1842 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1843 * barrier() makes sure compiler wont play fool^Waliasing games.
1844 */
1845 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1846 sizeof(struct inet_skb_parm));
1847 barrier();
1848
1849 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1850 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1851 skb->len - th->doff * 4);
1852 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1853 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1854 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1855 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1856 TCP_SKB_CB(skb)->sacked = 0;
1857 TCP_SKB_CB(skb)->has_rxtstamp =
1858 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1859}
1860
1da177e4
LT
1861/*
1862 * From tcp_input.c
1863 */
1864
1865int tcp_v4_rcv(struct sk_buff *skb)
1866{
3b24d854 1867 struct net *net = dev_net(skb->dev);
8b27dae5 1868 struct sk_buff *skb_to_free;
3fa6f616 1869 int sdif = inet_sdif(skb);
534322ca 1870 int dif = inet_iif(skb);
eddc9ec5 1871 const struct iphdr *iph;
cf533ea5 1872 const struct tcphdr *th;
3b24d854 1873 bool refcounted;
1da177e4
LT
1874 struct sock *sk;
1875 int ret;
1876
1877 if (skb->pkt_type != PACKET_HOST)
1878 goto discard_it;
1879
1880 /* Count it even if it's bad */
90bbcc60 1881 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1882
1883 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1884 goto discard_it;
1885
ea1627c2 1886 th = (const struct tcphdr *)skb->data;
1da177e4 1887
ea1627c2 1888 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1889 goto bad_packet;
1890 if (!pskb_may_pull(skb, th->doff * 4))
1891 goto discard_it;
1892
1893 /* An explanation is required here, I think.
1894 * Packet length and doff are validated by header prediction,
caa20d9a 1895 * provided case of th->doff==0 is eliminated.
1da177e4 1896 * So, we defer the checks. */
ed70fcfc
TH
1897
1898 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1899 goto csum_error;
1da177e4 1900
ea1627c2 1901 th = (const struct tcphdr *)skb->data;
eddc9ec5 1902 iph = ip_hdr(skb);
4bdc3d66 1903lookup:
a583636a 1904 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1905 th->dest, sdif, &refcounted);
1da177e4
LT
1906 if (!sk)
1907 goto no_tcp_socket;
1908
bb134d5d
ED
1909process:
1910 if (sk->sk_state == TCP_TIME_WAIT)
1911 goto do_time_wait;
1912
079096f1
ED
1913 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1914 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1915 bool req_stolen = false;
7716682c 1916 struct sock *nsk;
079096f1
ED
1917
1918 sk = req->rsk_listener;
534322ca 1919 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
e65c332d 1920 sk_drops_add(sk, skb);
72923555
ED
1921 reqsk_put(req);
1922 goto discard_it;
1923 }
4fd44a98
FL
1924 if (tcp_checksum_complete(skb)) {
1925 reqsk_put(req);
1926 goto csum_error;
1927 }
7716682c 1928 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1929 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1930 goto lookup;
1931 }
3b24d854
ED
1932 /* We own a reference on the listener, increase it again
1933 * as we might lose it too soon.
1934 */
7716682c 1935 sock_hold(sk);
3b24d854 1936 refcounted = true;
1f3b359f 1937 nsk = NULL;
eeea10b8
ED
1938 if (!tcp_filter(sk, skb)) {
1939 th = (const struct tcphdr *)skb->data;
1940 iph = ip_hdr(skb);
1941 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 1942 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 1943 }
079096f1
ED
1944 if (!nsk) {
1945 reqsk_put(req);
e0f9759f
ED
1946 if (req_stolen) {
1947 /* Another cpu got exclusive access to req
1948 * and created a full blown socket.
1949 * Try to feed this packet to this socket
1950 * instead of discarding it.
1951 */
1952 tcp_v4_restore_cb(skb);
1953 sock_put(sk);
1954 goto lookup;
1955 }
7716682c 1956 goto discard_and_relse;
079096f1
ED
1957 }
1958 if (nsk == sk) {
079096f1 1959 reqsk_put(req);
eeea10b8 1960 tcp_v4_restore_cb(skb);
079096f1
ED
1961 } else if (tcp_child_process(sk, nsk, skb)) {
1962 tcp_v4_send_reset(nsk, skb);
7716682c 1963 goto discard_and_relse;
079096f1 1964 } else {
7716682c 1965 sock_put(sk);
079096f1
ED
1966 return 0;
1967 }
1968 }
6cce09f8 1969 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1970 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1971 goto discard_and_relse;
6cce09f8 1972 }
d218d111 1973
1da177e4
LT
1974 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1975 goto discard_and_relse;
9ea88a15 1976
534322ca 1977 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
9ea88a15 1978 goto discard_and_relse;
9ea88a15 1979
895b5c9f 1980 nf_reset_ct(skb);
1da177e4 1981
ac6e7800 1982 if (tcp_filter(sk, skb))
1da177e4 1983 goto discard_and_relse;
ac6e7800
ED
1984 th = (const struct tcphdr *)skb->data;
1985 iph = ip_hdr(skb);
eeea10b8 1986 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
1987
1988 skb->dev = NULL;
1989
e994b2f0
ED
1990 if (sk->sk_state == TCP_LISTEN) {
1991 ret = tcp_v4_do_rcv(sk, skb);
1992 goto put_and_return;
1993 }
1994
1995 sk_incoming_cpu_update(sk);
1996
c6366184 1997 bh_lock_sock_nested(sk);
a44d6eac 1998 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1999 ret = 0;
2000 if (!sock_owned_by_user(sk)) {
8b27dae5
ED
2001 skb_to_free = sk->sk_rx_skb_cache;
2002 sk->sk_rx_skb_cache = NULL;
e7942d06 2003 ret = tcp_v4_do_rcv(sk, skb);
8b27dae5
ED
2004 } else {
2005 if (tcp_add_backlog(sk, skb))
2006 goto discard_and_relse;
2007 skb_to_free = NULL;
6b03a53a 2008 }
1da177e4 2009 bh_unlock_sock(sk);
8b27dae5
ED
2010 if (skb_to_free)
2011 __kfree_skb(skb_to_free);
1da177e4 2012
e994b2f0 2013put_and_return:
3b24d854
ED
2014 if (refcounted)
2015 sock_put(sk);
1da177e4
LT
2016
2017 return ret;
2018
2019no_tcp_socket:
2020 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2021 goto discard_it;
2022
eeea10b8
ED
2023 tcp_v4_fill_cb(skb, iph, th);
2024
12e25e10 2025 if (tcp_checksum_complete(skb)) {
6a5dc9e5 2026csum_error:
90bbcc60 2027 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 2028bad_packet:
90bbcc60 2029 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 2030 } else {
cfb6eeb4 2031 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2032 }
2033
2034discard_it:
2035 /* Discard frame. */
2036 kfree_skb(skb);
e905a9ed 2037 return 0;
1da177e4
LT
2038
2039discard_and_relse:
532182cd 2040 sk_drops_add(sk, skb);
3b24d854
ED
2041 if (refcounted)
2042 sock_put(sk);
1da177e4
LT
2043 goto discard_it;
2044
2045do_time_wait:
2046 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2047 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2048 goto discard_it;
2049 }
2050
eeea10b8
ED
2051 tcp_v4_fill_cb(skb, iph, th);
2052
6a5dc9e5
ED
2053 if (tcp_checksum_complete(skb)) {
2054 inet_twsk_put(inet_twsk(sk));
2055 goto csum_error;
1da177e4 2056 }
9469c7b4 2057 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2058 case TCP_TW_SYN: {
c346dca1 2059 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
2060 &tcp_hashinfo, skb,
2061 __tcp_hdrlen(th),
da5e3630 2062 iph->saddr, th->source,
eddc9ec5 2063 iph->daddr, th->dest,
3fa6f616
DA
2064 inet_iif(skb),
2065 sdif);
1da177e4 2066 if (sk2) {
dbe7faa4 2067 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 2068 sk = sk2;
eeea10b8 2069 tcp_v4_restore_cb(skb);
3b24d854 2070 refcounted = false;
1da177e4
LT
2071 goto process;
2072 }
1da177e4 2073 }
fcfd6dfa 2074 /* to ACK */
a8eceea8 2075 fallthrough;
1da177e4
LT
2076 case TCP_TW_ACK:
2077 tcp_v4_timewait_ack(sk, skb);
2078 break;
2079 case TCP_TW_RST:
271c3b9b
FW
2080 tcp_v4_send_reset(sk, skb);
2081 inet_twsk_deschedule_put(inet_twsk(sk));
2082 goto discard_it;
1da177e4
LT
2083 case TCP_TW_SUCCESS:;
2084 }
2085 goto discard_it;
2086}
2087
ccb7c410
DM
2088static struct timewait_sock_ops tcp_timewait_sock_ops = {
2089 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2090 .twsk_unique = tcp_twsk_unique,
2091 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2092};
1da177e4 2093
63d02d15 2094void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2095{
2096 struct dst_entry *dst = skb_dst(skb);
2097
5037e9ef 2098 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2099 sk->sk_rx_dst = dst;
2100 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2101 }
5d299f3d 2102}
63d02d15 2103EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2104
3b401a81 2105const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2106 .queue_xmit = ip_queue_xmit,
2107 .send_check = tcp_v4_send_check,
2108 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2109 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2110 .conn_request = tcp_v4_conn_request,
2111 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2112 .net_header_len = sizeof(struct iphdr),
2113 .setsockopt = ip_setsockopt,
2114 .getsockopt = ip_getsockopt,
2115 .addr2sockaddr = inet_csk_addr2sockaddr,
2116 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 2117#ifdef CONFIG_COMPAT
543d9cfe
ACM
2118 .compat_setsockopt = compat_ip_setsockopt,
2119 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2120#endif
4fab9071 2121 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2122};
4bc2f18b 2123EXPORT_SYMBOL(ipv4_specific);
1da177e4 2124
cfb6eeb4 2125#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2126static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2127 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2128 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2129 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2130};
b6332e6c 2131#endif
cfb6eeb4 2132
1da177e4
LT
2133/* NOTE: A lot of things set to zero explicitly by call to
2134 * sk_alloc() so need not be done here.
2135 */
2136static int tcp_v4_init_sock(struct sock *sk)
2137{
6687e988 2138 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2139
900f65d3 2140 tcp_init_sock(sk);
1da177e4 2141
8292a17a 2142 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2143
cfb6eeb4 2144#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2145 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2146#endif
1da177e4 2147
1da177e4
LT
2148 return 0;
2149}
2150
7d06b2e0 2151void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2152{
2153 struct tcp_sock *tp = tcp_sk(sk);
2154
e1a4aa50
SL
2155 trace_tcp_destroy_sock(sk);
2156
1da177e4
LT
2157 tcp_clear_xmit_timers(sk);
2158
6687e988 2159 tcp_cleanup_congestion_control(sk);
317a76f9 2160
734942cc
DW
2161 tcp_cleanup_ulp(sk);
2162
1da177e4 2163 /* Cleanup up the write buffer. */
fe067e8a 2164 tcp_write_queue_purge(sk);
1da177e4 2165
cf1ef3f0
WW
2166 /* Check if we want to disable active TFO */
2167 tcp_fastopen_active_disable_ofo_check(sk);
2168
1da177e4 2169 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2170 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2171
cfb6eeb4
YH
2172#ifdef CONFIG_TCP_MD5SIG
2173 /* Clean up the MD5 key list, if any */
2174 if (tp->md5sig_info) {
a915da9b 2175 tcp_clear_md5_list(sk);
fb7df5e4 2176 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2177 tp->md5sig_info = NULL;
2178 }
2179#endif
1a2449a8 2180
1da177e4 2181 /* Clean up a referenced TCP bind bucket. */
463c84b9 2182 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2183 inet_put_port(sk);
1da177e4 2184
d983ea6f 2185 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
435cf559 2186
cf60af03
YC
2187 /* If socket is aborted during connect operation */
2188 tcp_free_fastopen_req(tp);
1fba70e5 2189 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2190 tcp_saved_syn_free(tp);
cf60af03 2191
180d8cd9 2192 sk_sockets_allocated_dec(sk);
1da177e4 2193}
1da177e4
LT
2194EXPORT_SYMBOL(tcp_v4_destroy_sock);
2195
2196#ifdef CONFIG_PROC_FS
2197/* Proc filesystem TCP sock list dumping. */
2198
a8b690f9
TH
2199/*
2200 * Get next listener socket follow cur. If cur is NULL, get first socket
2201 * starting from bucket given in st->bucket; when st->bucket is zero the
2202 * very first socket in the hash table is returned.
2203 */
1da177e4
LT
2204static void *listening_get_next(struct seq_file *seq, void *cur)
2205{
37d849bb 2206 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2207 struct tcp_iter_state *st = seq->private;
a4146b1b 2208 struct net *net = seq_file_net(seq);
3b24d854 2209 struct inet_listen_hashbucket *ilb;
8dbd76e7 2210 struct hlist_nulls_node *node;
3b24d854 2211 struct sock *sk = cur;
1da177e4
LT
2212
2213 if (!sk) {
3b24d854 2214get_head:
a8b690f9 2215 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 2216 spin_lock(&ilb->lock);
8dbd76e7 2217 sk = sk_nulls_head(&ilb->nulls_head);
a8b690f9 2218 st->offset = 0;
1da177e4
LT
2219 goto get_sk;
2220 }
5caea4ea 2221 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2222 ++st->num;
a8b690f9 2223 ++st->offset;
1da177e4 2224
8dbd76e7 2225 sk = sk_nulls_next(sk);
1da177e4 2226get_sk:
8dbd76e7 2227 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2228 if (!net_eq(sock_net(sk), net))
2229 continue;
37d849bb 2230 if (sk->sk_family == afinfo->family)
3b24d854 2231 return sk;
1da177e4 2232 }
9652dc2e 2233 spin_unlock(&ilb->lock);
a8b690f9 2234 st->offset = 0;
3b24d854
ED
2235 if (++st->bucket < INET_LHTABLE_SIZE)
2236 goto get_head;
2237 return NULL;
1da177e4
LT
2238}
2239
2240static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2241{
a8b690f9
TH
2242 struct tcp_iter_state *st = seq->private;
2243 void *rc;
2244
2245 st->bucket = 0;
2246 st->offset = 0;
2247 rc = listening_get_next(seq, NULL);
1da177e4
LT
2248
2249 while (rc && *pos) {
2250 rc = listening_get_next(seq, rc);
2251 --*pos;
2252 }
2253 return rc;
2254}
2255
05dbc7b5 2256static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2257{
05dbc7b5 2258 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2259}
2260
a8b690f9
TH
2261/*
2262 * Get first established socket starting from bucket given in st->bucket.
2263 * If st->bucket is zero, the very first socket in the hash is returned.
2264 */
1da177e4
LT
2265static void *established_get_first(struct seq_file *seq)
2266{
37d849bb 2267 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2268 struct tcp_iter_state *st = seq->private;
a4146b1b 2269 struct net *net = seq_file_net(seq);
1da177e4
LT
2270 void *rc = NULL;
2271
a8b690f9
TH
2272 st->offset = 0;
2273 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2274 struct sock *sk;
3ab5aee7 2275 struct hlist_nulls_node *node;
9db66bdc 2276 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2277
6eac5604
AK
2278 /* Lockless fast path for the common case of empty buckets */
2279 if (empty_bucket(st))
2280 continue;
2281
9db66bdc 2282 spin_lock_bh(lock);
3ab5aee7 2283 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
37d849bb 2284 if (sk->sk_family != afinfo->family ||
878628fb 2285 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2286 continue;
2287 }
2288 rc = sk;
2289 goto out;
2290 }
9db66bdc 2291 spin_unlock_bh(lock);
1da177e4
LT
2292 }
2293out:
2294 return rc;
2295}
2296
2297static void *established_get_next(struct seq_file *seq, void *cur)
2298{
37d849bb 2299 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1da177e4 2300 struct sock *sk = cur;
3ab5aee7 2301 struct hlist_nulls_node *node;
5799de0b 2302 struct tcp_iter_state *st = seq->private;
a4146b1b 2303 struct net *net = seq_file_net(seq);
1da177e4
LT
2304
2305 ++st->num;
a8b690f9 2306 ++st->offset;
1da177e4 2307
05dbc7b5 2308 sk = sk_nulls_next(sk);
1da177e4 2309
3ab5aee7 2310 sk_nulls_for_each_from(sk, node) {
37d849bb
CH
2311 if (sk->sk_family == afinfo->family &&
2312 net_eq(sock_net(sk), net))
05dbc7b5 2313 return sk;
1da177e4
LT
2314 }
2315
05dbc7b5
ED
2316 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2317 ++st->bucket;
2318 return established_get_first(seq);
1da177e4
LT
2319}
2320
2321static void *established_get_idx(struct seq_file *seq, loff_t pos)
2322{
a8b690f9
TH
2323 struct tcp_iter_state *st = seq->private;
2324 void *rc;
2325
2326 st->bucket = 0;
2327 rc = established_get_first(seq);
1da177e4
LT
2328
2329 while (rc && pos) {
2330 rc = established_get_next(seq, rc);
2331 --pos;
7174259e 2332 }
1da177e4
LT
2333 return rc;
2334}
2335
2336static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2337{
2338 void *rc;
5799de0b 2339 struct tcp_iter_state *st = seq->private;
1da177e4 2340
1da177e4
LT
2341 st->state = TCP_SEQ_STATE_LISTENING;
2342 rc = listening_get_idx(seq, &pos);
2343
2344 if (!rc) {
1da177e4
LT
2345 st->state = TCP_SEQ_STATE_ESTABLISHED;
2346 rc = established_get_idx(seq, pos);
2347 }
2348
2349 return rc;
2350}
2351
a8b690f9
TH
2352static void *tcp_seek_last_pos(struct seq_file *seq)
2353{
2354 struct tcp_iter_state *st = seq->private;
2355 int offset = st->offset;
2356 int orig_num = st->num;
2357 void *rc = NULL;
2358
2359 switch (st->state) {
a8b690f9
TH
2360 case TCP_SEQ_STATE_LISTENING:
2361 if (st->bucket >= INET_LHTABLE_SIZE)
2362 break;
2363 st->state = TCP_SEQ_STATE_LISTENING;
2364 rc = listening_get_next(seq, NULL);
2365 while (offset-- && rc)
2366 rc = listening_get_next(seq, rc);
2367 if (rc)
2368 break;
2369 st->bucket = 0;
05dbc7b5 2370 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8eceea8 2371 fallthrough;
a8b690f9 2372 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2373 if (st->bucket > tcp_hashinfo.ehash_mask)
2374 break;
2375 rc = established_get_first(seq);
2376 while (offset-- && rc)
2377 rc = established_get_next(seq, rc);
2378 }
2379
2380 st->num = orig_num;
2381
2382 return rc;
2383}
2384
37d849bb 2385void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2386{
5799de0b 2387 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2388 void *rc;
2389
2390 if (*pos && *pos == st->last_pos) {
2391 rc = tcp_seek_last_pos(seq);
2392 if (rc)
2393 goto out;
2394 }
2395
1da177e4
LT
2396 st->state = TCP_SEQ_STATE_LISTENING;
2397 st->num = 0;
a8b690f9
TH
2398 st->bucket = 0;
2399 st->offset = 0;
2400 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2401
2402out:
2403 st->last_pos = *pos;
2404 return rc;
1da177e4 2405}
37d849bb 2406EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2407
37d849bb 2408void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2409{
a8b690f9 2410 struct tcp_iter_state *st = seq->private;
1da177e4 2411 void *rc = NULL;
1da177e4
LT
2412
2413 if (v == SEQ_START_TOKEN) {
2414 rc = tcp_get_idx(seq, 0);
2415 goto out;
2416 }
1da177e4
LT
2417
2418 switch (st->state) {
1da177e4
LT
2419 case TCP_SEQ_STATE_LISTENING:
2420 rc = listening_get_next(seq, v);
2421 if (!rc) {
1da177e4 2422 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2423 st->bucket = 0;
2424 st->offset = 0;
1da177e4
LT
2425 rc = established_get_first(seq);
2426 }
2427 break;
2428 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2429 rc = established_get_next(seq, v);
2430 break;
2431 }
2432out:
2433 ++*pos;
a8b690f9 2434 st->last_pos = *pos;
1da177e4
LT
2435 return rc;
2436}
37d849bb 2437EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2438
37d849bb 2439void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2440{
5799de0b 2441 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2442
2443 switch (st->state) {
1da177e4
LT
2444 case TCP_SEQ_STATE_LISTENING:
2445 if (v != SEQ_START_TOKEN)
9652dc2e 2446 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2447 break;
1da177e4
LT
2448 case TCP_SEQ_STATE_ESTABLISHED:
2449 if (v)
9db66bdc 2450 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2451 break;
2452 }
2453}
37d849bb 2454EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2455
d4f06873 2456static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2457 struct seq_file *f, int i)
1da177e4 2458{
2e6599cb 2459 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2460 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2461
5e659e4c 2462 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2463 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2464 i,
634fb979 2465 ireq->ir_loc_addr,
d4f06873 2466 ireq->ir_num,
634fb979
ED
2467 ireq->ir_rmt_addr,
2468 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2469 TCP_SYN_RECV,
2470 0, 0, /* could print option size, but that is af dependent. */
2471 1, /* timers active (only the expire timer) */
a399a805 2472 jiffies_delta_to_clock_t(delta),
e6c022a4 2473 req->num_timeout,
aa3a0c8c
ED
2474 from_kuid_munged(seq_user_ns(f),
2475 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2476 0, /* non standard timer */
2477 0, /* open_requests have no inode */
d4f06873 2478 0,
652586df 2479 req);
1da177e4
LT
2480}
2481
652586df 2482static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2483{
2484 int timer_active;
2485 unsigned long timer_expires;
cf533ea5 2486 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2487 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2488 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2489 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2490 __be32 dest = inet->inet_daddr;
2491 __be32 src = inet->inet_rcv_saddr;
2492 __u16 destp = ntohs(inet->inet_dport);
2493 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2494 int rx_queue;
00fd38d9 2495 int state;
1da177e4 2496
6ba8a3b1 2497 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2498 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2499 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2500 timer_active = 1;
463c84b9
ACM
2501 timer_expires = icsk->icsk_timeout;
2502 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2503 timer_active = 4;
463c84b9 2504 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2505 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2506 timer_active = 2;
cf4c6bf8 2507 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2508 } else {
2509 timer_active = 0;
2510 timer_expires = jiffies;
2511 }
2512
986ffdfd 2513 state = inet_sk_state_load(sk);
00fd38d9 2514 if (state == TCP_LISTEN)
288efe86 2515 rx_queue = READ_ONCE(sk->sk_ack_backlog);
49d09007 2516 else
00fd38d9
ED
2517 /* Because we don't lock the socket,
2518 * we might find a transient negative value.
49d09007 2519 */
dba7d9b8 2520 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
7db48e98 2521 READ_ONCE(tp->copied_seq), 0);
49d09007 2522
5e659e4c 2523 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2524 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2525 i, src, srcp, dest, destp, state,
0f317464 2526 READ_ONCE(tp->write_seq) - tp->snd_una,
49d09007 2527 rx_queue,
1da177e4 2528 timer_active,
a399a805 2529 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2530 icsk->icsk_retransmits,
a7cb5a49 2531 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2532 icsk->icsk_probes_out,
cf4c6bf8 2533 sock_i_ino(sk),
41c6d650 2534 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2535 jiffies_to_clock_t(icsk->icsk_rto),
2536 jiffies_to_clock_t(icsk->icsk_ack.ato),
31954cd8 2537 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
1da177e4 2538 tp->snd_cwnd,
00fd38d9
ED
2539 state == TCP_LISTEN ?
2540 fastopenq->max_qlen :
652586df 2541 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2542}
2543
cf533ea5 2544static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2545 struct seq_file *f, int i)
1da177e4 2546{
789f558c 2547 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2548 __be32 dest, src;
1da177e4 2549 __u16 destp, srcp;
1da177e4
LT
2550
2551 dest = tw->tw_daddr;
2552 src = tw->tw_rcv_saddr;
2553 destp = ntohs(tw->tw_dport);
2554 srcp = ntohs(tw->tw_sport);
2555
5e659e4c 2556 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2557 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2558 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2559 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2560 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2561}
2562
2563#define TMPSZ 150
2564
2565static int tcp4_seq_show(struct seq_file *seq, void *v)
2566{
5799de0b 2567 struct tcp_iter_state *st;
05dbc7b5 2568 struct sock *sk = v;
1da177e4 2569
652586df 2570 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2571 if (v == SEQ_START_TOKEN) {
652586df 2572 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2573 "rx_queue tr tm->when retrnsmt uid timeout "
2574 "inode");
2575 goto out;
2576 }
2577 st = seq->private;
2578
079096f1
ED
2579 if (sk->sk_state == TCP_TIME_WAIT)
2580 get_timewait4_sock(v, seq, st->num);
2581 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2582 get_openreq4(v, seq, st->num);
079096f1
ED
2583 else
2584 get_tcp4_sock(v, seq, st->num);
1da177e4 2585out:
652586df 2586 seq_pad(seq, '\n');
1da177e4
LT
2587 return 0;
2588}
2589
37d849bb
CH
2590static const struct seq_operations tcp4_seq_ops = {
2591 .show = tcp4_seq_show,
2592 .start = tcp_seq_start,
2593 .next = tcp_seq_next,
2594 .stop = tcp_seq_stop,
2595};
2596
1da177e4 2597static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2598 .family = AF_INET,
1da177e4
LT
2599};
2600
2c8c1e72 2601static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2602{
c3506372
CH
2603 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2604 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2605 return -ENOMEM;
2606 return 0;
757764f6
PE
2607}
2608
2c8c1e72 2609static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2610{
37d849bb 2611 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
2612}
2613
2614static struct pernet_operations tcp4_net_ops = {
2615 .init = tcp4_proc_init_net,
2616 .exit = tcp4_proc_exit_net,
2617};
2618
1da177e4
LT
2619int __init tcp4_proc_init(void)
2620{
757764f6 2621 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2622}
2623
2624void tcp4_proc_exit(void)
2625{
757764f6 2626 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2627}
2628#endif /* CONFIG_PROC_FS */
2629
2630struct proto tcp_prot = {
2631 .name = "TCP",
2632 .owner = THIS_MODULE,
2633 .close = tcp_close,
d74bad4e 2634 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
2635 .connect = tcp_v4_connect,
2636 .disconnect = tcp_disconnect,
463c84b9 2637 .accept = inet_csk_accept,
1da177e4
LT
2638 .ioctl = tcp_ioctl,
2639 .init = tcp_v4_init_sock,
2640 .destroy = tcp_v4_destroy_sock,
2641 .shutdown = tcp_shutdown,
2642 .setsockopt = tcp_setsockopt,
2643 .getsockopt = tcp_getsockopt,
4b9d07a4 2644 .keepalive = tcp_set_keepalive,
1da177e4 2645 .recvmsg = tcp_recvmsg,
7ba42910
CG
2646 .sendmsg = tcp_sendmsg,
2647 .sendpage = tcp_sendpage,
1da177e4 2648 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2649 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2650 .hash = inet_hash,
2651 .unhash = inet_unhash,
2652 .get_port = inet_csk_get_port,
1da177e4 2653 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2654 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2655 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2656 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2657 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2658 .memory_allocated = &tcp_memory_allocated,
2659 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2660 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
2661 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2662 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
2663 .max_header = MAX_TCP_HEADER,
2664 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2665 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2666 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2667 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2668 .h.hashinfo = &tcp_hashinfo,
7ba42910 2669 .no_autobind = true,
543d9cfe
ACM
2670#ifdef CONFIG_COMPAT
2671 .compat_setsockopt = compat_tcp_setsockopt,
2672 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2673#endif
c1e64e29 2674 .diag_destroy = tcp_abort,
1da177e4 2675};
4bc2f18b 2676EXPORT_SYMBOL(tcp_prot);
1da177e4 2677
bdbbb852
ED
2678static void __net_exit tcp_sk_exit(struct net *net)
2679{
2680 int cpu;
2681
b506bc97 2682 if (net->ipv4.tcp_congestion_control)
0baf26b0
MKL
2683 bpf_module_put(net->ipv4.tcp_congestion_control,
2684 net->ipv4.tcp_congestion_control->owner);
6670e152 2685
bdbbb852
ED
2686 for_each_possible_cpu(cpu)
2687 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2688 free_percpu(net->ipv4.tcp_sk);
2689}
2690
046ee902
DL
2691static int __net_init tcp_sk_init(struct net *net)
2692{
fee83d09 2693 int res, cpu, cnt;
bdbbb852
ED
2694
2695 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2696 if (!net->ipv4.tcp_sk)
2697 return -ENOMEM;
2698
2699 for_each_possible_cpu(cpu) {
2700 struct sock *sk;
2701
2702 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2703 IPPROTO_TCP, net);
2704 if (res)
2705 goto fail;
a9d6532b 2706 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
2707
2708 /* Please enforce IP_DF and IPID==0 for RST and
2709 * ACK sent in SYN-RECV and TIME-WAIT state.
2710 */
2711 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2712
bdbbb852
ED
2713 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2714 }
49213555 2715
5d134f1c 2716 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2717 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2718
b0f9ca53 2719 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
5f3e2bf0 2720 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
6b58e0a5 2721 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2722 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
c04b79b6 2723 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
046ee902 2724
13b287e8 2725 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2726 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2727 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2728
6fa25166 2729 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2730 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2731 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2732 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2733 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2734 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2735 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2736 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2737 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 2738 net->ipv4.sysctl_tcp_tw_reuse = 2;
65e6d901 2739 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
12ed8244 2740
fee83d09 2741 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 2742 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
2743 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2744
623d0c2d 2745 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
f9301034 2746 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2747 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2748 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 2749 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 2750 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 2751 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 2752 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 2753 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 2754 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 2755 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 2756 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 2757 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 2758 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
2759 /* This limits the percentage of the congestion window which we
2760 * will allow a single TSO frame to consume. Building TSO frames
2761 * which are too large can cause TCP streams to be bursty.
2762 */
2763 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
2764 /* Default TSQ limit of 16 TSO segments */
2765 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
2766 /* rfc5961 challenge ack rate limiting */
2767 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 2768 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 2769 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 2770 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 2771 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 2772 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 2773 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
2774 if (net != &init_net) {
2775 memcpy(net->ipv4.sysctl_tcp_rmem,
2776 init_net.ipv4.sysctl_tcp_rmem,
2777 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2778 memcpy(net->ipv4.sysctl_tcp_wmem,
2779 init_net.ipv4.sysctl_tcp_wmem,
2780 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2781 }
6d82aa24 2782 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
9c21d2fc 2783 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 2784 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2785 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2786 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2787 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2788
6670e152
SH
2789 /* Reno is always built in */
2790 if (!net_eq(net, &init_net) &&
0baf26b0
MKL
2791 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2792 init_net.ipv4.tcp_congestion_control->owner))
6670e152
SH
2793 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2794 else
2795 net->ipv4.tcp_congestion_control = &tcp_reno;
2796
49213555 2797 return 0;
bdbbb852
ED
2798fail:
2799 tcp_sk_exit(net);
2800
2801 return res;
b099ce26
EB
2802}
2803
2804static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2805{
43713848
HY
2806 struct net *net;
2807
1946e672 2808 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2809
2810 list_for_each_entry(net, net_exit_list, exit_list)
2811 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2812}
2813
2814static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2815 .init = tcp_sk_init,
2816 .exit = tcp_sk_exit,
2817 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2818};
2819
9b0f976f 2820void __init tcp_v4_init(void)
1da177e4 2821{
6a1b3054 2822 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2823 panic("Failed to create the TCP control socket.\n");
1da177e4 2824}