]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/ipv4/tcp_ipv4.c
Merge tag 'ovl-update-4.19' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs
[mirror_ubuntu-eoan-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
6797318e 83#include <linux/inetdevice.h>
1da177e4 84
cf80e0e4 85#include <crypto/hash.h>
cfb6eeb4
YH
86#include <linux/scatterlist.h>
87
c24b14c4
SL
88#include <trace/events/tcp.h>
89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
a915da9b 91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
93#endif
94
5caea4ea 95struct inet_hashinfo tcp_hashinfo;
4bc2f18b 96EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 97
84b114b9 98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 99{
84b114b9
ED
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source);
104}
105
5d2ed052 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 107{
5d2ed052 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
109}
110
6d6ee43e
ACM
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
79e9fed4 113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118 if (reuse == 2) {
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
122 */
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 loopback = true;
126#if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 loopback = true;
135 } else
136#endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
6d6ee43e
ACM
145
146 /* With PAWS, it is safe from the viewpoint
147 of data integrity. Even without PAWS it is safe provided sequence
148 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150 Actually, the idea is close to VJ's one, only timestamp cache is
151 held not per host, but per port pair and TW bucket is used as state
152 holder.
153
154 If TW bucket has been already destroyed we fall back to VJ's scheme
155 and use initial timestamp retrieved from peer table.
156 */
157 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
158 (!twp || (reuse && time_after32(ktime_get_seconds(),
159 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
160 /* In case of repair and re-using TIME-WAIT sockets we still
161 * want to be sure that it is safe as above but honor the
162 * sequence numbers and time stamps set as part of the repair
163 * process.
164 *
165 * Without this check re-using a TIME-WAIT socket with TCP
166 * repair would accumulate a -1 on the repair assigned
167 * sequence number. The first time it is reused the sequence
168 * is -1, the second time -2, etc. This fixes that issue
169 * without appearing to create any others.
170 */
171 if (likely(!tp->repair)) {
172 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173 if (tp->write_seq == 0)
174 tp->write_seq = 1;
175 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
176 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177 }
6d6ee43e
ACM
178 sock_hold(sktw);
179 return 1;
180 }
181
182 return 0;
183}
6d6ee43e
ACM
184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185
d74bad4e
AI
186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 int addr_len)
188{
189 /* This check is replicated from tcp_v4_connect() and intended to
190 * prevent BPF program called below from accessing bytes that are out
191 * of the bound specified by user in addr_len.
192 */
193 if (addr_len < sizeof(struct sockaddr_in))
194 return -EINVAL;
195
196 sock_owned_by_me(sk);
197
198 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199}
200
1da177e4
LT
201/* This will initiate an outgoing connection. */
202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203{
2d7192d6 204 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
205 struct inet_sock *inet = inet_sk(sk);
206 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 207 __be16 orig_sport, orig_dport;
bada8adc 208 __be32 daddr, nexthop;
da905bd1 209 struct flowi4 *fl4;
2d7192d6 210 struct rtable *rt;
1da177e4 211 int err;
f6d8bd05 212 struct ip_options_rcu *inet_opt;
1946e672 213 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
214
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
220
221 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 222 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 223 lockdep_sock_is_held(sk));
f6d8bd05 224 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
225 if (!daddr)
226 return -EINVAL;
f6d8bd05 227 nexthop = inet_opt->opt.faddr;
1da177e4
LT
228 }
229
dca8b089
DM
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
da905bd1
DM
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
234 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235 IPPROTO_TCP,
0e0d44ab 236 orig_sport, orig_dport, sk);
b23dd4fe
DM
237 if (IS_ERR(rt)) {
238 err = PTR_ERR(rt);
239 if (err == -ENETUNREACH)
f1d8cba6 240 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 241 return err;
584bdf8c 242 }
1da177e4
LT
243
244 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 ip_rt_put(rt);
246 return -ENETUNREACH;
247 }
248
f6d8bd05 249 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 250 daddr = fl4->daddr;
1da177e4 251
c720c7e8 252 if (!inet->inet_saddr)
da905bd1 253 inet->inet_saddr = fl4->saddr;
d1e559d0 254 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 255
c720c7e8 256 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
257 /* Reset inherited state */
258 tp->rx_opt.ts_recent = 0;
259 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
260 if (likely(!tp->repair))
261 tp->write_seq = 0;
1da177e4
LT
262 }
263
c720c7e8 264 inet->inet_dport = usin->sin_port;
d1e559d0 265 sk_daddr_set(sk, daddr);
1da177e4 266
d83d8461 267 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
268 if (inet_opt)
269 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 270
bee7ca9e 271 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
272
273 /* Socket identity is still unknown (sport may be zero).
274 * However we set state to SYN-SENT and not releasing socket
275 * lock select source port, enter ourselves into the hash tables and
276 * complete initialization after this.
277 */
278 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 279 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
280 if (err)
281 goto failure;
282
877d1f62 283 sk_set_txhash(sk);
9e7ceb06 284
da905bd1 285 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
286 inet->inet_sport, inet->inet_dport, sk);
287 if (IS_ERR(rt)) {
288 err = PTR_ERR(rt);
289 rt = NULL;
1da177e4 290 goto failure;
b23dd4fe 291 }
1da177e4 292 /* OK, now commit destination to socket. */
bcd76111 293 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 294 sk_setup_caps(sk, &rt->dst);
19f6d3f3 295 rt = NULL;
1da177e4 296
00355fa5 297 if (likely(!tp->repair)) {
00355fa5 298 if (!tp->write_seq)
84b114b9
ED
299 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300 inet->inet_daddr,
301 inet->inet_sport,
302 usin->sin_port);
5d2ed052
ED
303 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 inet->inet_saddr,
84b114b9 305 inet->inet_daddr);
00355fa5 306 }
1da177e4 307
c720c7e8 308 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 309
19f6d3f3
WW
310 if (tcp_fastopen_defer_connect(sk, &err))
311 return err;
312 if (err)
313 goto failure;
314
2b916477 315 err = tcp_connect(sk);
ee995283 316
1da177e4
LT
317 if (err)
318 goto failure;
319
320 return 0;
321
322failure:
7174259e
ACM
323 /*
324 * This unhashes the socket and releases the local port,
325 * if necessary.
326 */
1da177e4
LT
327 tcp_set_state(sk, TCP_CLOSE);
328 ip_rt_put(rt);
329 sk->sk_route_caps = 0;
c720c7e8 330 inet->inet_dport = 0;
1da177e4
LT
331 return err;
332}
4bc2f18b 333EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 334
1da177e4 335/*
563d34d0
ED
336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337 * It can be called through tcp_release_cb() if socket was owned by user
338 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 339 */
4fab9071 340void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 341{
1da177e4 342 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
343 struct dst_entry *dst;
344 u32 mtu;
1da177e4 345
02b2faaf
ED
346 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 return;
348 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
349 dst = inet_csk_update_pmtu(sk, mtu);
350 if (!dst)
1da177e4
LT
351 return;
352
1da177e4
LT
353 /* Something is about to be wrong... Remember soft error
354 * for the case, if this connection will not able to recover.
355 */
356 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 sk->sk_err_soft = EMSGSIZE;
358
359 mtu = dst_mtu(dst);
360
361 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 362 ip_sk_accept_pmtu(sk) &&
d83d8461 363 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
364 tcp_sync_mss(sk, mtu);
365
366 /* Resend the TCP packet because it's
367 * clear that the old packet has been
368 * dropped. This is the new "fast" path mtu
369 * discovery.
370 */
371 tcp_simple_retransmit(sk);
372 } /* else let the usual retransmit timer handle it */
373}
4fab9071 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 375
55be7a9c
DM
376static void do_redirect(struct sk_buff *skb, struct sock *sk)
377{
378 struct dst_entry *dst = __sk_dst_check(sk, 0);
379
1ed5c48f 380 if (dst)
6700c270 381 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
382}
383
26e37360
ED
384
385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
387{
388 struct request_sock *req = inet_reqsk(sk);
389 struct net *net = sock_net(sk);
390
391 /* ICMPs are not backlogged, hence we cannot get
392 * an established socket here.
393 */
26e37360 394 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 395 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 396 } else if (abort) {
26e37360
ED
397 /*
398 * Still in SYN_RECV, just remove it silently.
399 * There is no good way to pass the error to the newly
400 * created socket, and POSIX does not want network
401 * errors returned from accept().
402 */
c6973669 403 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 404 tcp_listendrop(req->rsk_listener);
26e37360 405 }
ef84d8ce 406 reqsk_put(req);
26e37360
ED
407}
408EXPORT_SYMBOL(tcp_req_err);
409
1da177e4
LT
410/*
411 * This routine is called by the ICMP module when it gets some
412 * sort of error condition. If err < 0 then the socket should
413 * be closed and the error returned to the user. If err > 0
414 * it's just the icmp type << 8 | icmp code. After adjustment
415 * header points to the first 8 bytes of the tcp header. We need
416 * to find the appropriate port.
417 *
418 * The locking strategy used here is very "optimistic". When
419 * someone else accesses the socket the ICMP is just dropped
420 * and for some paths there is no check at all.
421 * A more general error queue to queue errors for later handling
422 * is probably better.
423 *
424 */
425
4d1a2d9e 426void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 427{
b71d1d42 428 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 429 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 430 struct inet_connection_sock *icsk;
1da177e4
LT
431 struct tcp_sock *tp;
432 struct inet_sock *inet;
4d1a2d9e
DL
433 const int type = icmp_hdr(icmp_skb)->type;
434 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 435 struct sock *sk;
f1ecd5d9 436 struct sk_buff *skb;
0a672f74 437 struct request_sock *fastopen;
9a568de4
ED
438 u32 seq, snd_una;
439 s32 remaining;
440 u32 delta_us;
1da177e4 441 int err;
4d1a2d9e 442 struct net *net = dev_net(icmp_skb->dev);
1da177e4 443
26e37360
ED
444 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 446 inet_iif(icmp_skb), 0);
1da177e4 447 if (!sk) {
5d3848bc 448 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
1da177e4
LT
449 return;
450 }
451 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 452 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
453 return;
454 }
26e37360
ED
455 seq = ntohl(th->seq);
456 if (sk->sk_state == TCP_NEW_SYN_RECV)
9cf74903
ED
457 return tcp_req_err(sk, seq,
458 type == ICMP_PARAMETERPROB ||
459 type == ICMP_TIME_EXCEEDED ||
460 (type == ICMP_DEST_UNREACH &&
461 (code == ICMP_NET_UNREACH ||
462 code == ICMP_HOST_UNREACH)));
1da177e4
LT
463
464 bh_lock_sock(sk);
465 /* If too many ICMPs get dropped on busy
466 * servers this needs to be solved differently.
563d34d0
ED
467 * We do take care of PMTU discovery (RFC1191) special case :
468 * we can receive locally generated ICMP messages while socket is held.
1da177e4 469 */
b74aa930
ED
470 if (sock_owned_by_user(sk)) {
471 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 472 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 473 }
1da177e4
LT
474 if (sk->sk_state == TCP_CLOSE)
475 goto out;
476
97e3ecd1 477 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 478 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 479 goto out;
480 }
481
f1ecd5d9 482 icsk = inet_csk(sk);
1da177e4 483 tp = tcp_sk(sk);
0a672f74
YC
484 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
485 fastopen = tp->fastopen_rsk;
486 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 487 if (sk->sk_state != TCP_LISTEN &&
0a672f74 488 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 489 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
490 goto out;
491 }
492
493 switch (type) {
55be7a9c 494 case ICMP_REDIRECT:
45caeaa5
JM
495 if (!sock_owned_by_user(sk))
496 do_redirect(icmp_skb, sk);
55be7a9c 497 goto out;
1da177e4
LT
498 case ICMP_SOURCE_QUENCH:
499 /* Just silently ignore these. */
500 goto out;
501 case ICMP_PARAMETERPROB:
502 err = EPROTO;
503 break;
504 case ICMP_DEST_UNREACH:
505 if (code > NR_ICMP_UNREACH)
506 goto out;
507
508 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
509 /* We are not interested in TCP_LISTEN and open_requests
510 * (SYN-ACKs send out by Linux are always <576bytes so
511 * they should go through unfragmented).
512 */
513 if (sk->sk_state == TCP_LISTEN)
514 goto out;
515
563d34d0 516 tp->mtu_info = info;
144d56e9 517 if (!sock_owned_by_user(sk)) {
563d34d0 518 tcp_v4_mtu_reduced(sk);
144d56e9 519 } else {
7aa5470c 520 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
521 sock_hold(sk);
522 }
1da177e4
LT
523 goto out;
524 }
525
526 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
527 /* check if icmp_skb allows revert of backoff
528 * (see draft-zimmermann-tcp-lcd) */
529 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
530 break;
531 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 532 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
533 break;
534
8f49c270
DM
535 if (sock_owned_by_user(sk))
536 break;
537
f1ecd5d9 538 icsk->icsk_backoff--;
fcdd1cf4
ED
539 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
540 TCP_TIMEOUT_INIT;
541 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9 542
75c119af 543 skb = tcp_rtx_queue_head(sk);
f1ecd5d9
DL
544 BUG_ON(!skb);
545
9a568de4
ED
546 tcp_mstamp_refresh(tp);
547 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
7faee5c0 548 remaining = icsk->icsk_rto -
9a568de4 549 usecs_to_jiffies(delta_us);
f1ecd5d9 550
9a568de4 551 if (remaining > 0) {
f1ecd5d9
DL
552 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
553 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
554 } else {
555 /* RTO revert clocked out retransmission.
556 * Will retransmit now */
557 tcp_retransmit_timer(sk);
558 }
559
1da177e4
LT
560 break;
561 case ICMP_TIME_EXCEEDED:
562 err = EHOSTUNREACH;
563 break;
564 default:
565 goto out;
566 }
567
568 switch (sk->sk_state) {
1da177e4 569 case TCP_SYN_SENT:
0a672f74
YC
570 case TCP_SYN_RECV:
571 /* Only in fast or simultaneous open. If a fast open socket is
572 * is already accepted it is treated as a connected one below.
573 */
51456b29 574 if (fastopen && !fastopen->sk)
0a672f74
YC
575 break;
576
1da177e4 577 if (!sock_owned_by_user(sk)) {
1da177e4
LT
578 sk->sk_err = err;
579
580 sk->sk_error_report(sk);
581
582 tcp_done(sk);
583 } else {
584 sk->sk_err_soft = err;
585 }
586 goto out;
587 }
588
589 /* If we've already connected we will keep trying
590 * until we time out, or the user gives up.
591 *
592 * rfc1122 4.2.3.9 allows to consider as hard errors
593 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
594 * but it is obsoleted by pmtu discovery).
595 *
596 * Note, that in modern internet, where routing is unreliable
597 * and in each dark corner broken firewalls sit, sending random
598 * errors ordered by their masters even this two messages finally lose
599 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 *
601 * Now we are in compliance with RFCs.
602 * --ANK (980905)
603 */
604
605 inet = inet_sk(sk);
606 if (!sock_owned_by_user(sk) && inet->recverr) {
607 sk->sk_err = err;
608 sk->sk_error_report(sk);
609 } else { /* Only an error on timeout */
610 sk->sk_err_soft = err;
611 }
612
613out:
614 bh_unlock_sock(sk);
615 sock_put(sk);
616}
617
28850dc7 618void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 619{
aa8223c7 620 struct tcphdr *th = tcp_hdr(skb);
1da177e4 621
98be9b12
ED
622 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 skb->csum_start = skb_transport_header(skb) - skb->head;
624 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
625}
626
419f9f89 627/* This routine computes an IPv4 TCP checksum. */
bb296246 628void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 629{
cf533ea5 630 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
631
632 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633}
4bc2f18b 634EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 635
1da177e4
LT
636/*
637 * This routine will send an RST to the other tcp.
638 *
639 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640 * for reset.
641 * Answer: if a packet caused RST, it is not for a socket
642 * existing in our system, if it is matched to a socket,
643 * it is just duplicate segment or bug in other side's TCP.
644 * So that we build reply only basing on parameters
645 * arrived with segment.
646 * Exception: precedence violation. We do not implement it in any case.
647 */
648
a00e7444 649static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 650{
cf533ea5 651 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
652 struct {
653 struct tcphdr th;
654#ifdef CONFIG_TCP_MD5SIG
714e85be 655 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
656#endif
657 } rep;
1da177e4 658 struct ip_reply_arg arg;
cfb6eeb4 659#ifdef CONFIG_TCP_MD5SIG
e46787f0 660 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
661 const __u8 *hash_location = NULL;
662 unsigned char newhash[16];
663 int genhash;
664 struct sock *sk1 = NULL;
cfb6eeb4 665#endif
a86b1e30 666 struct net *net;
00483690 667 struct sock *ctl_sk;
1da177e4
LT
668
669 /* Never send a reset in response to a reset. */
670 if (th->rst)
671 return;
672
c3658e8d
ED
673 /* If sk not NULL, it means we did a successful lookup and incoming
674 * route had to be correct. prequeue might have dropped our dst.
675 */
676 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
677 return;
678
679 /* Swap the send and the receive. */
cfb6eeb4
YH
680 memset(&rep, 0, sizeof(rep));
681 rep.th.dest = th->source;
682 rep.th.source = th->dest;
683 rep.th.doff = sizeof(struct tcphdr) / 4;
684 rep.th.rst = 1;
1da177e4
LT
685
686 if (th->ack) {
cfb6eeb4 687 rep.th.seq = th->ack_seq;
1da177e4 688 } else {
cfb6eeb4
YH
689 rep.th.ack = 1;
690 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
691 skb->len - (th->doff << 2));
1da177e4
LT
692 }
693
7174259e 694 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
695 arg.iov[0].iov_base = (unsigned char *)&rep;
696 arg.iov[0].iov_len = sizeof(rep.th);
697
0f85feae 698 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 699#ifdef CONFIG_TCP_MD5SIG
3b24d854 700 rcu_read_lock();
658ddaaf 701 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 702 if (sk && sk_fullsock(sk)) {
e46787f0
FW
703 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
704 &ip_hdr(skb)->saddr, AF_INET);
705 } else if (hash_location) {
658ddaaf
SL
706 /*
707 * active side is lost. Try to find listening socket through
708 * source port, and then find md5 key through listening socket.
709 * we are not loose security here:
710 * Incoming packet is checked with md5 hash with finding key,
711 * no RST generated if md5 hash doesn't match.
712 */
a583636a
CG
713 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
714 ip_hdr(skb)->saddr,
da5e3630 715 th->source, ip_hdr(skb)->daddr,
3fa6f616
DA
716 ntohs(th->source), inet_iif(skb),
717 tcp_v4_sdif(skb));
658ddaaf
SL
718 /* don't send rst if it can't find key */
719 if (!sk1)
3b24d854
ED
720 goto out;
721
658ddaaf
SL
722 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
723 &ip_hdr(skb)->saddr, AF_INET);
724 if (!key)
3b24d854
ED
725 goto out;
726
658ddaaf 727
39f8e58e 728 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 729 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
730 goto out;
731
658ddaaf
SL
732 }
733
cfb6eeb4
YH
734 if (key) {
735 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
736 (TCPOPT_NOP << 16) |
737 (TCPOPT_MD5SIG << 8) |
738 TCPOLEN_MD5SIG);
739 /* Update length and the length the header thinks exists */
740 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
741 rep.th.doff = arg.iov[0].iov_len / 4;
742
49a72dfb 743 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
744 key, ip_hdr(skb)->saddr,
745 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
746 }
747#endif
eddc9ec5
ACM
748 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
749 ip_hdr(skb)->saddr, /* XXX */
52cd5750 750 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 751 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
752 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
753
e2446eaa 754 /* When socket is gone, all binding information is lost.
4c675258
AK
755 * routing might fail in this case. No choice here, if we choose to force
756 * input interface, we will misroute in case of asymmetric route.
e2446eaa 757 */
c24b14c4 758 if (sk) {
4c675258 759 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
760 if (sk_fullsock(sk))
761 trace_tcp_send_reset(sk, skb);
c24b14c4 762 }
1da177e4 763
271c3b9b
FW
764 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
765 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
766
66b13d99 767 arg.tos = ip_hdr(skb)->tos;
e2d118a1 768 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 769 local_bh_disable();
00483690
JM
770 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
771 if (sk)
772 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
773 inet_twsk(sk)->tw_mark : sk->sk_mark;
774 ip_send_unicast_reply(ctl_sk,
bdbbb852 775 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
776 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 &arg, arg.iov[0].iov_len);
1da177e4 778
00483690 779 ctl_sk->sk_mark = 0;
90bbcc60
ED
780 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
781 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 782 local_bh_enable();
658ddaaf
SL
783
784#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
785out:
786 rcu_read_unlock();
658ddaaf 787#endif
1da177e4
LT
788}
789
790/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
791 outside socket context is ugly, certainly. What can I do?
792 */
793
e2d118a1 794static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 795 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 796 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 797 struct tcp_md5sig_key *key,
66b13d99 798 int reply_flags, u8 tos)
1da177e4 799{
cf533ea5 800 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
801 struct {
802 struct tcphdr th;
714e85be 803 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 804#ifdef CONFIG_TCP_MD5SIG
714e85be 805 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
806#endif
807 ];
1da177e4 808 } rep;
e2d118a1 809 struct net *net = sock_net(sk);
1da177e4 810 struct ip_reply_arg arg;
00483690 811 struct sock *ctl_sk;
1da177e4
LT
812
813 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 814 memset(&arg, 0, sizeof(arg));
1da177e4
LT
815
816 arg.iov[0].iov_base = (unsigned char *)&rep;
817 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 818 if (tsecr) {
cfb6eeb4
YH
819 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
820 (TCPOPT_TIMESTAMP << 8) |
821 TCPOLEN_TIMESTAMP);
ee684b6f
AV
822 rep.opt[1] = htonl(tsval);
823 rep.opt[2] = htonl(tsecr);
cb48cfe8 824 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
825 }
826
827 /* Swap the send and the receive. */
828 rep.th.dest = th->source;
829 rep.th.source = th->dest;
830 rep.th.doff = arg.iov[0].iov_len / 4;
831 rep.th.seq = htonl(seq);
832 rep.th.ack_seq = htonl(ack);
833 rep.th.ack = 1;
834 rep.th.window = htons(win);
835
cfb6eeb4 836#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 837 if (key) {
ee684b6f 838 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
839
840 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
841 (TCPOPT_NOP << 16) |
842 (TCPOPT_MD5SIG << 8) |
843 TCPOLEN_MD5SIG);
844 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
845 rep.th.doff = arg.iov[0].iov_len/4;
846
49a72dfb 847 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
848 key, ip_hdr(skb)->saddr,
849 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
850 }
851#endif
88ef4a5a 852 arg.flags = reply_flags;
eddc9ec5
ACM
853 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
854 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
855 arg.iov[0].iov_len, IPPROTO_TCP, 0);
856 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
857 if (oif)
858 arg.bound_dev_if = oif;
66b13d99 859 arg.tos = tos;
e2d118a1 860 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 861 local_bh_disable();
00483690
JM
862 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
863 if (sk)
864 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
865 inet_twsk(sk)->tw_mark : sk->sk_mark;
866 ip_send_unicast_reply(ctl_sk,
bdbbb852 867 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
868 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
869 &arg, arg.iov[0].iov_len);
1da177e4 870
00483690 871 ctl_sk->sk_mark = 0;
90bbcc60 872 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 873 local_bh_enable();
1da177e4
LT
874}
875
876static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
877{
8feaf0c0 878 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 879 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 880
e2d118a1 881 tcp_v4_send_ack(sk, skb,
e62a123b 882 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 883 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 884 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
885 tcptw->tw_ts_recent,
886 tw->tw_bound_dev_if,
88ef4a5a 887 tcp_twsk_md5_key(tcptw),
66b13d99
ED
888 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
889 tw->tw_tos
9501f972 890 );
1da177e4 891
8feaf0c0 892 inet_twsk_put(tw);
1da177e4
LT
893}
894
a00e7444 895static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 896 struct request_sock *req)
1da177e4 897{
168a8f58
JC
898 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
899 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
900 */
e62a123b
ED
901 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
902 tcp_sk(sk)->snd_nxt;
903
20a2b49f
ED
904 /* RFC 7323 2.3
905 * The window field (SEG.WND) of every outgoing segment, with the
906 * exception of <SYN> segments, MUST be right-shifted by
907 * Rcv.Wind.Shift bits:
908 */
e2d118a1 909 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
910 tcp_rsk(req)->rcv_nxt,
911 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 912 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
913 req->ts_recent,
914 0,
30791ac4 915 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
a915da9b 916 AF_INET),
66b13d99
ED
917 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
918 ip_hdr(skb)->tos);
1da177e4
LT
919}
920
1da177e4 921/*
9bf1d83e 922 * Send a SYN-ACK after having received a SYN.
60236fdd 923 * This still operates on a request_sock only, not on a big
1da177e4
LT
924 * socket.
925 */
0f935dbe 926static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 927 struct flowi *fl,
72659ecc 928 struct request_sock *req,
ca6fb065 929 struct tcp_fastopen_cookie *foc,
b3d05147 930 enum tcp_synack_type synack_type)
1da177e4 931{
2e6599cb 932 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 933 struct flowi4 fl4;
1da177e4 934 int err = -1;
d41db5af 935 struct sk_buff *skb;
1da177e4
LT
936
937 /* First, grab a route. */
ba3f7f04 938 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 939 return -1;
1da177e4 940
b3d05147 941 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
942
943 if (skb) {
634fb979 944 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 945
634fb979
ED
946 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
947 ireq->ir_rmt_addr,
06f877d6 948 ireq_opt_deref(ireq));
b9df3cb8 949 err = net_xmit_eval(err);
1da177e4
LT
950 }
951
1da177e4
LT
952 return err;
953}
954
955/*
60236fdd 956 * IPv4 request_sock destructor.
1da177e4 957 */
60236fdd 958static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 959{
c92e8c02 960 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
961}
962
cfb6eeb4
YH
963#ifdef CONFIG_TCP_MD5SIG
964/*
965 * RFC2385 MD5 checksumming requires a mapping of
966 * IP address->MD5 Key.
967 * We need to maintain these in the sk structure.
968 */
969
970/* Find the Key structure for an address. */
b83e3deb 971struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
972 const union tcp_md5_addr *addr,
973 int family)
cfb6eeb4 974{
fd3a154a 975 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 976 struct tcp_md5sig_key *key;
fd3a154a 977 const struct tcp_md5sig_info *md5sig;
6797318e
ID
978 __be32 mask;
979 struct tcp_md5sig_key *best_match = NULL;
980 bool match;
cfb6eeb4 981
a8afca03
ED
982 /* caller either holds rcu_read_lock() or socket lock */
983 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 984 lockdep_sock_is_held(sk));
a8afca03 985 if (!md5sig)
cfb6eeb4 986 return NULL;
083a0326 987
b67bfe0d 988 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
989 if (key->family != family)
990 continue;
6797318e
ID
991
992 if (family == AF_INET) {
993 mask = inet_make_mask(key->prefixlen);
994 match = (key->addr.a4.s_addr & mask) ==
995 (addr->a4.s_addr & mask);
996#if IS_ENABLED(CONFIG_IPV6)
997 } else if (family == AF_INET6) {
998 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
999 key->prefixlen);
1000#endif
1001 } else {
1002 match = false;
1003 }
1004
1005 if (match && (!best_match ||
1006 key->prefixlen > best_match->prefixlen))
1007 best_match = key;
1008 }
1009 return best_match;
1010}
1011EXPORT_SYMBOL(tcp_md5_do_lookup);
1012
e8f37d57
WF
1013static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1014 const union tcp_md5_addr *addr,
1015 int family, u8 prefixlen)
6797318e
ID
1016{
1017 const struct tcp_sock *tp = tcp_sk(sk);
1018 struct tcp_md5sig_key *key;
1019 unsigned int size = sizeof(struct in_addr);
1020 const struct tcp_md5sig_info *md5sig;
1021
1022 /* caller either holds rcu_read_lock() or socket lock */
1023 md5sig = rcu_dereference_check(tp->md5sig_info,
1024 lockdep_sock_is_held(sk));
1025 if (!md5sig)
1026 return NULL;
1027#if IS_ENABLED(CONFIG_IPV6)
1028 if (family == AF_INET6)
1029 size = sizeof(struct in6_addr);
1030#endif
1031 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1032 if (key->family != family)
1033 continue;
1034 if (!memcmp(&key->addr, addr, size) &&
1035 key->prefixlen == prefixlen)
a915da9b 1036 return key;
cfb6eeb4
YH
1037 }
1038 return NULL;
1039}
1040
b83e3deb 1041struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1042 const struct sock *addr_sk)
cfb6eeb4 1043{
b52e6921 1044 const union tcp_md5_addr *addr;
a915da9b 1045
b52e6921 1046 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 1047 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 1048}
cfb6eeb4
YH
1049EXPORT_SYMBOL(tcp_v4_md5_lookup);
1050
cfb6eeb4 1051/* This can be called on a newly created socket, from other files */
a915da9b 1052int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
6797318e
ID
1053 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1054 gfp_t gfp)
cfb6eeb4
YH
1055{
1056 /* Add Key to the list */
b0a713e9 1057 struct tcp_md5sig_key *key;
cfb6eeb4 1058 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1059 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1060
6797318e 1061 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
cfb6eeb4
YH
1062 if (key) {
1063 /* Pre-existing entry - just update that one. */
a915da9b 1064 memcpy(key->key, newkey, newkeylen);
b0a713e9 1065 key->keylen = newkeylen;
a915da9b
ED
1066 return 0;
1067 }
260fcbeb 1068
a8afca03 1069 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1070 lockdep_sock_is_held(sk));
a915da9b
ED
1071 if (!md5sig) {
1072 md5sig = kmalloc(sizeof(*md5sig), gfp);
1073 if (!md5sig)
cfb6eeb4 1074 return -ENOMEM;
cfb6eeb4 1075
a915da9b
ED
1076 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1077 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1078 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1079 }
cfb6eeb4 1080
5f3d9cb2 1081 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1082 if (!key)
1083 return -ENOMEM;
71cea17e 1084 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1085 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1086 return -ENOMEM;
cfb6eeb4 1087 }
a915da9b
ED
1088
1089 memcpy(key->key, newkey, newkeylen);
1090 key->keylen = newkeylen;
1091 key->family = family;
6797318e 1092 key->prefixlen = prefixlen;
a915da9b
ED
1093 memcpy(&key->addr, addr,
1094 (family == AF_INET6) ? sizeof(struct in6_addr) :
1095 sizeof(struct in_addr));
1096 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1097 return 0;
1098}
a915da9b 1099EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1100
6797318e
ID
1101int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1102 u8 prefixlen)
cfb6eeb4 1103{
a915da9b
ED
1104 struct tcp_md5sig_key *key;
1105
6797318e 1106 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
a915da9b
ED
1107 if (!key)
1108 return -ENOENT;
1109 hlist_del_rcu(&key->node);
5f3d9cb2 1110 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1111 kfree_rcu(key, rcu);
a915da9b 1112 return 0;
cfb6eeb4 1113}
a915da9b 1114EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1115
e0683e70 1116static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1117{
1118 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1119 struct tcp_md5sig_key *key;
b67bfe0d 1120 struct hlist_node *n;
a8afca03 1121 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1122
a8afca03
ED
1123 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1124
b67bfe0d 1125 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1126 hlist_del_rcu(&key->node);
5f3d9cb2 1127 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1128 kfree_rcu(key, rcu);
cfb6eeb4
YH
1129 }
1130}
1131
8917a777
ID
1132static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1133 char __user *optval, int optlen)
cfb6eeb4
YH
1134{
1135 struct tcp_md5sig cmd;
1136 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
8917a777 1137 u8 prefixlen = 32;
cfb6eeb4
YH
1138
1139 if (optlen < sizeof(cmd))
1140 return -EINVAL;
1141
7174259e 1142 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1143 return -EFAULT;
1144
1145 if (sin->sin_family != AF_INET)
1146 return -EINVAL;
1147
8917a777
ID
1148 if (optname == TCP_MD5SIG_EXT &&
1149 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1150 prefixlen = cmd.tcpm_prefixlen;
1151 if (prefixlen > 32)
1152 return -EINVAL;
1153 }
1154
64a124ed 1155 if (!cmd.tcpm_keylen)
a915da9b 1156 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1157 AF_INET, prefixlen);
cfb6eeb4
YH
1158
1159 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1160 return -EINVAL;
1161
a915da9b 1162 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1163 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
a915da9b 1164 GFP_KERNEL);
cfb6eeb4
YH
1165}
1166
19689e38
ED
1167static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1168 __be32 daddr, __be32 saddr,
1169 const struct tcphdr *th, int nbytes)
cfb6eeb4 1170{
cfb6eeb4 1171 struct tcp4_pseudohdr *bp;
49a72dfb 1172 struct scatterlist sg;
19689e38 1173 struct tcphdr *_th;
cfb6eeb4 1174
19689e38 1175 bp = hp->scratch;
cfb6eeb4
YH
1176 bp->saddr = saddr;
1177 bp->daddr = daddr;
1178 bp->pad = 0;
076fb722 1179 bp->protocol = IPPROTO_TCP;
49a72dfb 1180 bp->len = cpu_to_be16(nbytes);
c7da57a1 1181
19689e38
ED
1182 _th = (struct tcphdr *)(bp + 1);
1183 memcpy(_th, th, sizeof(*th));
1184 _th->check = 0;
1185
1186 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1187 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1188 sizeof(*bp) + sizeof(*th));
cf80e0e4 1189 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1190}
1191
a915da9b 1192static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1193 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1194{
1195 struct tcp_md5sig_pool *hp;
cf80e0e4 1196 struct ahash_request *req;
49a72dfb
AL
1197
1198 hp = tcp_get_md5sig_pool();
1199 if (!hp)
1200 goto clear_hash_noput;
cf80e0e4 1201 req = hp->md5_req;
49a72dfb 1202
cf80e0e4 1203 if (crypto_ahash_init(req))
49a72dfb 1204 goto clear_hash;
19689e38 1205 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1206 goto clear_hash;
1207 if (tcp_md5_hash_key(hp, key))
1208 goto clear_hash;
cf80e0e4
HX
1209 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1210 if (crypto_ahash_final(req))
cfb6eeb4
YH
1211 goto clear_hash;
1212
cfb6eeb4 1213 tcp_put_md5sig_pool();
cfb6eeb4 1214 return 0;
49a72dfb 1215
cfb6eeb4
YH
1216clear_hash:
1217 tcp_put_md5sig_pool();
1218clear_hash_noput:
1219 memset(md5_hash, 0, 16);
49a72dfb 1220 return 1;
cfb6eeb4
YH
1221}
1222
39f8e58e
ED
1223int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1224 const struct sock *sk,
318cf7aa 1225 const struct sk_buff *skb)
cfb6eeb4 1226{
49a72dfb 1227 struct tcp_md5sig_pool *hp;
cf80e0e4 1228 struct ahash_request *req;
318cf7aa 1229 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1230 __be32 saddr, daddr;
1231
39f8e58e
ED
1232 if (sk) { /* valid for establish/request sockets */
1233 saddr = sk->sk_rcv_saddr;
1234 daddr = sk->sk_daddr;
cfb6eeb4 1235 } else {
49a72dfb
AL
1236 const struct iphdr *iph = ip_hdr(skb);
1237 saddr = iph->saddr;
1238 daddr = iph->daddr;
cfb6eeb4 1239 }
49a72dfb
AL
1240
1241 hp = tcp_get_md5sig_pool();
1242 if (!hp)
1243 goto clear_hash_noput;
cf80e0e4 1244 req = hp->md5_req;
49a72dfb 1245
cf80e0e4 1246 if (crypto_ahash_init(req))
49a72dfb
AL
1247 goto clear_hash;
1248
19689e38 1249 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1250 goto clear_hash;
1251 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1252 goto clear_hash;
1253 if (tcp_md5_hash_key(hp, key))
1254 goto clear_hash;
cf80e0e4
HX
1255 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1256 if (crypto_ahash_final(req))
49a72dfb
AL
1257 goto clear_hash;
1258
1259 tcp_put_md5sig_pool();
1260 return 0;
1261
1262clear_hash:
1263 tcp_put_md5sig_pool();
1264clear_hash_noput:
1265 memset(md5_hash, 0, 16);
1266 return 1;
cfb6eeb4 1267}
49a72dfb 1268EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1269
ba8e275a
ED
1270#endif
1271
ff74e23f 1272/* Called with rcu_read_lock() */
ba8e275a 1273static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1274 const struct sk_buff *skb)
cfb6eeb4 1275{
ba8e275a 1276#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1277 /*
1278 * This gets called for each TCP segment that arrives
1279 * so we want to be efficient.
1280 * We have 3 drop cases:
1281 * o No MD5 hash and one expected.
1282 * o MD5 hash and we're not expecting one.
1283 * o MD5 hash and its wrong.
1284 */
cf533ea5 1285 const __u8 *hash_location = NULL;
cfb6eeb4 1286 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1287 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1288 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1289 int genhash;
cfb6eeb4
YH
1290 unsigned char newhash[16];
1291
a915da9b
ED
1292 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1293 AF_INET);
7d5d5525 1294 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1295
cfb6eeb4
YH
1296 /* We've parsed the options - do we have a hash? */
1297 if (!hash_expected && !hash_location)
a2a385d6 1298 return false;
cfb6eeb4
YH
1299
1300 if (hash_expected && !hash_location) {
c10d9310 1301 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1302 return true;
cfb6eeb4
YH
1303 }
1304
1305 if (!hash_expected && hash_location) {
c10d9310 1306 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1307 return true;
cfb6eeb4
YH
1308 }
1309
1310 /* Okay, so this is hash_expected and hash_location -
1311 * so we need to calculate the checksum.
1312 */
49a72dfb
AL
1313 genhash = tcp_v4_md5_hash_skb(newhash,
1314 hash_expected,
39f8e58e 1315 NULL, skb);
cfb6eeb4
YH
1316
1317 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1318 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1319 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1320 &iph->saddr, ntohs(th->source),
1321 &iph->daddr, ntohs(th->dest),
1322 genhash ? " tcp_v4_calc_md5_hash failed"
1323 : "");
a2a385d6 1324 return true;
cfb6eeb4 1325 }
a2a385d6 1326 return false;
cfb6eeb4 1327#endif
ba8e275a
ED
1328 return false;
1329}
cfb6eeb4 1330
b40cf18e
ED
1331static void tcp_v4_init_req(struct request_sock *req,
1332 const struct sock *sk_listener,
16bea70a
OP
1333 struct sk_buff *skb)
1334{
1335 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1336 struct net *net = sock_net(sk_listener);
16bea70a 1337
08d2cc3b
ED
1338 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1339 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1340 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1341}
1342
f964629e
ED
1343static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1344 struct flowi *fl,
4396e461 1345 const struct request_sock *req)
d94e0417 1346{
4396e461 1347 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1348}
1349
72a3effa 1350struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1351 .family = PF_INET,
2e6599cb 1352 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1353 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1354 .send_ack = tcp_v4_reqsk_send_ack,
1355 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1356 .send_reset = tcp_v4_send_reset,
688d1945 1357 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1358};
1359
b2e4b3de 1360static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1361 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1362#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1363 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1364 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1365#endif
16bea70a 1366 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1367#ifdef CONFIG_SYN_COOKIES
1368 .cookie_init_seq = cookie_v4_init_sequence,
1369#endif
d94e0417 1370 .route_req = tcp_v4_route_req,
84b114b9
ED
1371 .init_seq = tcp_v4_init_seq,
1372 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1373 .send_synack = tcp_v4_send_synack,
16bea70a 1374};
cfb6eeb4 1375
1da177e4
LT
1376int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377{
1da177e4 1378 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1379 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1380 goto drop;
1381
1fb6f159
OP
1382 return tcp_conn_request(&tcp_request_sock_ops,
1383 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1384
1da177e4 1385drop:
9caad864 1386 tcp_listendrop(sk);
1da177e4
LT
1387 return 0;
1388}
4bc2f18b 1389EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1390
1391
1392/*
1393 * The three way handshake has completed - we got a valid synack -
1394 * now create the new socket.
1395 */
0c27171e 1396struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1397 struct request_sock *req,
5e0724d0
ED
1398 struct dst_entry *dst,
1399 struct request_sock *req_unhash,
1400 bool *own_req)
1da177e4 1401{
2e6599cb 1402 struct inet_request_sock *ireq;
1da177e4
LT
1403 struct inet_sock *newinet;
1404 struct tcp_sock *newtp;
1405 struct sock *newsk;
cfb6eeb4
YH
1406#ifdef CONFIG_TCP_MD5SIG
1407 struct tcp_md5sig_key *key;
1408#endif
f6d8bd05 1409 struct ip_options_rcu *inet_opt;
1da177e4
LT
1410
1411 if (sk_acceptq_is_full(sk))
1412 goto exit_overflow;
1413
1da177e4
LT
1414 newsk = tcp_create_openreq_child(sk, req, skb);
1415 if (!newsk)
093d2823 1416 goto exit_nonewsk;
1da177e4 1417
bcd76111 1418 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1419 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1420
1421 newtp = tcp_sk(newsk);
1422 newinet = inet_sk(newsk);
2e6599cb 1423 ireq = inet_rsk(req);
d1e559d0
ED
1424 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1425 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1426 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1427 newinet->inet_saddr = ireq->ir_loc_addr;
1428 inet_opt = rcu_dereference(ireq->ireq_opt);
1429 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1430 newinet->mc_index = inet_iif(skb);
eddc9ec5 1431 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1432 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1433 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1434 if (inet_opt)
1435 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1436 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1437
dfd25fff
ED
1438 if (!dst) {
1439 dst = inet_csk_route_child_sock(sk, newsk, req);
1440 if (!dst)
1441 goto put_and_exit;
1442 } else {
1443 /* syncookie case : see end of cookie_v4_check() */
1444 }
0e734419
DM
1445 sk_setup_caps(newsk, dst);
1446
81164413
DB
1447 tcp_ca_openreq_child(newsk, dst);
1448
1da177e4 1449 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1450 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1451
1da177e4
LT
1452 tcp_initialize_rcv_mss(newsk);
1453
cfb6eeb4
YH
1454#ifdef CONFIG_TCP_MD5SIG
1455 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1456 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1457 AF_INET);
00db4124 1458 if (key) {
cfb6eeb4
YH
1459 /*
1460 * We're using one, so create a matching key
1461 * on the newsk structure. If we fail to get
1462 * memory, then we end up not copying the key
1463 * across. Shucks.
1464 */
a915da9b 1465 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
6797318e 1466 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
a465419b 1467 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1468 }
1469#endif
1470
0e734419
DM
1471 if (__inet_inherit_port(sk, newsk) < 0)
1472 goto put_and_exit;
5e0724d0 1473 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
c92e8c02 1474 if (likely(*own_req)) {
49a496c9 1475 tcp_move_syn(newtp, req);
c92e8c02
ED
1476 ireq->ireq_opt = NULL;
1477 } else {
1478 newinet->inet_opt = NULL;
1479 }
1da177e4
LT
1480 return newsk;
1481
1482exit_overflow:
c10d9310 1483 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1484exit_nonewsk:
1485 dst_release(dst);
1da177e4 1486exit:
9caad864 1487 tcp_listendrop(sk);
1da177e4 1488 return NULL;
0e734419 1489put_and_exit:
c92e8c02 1490 newinet->inet_opt = NULL;
e337e24d
CP
1491 inet_csk_prepare_forced_close(newsk);
1492 tcp_done(newsk);
0e734419 1493 goto exit;
1da177e4 1494}
4bc2f18b 1495EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1496
079096f1 1497static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1498{
079096f1 1499#ifdef CONFIG_SYN_COOKIES
52452c54 1500 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1501
af9b4738 1502 if (!th->syn)
461b74c3 1503 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1504#endif
1505 return sk;
1506}
1507
1da177e4 1508/* The socket must have it's spinlock held when we get
e994b2f0 1509 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1510 *
1511 * We have a potential double-lock case here, so even when
1512 * doing backlog processing we use the BH locking scheme.
1513 * This is because we cannot sleep with the original spinlock
1514 * held.
1515 */
1516int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1517{
cfb6eeb4 1518 struct sock *rsk;
cfb6eeb4 1519
1da177e4 1520 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1521 struct dst_entry *dst = sk->sk_rx_dst;
1522
bdeab991 1523 sock_rps_save_rxhash(sk, skb);
3d97379a 1524 sk_mark_napi_id(sk, skb);
404e0a8b 1525 if (dst) {
505fbcf0 1526 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1527 !dst->ops->check(dst, 0)) {
92101b3b
DM
1528 dst_release(dst);
1529 sk->sk_rx_dst = NULL;
1530 }
1531 }
3d97d88e 1532 tcp_rcv_established(sk, skb);
1da177e4
LT
1533 return 0;
1534 }
1535
12e25e10 1536 if (tcp_checksum_complete(skb))
1da177e4
LT
1537 goto csum_err;
1538
1539 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1540 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1541
1da177e4
LT
1542 if (!nsk)
1543 goto discard;
1da177e4 1544 if (nsk != sk) {
cfb6eeb4
YH
1545 if (tcp_child_process(sk, nsk, skb)) {
1546 rsk = nsk;
1da177e4 1547 goto reset;
cfb6eeb4 1548 }
1da177e4
LT
1549 return 0;
1550 }
ca55158c 1551 } else
bdeab991 1552 sock_rps_save_rxhash(sk, skb);
ca55158c 1553
72ab4a86 1554 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1555 rsk = sk;
1da177e4 1556 goto reset;
cfb6eeb4 1557 }
1da177e4
LT
1558 return 0;
1559
1560reset:
cfb6eeb4 1561 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1562discard:
1563 kfree_skb(skb);
1564 /* Be careful here. If this function gets more complicated and
1565 * gcc suffers from register pressure on the x86, sk (in %ebx)
1566 * might be destroyed here. This current version compiles correctly,
1567 * but you have been warned.
1568 */
1569 return 0;
1570
1571csum_err:
c10d9310
ED
1572 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1573 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1574 goto discard;
1575}
4bc2f18b 1576EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1577
7487449c 1578int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1579{
41063e9d
DM
1580 const struct iphdr *iph;
1581 const struct tcphdr *th;
1582 struct sock *sk;
41063e9d 1583
41063e9d 1584 if (skb->pkt_type != PACKET_HOST)
7487449c 1585 return 0;
41063e9d 1586
45f00f99 1587 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1588 return 0;
41063e9d
DM
1589
1590 iph = ip_hdr(skb);
45f00f99 1591 th = tcp_hdr(skb);
41063e9d
DM
1592
1593 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1594 return 0;
41063e9d 1595
45f00f99 1596 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1597 iph->saddr, th->source,
7011d085 1598 iph->daddr, ntohs(th->dest),
3fa6f616 1599 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1600 if (sk) {
1601 skb->sk = sk;
1602 skb->destructor = sock_edemux;
f7e4eb03 1603 if (sk_fullsock(sk)) {
d0c294c5 1604 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1605
41063e9d
DM
1606 if (dst)
1607 dst = dst_check(dst, 0);
92101b3b 1608 if (dst &&
505fbcf0 1609 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1610 skb_dst_set_noref(skb, dst);
41063e9d
DM
1611 }
1612 }
7487449c 1613 return 0;
41063e9d
DM
1614}
1615
c9c33212
ED
1616bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1617{
1618 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1619
1620 /* Only socket owner can try to collapse/prune rx queues
1621 * to reduce memory overhead, so add a little headroom here.
1622 * Few sockets backlog are possibly concurrently non empty.
1623 */
1624 limit += 64*1024;
1625
1626 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1627 * we can fix skb->truesize to its real value to avoid future drops.
1628 * This is valid because skb is not yet charged to the socket.
1629 * It has been noticed pure SACK packets were sometimes dropped
1630 * (if cooked by drivers without copybreak feature).
1631 */
60b1af33 1632 skb_condense(skb);
c9c33212
ED
1633
1634 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1635 bh_unlock_sock(sk);
1636 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1637 return true;
1638 }
1639 return false;
1640}
1641EXPORT_SYMBOL(tcp_add_backlog);
1642
ac6e7800
ED
1643int tcp_filter(struct sock *sk, struct sk_buff *skb)
1644{
1645 struct tcphdr *th = (struct tcphdr *)skb->data;
1646 unsigned int eaten = skb->len;
1647 int err;
1648
1649 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1650 if (!err) {
1651 eaten -= skb->len;
1652 TCP_SKB_CB(skb)->end_seq -= eaten;
1653 }
1654 return err;
1655}
1656EXPORT_SYMBOL(tcp_filter);
1657
eeea10b8
ED
1658static void tcp_v4_restore_cb(struct sk_buff *skb)
1659{
1660 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1661 sizeof(struct inet_skb_parm));
1662}
1663
1664static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1665 const struct tcphdr *th)
1666{
1667 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1668 * barrier() makes sure compiler wont play fool^Waliasing games.
1669 */
1670 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1671 sizeof(struct inet_skb_parm));
1672 barrier();
1673
1674 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1675 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1676 skb->len - th->doff * 4);
1677 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1678 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1679 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1680 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1681 TCP_SKB_CB(skb)->sacked = 0;
1682 TCP_SKB_CB(skb)->has_rxtstamp =
1683 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1684}
1685
1da177e4
LT
1686/*
1687 * From tcp_input.c
1688 */
1689
1690int tcp_v4_rcv(struct sk_buff *skb)
1691{
3b24d854 1692 struct net *net = dev_net(skb->dev);
3fa6f616 1693 int sdif = inet_sdif(skb);
eddc9ec5 1694 const struct iphdr *iph;
cf533ea5 1695 const struct tcphdr *th;
3b24d854 1696 bool refcounted;
1da177e4
LT
1697 struct sock *sk;
1698 int ret;
1699
1700 if (skb->pkt_type != PACKET_HOST)
1701 goto discard_it;
1702
1703 /* Count it even if it's bad */
90bbcc60 1704 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1705
1706 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1707 goto discard_it;
1708
ea1627c2 1709 th = (const struct tcphdr *)skb->data;
1da177e4 1710
ea1627c2 1711 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1712 goto bad_packet;
1713 if (!pskb_may_pull(skb, th->doff * 4))
1714 goto discard_it;
1715
1716 /* An explanation is required here, I think.
1717 * Packet length and doff are validated by header prediction,
caa20d9a 1718 * provided case of th->doff==0 is eliminated.
1da177e4 1719 * So, we defer the checks. */
ed70fcfc
TH
1720
1721 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1722 goto csum_error;
1da177e4 1723
ea1627c2 1724 th = (const struct tcphdr *)skb->data;
eddc9ec5 1725 iph = ip_hdr(skb);
4bdc3d66 1726lookup:
a583636a 1727 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1728 th->dest, sdif, &refcounted);
1da177e4
LT
1729 if (!sk)
1730 goto no_tcp_socket;
1731
bb134d5d
ED
1732process:
1733 if (sk->sk_state == TCP_TIME_WAIT)
1734 goto do_time_wait;
1735
079096f1
ED
1736 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1737 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1738 bool req_stolen = false;
7716682c 1739 struct sock *nsk;
079096f1
ED
1740
1741 sk = req->rsk_listener;
72923555 1742 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1743 sk_drops_add(sk, skb);
72923555
ED
1744 reqsk_put(req);
1745 goto discard_it;
1746 }
4fd44a98
FL
1747 if (tcp_checksum_complete(skb)) {
1748 reqsk_put(req);
1749 goto csum_error;
1750 }
7716682c 1751 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1752 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1753 goto lookup;
1754 }
3b24d854
ED
1755 /* We own a reference on the listener, increase it again
1756 * as we might lose it too soon.
1757 */
7716682c 1758 sock_hold(sk);
3b24d854 1759 refcounted = true;
1f3b359f 1760 nsk = NULL;
eeea10b8
ED
1761 if (!tcp_filter(sk, skb)) {
1762 th = (const struct tcphdr *)skb->data;
1763 iph = ip_hdr(skb);
1764 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 1765 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 1766 }
079096f1
ED
1767 if (!nsk) {
1768 reqsk_put(req);
e0f9759f
ED
1769 if (req_stolen) {
1770 /* Another cpu got exclusive access to req
1771 * and created a full blown socket.
1772 * Try to feed this packet to this socket
1773 * instead of discarding it.
1774 */
1775 tcp_v4_restore_cb(skb);
1776 sock_put(sk);
1777 goto lookup;
1778 }
7716682c 1779 goto discard_and_relse;
079096f1
ED
1780 }
1781 if (nsk == sk) {
079096f1 1782 reqsk_put(req);
eeea10b8 1783 tcp_v4_restore_cb(skb);
079096f1
ED
1784 } else if (tcp_child_process(sk, nsk, skb)) {
1785 tcp_v4_send_reset(nsk, skb);
7716682c 1786 goto discard_and_relse;
079096f1 1787 } else {
7716682c 1788 sock_put(sk);
079096f1
ED
1789 return 0;
1790 }
1791 }
6cce09f8 1792 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1793 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1794 goto discard_and_relse;
6cce09f8 1795 }
d218d111 1796
1da177e4
LT
1797 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1798 goto discard_and_relse;
9ea88a15 1799
9ea88a15
DP
1800 if (tcp_v4_inbound_md5_hash(sk, skb))
1801 goto discard_and_relse;
9ea88a15 1802
b59c2701 1803 nf_reset(skb);
1da177e4 1804
ac6e7800 1805 if (tcp_filter(sk, skb))
1da177e4 1806 goto discard_and_relse;
ac6e7800
ED
1807 th = (const struct tcphdr *)skb->data;
1808 iph = ip_hdr(skb);
eeea10b8 1809 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
1810
1811 skb->dev = NULL;
1812
e994b2f0
ED
1813 if (sk->sk_state == TCP_LISTEN) {
1814 ret = tcp_v4_do_rcv(sk, skb);
1815 goto put_and_return;
1816 }
1817
1818 sk_incoming_cpu_update(sk);
1819
c6366184 1820 bh_lock_sock_nested(sk);
a44d6eac 1821 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1822 ret = 0;
1823 if (!sock_owned_by_user(sk)) {
e7942d06 1824 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1825 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1826 goto discard_and_relse;
1827 }
1da177e4
LT
1828 bh_unlock_sock(sk);
1829
e994b2f0 1830put_and_return:
3b24d854
ED
1831 if (refcounted)
1832 sock_put(sk);
1da177e4
LT
1833
1834 return ret;
1835
1836no_tcp_socket:
1837 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1838 goto discard_it;
1839
eeea10b8
ED
1840 tcp_v4_fill_cb(skb, iph, th);
1841
12e25e10 1842 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1843csum_error:
90bbcc60 1844 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1845bad_packet:
90bbcc60 1846 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1847 } else {
cfb6eeb4 1848 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1849 }
1850
1851discard_it:
1852 /* Discard frame. */
1853 kfree_skb(skb);
e905a9ed 1854 return 0;
1da177e4
LT
1855
1856discard_and_relse:
532182cd 1857 sk_drops_add(sk, skb);
3b24d854
ED
1858 if (refcounted)
1859 sock_put(sk);
1da177e4
LT
1860 goto discard_it;
1861
1862do_time_wait:
1863 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1864 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1865 goto discard_it;
1866 }
1867
eeea10b8
ED
1868 tcp_v4_fill_cb(skb, iph, th);
1869
6a5dc9e5
ED
1870 if (tcp_checksum_complete(skb)) {
1871 inet_twsk_put(inet_twsk(sk));
1872 goto csum_error;
1da177e4 1873 }
9469c7b4 1874 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1875 case TCP_TW_SYN: {
c346dca1 1876 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1877 &tcp_hashinfo, skb,
1878 __tcp_hdrlen(th),
da5e3630 1879 iph->saddr, th->source,
eddc9ec5 1880 iph->daddr, th->dest,
3fa6f616
DA
1881 inet_iif(skb),
1882 sdif);
1da177e4 1883 if (sk2) {
dbe7faa4 1884 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1885 sk = sk2;
eeea10b8 1886 tcp_v4_restore_cb(skb);
3b24d854 1887 refcounted = false;
1da177e4
LT
1888 goto process;
1889 }
1da177e4 1890 }
fcfd6dfa
GS
1891 /* to ACK */
1892 /* fall through */
1da177e4
LT
1893 case TCP_TW_ACK:
1894 tcp_v4_timewait_ack(sk, skb);
1895 break;
1896 case TCP_TW_RST:
271c3b9b
FW
1897 tcp_v4_send_reset(sk, skb);
1898 inet_twsk_deschedule_put(inet_twsk(sk));
1899 goto discard_it;
1da177e4
LT
1900 case TCP_TW_SUCCESS:;
1901 }
1902 goto discard_it;
1903}
1904
ccb7c410
DM
1905static struct timewait_sock_ops tcp_timewait_sock_ops = {
1906 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1907 .twsk_unique = tcp_twsk_unique,
1908 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1909};
1da177e4 1910
63d02d15 1911void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1912{
1913 struct dst_entry *dst = skb_dst(skb);
1914
5037e9ef 1915 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1916 sk->sk_rx_dst = dst;
1917 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1918 }
5d299f3d 1919}
63d02d15 1920EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1921
3b401a81 1922const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1923 .queue_xmit = ip_queue_xmit,
1924 .send_check = tcp_v4_send_check,
1925 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1926 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1927 .conn_request = tcp_v4_conn_request,
1928 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1929 .net_header_len = sizeof(struct iphdr),
1930 .setsockopt = ip_setsockopt,
1931 .getsockopt = ip_getsockopt,
1932 .addr2sockaddr = inet_csk_addr2sockaddr,
1933 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 1934#ifdef CONFIG_COMPAT
543d9cfe
ACM
1935 .compat_setsockopt = compat_ip_setsockopt,
1936 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1937#endif
4fab9071 1938 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1939};
4bc2f18b 1940EXPORT_SYMBOL(ipv4_specific);
1da177e4 1941
cfb6eeb4 1942#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1943static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1944 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1945 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1946 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1947};
b6332e6c 1948#endif
cfb6eeb4 1949
1da177e4
LT
1950/* NOTE: A lot of things set to zero explicitly by call to
1951 * sk_alloc() so need not be done here.
1952 */
1953static int tcp_v4_init_sock(struct sock *sk)
1954{
6687e988 1955 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1956
900f65d3 1957 tcp_init_sock(sk);
1da177e4 1958
8292a17a 1959 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1960
cfb6eeb4 1961#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1962 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1963#endif
1da177e4 1964
1da177e4
LT
1965 return 0;
1966}
1967
7d06b2e0 1968void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1969{
1970 struct tcp_sock *tp = tcp_sk(sk);
1971
e1a4aa50
SL
1972 trace_tcp_destroy_sock(sk);
1973
1da177e4
LT
1974 tcp_clear_xmit_timers(sk);
1975
6687e988 1976 tcp_cleanup_congestion_control(sk);
317a76f9 1977
734942cc
DW
1978 tcp_cleanup_ulp(sk);
1979
1da177e4 1980 /* Cleanup up the write buffer. */
fe067e8a 1981 tcp_write_queue_purge(sk);
1da177e4 1982
cf1ef3f0
WW
1983 /* Check if we want to disable active TFO */
1984 tcp_fastopen_active_disable_ofo_check(sk);
1985
1da177e4 1986 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 1987 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 1988
cfb6eeb4
YH
1989#ifdef CONFIG_TCP_MD5SIG
1990 /* Clean up the MD5 key list, if any */
1991 if (tp->md5sig_info) {
a915da9b 1992 tcp_clear_md5_list(sk);
fb7df5e4 1993 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
1994 tp->md5sig_info = NULL;
1995 }
1996#endif
1a2449a8 1997
1da177e4 1998 /* Clean up a referenced TCP bind bucket. */
463c84b9 1999 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2000 inet_put_port(sk);
1da177e4 2001
00db4124 2002 BUG_ON(tp->fastopen_rsk);
435cf559 2003
cf60af03
YC
2004 /* If socket is aborted during connect operation */
2005 tcp_free_fastopen_req(tp);
1fba70e5 2006 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2007 tcp_saved_syn_free(tp);
cf60af03 2008
180d8cd9 2009 sk_sockets_allocated_dec(sk);
1da177e4 2010}
1da177e4
LT
2011EXPORT_SYMBOL(tcp_v4_destroy_sock);
2012
2013#ifdef CONFIG_PROC_FS
2014/* Proc filesystem TCP sock list dumping. */
2015
a8b690f9
TH
2016/*
2017 * Get next listener socket follow cur. If cur is NULL, get first socket
2018 * starting from bucket given in st->bucket; when st->bucket is zero the
2019 * very first socket in the hash table is returned.
2020 */
1da177e4
LT
2021static void *listening_get_next(struct seq_file *seq, void *cur)
2022{
37d849bb 2023 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2024 struct tcp_iter_state *st = seq->private;
a4146b1b 2025 struct net *net = seq_file_net(seq);
3b24d854 2026 struct inet_listen_hashbucket *ilb;
3b24d854 2027 struct sock *sk = cur;
1da177e4
LT
2028
2029 if (!sk) {
3b24d854 2030get_head:
a8b690f9 2031 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 2032 spin_lock(&ilb->lock);
3b24d854 2033 sk = sk_head(&ilb->head);
a8b690f9 2034 st->offset = 0;
1da177e4
LT
2035 goto get_sk;
2036 }
5caea4ea 2037 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2038 ++st->num;
a8b690f9 2039 ++st->offset;
1da177e4 2040
3b24d854 2041 sk = sk_next(sk);
1da177e4 2042get_sk:
3b24d854 2043 sk_for_each_from(sk) {
8475ef9f
PE
2044 if (!net_eq(sock_net(sk), net))
2045 continue;
37d849bb 2046 if (sk->sk_family == afinfo->family)
3b24d854 2047 return sk;
1da177e4 2048 }
9652dc2e 2049 spin_unlock(&ilb->lock);
a8b690f9 2050 st->offset = 0;
3b24d854
ED
2051 if (++st->bucket < INET_LHTABLE_SIZE)
2052 goto get_head;
2053 return NULL;
1da177e4
LT
2054}
2055
2056static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2057{
a8b690f9
TH
2058 struct tcp_iter_state *st = seq->private;
2059 void *rc;
2060
2061 st->bucket = 0;
2062 st->offset = 0;
2063 rc = listening_get_next(seq, NULL);
1da177e4
LT
2064
2065 while (rc && *pos) {
2066 rc = listening_get_next(seq, rc);
2067 --*pos;
2068 }
2069 return rc;
2070}
2071
05dbc7b5 2072static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2073{
05dbc7b5 2074 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2075}
2076
a8b690f9
TH
2077/*
2078 * Get first established socket starting from bucket given in st->bucket.
2079 * If st->bucket is zero, the very first socket in the hash is returned.
2080 */
1da177e4
LT
2081static void *established_get_first(struct seq_file *seq)
2082{
37d849bb 2083 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2084 struct tcp_iter_state *st = seq->private;
a4146b1b 2085 struct net *net = seq_file_net(seq);
1da177e4
LT
2086 void *rc = NULL;
2087
a8b690f9
TH
2088 st->offset = 0;
2089 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2090 struct sock *sk;
3ab5aee7 2091 struct hlist_nulls_node *node;
9db66bdc 2092 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2093
6eac5604
AK
2094 /* Lockless fast path for the common case of empty buckets */
2095 if (empty_bucket(st))
2096 continue;
2097
9db66bdc 2098 spin_lock_bh(lock);
3ab5aee7 2099 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
37d849bb 2100 if (sk->sk_family != afinfo->family ||
878628fb 2101 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2102 continue;
2103 }
2104 rc = sk;
2105 goto out;
2106 }
9db66bdc 2107 spin_unlock_bh(lock);
1da177e4
LT
2108 }
2109out:
2110 return rc;
2111}
2112
2113static void *established_get_next(struct seq_file *seq, void *cur)
2114{
37d849bb 2115 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1da177e4 2116 struct sock *sk = cur;
3ab5aee7 2117 struct hlist_nulls_node *node;
5799de0b 2118 struct tcp_iter_state *st = seq->private;
a4146b1b 2119 struct net *net = seq_file_net(seq);
1da177e4
LT
2120
2121 ++st->num;
a8b690f9 2122 ++st->offset;
1da177e4 2123
05dbc7b5 2124 sk = sk_nulls_next(sk);
1da177e4 2125
3ab5aee7 2126 sk_nulls_for_each_from(sk, node) {
37d849bb
CH
2127 if (sk->sk_family == afinfo->family &&
2128 net_eq(sock_net(sk), net))
05dbc7b5 2129 return sk;
1da177e4
LT
2130 }
2131
05dbc7b5
ED
2132 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2133 ++st->bucket;
2134 return established_get_first(seq);
1da177e4
LT
2135}
2136
2137static void *established_get_idx(struct seq_file *seq, loff_t pos)
2138{
a8b690f9
TH
2139 struct tcp_iter_state *st = seq->private;
2140 void *rc;
2141
2142 st->bucket = 0;
2143 rc = established_get_first(seq);
1da177e4
LT
2144
2145 while (rc && pos) {
2146 rc = established_get_next(seq, rc);
2147 --pos;
7174259e 2148 }
1da177e4
LT
2149 return rc;
2150}
2151
2152static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2153{
2154 void *rc;
5799de0b 2155 struct tcp_iter_state *st = seq->private;
1da177e4 2156
1da177e4
LT
2157 st->state = TCP_SEQ_STATE_LISTENING;
2158 rc = listening_get_idx(seq, &pos);
2159
2160 if (!rc) {
1da177e4
LT
2161 st->state = TCP_SEQ_STATE_ESTABLISHED;
2162 rc = established_get_idx(seq, pos);
2163 }
2164
2165 return rc;
2166}
2167
a8b690f9
TH
2168static void *tcp_seek_last_pos(struct seq_file *seq)
2169{
2170 struct tcp_iter_state *st = seq->private;
2171 int offset = st->offset;
2172 int orig_num = st->num;
2173 void *rc = NULL;
2174
2175 switch (st->state) {
a8b690f9
TH
2176 case TCP_SEQ_STATE_LISTENING:
2177 if (st->bucket >= INET_LHTABLE_SIZE)
2178 break;
2179 st->state = TCP_SEQ_STATE_LISTENING;
2180 rc = listening_get_next(seq, NULL);
2181 while (offset-- && rc)
2182 rc = listening_get_next(seq, rc);
2183 if (rc)
2184 break;
2185 st->bucket = 0;
05dbc7b5 2186 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2187 /* Fallthrough */
2188 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2189 if (st->bucket > tcp_hashinfo.ehash_mask)
2190 break;
2191 rc = established_get_first(seq);
2192 while (offset-- && rc)
2193 rc = established_get_next(seq, rc);
2194 }
2195
2196 st->num = orig_num;
2197
2198 return rc;
2199}
2200
37d849bb 2201void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2202{
5799de0b 2203 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2204 void *rc;
2205
2206 if (*pos && *pos == st->last_pos) {
2207 rc = tcp_seek_last_pos(seq);
2208 if (rc)
2209 goto out;
2210 }
2211
1da177e4
LT
2212 st->state = TCP_SEQ_STATE_LISTENING;
2213 st->num = 0;
a8b690f9
TH
2214 st->bucket = 0;
2215 st->offset = 0;
2216 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2217
2218out:
2219 st->last_pos = *pos;
2220 return rc;
1da177e4 2221}
37d849bb 2222EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2223
37d849bb 2224void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2225{
a8b690f9 2226 struct tcp_iter_state *st = seq->private;
1da177e4 2227 void *rc = NULL;
1da177e4
LT
2228
2229 if (v == SEQ_START_TOKEN) {
2230 rc = tcp_get_idx(seq, 0);
2231 goto out;
2232 }
1da177e4
LT
2233
2234 switch (st->state) {
1da177e4
LT
2235 case TCP_SEQ_STATE_LISTENING:
2236 rc = listening_get_next(seq, v);
2237 if (!rc) {
1da177e4 2238 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2239 st->bucket = 0;
2240 st->offset = 0;
1da177e4
LT
2241 rc = established_get_first(seq);
2242 }
2243 break;
2244 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2245 rc = established_get_next(seq, v);
2246 break;
2247 }
2248out:
2249 ++*pos;
a8b690f9 2250 st->last_pos = *pos;
1da177e4
LT
2251 return rc;
2252}
37d849bb 2253EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2254
37d849bb 2255void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2256{
5799de0b 2257 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2258
2259 switch (st->state) {
1da177e4
LT
2260 case TCP_SEQ_STATE_LISTENING:
2261 if (v != SEQ_START_TOKEN)
9652dc2e 2262 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2263 break;
1da177e4
LT
2264 case TCP_SEQ_STATE_ESTABLISHED:
2265 if (v)
9db66bdc 2266 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2267 break;
2268 }
2269}
37d849bb 2270EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2271
d4f06873 2272static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2273 struct seq_file *f, int i)
1da177e4 2274{
2e6599cb 2275 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2276 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2277
5e659e4c 2278 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2279 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2280 i,
634fb979 2281 ireq->ir_loc_addr,
d4f06873 2282 ireq->ir_num,
634fb979
ED
2283 ireq->ir_rmt_addr,
2284 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2285 TCP_SYN_RECV,
2286 0, 0, /* could print option size, but that is af dependent. */
2287 1, /* timers active (only the expire timer) */
a399a805 2288 jiffies_delta_to_clock_t(delta),
e6c022a4 2289 req->num_timeout,
aa3a0c8c
ED
2290 from_kuid_munged(seq_user_ns(f),
2291 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2292 0, /* non standard timer */
2293 0, /* open_requests have no inode */
d4f06873 2294 0,
652586df 2295 req);
1da177e4
LT
2296}
2297
652586df 2298static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2299{
2300 int timer_active;
2301 unsigned long timer_expires;
cf533ea5 2302 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2303 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2304 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2305 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2306 __be32 dest = inet->inet_daddr;
2307 __be32 src = inet->inet_rcv_saddr;
2308 __u16 destp = ntohs(inet->inet_dport);
2309 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2310 int rx_queue;
00fd38d9 2311 int state;
1da177e4 2312
6ba8a3b1 2313 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2314 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2315 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2316 timer_active = 1;
463c84b9
ACM
2317 timer_expires = icsk->icsk_timeout;
2318 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2319 timer_active = 4;
463c84b9 2320 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2321 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2322 timer_active = 2;
cf4c6bf8 2323 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2324 } else {
2325 timer_active = 0;
2326 timer_expires = jiffies;
2327 }
2328
986ffdfd 2329 state = inet_sk_state_load(sk);
00fd38d9 2330 if (state == TCP_LISTEN)
49d09007
ED
2331 rx_queue = sk->sk_ack_backlog;
2332 else
00fd38d9
ED
2333 /* Because we don't lock the socket,
2334 * we might find a transient negative value.
49d09007
ED
2335 */
2336 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2337
5e659e4c 2338 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2339 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2340 i, src, srcp, dest, destp, state,
47da8ee6 2341 tp->write_seq - tp->snd_una,
49d09007 2342 rx_queue,
1da177e4 2343 timer_active,
a399a805 2344 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2345 icsk->icsk_retransmits,
a7cb5a49 2346 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2347 icsk->icsk_probes_out,
cf4c6bf8 2348 sock_i_ino(sk),
41c6d650 2349 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2350 jiffies_to_clock_t(icsk->icsk_rto),
2351 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2352 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2353 tp->snd_cwnd,
00fd38d9
ED
2354 state == TCP_LISTEN ?
2355 fastopenq->max_qlen :
652586df 2356 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2357}
2358
cf533ea5 2359static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2360 struct seq_file *f, int i)
1da177e4 2361{
789f558c 2362 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2363 __be32 dest, src;
1da177e4 2364 __u16 destp, srcp;
1da177e4
LT
2365
2366 dest = tw->tw_daddr;
2367 src = tw->tw_rcv_saddr;
2368 destp = ntohs(tw->tw_dport);
2369 srcp = ntohs(tw->tw_sport);
2370
5e659e4c 2371 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2372 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2373 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2374 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2375 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2376}
2377
2378#define TMPSZ 150
2379
2380static int tcp4_seq_show(struct seq_file *seq, void *v)
2381{
5799de0b 2382 struct tcp_iter_state *st;
05dbc7b5 2383 struct sock *sk = v;
1da177e4 2384
652586df 2385 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2386 if (v == SEQ_START_TOKEN) {
652586df 2387 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2388 "rx_queue tr tm->when retrnsmt uid timeout "
2389 "inode");
2390 goto out;
2391 }
2392 st = seq->private;
2393
079096f1
ED
2394 if (sk->sk_state == TCP_TIME_WAIT)
2395 get_timewait4_sock(v, seq, st->num);
2396 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2397 get_openreq4(v, seq, st->num);
079096f1
ED
2398 else
2399 get_tcp4_sock(v, seq, st->num);
1da177e4 2400out:
652586df 2401 seq_pad(seq, '\n');
1da177e4
LT
2402 return 0;
2403}
2404
37d849bb
CH
2405static const struct seq_operations tcp4_seq_ops = {
2406 .show = tcp4_seq_show,
2407 .start = tcp_seq_start,
2408 .next = tcp_seq_next,
2409 .stop = tcp_seq_stop,
2410};
2411
1da177e4 2412static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2413 .family = AF_INET,
1da177e4
LT
2414};
2415
2c8c1e72 2416static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2417{
c3506372
CH
2418 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2419 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2420 return -ENOMEM;
2421 return 0;
757764f6
PE
2422}
2423
2c8c1e72 2424static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2425{
37d849bb 2426 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
2427}
2428
2429static struct pernet_operations tcp4_net_ops = {
2430 .init = tcp4_proc_init_net,
2431 .exit = tcp4_proc_exit_net,
2432};
2433
1da177e4
LT
2434int __init tcp4_proc_init(void)
2435{
757764f6 2436 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2437}
2438
2439void tcp4_proc_exit(void)
2440{
757764f6 2441 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2442}
2443#endif /* CONFIG_PROC_FS */
2444
2445struct proto tcp_prot = {
2446 .name = "TCP",
2447 .owner = THIS_MODULE,
2448 .close = tcp_close,
d74bad4e 2449 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
2450 .connect = tcp_v4_connect,
2451 .disconnect = tcp_disconnect,
463c84b9 2452 .accept = inet_csk_accept,
1da177e4
LT
2453 .ioctl = tcp_ioctl,
2454 .init = tcp_v4_init_sock,
2455 .destroy = tcp_v4_destroy_sock,
2456 .shutdown = tcp_shutdown,
2457 .setsockopt = tcp_setsockopt,
2458 .getsockopt = tcp_getsockopt,
4b9d07a4 2459 .keepalive = tcp_set_keepalive,
1da177e4 2460 .recvmsg = tcp_recvmsg,
7ba42910
CG
2461 .sendmsg = tcp_sendmsg,
2462 .sendpage = tcp_sendpage,
1da177e4 2463 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2464 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2465 .hash = inet_hash,
2466 .unhash = inet_unhash,
2467 .get_port = inet_csk_get_port,
1da177e4 2468 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2469 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2470 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2471 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2472 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2473 .memory_allocated = &tcp_memory_allocated,
2474 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2475 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
2476 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2477 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
2478 .max_header = MAX_TCP_HEADER,
2479 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2480 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2481 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2482 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2483 .h.hashinfo = &tcp_hashinfo,
7ba42910 2484 .no_autobind = true,
543d9cfe
ACM
2485#ifdef CONFIG_COMPAT
2486 .compat_setsockopt = compat_tcp_setsockopt,
2487 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2488#endif
c1e64e29 2489 .diag_destroy = tcp_abort,
1da177e4 2490};
4bc2f18b 2491EXPORT_SYMBOL(tcp_prot);
1da177e4 2492
bdbbb852
ED
2493static void __net_exit tcp_sk_exit(struct net *net)
2494{
2495 int cpu;
2496
6670e152
SH
2497 module_put(net->ipv4.tcp_congestion_control->owner);
2498
bdbbb852
ED
2499 for_each_possible_cpu(cpu)
2500 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2501 free_percpu(net->ipv4.tcp_sk);
2502}
2503
046ee902
DL
2504static int __net_init tcp_sk_init(struct net *net)
2505{
fee83d09 2506 int res, cpu, cnt;
bdbbb852
ED
2507
2508 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2509 if (!net->ipv4.tcp_sk)
2510 return -ENOMEM;
2511
2512 for_each_possible_cpu(cpu) {
2513 struct sock *sk;
2514
2515 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2516 IPPROTO_TCP, net);
2517 if (res)
2518 goto fail;
a9d6532b 2519 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
bdbbb852
ED
2520 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2521 }
49213555 2522
5d134f1c 2523 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2524 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2525
b0f9ca53 2526 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2527 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2528 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2529
13b287e8 2530 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2531 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2532 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2533
6fa25166 2534 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2535 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2536 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2537 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2538 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2539 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2540 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2541 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2542 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 2543 net->ipv4.sysctl_tcp_tw_reuse = 2;
12ed8244 2544
fee83d09 2545 cnt = tcp_hashinfo.ehash_mask + 1;
fee83d09 2546 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
1946e672
HY
2547 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2548
fee83d09 2549 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
f9301034 2550 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2551 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2552 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 2553 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 2554 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 2555 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 2556 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 2557 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 2558 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 2559 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 2560 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 2561 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 2562 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
2563 /* This limits the percentage of the congestion window which we
2564 * will allow a single TSO frame to consume. Building TSO frames
2565 * which are too large can cause TCP streams to be bursty.
2566 */
2567 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
9184d8bb
ED
2568 /* Default TSQ limit of four TSO segments */
2569 net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
b530b681
ED
2570 /* rfc5961 challenge ack rate limiting */
2571 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 2572 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 2573 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 2574 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 2575 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 2576 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 2577 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
2578 if (net != &init_net) {
2579 memcpy(net->ipv4.sysctl_tcp_rmem,
2580 init_net.ipv4.sysctl_tcp_rmem,
2581 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2582 memcpy(net->ipv4.sysctl_tcp_wmem,
2583 init_net.ipv4.sysctl_tcp_wmem,
2584 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2585 }
6d82aa24 2586 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
9c21d2fc 2587 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 2588 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2589 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2590 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2591 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2592
6670e152
SH
2593 /* Reno is always built in */
2594 if (!net_eq(net, &init_net) &&
2595 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2596 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2597 else
2598 net->ipv4.tcp_congestion_control = &tcp_reno;
2599
49213555 2600 return 0;
bdbbb852
ED
2601fail:
2602 tcp_sk_exit(net);
2603
2604 return res;
b099ce26
EB
2605}
2606
2607static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2608{
43713848
HY
2609 struct net *net;
2610
1946e672 2611 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2612
2613 list_for_each_entry(net, net_exit_list, exit_list)
2614 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2615}
2616
2617static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2618 .init = tcp_sk_init,
2619 .exit = tcp_sk_exit,
2620 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2621};
2622
9b0f976f 2623void __init tcp_v4_init(void)
1da177e4 2624{
6a1b3054 2625 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2626 panic("Failed to create the TCP control socket.\n");
1da177e4 2627}