]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/ipv4/tcp_ipv4.c
net: ipv4: ip_input: fix blank line coding style issues
[mirror_ubuntu-eoan-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
6797318e 83#include <linux/inetdevice.h>
1da177e4 84
cf80e0e4 85#include <crypto/hash.h>
cfb6eeb4
YH
86#include <linux/scatterlist.h>
87
c24b14c4
SL
88#include <trace/events/tcp.h>
89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
a915da9b 91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
93#endif
94
5caea4ea 95struct inet_hashinfo tcp_hashinfo;
4bc2f18b 96EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 97
84b114b9 98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 99{
84b114b9
ED
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source);
104}
105
5d2ed052 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 107{
5d2ed052 108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
109}
110
6d6ee43e
ACM
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
79e9fed4 113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
6d6ee43e
ACM
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
79e9fed4
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118 if (reuse == 2) {
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
122 */
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 loopback = true;
126#if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 loopback = true;
135 } else
136#endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
6d6ee43e
ACM
145
146 /* With PAWS, it is safe from the viewpoint
147 of data integrity. Even without PAWS it is safe provided sequence
148 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149
150 Actually, the idea is close to VJ's one, only timestamp cache is
151 held not per host, but per port pair and TW bucket is used as state
152 holder.
153
154 If TW bucket has been already destroyed we fall back to VJ's scheme
155 and use initial timestamp retrieved from peer table.
156 */
157 if (tcptw->tw_ts_recent_stamp &&
cca9bab1
AB
158 (!twp || (reuse && time_after32(ktime_get_seconds(),
159 tcptw->tw_ts_recent_stamp)))) {
21684dc4
SB
160 /* In case of repair and re-using TIME-WAIT sockets we still
161 * want to be sure that it is safe as above but honor the
162 * sequence numbers and time stamps set as part of the repair
163 * process.
164 *
165 * Without this check re-using a TIME-WAIT socket with TCP
166 * repair would accumulate a -1 on the repair assigned
167 * sequence number. The first time it is reused the sequence
168 * is -1, the second time -2, etc. This fixes that issue
169 * without appearing to create any others.
170 */
171 if (likely(!tp->repair)) {
172 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
173 if (tp->write_seq == 0)
174 tp->write_seq = 1;
175 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
176 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
177 }
6d6ee43e
ACM
178 sock_hold(sktw);
179 return 1;
180 }
181
182 return 0;
183}
6d6ee43e
ACM
184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
185
d74bad4e
AI
186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
187 int addr_len)
188{
189 /* This check is replicated from tcp_v4_connect() and intended to
190 * prevent BPF program called below from accessing bytes that are out
191 * of the bound specified by user in addr_len.
192 */
193 if (addr_len < sizeof(struct sockaddr_in))
194 return -EINVAL;
195
196 sock_owned_by_me(sk);
197
198 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
199}
200
1da177e4
LT
201/* This will initiate an outgoing connection. */
202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203{
2d7192d6 204 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
205 struct inet_sock *inet = inet_sk(sk);
206 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 207 __be16 orig_sport, orig_dport;
bada8adc 208 __be32 daddr, nexthop;
da905bd1 209 struct flowi4 *fl4;
2d7192d6 210 struct rtable *rt;
1da177e4 211 int err;
f6d8bd05 212 struct ip_options_rcu *inet_opt;
1946e672 213 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
214
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 if (usin->sin_family != AF_INET)
219 return -EAFNOSUPPORT;
220
221 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 222 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 223 lockdep_sock_is_held(sk));
f6d8bd05 224 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
225 if (!daddr)
226 return -EINVAL;
f6d8bd05 227 nexthop = inet_opt->opt.faddr;
1da177e4
LT
228 }
229
dca8b089
DM
230 orig_sport = inet->inet_sport;
231 orig_dport = usin->sin_port;
da905bd1
DM
232 fl4 = &inet->cork.fl.u.ip4;
233 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
234 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
235 IPPROTO_TCP,
0e0d44ab 236 orig_sport, orig_dport, sk);
b23dd4fe
DM
237 if (IS_ERR(rt)) {
238 err = PTR_ERR(rt);
239 if (err == -ENETUNREACH)
f1d8cba6 240 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 241 return err;
584bdf8c 242 }
1da177e4
LT
243
244 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
245 ip_rt_put(rt);
246 return -ENETUNREACH;
247 }
248
f6d8bd05 249 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 250 daddr = fl4->daddr;
1da177e4 251
c720c7e8 252 if (!inet->inet_saddr)
da905bd1 253 inet->inet_saddr = fl4->saddr;
d1e559d0 254 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 255
c720c7e8 256 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
257 /* Reset inherited state */
258 tp->rx_opt.ts_recent = 0;
259 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
260 if (likely(!tp->repair))
261 tp->write_seq = 0;
1da177e4
LT
262 }
263
c720c7e8 264 inet->inet_dport = usin->sin_port;
d1e559d0 265 sk_daddr_set(sk, daddr);
1da177e4 266
d83d8461 267 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
268 if (inet_opt)
269 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 270
bee7ca9e 271 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
272
273 /* Socket identity is still unknown (sport may be zero).
274 * However we set state to SYN-SENT and not releasing socket
275 * lock select source port, enter ourselves into the hash tables and
276 * complete initialization after this.
277 */
278 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 279 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
280 if (err)
281 goto failure;
282
877d1f62 283 sk_set_txhash(sk);
9e7ceb06 284
da905bd1 285 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
286 inet->inet_sport, inet->inet_dport, sk);
287 if (IS_ERR(rt)) {
288 err = PTR_ERR(rt);
289 rt = NULL;
1da177e4 290 goto failure;
b23dd4fe 291 }
1da177e4 292 /* OK, now commit destination to socket. */
bcd76111 293 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 294 sk_setup_caps(sk, &rt->dst);
19f6d3f3 295 rt = NULL;
1da177e4 296
00355fa5 297 if (likely(!tp->repair)) {
00355fa5 298 if (!tp->write_seq)
84b114b9
ED
299 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
300 inet->inet_daddr,
301 inet->inet_sport,
302 usin->sin_port);
5d2ed052
ED
303 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
304 inet->inet_saddr,
84b114b9 305 inet->inet_daddr);
00355fa5 306 }
1da177e4 307
c720c7e8 308 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 309
19f6d3f3
WW
310 if (tcp_fastopen_defer_connect(sk, &err))
311 return err;
312 if (err)
313 goto failure;
314
2b916477 315 err = tcp_connect(sk);
ee995283 316
1da177e4
LT
317 if (err)
318 goto failure;
319
320 return 0;
321
322failure:
7174259e
ACM
323 /*
324 * This unhashes the socket and releases the local port,
325 * if necessary.
326 */
1da177e4
LT
327 tcp_set_state(sk, TCP_CLOSE);
328 ip_rt_put(rt);
329 sk->sk_route_caps = 0;
c720c7e8 330 inet->inet_dport = 0;
1da177e4
LT
331 return err;
332}
4bc2f18b 333EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 334
1da177e4 335/*
563d34d0
ED
336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
337 * It can be called through tcp_release_cb() if socket was owned by user
338 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 339 */
4fab9071 340void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 341{
1da177e4 342 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
343 struct dst_entry *dst;
344 u32 mtu;
1da177e4 345
02b2faaf
ED
346 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
347 return;
348 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
349 dst = inet_csk_update_pmtu(sk, mtu);
350 if (!dst)
1da177e4
LT
351 return;
352
1da177e4
LT
353 /* Something is about to be wrong... Remember soft error
354 * for the case, if this connection will not able to recover.
355 */
356 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
357 sk->sk_err_soft = EMSGSIZE;
358
359 mtu = dst_mtu(dst);
360
361 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 362 ip_sk_accept_pmtu(sk) &&
d83d8461 363 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
364 tcp_sync_mss(sk, mtu);
365
366 /* Resend the TCP packet because it's
367 * clear that the old packet has been
368 * dropped. This is the new "fast" path mtu
369 * discovery.
370 */
371 tcp_simple_retransmit(sk);
372 } /* else let the usual retransmit timer handle it */
373}
4fab9071 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 375
55be7a9c
DM
376static void do_redirect(struct sk_buff *skb, struct sock *sk)
377{
378 struct dst_entry *dst = __sk_dst_check(sk, 0);
379
1ed5c48f 380 if (dst)
6700c270 381 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
382}
383
26e37360
ED
384
385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
387{
388 struct request_sock *req = inet_reqsk(sk);
389 struct net *net = sock_net(sk);
390
391 /* ICMPs are not backlogged, hence we cannot get
392 * an established socket here.
393 */
26e37360 394 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 395 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 396 } else if (abort) {
26e37360
ED
397 /*
398 * Still in SYN_RECV, just remove it silently.
399 * There is no good way to pass the error to the newly
400 * created socket, and POSIX does not want network
401 * errors returned from accept().
402 */
c6973669 403 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 404 tcp_listendrop(req->rsk_listener);
26e37360 405 }
ef84d8ce 406 reqsk_put(req);
26e37360
ED
407}
408EXPORT_SYMBOL(tcp_req_err);
409
1da177e4
LT
410/*
411 * This routine is called by the ICMP module when it gets some
412 * sort of error condition. If err < 0 then the socket should
413 * be closed and the error returned to the user. If err > 0
414 * it's just the icmp type << 8 | icmp code. After adjustment
415 * header points to the first 8 bytes of the tcp header. We need
416 * to find the appropriate port.
417 *
418 * The locking strategy used here is very "optimistic". When
419 * someone else accesses the socket the ICMP is just dropped
420 * and for some paths there is no check at all.
421 * A more general error queue to queue errors for later handling
422 * is probably better.
423 *
424 */
425
32bbd879 426int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 427{
b71d1d42 428 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 429 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 430 struct inet_connection_sock *icsk;
1da177e4
LT
431 struct tcp_sock *tp;
432 struct inet_sock *inet;
4d1a2d9e
DL
433 const int type = icmp_hdr(icmp_skb)->type;
434 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 435 struct sock *sk;
f1ecd5d9 436 struct sk_buff *skb;
0a672f74 437 struct request_sock *fastopen;
9a568de4
ED
438 u32 seq, snd_una;
439 s32 remaining;
440 u32 delta_us;
1da177e4 441 int err;
4d1a2d9e 442 struct net *net = dev_net(icmp_skb->dev);
1da177e4 443
26e37360
ED
444 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
445 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 446 inet_iif(icmp_skb), 0);
1da177e4 447 if (!sk) {
5d3848bc 448 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
32bbd879 449 return -ENOENT;
1da177e4
LT
450 }
451 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 452 inet_twsk_put(inet_twsk(sk));
32bbd879 453 return 0;
1da177e4 454 }
26e37360 455 seq = ntohl(th->seq);
32bbd879
SB
456 if (sk->sk_state == TCP_NEW_SYN_RECV) {
457 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
458 type == ICMP_TIME_EXCEEDED ||
459 (type == ICMP_DEST_UNREACH &&
460 (code == ICMP_NET_UNREACH ||
461 code == ICMP_HOST_UNREACH)));
462 return 0;
463 }
1da177e4
LT
464
465 bh_lock_sock(sk);
466 /* If too many ICMPs get dropped on busy
467 * servers this needs to be solved differently.
563d34d0
ED
468 * We do take care of PMTU discovery (RFC1191) special case :
469 * we can receive locally generated ICMP messages while socket is held.
1da177e4 470 */
b74aa930
ED
471 if (sock_owned_by_user(sk)) {
472 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 473 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 474 }
1da177e4
LT
475 if (sk->sk_state == TCP_CLOSE)
476 goto out;
477
97e3ecd1 478 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 479 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 480 goto out;
481 }
482
f1ecd5d9 483 icsk = inet_csk(sk);
1da177e4 484 tp = tcp_sk(sk);
0a672f74
YC
485 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
486 fastopen = tp->fastopen_rsk;
487 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 488 if (sk->sk_state != TCP_LISTEN &&
0a672f74 489 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 490 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
491 goto out;
492 }
493
494 switch (type) {
55be7a9c 495 case ICMP_REDIRECT:
45caeaa5
JM
496 if (!sock_owned_by_user(sk))
497 do_redirect(icmp_skb, sk);
55be7a9c 498 goto out;
1da177e4
LT
499 case ICMP_SOURCE_QUENCH:
500 /* Just silently ignore these. */
501 goto out;
502 case ICMP_PARAMETERPROB:
503 err = EPROTO;
504 break;
505 case ICMP_DEST_UNREACH:
506 if (code > NR_ICMP_UNREACH)
507 goto out;
508
509 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
510 /* We are not interested in TCP_LISTEN and open_requests
511 * (SYN-ACKs send out by Linux are always <576bytes so
512 * they should go through unfragmented).
513 */
514 if (sk->sk_state == TCP_LISTEN)
515 goto out;
516
563d34d0 517 tp->mtu_info = info;
144d56e9 518 if (!sock_owned_by_user(sk)) {
563d34d0 519 tcp_v4_mtu_reduced(sk);
144d56e9 520 } else {
7aa5470c 521 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
522 sock_hold(sk);
523 }
1da177e4
LT
524 goto out;
525 }
526
527 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
528 /* check if icmp_skb allows revert of backoff
529 * (see draft-zimmermann-tcp-lcd) */
530 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
531 break;
532 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 533 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
534 break;
535
8f49c270
DM
536 if (sock_owned_by_user(sk))
537 break;
538
f1ecd5d9 539 icsk->icsk_backoff--;
fcdd1cf4
ED
540 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 TCP_TIMEOUT_INIT;
542 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9 543
75c119af 544 skb = tcp_rtx_queue_head(sk);
f1ecd5d9 545
9a568de4 546 tcp_mstamp_refresh(tp);
2fd66ffb 547 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
7faee5c0 548 remaining = icsk->icsk_rto -
9a568de4 549 usecs_to_jiffies(delta_us);
f1ecd5d9 550
9a568de4 551 if (remaining > 0) {
f1ecd5d9
DL
552 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
553 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
554 } else {
555 /* RTO revert clocked out retransmission.
556 * Will retransmit now */
557 tcp_retransmit_timer(sk);
558 }
559
1da177e4
LT
560 break;
561 case ICMP_TIME_EXCEEDED:
562 err = EHOSTUNREACH;
563 break;
564 default:
565 goto out;
566 }
567
568 switch (sk->sk_state) {
1da177e4 569 case TCP_SYN_SENT:
0a672f74
YC
570 case TCP_SYN_RECV:
571 /* Only in fast or simultaneous open. If a fast open socket is
572 * is already accepted it is treated as a connected one below.
573 */
51456b29 574 if (fastopen && !fastopen->sk)
0a672f74
YC
575 break;
576
1da177e4 577 if (!sock_owned_by_user(sk)) {
1da177e4
LT
578 sk->sk_err = err;
579
580 sk->sk_error_report(sk);
581
582 tcp_done(sk);
583 } else {
584 sk->sk_err_soft = err;
585 }
586 goto out;
587 }
588
589 /* If we've already connected we will keep trying
590 * until we time out, or the user gives up.
591 *
592 * rfc1122 4.2.3.9 allows to consider as hard errors
593 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
594 * but it is obsoleted by pmtu discovery).
595 *
596 * Note, that in modern internet, where routing is unreliable
597 * and in each dark corner broken firewalls sit, sending random
598 * errors ordered by their masters even this two messages finally lose
599 * their original sense (even Linux sends invalid PORT_UNREACHs)
600 *
601 * Now we are in compliance with RFCs.
602 * --ANK (980905)
603 */
604
605 inet = inet_sk(sk);
606 if (!sock_owned_by_user(sk) && inet->recverr) {
607 sk->sk_err = err;
608 sk->sk_error_report(sk);
609 } else { /* Only an error on timeout */
610 sk->sk_err_soft = err;
611 }
612
613out:
614 bh_unlock_sock(sk);
615 sock_put(sk);
32bbd879 616 return 0;
1da177e4
LT
617}
618
28850dc7 619void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 620{
aa8223c7 621 struct tcphdr *th = tcp_hdr(skb);
1da177e4 622
98be9b12
ED
623 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
624 skb->csum_start = skb_transport_header(skb) - skb->head;
625 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4
LT
626}
627
419f9f89 628/* This routine computes an IPv4 TCP checksum. */
bb296246 629void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 630{
cf533ea5 631 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
632
633 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
634}
4bc2f18b 635EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 636
1da177e4
LT
637/*
638 * This routine will send an RST to the other tcp.
639 *
640 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
641 * for reset.
642 * Answer: if a packet caused RST, it is not for a socket
643 * existing in our system, if it is matched to a socket,
644 * it is just duplicate segment or bug in other side's TCP.
645 * So that we build reply only basing on parameters
646 * arrived with segment.
647 * Exception: precedence violation. We do not implement it in any case.
648 */
649
a00e7444 650static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 651{
cf533ea5 652 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
653 struct {
654 struct tcphdr th;
655#ifdef CONFIG_TCP_MD5SIG
714e85be 656 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
657#endif
658 } rep;
1da177e4 659 struct ip_reply_arg arg;
cfb6eeb4 660#ifdef CONFIG_TCP_MD5SIG
e46787f0 661 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
662 const __u8 *hash_location = NULL;
663 unsigned char newhash[16];
664 int genhash;
665 struct sock *sk1 = NULL;
cfb6eeb4 666#endif
a86b1e30 667 struct net *net;
00483690 668 struct sock *ctl_sk;
1da177e4
LT
669
670 /* Never send a reset in response to a reset. */
671 if (th->rst)
672 return;
673
c3658e8d
ED
674 /* If sk not NULL, it means we did a successful lookup and incoming
675 * route had to be correct. prequeue might have dropped our dst.
676 */
677 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
678 return;
679
680 /* Swap the send and the receive. */
cfb6eeb4
YH
681 memset(&rep, 0, sizeof(rep));
682 rep.th.dest = th->source;
683 rep.th.source = th->dest;
684 rep.th.doff = sizeof(struct tcphdr) / 4;
685 rep.th.rst = 1;
1da177e4
LT
686
687 if (th->ack) {
cfb6eeb4 688 rep.th.seq = th->ack_seq;
1da177e4 689 } else {
cfb6eeb4
YH
690 rep.th.ack = 1;
691 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 skb->len - (th->doff << 2));
1da177e4
LT
693 }
694
7174259e 695 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
696 arg.iov[0].iov_base = (unsigned char *)&rep;
697 arg.iov[0].iov_len = sizeof(rep.th);
698
0f85feae 699 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 700#ifdef CONFIG_TCP_MD5SIG
3b24d854 701 rcu_read_lock();
658ddaaf 702 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 703 if (sk && sk_fullsock(sk)) {
e46787f0
FW
704 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
705 &ip_hdr(skb)->saddr, AF_INET);
706 } else if (hash_location) {
658ddaaf
SL
707 /*
708 * active side is lost. Try to find listening socket through
709 * source port, and then find md5 key through listening socket.
710 * we are not loose security here:
711 * Incoming packet is checked with md5 hash with finding key,
712 * no RST generated if md5 hash doesn't match.
713 */
a583636a
CG
714 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
715 ip_hdr(skb)->saddr,
da5e3630 716 th->source, ip_hdr(skb)->daddr,
3fa6f616
DA
717 ntohs(th->source), inet_iif(skb),
718 tcp_v4_sdif(skb));
658ddaaf
SL
719 /* don't send rst if it can't find key */
720 if (!sk1)
3b24d854
ED
721 goto out;
722
658ddaaf
SL
723 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
724 &ip_hdr(skb)->saddr, AF_INET);
725 if (!key)
3b24d854
ED
726 goto out;
727
658ddaaf 728
39f8e58e 729 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 730 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
731 goto out;
732
658ddaaf
SL
733 }
734
cfb6eeb4
YH
735 if (key) {
736 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
737 (TCPOPT_NOP << 16) |
738 (TCPOPT_MD5SIG << 8) |
739 TCPOLEN_MD5SIG);
740 /* Update length and the length the header thinks exists */
741 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
742 rep.th.doff = arg.iov[0].iov_len / 4;
743
49a72dfb 744 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
745 key, ip_hdr(skb)->saddr,
746 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
747 }
748#endif
eddc9ec5
ACM
749 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
750 ip_hdr(skb)->saddr, /* XXX */
52cd5750 751 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 752 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
753 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
754
e2446eaa 755 /* When socket is gone, all binding information is lost.
4c675258
AK
756 * routing might fail in this case. No choice here, if we choose to force
757 * input interface, we will misroute in case of asymmetric route.
e2446eaa 758 */
c24b14c4 759 if (sk) {
4c675258 760 arg.bound_dev_if = sk->sk_bound_dev_if;
5c487bb9
SL
761 if (sk_fullsock(sk))
762 trace_tcp_send_reset(sk, skb);
c24b14c4 763 }
1da177e4 764
271c3b9b
FW
765 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
766 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
767
66b13d99 768 arg.tos = ip_hdr(skb)->tos;
e2d118a1 769 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 770 local_bh_disable();
00483690
JM
771 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
772 if (sk)
773 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
774 inet_twsk(sk)->tw_mark : sk->sk_mark;
775 ip_send_unicast_reply(ctl_sk,
bdbbb852 776 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
777 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
778 &arg, arg.iov[0].iov_len);
1da177e4 779
00483690 780 ctl_sk->sk_mark = 0;
90bbcc60
ED
781 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
782 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 783 local_bh_enable();
658ddaaf
SL
784
785#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
786out:
787 rcu_read_unlock();
658ddaaf 788#endif
1da177e4
LT
789}
790
791/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
792 outside socket context is ugly, certainly. What can I do?
793 */
794
e2d118a1 795static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 796 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 797 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 798 struct tcp_md5sig_key *key,
66b13d99 799 int reply_flags, u8 tos)
1da177e4 800{
cf533ea5 801 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
802 struct {
803 struct tcphdr th;
714e85be 804 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 805#ifdef CONFIG_TCP_MD5SIG
714e85be 806 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
807#endif
808 ];
1da177e4 809 } rep;
e2d118a1 810 struct net *net = sock_net(sk);
1da177e4 811 struct ip_reply_arg arg;
00483690 812 struct sock *ctl_sk;
1da177e4
LT
813
814 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 815 memset(&arg, 0, sizeof(arg));
1da177e4
LT
816
817 arg.iov[0].iov_base = (unsigned char *)&rep;
818 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 819 if (tsecr) {
cfb6eeb4
YH
820 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
821 (TCPOPT_TIMESTAMP << 8) |
822 TCPOLEN_TIMESTAMP);
ee684b6f
AV
823 rep.opt[1] = htonl(tsval);
824 rep.opt[2] = htonl(tsecr);
cb48cfe8 825 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
826 }
827
828 /* Swap the send and the receive. */
829 rep.th.dest = th->source;
830 rep.th.source = th->dest;
831 rep.th.doff = arg.iov[0].iov_len / 4;
832 rep.th.seq = htonl(seq);
833 rep.th.ack_seq = htonl(ack);
834 rep.th.ack = 1;
835 rep.th.window = htons(win);
836
cfb6eeb4 837#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 838 if (key) {
ee684b6f 839 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
840
841 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
842 (TCPOPT_NOP << 16) |
843 (TCPOPT_MD5SIG << 8) |
844 TCPOLEN_MD5SIG);
845 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
846 rep.th.doff = arg.iov[0].iov_len/4;
847
49a72dfb 848 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
849 key, ip_hdr(skb)->saddr,
850 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
851 }
852#endif
88ef4a5a 853 arg.flags = reply_flags;
eddc9ec5
ACM
854 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
855 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
856 arg.iov[0].iov_len, IPPROTO_TCP, 0);
857 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
858 if (oif)
859 arg.bound_dev_if = oif;
66b13d99 860 arg.tos = tos;
e2d118a1 861 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 862 local_bh_disable();
00483690
JM
863 ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
864 if (sk)
865 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
866 inet_twsk(sk)->tw_mark : sk->sk_mark;
867 ip_send_unicast_reply(ctl_sk,
bdbbb852 868 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
869 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
870 &arg, arg.iov[0].iov_len);
1da177e4 871
00483690 872 ctl_sk->sk_mark = 0;
90bbcc60 873 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 874 local_bh_enable();
1da177e4
LT
875}
876
877static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
878{
8feaf0c0 879 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 880 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 881
e2d118a1 882 tcp_v4_send_ack(sk, skb,
e62a123b 883 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 884 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 885 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
886 tcptw->tw_ts_recent,
887 tw->tw_bound_dev_if,
88ef4a5a 888 tcp_twsk_md5_key(tcptw),
66b13d99
ED
889 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
890 tw->tw_tos
9501f972 891 );
1da177e4 892
8feaf0c0 893 inet_twsk_put(tw);
1da177e4
LT
894}
895
a00e7444 896static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 897 struct request_sock *req)
1da177e4 898{
168a8f58
JC
899 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
900 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
901 */
e62a123b
ED
902 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
903 tcp_sk(sk)->snd_nxt;
904
20a2b49f
ED
905 /* RFC 7323 2.3
906 * The window field (SEG.WND) of every outgoing segment, with the
907 * exception of <SYN> segments, MUST be right-shifted by
908 * Rcv.Wind.Shift bits:
909 */
e2d118a1 910 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
911 tcp_rsk(req)->rcv_nxt,
912 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 913 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
914 req->ts_recent,
915 0,
30791ac4 916 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
a915da9b 917 AF_INET),
66b13d99
ED
918 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
919 ip_hdr(skb)->tos);
1da177e4
LT
920}
921
1da177e4 922/*
9bf1d83e 923 * Send a SYN-ACK after having received a SYN.
60236fdd 924 * This still operates on a request_sock only, not on a big
1da177e4
LT
925 * socket.
926 */
0f935dbe 927static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 928 struct flowi *fl,
72659ecc 929 struct request_sock *req,
ca6fb065 930 struct tcp_fastopen_cookie *foc,
b3d05147 931 enum tcp_synack_type synack_type)
1da177e4 932{
2e6599cb 933 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 934 struct flowi4 fl4;
1da177e4 935 int err = -1;
d41db5af 936 struct sk_buff *skb;
1da177e4
LT
937
938 /* First, grab a route. */
ba3f7f04 939 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 940 return -1;
1da177e4 941
b3d05147 942 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
943
944 if (skb) {
634fb979 945 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 946
2ab2ddd3 947 rcu_read_lock();
634fb979
ED
948 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
949 ireq->ir_rmt_addr,
2ab2ddd3
ED
950 rcu_dereference(ireq->ireq_opt));
951 rcu_read_unlock();
b9df3cb8 952 err = net_xmit_eval(err);
1da177e4
LT
953 }
954
1da177e4
LT
955 return err;
956}
957
958/*
60236fdd 959 * IPv4 request_sock destructor.
1da177e4 960 */
60236fdd 961static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 962{
c92e8c02 963 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
964}
965
cfb6eeb4
YH
966#ifdef CONFIG_TCP_MD5SIG
967/*
968 * RFC2385 MD5 checksumming requires a mapping of
969 * IP address->MD5 Key.
970 * We need to maintain these in the sk structure.
971 */
972
6015c71e
ED
973struct static_key tcp_md5_needed __read_mostly;
974EXPORT_SYMBOL(tcp_md5_needed);
975
cfb6eeb4 976/* Find the Key structure for an address. */
6015c71e
ED
977struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
978 const union tcp_md5_addr *addr,
979 int family)
cfb6eeb4 980{
fd3a154a 981 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 982 struct tcp_md5sig_key *key;
fd3a154a 983 const struct tcp_md5sig_info *md5sig;
6797318e
ID
984 __be32 mask;
985 struct tcp_md5sig_key *best_match = NULL;
986 bool match;
cfb6eeb4 987
a8afca03
ED
988 /* caller either holds rcu_read_lock() or socket lock */
989 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 990 lockdep_sock_is_held(sk));
a8afca03 991 if (!md5sig)
cfb6eeb4 992 return NULL;
083a0326 993
b67bfe0d 994 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
995 if (key->family != family)
996 continue;
6797318e
ID
997
998 if (family == AF_INET) {
999 mask = inet_make_mask(key->prefixlen);
1000 match = (key->addr.a4.s_addr & mask) ==
1001 (addr->a4.s_addr & mask);
1002#if IS_ENABLED(CONFIG_IPV6)
1003 } else if (family == AF_INET6) {
1004 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005 key->prefixlen);
1006#endif
1007 } else {
1008 match = false;
1009 }
1010
1011 if (match && (!best_match ||
1012 key->prefixlen > best_match->prefixlen))
1013 best_match = key;
1014 }
1015 return best_match;
1016}
6015c71e 1017EXPORT_SYMBOL(__tcp_md5_do_lookup);
6797318e 1018
e8f37d57
WF
1019static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020 const union tcp_md5_addr *addr,
1021 int family, u8 prefixlen)
6797318e
ID
1022{
1023 const struct tcp_sock *tp = tcp_sk(sk);
1024 struct tcp_md5sig_key *key;
1025 unsigned int size = sizeof(struct in_addr);
1026 const struct tcp_md5sig_info *md5sig;
1027
1028 /* caller either holds rcu_read_lock() or socket lock */
1029 md5sig = rcu_dereference_check(tp->md5sig_info,
1030 lockdep_sock_is_held(sk));
1031 if (!md5sig)
1032 return NULL;
1033#if IS_ENABLED(CONFIG_IPV6)
1034 if (family == AF_INET6)
1035 size = sizeof(struct in6_addr);
1036#endif
1037 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038 if (key->family != family)
1039 continue;
1040 if (!memcmp(&key->addr, addr, size) &&
1041 key->prefixlen == prefixlen)
a915da9b 1042 return key;
cfb6eeb4
YH
1043 }
1044 return NULL;
1045}
1046
b83e3deb 1047struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1048 const struct sock *addr_sk)
cfb6eeb4 1049{
b52e6921 1050 const union tcp_md5_addr *addr;
a915da9b 1051
b52e6921 1052 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 1053 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 1054}
cfb6eeb4
YH
1055EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
cfb6eeb4 1057/* This can be called on a newly created socket, from other files */
a915da9b 1058int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
6797318e
ID
1059 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060 gfp_t gfp)
cfb6eeb4
YH
1061{
1062 /* Add Key to the list */
b0a713e9 1063 struct tcp_md5sig_key *key;
cfb6eeb4 1064 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1065 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1066
6797318e 1067 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
cfb6eeb4
YH
1068 if (key) {
1069 /* Pre-existing entry - just update that one. */
a915da9b 1070 memcpy(key->key, newkey, newkeylen);
b0a713e9 1071 key->keylen = newkeylen;
a915da9b
ED
1072 return 0;
1073 }
260fcbeb 1074
a8afca03 1075 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1076 lockdep_sock_is_held(sk));
a915da9b
ED
1077 if (!md5sig) {
1078 md5sig = kmalloc(sizeof(*md5sig), gfp);
1079 if (!md5sig)
cfb6eeb4 1080 return -ENOMEM;
cfb6eeb4 1081
a915da9b
ED
1082 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1083 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1084 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1085 }
cfb6eeb4 1086
5f3d9cb2 1087 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1088 if (!key)
1089 return -ENOMEM;
71cea17e 1090 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1091 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1092 return -ENOMEM;
cfb6eeb4 1093 }
a915da9b
ED
1094
1095 memcpy(key->key, newkey, newkeylen);
1096 key->keylen = newkeylen;
1097 key->family = family;
6797318e 1098 key->prefixlen = prefixlen;
a915da9b
ED
1099 memcpy(&key->addr, addr,
1100 (family == AF_INET6) ? sizeof(struct in6_addr) :
1101 sizeof(struct in_addr));
1102 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1103 return 0;
1104}
a915da9b 1105EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1106
6797318e
ID
1107int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1108 u8 prefixlen)
cfb6eeb4 1109{
a915da9b
ED
1110 struct tcp_md5sig_key *key;
1111
6797318e 1112 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
a915da9b
ED
1113 if (!key)
1114 return -ENOENT;
1115 hlist_del_rcu(&key->node);
5f3d9cb2 1116 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1117 kfree_rcu(key, rcu);
a915da9b 1118 return 0;
cfb6eeb4 1119}
a915da9b 1120EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1121
e0683e70 1122static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1123{
1124 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1125 struct tcp_md5sig_key *key;
b67bfe0d 1126 struct hlist_node *n;
a8afca03 1127 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1128
a8afca03
ED
1129 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1130
b67bfe0d 1131 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1132 hlist_del_rcu(&key->node);
5f3d9cb2 1133 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1134 kfree_rcu(key, rcu);
cfb6eeb4
YH
1135 }
1136}
1137
8917a777
ID
1138static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1139 char __user *optval, int optlen)
cfb6eeb4
YH
1140{
1141 struct tcp_md5sig cmd;
1142 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
8917a777 1143 u8 prefixlen = 32;
cfb6eeb4
YH
1144
1145 if (optlen < sizeof(cmd))
1146 return -EINVAL;
1147
7174259e 1148 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1149 return -EFAULT;
1150
1151 if (sin->sin_family != AF_INET)
1152 return -EINVAL;
1153
8917a777
ID
1154 if (optname == TCP_MD5SIG_EXT &&
1155 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1156 prefixlen = cmd.tcpm_prefixlen;
1157 if (prefixlen > 32)
1158 return -EINVAL;
1159 }
1160
64a124ed 1161 if (!cmd.tcpm_keylen)
a915da9b 1162 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1163 AF_INET, prefixlen);
cfb6eeb4
YH
1164
1165 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1166 return -EINVAL;
1167
a915da9b 1168 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1169 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
a915da9b 1170 GFP_KERNEL);
cfb6eeb4
YH
1171}
1172
19689e38
ED
1173static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1174 __be32 daddr, __be32 saddr,
1175 const struct tcphdr *th, int nbytes)
cfb6eeb4 1176{
cfb6eeb4 1177 struct tcp4_pseudohdr *bp;
49a72dfb 1178 struct scatterlist sg;
19689e38 1179 struct tcphdr *_th;
cfb6eeb4 1180
19689e38 1181 bp = hp->scratch;
cfb6eeb4
YH
1182 bp->saddr = saddr;
1183 bp->daddr = daddr;
1184 bp->pad = 0;
076fb722 1185 bp->protocol = IPPROTO_TCP;
49a72dfb 1186 bp->len = cpu_to_be16(nbytes);
c7da57a1 1187
19689e38
ED
1188 _th = (struct tcphdr *)(bp + 1);
1189 memcpy(_th, th, sizeof(*th));
1190 _th->check = 0;
1191
1192 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1193 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1194 sizeof(*bp) + sizeof(*th));
cf80e0e4 1195 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1196}
1197
a915da9b 1198static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1199 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1200{
1201 struct tcp_md5sig_pool *hp;
cf80e0e4 1202 struct ahash_request *req;
49a72dfb
AL
1203
1204 hp = tcp_get_md5sig_pool();
1205 if (!hp)
1206 goto clear_hash_noput;
cf80e0e4 1207 req = hp->md5_req;
49a72dfb 1208
cf80e0e4 1209 if (crypto_ahash_init(req))
49a72dfb 1210 goto clear_hash;
19689e38 1211 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1212 goto clear_hash;
1213 if (tcp_md5_hash_key(hp, key))
1214 goto clear_hash;
cf80e0e4
HX
1215 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1216 if (crypto_ahash_final(req))
cfb6eeb4
YH
1217 goto clear_hash;
1218
cfb6eeb4 1219 tcp_put_md5sig_pool();
cfb6eeb4 1220 return 0;
49a72dfb 1221
cfb6eeb4
YH
1222clear_hash:
1223 tcp_put_md5sig_pool();
1224clear_hash_noput:
1225 memset(md5_hash, 0, 16);
49a72dfb 1226 return 1;
cfb6eeb4
YH
1227}
1228
39f8e58e
ED
1229int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1230 const struct sock *sk,
318cf7aa 1231 const struct sk_buff *skb)
cfb6eeb4 1232{
49a72dfb 1233 struct tcp_md5sig_pool *hp;
cf80e0e4 1234 struct ahash_request *req;
318cf7aa 1235 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1236 __be32 saddr, daddr;
1237
39f8e58e
ED
1238 if (sk) { /* valid for establish/request sockets */
1239 saddr = sk->sk_rcv_saddr;
1240 daddr = sk->sk_daddr;
cfb6eeb4 1241 } else {
49a72dfb
AL
1242 const struct iphdr *iph = ip_hdr(skb);
1243 saddr = iph->saddr;
1244 daddr = iph->daddr;
cfb6eeb4 1245 }
49a72dfb
AL
1246
1247 hp = tcp_get_md5sig_pool();
1248 if (!hp)
1249 goto clear_hash_noput;
cf80e0e4 1250 req = hp->md5_req;
49a72dfb 1251
cf80e0e4 1252 if (crypto_ahash_init(req))
49a72dfb
AL
1253 goto clear_hash;
1254
19689e38 1255 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1256 goto clear_hash;
1257 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1258 goto clear_hash;
1259 if (tcp_md5_hash_key(hp, key))
1260 goto clear_hash;
cf80e0e4
HX
1261 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1262 if (crypto_ahash_final(req))
49a72dfb
AL
1263 goto clear_hash;
1264
1265 tcp_put_md5sig_pool();
1266 return 0;
1267
1268clear_hash:
1269 tcp_put_md5sig_pool();
1270clear_hash_noput:
1271 memset(md5_hash, 0, 16);
1272 return 1;
cfb6eeb4 1273}
49a72dfb 1274EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1275
ba8e275a
ED
1276#endif
1277
ff74e23f 1278/* Called with rcu_read_lock() */
ba8e275a 1279static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1280 const struct sk_buff *skb)
cfb6eeb4 1281{
ba8e275a 1282#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1283 /*
1284 * This gets called for each TCP segment that arrives
1285 * so we want to be efficient.
1286 * We have 3 drop cases:
1287 * o No MD5 hash and one expected.
1288 * o MD5 hash and we're not expecting one.
1289 * o MD5 hash and its wrong.
1290 */
cf533ea5 1291 const __u8 *hash_location = NULL;
cfb6eeb4 1292 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1293 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1294 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1295 int genhash;
cfb6eeb4
YH
1296 unsigned char newhash[16];
1297
a915da9b
ED
1298 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1299 AF_INET);
7d5d5525 1300 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1301
cfb6eeb4
YH
1302 /* We've parsed the options - do we have a hash? */
1303 if (!hash_expected && !hash_location)
a2a385d6 1304 return false;
cfb6eeb4
YH
1305
1306 if (hash_expected && !hash_location) {
c10d9310 1307 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1308 return true;
cfb6eeb4
YH
1309 }
1310
1311 if (!hash_expected && hash_location) {
c10d9310 1312 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1313 return true;
cfb6eeb4
YH
1314 }
1315
1316 /* Okay, so this is hash_expected and hash_location -
1317 * so we need to calculate the checksum.
1318 */
49a72dfb
AL
1319 genhash = tcp_v4_md5_hash_skb(newhash,
1320 hash_expected,
39f8e58e 1321 NULL, skb);
cfb6eeb4
YH
1322
1323 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1325 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1326 &iph->saddr, ntohs(th->source),
1327 &iph->daddr, ntohs(th->dest),
1328 genhash ? " tcp_v4_calc_md5_hash failed"
1329 : "");
a2a385d6 1330 return true;
cfb6eeb4 1331 }
a2a385d6 1332 return false;
cfb6eeb4 1333#endif
ba8e275a
ED
1334 return false;
1335}
cfb6eeb4 1336
b40cf18e
ED
1337static void tcp_v4_init_req(struct request_sock *req,
1338 const struct sock *sk_listener,
16bea70a
OP
1339 struct sk_buff *skb)
1340{
1341 struct inet_request_sock *ireq = inet_rsk(req);
c92e8c02 1342 struct net *net = sock_net(sk_listener);
16bea70a 1343
08d2cc3b
ED
1344 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1345 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
c92e8c02 1346 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
16bea70a
OP
1347}
1348
f964629e
ED
1349static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1350 struct flowi *fl,
4396e461 1351 const struct request_sock *req)
d94e0417 1352{
4396e461 1353 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1354}
1355
72a3effa 1356struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1357 .family = PF_INET,
2e6599cb 1358 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1359 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1360 .send_ack = tcp_v4_reqsk_send_ack,
1361 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1362 .send_reset = tcp_v4_send_reset,
688d1945 1363 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1364};
1365
b2e4b3de 1366static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1367 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1368#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1369 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1370 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1371#endif
16bea70a 1372 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1373#ifdef CONFIG_SYN_COOKIES
1374 .cookie_init_seq = cookie_v4_init_sequence,
1375#endif
d94e0417 1376 .route_req = tcp_v4_route_req,
84b114b9
ED
1377 .init_seq = tcp_v4_init_seq,
1378 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1379 .send_synack = tcp_v4_send_synack,
16bea70a 1380};
cfb6eeb4 1381
1da177e4
LT
1382int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383{
1da177e4 1384 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1385 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1386 goto drop;
1387
1fb6f159
OP
1388 return tcp_conn_request(&tcp_request_sock_ops,
1389 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1390
1da177e4 1391drop:
9caad864 1392 tcp_listendrop(sk);
1da177e4
LT
1393 return 0;
1394}
4bc2f18b 1395EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1396
1397
1398/*
1399 * The three way handshake has completed - we got a valid synack -
1400 * now create the new socket.
1401 */
0c27171e 1402struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1403 struct request_sock *req,
5e0724d0
ED
1404 struct dst_entry *dst,
1405 struct request_sock *req_unhash,
1406 bool *own_req)
1da177e4 1407{
2e6599cb 1408 struct inet_request_sock *ireq;
1da177e4
LT
1409 struct inet_sock *newinet;
1410 struct tcp_sock *newtp;
1411 struct sock *newsk;
cfb6eeb4
YH
1412#ifdef CONFIG_TCP_MD5SIG
1413 struct tcp_md5sig_key *key;
1414#endif
f6d8bd05 1415 struct ip_options_rcu *inet_opt;
1da177e4
LT
1416
1417 if (sk_acceptq_is_full(sk))
1418 goto exit_overflow;
1419
1da177e4
LT
1420 newsk = tcp_create_openreq_child(sk, req, skb);
1421 if (!newsk)
093d2823 1422 goto exit_nonewsk;
1da177e4 1423
bcd76111 1424 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1425 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1426
1427 newtp = tcp_sk(newsk);
1428 newinet = inet_sk(newsk);
2e6599cb 1429 ireq = inet_rsk(req);
d1e559d0
ED
1430 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1431 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1432 newsk->sk_bound_dev_if = ireq->ir_iif;
c92e8c02
ED
1433 newinet->inet_saddr = ireq->ir_loc_addr;
1434 inet_opt = rcu_dereference(ireq->ireq_opt);
1435 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1436 newinet->mc_index = inet_iif(skb);
eddc9ec5 1437 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1438 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1439 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1440 if (inet_opt)
1441 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1442 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1443
dfd25fff
ED
1444 if (!dst) {
1445 dst = inet_csk_route_child_sock(sk, newsk, req);
1446 if (!dst)
1447 goto put_and_exit;
1448 } else {
1449 /* syncookie case : see end of cookie_v4_check() */
1450 }
0e734419
DM
1451 sk_setup_caps(newsk, dst);
1452
81164413
DB
1453 tcp_ca_openreq_child(newsk, dst);
1454
1da177e4 1455 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1456 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1457
1da177e4
LT
1458 tcp_initialize_rcv_mss(newsk);
1459
cfb6eeb4
YH
1460#ifdef CONFIG_TCP_MD5SIG
1461 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1462 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1463 AF_INET);
00db4124 1464 if (key) {
cfb6eeb4
YH
1465 /*
1466 * We're using one, so create a matching key
1467 * on the newsk structure. If we fail to get
1468 * memory, then we end up not copying the key
1469 * across. Shucks.
1470 */
a915da9b 1471 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
6797318e 1472 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
a465419b 1473 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1474 }
1475#endif
1476
0e734419
DM
1477 if (__inet_inherit_port(sk, newsk) < 0)
1478 goto put_and_exit;
5e0724d0 1479 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
c92e8c02 1480 if (likely(*own_req)) {
49a496c9 1481 tcp_move_syn(newtp, req);
c92e8c02
ED
1482 ireq->ireq_opt = NULL;
1483 } else {
1484 newinet->inet_opt = NULL;
1485 }
1da177e4
LT
1486 return newsk;
1487
1488exit_overflow:
c10d9310 1489 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1490exit_nonewsk:
1491 dst_release(dst);
1da177e4 1492exit:
9caad864 1493 tcp_listendrop(sk);
1da177e4 1494 return NULL;
0e734419 1495put_and_exit:
c92e8c02 1496 newinet->inet_opt = NULL;
e337e24d
CP
1497 inet_csk_prepare_forced_close(newsk);
1498 tcp_done(newsk);
0e734419 1499 goto exit;
1da177e4 1500}
4bc2f18b 1501EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1502
079096f1 1503static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1504{
079096f1 1505#ifdef CONFIG_SYN_COOKIES
52452c54 1506 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1507
af9b4738 1508 if (!th->syn)
461b74c3 1509 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1510#endif
1511 return sk;
1512}
1513
1da177e4 1514/* The socket must have it's spinlock held when we get
e994b2f0 1515 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1516 *
1517 * We have a potential double-lock case here, so even when
1518 * doing backlog processing we use the BH locking scheme.
1519 * This is because we cannot sleep with the original spinlock
1520 * held.
1521 */
1522int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1523{
cfb6eeb4 1524 struct sock *rsk;
cfb6eeb4 1525
1da177e4 1526 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1527 struct dst_entry *dst = sk->sk_rx_dst;
1528
bdeab991 1529 sock_rps_save_rxhash(sk, skb);
3d97379a 1530 sk_mark_napi_id(sk, skb);
404e0a8b 1531 if (dst) {
505fbcf0 1532 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1533 !dst->ops->check(dst, 0)) {
92101b3b
DM
1534 dst_release(dst);
1535 sk->sk_rx_dst = NULL;
1536 }
1537 }
3d97d88e 1538 tcp_rcv_established(sk, skb);
1da177e4
LT
1539 return 0;
1540 }
1541
12e25e10 1542 if (tcp_checksum_complete(skb))
1da177e4
LT
1543 goto csum_err;
1544
1545 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1546 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1547
1da177e4
LT
1548 if (!nsk)
1549 goto discard;
1da177e4 1550 if (nsk != sk) {
cfb6eeb4
YH
1551 if (tcp_child_process(sk, nsk, skb)) {
1552 rsk = nsk;
1da177e4 1553 goto reset;
cfb6eeb4 1554 }
1da177e4
LT
1555 return 0;
1556 }
ca55158c 1557 } else
bdeab991 1558 sock_rps_save_rxhash(sk, skb);
ca55158c 1559
72ab4a86 1560 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1561 rsk = sk;
1da177e4 1562 goto reset;
cfb6eeb4 1563 }
1da177e4
LT
1564 return 0;
1565
1566reset:
cfb6eeb4 1567 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1568discard:
1569 kfree_skb(skb);
1570 /* Be careful here. If this function gets more complicated and
1571 * gcc suffers from register pressure on the x86, sk (in %ebx)
1572 * might be destroyed here. This current version compiles correctly,
1573 * but you have been warned.
1574 */
1575 return 0;
1576
1577csum_err:
c10d9310
ED
1578 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1579 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1580 goto discard;
1581}
4bc2f18b 1582EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1583
7487449c 1584int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1585{
41063e9d
DM
1586 const struct iphdr *iph;
1587 const struct tcphdr *th;
1588 struct sock *sk;
41063e9d 1589
41063e9d 1590 if (skb->pkt_type != PACKET_HOST)
7487449c 1591 return 0;
41063e9d 1592
45f00f99 1593 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1594 return 0;
41063e9d
DM
1595
1596 iph = ip_hdr(skb);
45f00f99 1597 th = tcp_hdr(skb);
41063e9d
DM
1598
1599 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1600 return 0;
41063e9d 1601
45f00f99 1602 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1603 iph->saddr, th->source,
7011d085 1604 iph->daddr, ntohs(th->dest),
3fa6f616 1605 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1606 if (sk) {
1607 skb->sk = sk;
1608 skb->destructor = sock_edemux;
f7e4eb03 1609 if (sk_fullsock(sk)) {
d0c294c5 1610 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1611
41063e9d
DM
1612 if (dst)
1613 dst = dst_check(dst, 0);
92101b3b 1614 if (dst &&
505fbcf0 1615 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1616 skb_dst_set_noref(skb, dst);
41063e9d
DM
1617 }
1618 }
7487449c 1619 return 0;
41063e9d
DM
1620}
1621
c9c33212
ED
1622bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1623{
1624 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
4f693b55
ED
1625 struct skb_shared_info *shinfo;
1626 const struct tcphdr *th;
1627 struct tcphdr *thtail;
1628 struct sk_buff *tail;
1629 unsigned int hdrlen;
1630 bool fragstolen;
1631 u32 gso_segs;
1632 int delta;
c9c33212
ED
1633
1634 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1635 * we can fix skb->truesize to its real value to avoid future drops.
1636 * This is valid because skb is not yet charged to the socket.
1637 * It has been noticed pure SACK packets were sometimes dropped
1638 * (if cooked by drivers without copybreak feature).
1639 */
60b1af33 1640 skb_condense(skb);
c9c33212 1641
ade9628e
ED
1642 skb_dst_drop(skb);
1643
4f693b55
ED
1644 if (unlikely(tcp_checksum_complete(skb))) {
1645 bh_unlock_sock(sk);
1646 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1647 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1648 return true;
1649 }
1650
1651 /* Attempt coalescing to last skb in backlog, even if we are
1652 * above the limits.
1653 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1654 */
1655 th = (const struct tcphdr *)skb->data;
1656 hdrlen = th->doff * 4;
1657 shinfo = skb_shinfo(skb);
1658
1659 if (!shinfo->gso_size)
1660 shinfo->gso_size = skb->len - hdrlen;
1661
1662 if (!shinfo->gso_segs)
1663 shinfo->gso_segs = 1;
1664
1665 tail = sk->sk_backlog.tail;
1666 if (!tail)
1667 goto no_coalesce;
1668 thtail = (struct tcphdr *)tail->data;
1669
1670 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1671 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1672 ((TCP_SKB_CB(tail)->tcp_flags |
1673 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1674 ((TCP_SKB_CB(tail)->tcp_flags ^
1675 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676#ifdef CONFIG_TLS_DEVICE
1677 tail->decrypted != skb->decrypted ||
1678#endif
1679 thtail->doff != th->doff ||
1680 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681 goto no_coalesce;
1682
1683 __skb_pull(skb, hdrlen);
1684 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685 thtail->window = th->window;
1686
1687 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1688
1689 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1691
1692 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1693
1694 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1695 TCP_SKB_CB(tail)->has_rxtstamp = true;
1696 tail->tstamp = skb->tstamp;
1697 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1698 }
1699
1700 /* Not as strict as GRO. We only need to carry mss max value */
1701 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1702 skb_shinfo(tail)->gso_size);
1703
1704 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1705 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1706
1707 sk->sk_backlog.len += delta;
1708 __NET_INC_STATS(sock_net(sk),
1709 LINUX_MIB_TCPBACKLOGCOALESCE);
1710 kfree_skb_partial(skb, fragstolen);
1711 return false;
1712 }
1713 __skb_push(skb, hdrlen);
1714
1715no_coalesce:
1716 /* Only socket owner can try to collapse/prune rx queues
1717 * to reduce memory overhead, so add a little headroom here.
1718 * Few sockets backlog are possibly concurrently non empty.
1719 */
1720 limit += 64*1024;
1721
c9c33212
ED
1722 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1723 bh_unlock_sock(sk);
1724 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1725 return true;
1726 }
1727 return false;
1728}
1729EXPORT_SYMBOL(tcp_add_backlog);
1730
ac6e7800
ED
1731int tcp_filter(struct sock *sk, struct sk_buff *skb)
1732{
1733 struct tcphdr *th = (struct tcphdr *)skb->data;
1734 unsigned int eaten = skb->len;
1735 int err;
1736
1737 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1738 if (!err) {
1739 eaten -= skb->len;
1740 TCP_SKB_CB(skb)->end_seq -= eaten;
1741 }
1742 return err;
1743}
1744EXPORT_SYMBOL(tcp_filter);
1745
eeea10b8
ED
1746static void tcp_v4_restore_cb(struct sk_buff *skb)
1747{
1748 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1749 sizeof(struct inet_skb_parm));
1750}
1751
1752static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1753 const struct tcphdr *th)
1754{
1755 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1756 * barrier() makes sure compiler wont play fool^Waliasing games.
1757 */
1758 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1759 sizeof(struct inet_skb_parm));
1760 barrier();
1761
1762 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1763 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1764 skb->len - th->doff * 4);
1765 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1766 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1767 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1768 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1769 TCP_SKB_CB(skb)->sacked = 0;
1770 TCP_SKB_CB(skb)->has_rxtstamp =
1771 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1772}
1773
1da177e4
LT
1774/*
1775 * From tcp_input.c
1776 */
1777
1778int tcp_v4_rcv(struct sk_buff *skb)
1779{
3b24d854 1780 struct net *net = dev_net(skb->dev);
3fa6f616 1781 int sdif = inet_sdif(skb);
eddc9ec5 1782 const struct iphdr *iph;
cf533ea5 1783 const struct tcphdr *th;
3b24d854 1784 bool refcounted;
1da177e4
LT
1785 struct sock *sk;
1786 int ret;
1787
1788 if (skb->pkt_type != PACKET_HOST)
1789 goto discard_it;
1790
1791 /* Count it even if it's bad */
90bbcc60 1792 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1793
1794 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1795 goto discard_it;
1796
ea1627c2 1797 th = (const struct tcphdr *)skb->data;
1da177e4 1798
ea1627c2 1799 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1800 goto bad_packet;
1801 if (!pskb_may_pull(skb, th->doff * 4))
1802 goto discard_it;
1803
1804 /* An explanation is required here, I think.
1805 * Packet length and doff are validated by header prediction,
caa20d9a 1806 * provided case of th->doff==0 is eliminated.
1da177e4 1807 * So, we defer the checks. */
ed70fcfc
TH
1808
1809 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1810 goto csum_error;
1da177e4 1811
ea1627c2 1812 th = (const struct tcphdr *)skb->data;
eddc9ec5 1813 iph = ip_hdr(skb);
4bdc3d66 1814lookup:
a583636a 1815 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1816 th->dest, sdif, &refcounted);
1da177e4
LT
1817 if (!sk)
1818 goto no_tcp_socket;
1819
bb134d5d
ED
1820process:
1821 if (sk->sk_state == TCP_TIME_WAIT)
1822 goto do_time_wait;
1823
079096f1
ED
1824 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1825 struct request_sock *req = inet_reqsk(sk);
e0f9759f 1826 bool req_stolen = false;
7716682c 1827 struct sock *nsk;
079096f1
ED
1828
1829 sk = req->rsk_listener;
72923555 1830 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1831 sk_drops_add(sk, skb);
72923555
ED
1832 reqsk_put(req);
1833 goto discard_it;
1834 }
4fd44a98
FL
1835 if (tcp_checksum_complete(skb)) {
1836 reqsk_put(req);
1837 goto csum_error;
1838 }
7716682c 1839 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1840 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1841 goto lookup;
1842 }
3b24d854
ED
1843 /* We own a reference on the listener, increase it again
1844 * as we might lose it too soon.
1845 */
7716682c 1846 sock_hold(sk);
3b24d854 1847 refcounted = true;
1f3b359f 1848 nsk = NULL;
eeea10b8
ED
1849 if (!tcp_filter(sk, skb)) {
1850 th = (const struct tcphdr *)skb->data;
1851 iph = ip_hdr(skb);
1852 tcp_v4_fill_cb(skb, iph, th);
e0f9759f 1853 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
eeea10b8 1854 }
079096f1
ED
1855 if (!nsk) {
1856 reqsk_put(req);
e0f9759f
ED
1857 if (req_stolen) {
1858 /* Another cpu got exclusive access to req
1859 * and created a full blown socket.
1860 * Try to feed this packet to this socket
1861 * instead of discarding it.
1862 */
1863 tcp_v4_restore_cb(skb);
1864 sock_put(sk);
1865 goto lookup;
1866 }
7716682c 1867 goto discard_and_relse;
079096f1
ED
1868 }
1869 if (nsk == sk) {
079096f1 1870 reqsk_put(req);
eeea10b8 1871 tcp_v4_restore_cb(skb);
079096f1
ED
1872 } else if (tcp_child_process(sk, nsk, skb)) {
1873 tcp_v4_send_reset(nsk, skb);
7716682c 1874 goto discard_and_relse;
079096f1 1875 } else {
7716682c 1876 sock_put(sk);
079096f1
ED
1877 return 0;
1878 }
1879 }
6cce09f8 1880 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1881 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1882 goto discard_and_relse;
6cce09f8 1883 }
d218d111 1884
1da177e4
LT
1885 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1886 goto discard_and_relse;
9ea88a15 1887
9ea88a15
DP
1888 if (tcp_v4_inbound_md5_hash(sk, skb))
1889 goto discard_and_relse;
9ea88a15 1890
b59c2701 1891 nf_reset(skb);
1da177e4 1892
ac6e7800 1893 if (tcp_filter(sk, skb))
1da177e4 1894 goto discard_and_relse;
ac6e7800
ED
1895 th = (const struct tcphdr *)skb->data;
1896 iph = ip_hdr(skb);
eeea10b8 1897 tcp_v4_fill_cb(skb, iph, th);
1da177e4
LT
1898
1899 skb->dev = NULL;
1900
e994b2f0
ED
1901 if (sk->sk_state == TCP_LISTEN) {
1902 ret = tcp_v4_do_rcv(sk, skb);
1903 goto put_and_return;
1904 }
1905
1906 sk_incoming_cpu_update(sk);
1907
c6366184 1908 bh_lock_sock_nested(sk);
a44d6eac 1909 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1910 ret = 0;
1911 if (!sock_owned_by_user(sk)) {
e7942d06 1912 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1913 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1914 goto discard_and_relse;
1915 }
1da177e4
LT
1916 bh_unlock_sock(sk);
1917
e994b2f0 1918put_and_return:
3b24d854
ED
1919 if (refcounted)
1920 sock_put(sk);
1da177e4
LT
1921
1922 return ret;
1923
1924no_tcp_socket:
1925 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1926 goto discard_it;
1927
eeea10b8
ED
1928 tcp_v4_fill_cb(skb, iph, th);
1929
12e25e10 1930 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1931csum_error:
90bbcc60 1932 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1933bad_packet:
90bbcc60 1934 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1935 } else {
cfb6eeb4 1936 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1937 }
1938
1939discard_it:
1940 /* Discard frame. */
1941 kfree_skb(skb);
e905a9ed 1942 return 0;
1da177e4
LT
1943
1944discard_and_relse:
532182cd 1945 sk_drops_add(sk, skb);
3b24d854
ED
1946 if (refcounted)
1947 sock_put(sk);
1da177e4
LT
1948 goto discard_it;
1949
1950do_time_wait:
1951 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1952 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1953 goto discard_it;
1954 }
1955
eeea10b8
ED
1956 tcp_v4_fill_cb(skb, iph, th);
1957
6a5dc9e5
ED
1958 if (tcp_checksum_complete(skb)) {
1959 inet_twsk_put(inet_twsk(sk));
1960 goto csum_error;
1da177e4 1961 }
9469c7b4 1962 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1963 case TCP_TW_SYN: {
c346dca1 1964 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1965 &tcp_hashinfo, skb,
1966 __tcp_hdrlen(th),
da5e3630 1967 iph->saddr, th->source,
eddc9ec5 1968 iph->daddr, th->dest,
3fa6f616
DA
1969 inet_iif(skb),
1970 sdif);
1da177e4 1971 if (sk2) {
dbe7faa4 1972 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1973 sk = sk2;
eeea10b8 1974 tcp_v4_restore_cb(skb);
3b24d854 1975 refcounted = false;
1da177e4
LT
1976 goto process;
1977 }
1da177e4 1978 }
fcfd6dfa
GS
1979 /* to ACK */
1980 /* fall through */
1da177e4
LT
1981 case TCP_TW_ACK:
1982 tcp_v4_timewait_ack(sk, skb);
1983 break;
1984 case TCP_TW_RST:
271c3b9b
FW
1985 tcp_v4_send_reset(sk, skb);
1986 inet_twsk_deschedule_put(inet_twsk(sk));
1987 goto discard_it;
1da177e4
LT
1988 case TCP_TW_SUCCESS:;
1989 }
1990 goto discard_it;
1991}
1992
ccb7c410
DM
1993static struct timewait_sock_ops tcp_timewait_sock_ops = {
1994 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1995 .twsk_unique = tcp_twsk_unique,
1996 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1997};
1da177e4 1998
63d02d15 1999void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2000{
2001 struct dst_entry *dst = skb_dst(skb);
2002
5037e9ef 2003 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2004 sk->sk_rx_dst = dst;
2005 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2006 }
5d299f3d 2007}
63d02d15 2008EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2009
3b401a81 2010const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2011 .queue_xmit = ip_queue_xmit,
2012 .send_check = tcp_v4_send_check,
2013 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2014 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2015 .conn_request = tcp_v4_conn_request,
2016 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2017 .net_header_len = sizeof(struct iphdr),
2018 .setsockopt = ip_setsockopt,
2019 .getsockopt = ip_getsockopt,
2020 .addr2sockaddr = inet_csk_addr2sockaddr,
2021 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 2022#ifdef CONFIG_COMPAT
543d9cfe
ACM
2023 .compat_setsockopt = compat_ip_setsockopt,
2024 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2025#endif
4fab9071 2026 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2027};
4bc2f18b 2028EXPORT_SYMBOL(ipv4_specific);
1da177e4 2029
cfb6eeb4 2030#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2031static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2032 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2033 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2034 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2035};
b6332e6c 2036#endif
cfb6eeb4 2037
1da177e4
LT
2038/* NOTE: A lot of things set to zero explicitly by call to
2039 * sk_alloc() so need not be done here.
2040 */
2041static int tcp_v4_init_sock(struct sock *sk)
2042{
6687e988 2043 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2044
900f65d3 2045 tcp_init_sock(sk);
1da177e4 2046
8292a17a 2047 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2048
cfb6eeb4 2049#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2050 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2051#endif
1da177e4 2052
1da177e4
LT
2053 return 0;
2054}
2055
7d06b2e0 2056void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2057{
2058 struct tcp_sock *tp = tcp_sk(sk);
2059
e1a4aa50
SL
2060 trace_tcp_destroy_sock(sk);
2061
1da177e4
LT
2062 tcp_clear_xmit_timers(sk);
2063
6687e988 2064 tcp_cleanup_congestion_control(sk);
317a76f9 2065
734942cc
DW
2066 tcp_cleanup_ulp(sk);
2067
1da177e4 2068 /* Cleanup up the write buffer. */
fe067e8a 2069 tcp_write_queue_purge(sk);
1da177e4 2070
cf1ef3f0
WW
2071 /* Check if we want to disable active TFO */
2072 tcp_fastopen_active_disable_ofo_check(sk);
2073
1da177e4 2074 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 2075 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 2076
cfb6eeb4
YH
2077#ifdef CONFIG_TCP_MD5SIG
2078 /* Clean up the MD5 key list, if any */
2079 if (tp->md5sig_info) {
a915da9b 2080 tcp_clear_md5_list(sk);
fb7df5e4 2081 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
cfb6eeb4
YH
2082 tp->md5sig_info = NULL;
2083 }
2084#endif
1a2449a8 2085
1da177e4 2086 /* Clean up a referenced TCP bind bucket. */
463c84b9 2087 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2088 inet_put_port(sk);
1da177e4 2089
00db4124 2090 BUG_ON(tp->fastopen_rsk);
435cf559 2091
cf60af03
YC
2092 /* If socket is aborted during connect operation */
2093 tcp_free_fastopen_req(tp);
1fba70e5 2094 tcp_fastopen_destroy_cipher(sk);
cd8ae852 2095 tcp_saved_syn_free(tp);
cf60af03 2096
180d8cd9 2097 sk_sockets_allocated_dec(sk);
1da177e4 2098}
1da177e4
LT
2099EXPORT_SYMBOL(tcp_v4_destroy_sock);
2100
2101#ifdef CONFIG_PROC_FS
2102/* Proc filesystem TCP sock list dumping. */
2103
a8b690f9
TH
2104/*
2105 * Get next listener socket follow cur. If cur is NULL, get first socket
2106 * starting from bucket given in st->bucket; when st->bucket is zero the
2107 * very first socket in the hash table is returned.
2108 */
1da177e4
LT
2109static void *listening_get_next(struct seq_file *seq, void *cur)
2110{
37d849bb 2111 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2112 struct tcp_iter_state *st = seq->private;
a4146b1b 2113 struct net *net = seq_file_net(seq);
3b24d854 2114 struct inet_listen_hashbucket *ilb;
3b24d854 2115 struct sock *sk = cur;
1da177e4
LT
2116
2117 if (!sk) {
3b24d854 2118get_head:
a8b690f9 2119 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 2120 spin_lock(&ilb->lock);
3b24d854 2121 sk = sk_head(&ilb->head);
a8b690f9 2122 st->offset = 0;
1da177e4
LT
2123 goto get_sk;
2124 }
5caea4ea 2125 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2126 ++st->num;
a8b690f9 2127 ++st->offset;
1da177e4 2128
3b24d854 2129 sk = sk_next(sk);
1da177e4 2130get_sk:
3b24d854 2131 sk_for_each_from(sk) {
8475ef9f
PE
2132 if (!net_eq(sock_net(sk), net))
2133 continue;
37d849bb 2134 if (sk->sk_family == afinfo->family)
3b24d854 2135 return sk;
1da177e4 2136 }
9652dc2e 2137 spin_unlock(&ilb->lock);
a8b690f9 2138 st->offset = 0;
3b24d854
ED
2139 if (++st->bucket < INET_LHTABLE_SIZE)
2140 goto get_head;
2141 return NULL;
1da177e4
LT
2142}
2143
2144static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2145{
a8b690f9
TH
2146 struct tcp_iter_state *st = seq->private;
2147 void *rc;
2148
2149 st->bucket = 0;
2150 st->offset = 0;
2151 rc = listening_get_next(seq, NULL);
1da177e4
LT
2152
2153 while (rc && *pos) {
2154 rc = listening_get_next(seq, rc);
2155 --*pos;
2156 }
2157 return rc;
2158}
2159
05dbc7b5 2160static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2161{
05dbc7b5 2162 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2163}
2164
a8b690f9
TH
2165/*
2166 * Get first established socket starting from bucket given in st->bucket.
2167 * If st->bucket is zero, the very first socket in the hash is returned.
2168 */
1da177e4
LT
2169static void *established_get_first(struct seq_file *seq)
2170{
37d849bb 2171 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
5799de0b 2172 struct tcp_iter_state *st = seq->private;
a4146b1b 2173 struct net *net = seq_file_net(seq);
1da177e4
LT
2174 void *rc = NULL;
2175
a8b690f9
TH
2176 st->offset = 0;
2177 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2178 struct sock *sk;
3ab5aee7 2179 struct hlist_nulls_node *node;
9db66bdc 2180 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2181
6eac5604
AK
2182 /* Lockless fast path for the common case of empty buckets */
2183 if (empty_bucket(st))
2184 continue;
2185
9db66bdc 2186 spin_lock_bh(lock);
3ab5aee7 2187 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
37d849bb 2188 if (sk->sk_family != afinfo->family ||
878628fb 2189 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2190 continue;
2191 }
2192 rc = sk;
2193 goto out;
2194 }
9db66bdc 2195 spin_unlock_bh(lock);
1da177e4
LT
2196 }
2197out:
2198 return rc;
2199}
2200
2201static void *established_get_next(struct seq_file *seq, void *cur)
2202{
37d849bb 2203 struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
1da177e4 2204 struct sock *sk = cur;
3ab5aee7 2205 struct hlist_nulls_node *node;
5799de0b 2206 struct tcp_iter_state *st = seq->private;
a4146b1b 2207 struct net *net = seq_file_net(seq);
1da177e4
LT
2208
2209 ++st->num;
a8b690f9 2210 ++st->offset;
1da177e4 2211
05dbc7b5 2212 sk = sk_nulls_next(sk);
1da177e4 2213
3ab5aee7 2214 sk_nulls_for_each_from(sk, node) {
37d849bb
CH
2215 if (sk->sk_family == afinfo->family &&
2216 net_eq(sock_net(sk), net))
05dbc7b5 2217 return sk;
1da177e4
LT
2218 }
2219
05dbc7b5
ED
2220 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2221 ++st->bucket;
2222 return established_get_first(seq);
1da177e4
LT
2223}
2224
2225static void *established_get_idx(struct seq_file *seq, loff_t pos)
2226{
a8b690f9
TH
2227 struct tcp_iter_state *st = seq->private;
2228 void *rc;
2229
2230 st->bucket = 0;
2231 rc = established_get_first(seq);
1da177e4
LT
2232
2233 while (rc && pos) {
2234 rc = established_get_next(seq, rc);
2235 --pos;
7174259e 2236 }
1da177e4
LT
2237 return rc;
2238}
2239
2240static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2241{
2242 void *rc;
5799de0b 2243 struct tcp_iter_state *st = seq->private;
1da177e4 2244
1da177e4
LT
2245 st->state = TCP_SEQ_STATE_LISTENING;
2246 rc = listening_get_idx(seq, &pos);
2247
2248 if (!rc) {
1da177e4
LT
2249 st->state = TCP_SEQ_STATE_ESTABLISHED;
2250 rc = established_get_idx(seq, pos);
2251 }
2252
2253 return rc;
2254}
2255
a8b690f9
TH
2256static void *tcp_seek_last_pos(struct seq_file *seq)
2257{
2258 struct tcp_iter_state *st = seq->private;
2259 int offset = st->offset;
2260 int orig_num = st->num;
2261 void *rc = NULL;
2262
2263 switch (st->state) {
a8b690f9
TH
2264 case TCP_SEQ_STATE_LISTENING:
2265 if (st->bucket >= INET_LHTABLE_SIZE)
2266 break;
2267 st->state = TCP_SEQ_STATE_LISTENING;
2268 rc = listening_get_next(seq, NULL);
2269 while (offset-- && rc)
2270 rc = listening_get_next(seq, rc);
2271 if (rc)
2272 break;
2273 st->bucket = 0;
05dbc7b5 2274 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2275 /* Fallthrough */
2276 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2277 if (st->bucket > tcp_hashinfo.ehash_mask)
2278 break;
2279 rc = established_get_first(seq);
2280 while (offset-- && rc)
2281 rc = established_get_next(seq, rc);
2282 }
2283
2284 st->num = orig_num;
2285
2286 return rc;
2287}
2288
37d849bb 2289void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1da177e4 2290{
5799de0b 2291 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2292 void *rc;
2293
2294 if (*pos && *pos == st->last_pos) {
2295 rc = tcp_seek_last_pos(seq);
2296 if (rc)
2297 goto out;
2298 }
2299
1da177e4
LT
2300 st->state = TCP_SEQ_STATE_LISTENING;
2301 st->num = 0;
a8b690f9
TH
2302 st->bucket = 0;
2303 st->offset = 0;
2304 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2305
2306out:
2307 st->last_pos = *pos;
2308 return rc;
1da177e4 2309}
37d849bb 2310EXPORT_SYMBOL(tcp_seq_start);
1da177e4 2311
37d849bb 2312void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1da177e4 2313{
a8b690f9 2314 struct tcp_iter_state *st = seq->private;
1da177e4 2315 void *rc = NULL;
1da177e4
LT
2316
2317 if (v == SEQ_START_TOKEN) {
2318 rc = tcp_get_idx(seq, 0);
2319 goto out;
2320 }
1da177e4
LT
2321
2322 switch (st->state) {
1da177e4
LT
2323 case TCP_SEQ_STATE_LISTENING:
2324 rc = listening_get_next(seq, v);
2325 if (!rc) {
1da177e4 2326 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2327 st->bucket = 0;
2328 st->offset = 0;
1da177e4
LT
2329 rc = established_get_first(seq);
2330 }
2331 break;
2332 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2333 rc = established_get_next(seq, v);
2334 break;
2335 }
2336out:
2337 ++*pos;
a8b690f9 2338 st->last_pos = *pos;
1da177e4
LT
2339 return rc;
2340}
37d849bb 2341EXPORT_SYMBOL(tcp_seq_next);
1da177e4 2342
37d849bb 2343void tcp_seq_stop(struct seq_file *seq, void *v)
1da177e4 2344{
5799de0b 2345 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2346
2347 switch (st->state) {
1da177e4
LT
2348 case TCP_SEQ_STATE_LISTENING:
2349 if (v != SEQ_START_TOKEN)
9652dc2e 2350 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2351 break;
1da177e4
LT
2352 case TCP_SEQ_STATE_ESTABLISHED:
2353 if (v)
9db66bdc 2354 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2355 break;
2356 }
2357}
37d849bb 2358EXPORT_SYMBOL(tcp_seq_stop);
1da177e4 2359
d4f06873 2360static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2361 struct seq_file *f, int i)
1da177e4 2362{
2e6599cb 2363 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2364 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2365
5e659e4c 2366 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2367 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2368 i,
634fb979 2369 ireq->ir_loc_addr,
d4f06873 2370 ireq->ir_num,
634fb979
ED
2371 ireq->ir_rmt_addr,
2372 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2373 TCP_SYN_RECV,
2374 0, 0, /* could print option size, but that is af dependent. */
2375 1, /* timers active (only the expire timer) */
a399a805 2376 jiffies_delta_to_clock_t(delta),
e6c022a4 2377 req->num_timeout,
aa3a0c8c
ED
2378 from_kuid_munged(seq_user_ns(f),
2379 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2380 0, /* non standard timer */
2381 0, /* open_requests have no inode */
d4f06873 2382 0,
652586df 2383 req);
1da177e4
LT
2384}
2385
652586df 2386static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2387{
2388 int timer_active;
2389 unsigned long timer_expires;
cf533ea5 2390 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2391 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2392 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2393 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2394 __be32 dest = inet->inet_daddr;
2395 __be32 src = inet->inet_rcv_saddr;
2396 __u16 destp = ntohs(inet->inet_dport);
2397 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2398 int rx_queue;
00fd38d9 2399 int state;
1da177e4 2400
6ba8a3b1 2401 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2402 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2403 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2404 timer_active = 1;
463c84b9
ACM
2405 timer_expires = icsk->icsk_timeout;
2406 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2407 timer_active = 4;
463c84b9 2408 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2409 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2410 timer_active = 2;
cf4c6bf8 2411 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2412 } else {
2413 timer_active = 0;
2414 timer_expires = jiffies;
2415 }
2416
986ffdfd 2417 state = inet_sk_state_load(sk);
00fd38d9 2418 if (state == TCP_LISTEN)
49d09007
ED
2419 rx_queue = sk->sk_ack_backlog;
2420 else
00fd38d9
ED
2421 /* Because we don't lock the socket,
2422 * we might find a transient negative value.
49d09007
ED
2423 */
2424 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2425
5e659e4c 2426 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2427 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2428 i, src, srcp, dest, destp, state,
47da8ee6 2429 tp->write_seq - tp->snd_una,
49d09007 2430 rx_queue,
1da177e4 2431 timer_active,
a399a805 2432 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2433 icsk->icsk_retransmits,
a7cb5a49 2434 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2435 icsk->icsk_probes_out,
cf4c6bf8 2436 sock_i_ino(sk),
41c6d650 2437 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2438 jiffies_to_clock_t(icsk->icsk_rto),
2439 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2440 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2441 tp->snd_cwnd,
00fd38d9
ED
2442 state == TCP_LISTEN ?
2443 fastopenq->max_qlen :
652586df 2444 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2445}
2446
cf533ea5 2447static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2448 struct seq_file *f, int i)
1da177e4 2449{
789f558c 2450 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2451 __be32 dest, src;
1da177e4 2452 __u16 destp, srcp;
1da177e4
LT
2453
2454 dest = tw->tw_daddr;
2455 src = tw->tw_rcv_saddr;
2456 destp = ntohs(tw->tw_dport);
2457 srcp = ntohs(tw->tw_sport);
2458
5e659e4c 2459 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2460 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2461 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2462 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2463 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2464}
2465
2466#define TMPSZ 150
2467
2468static int tcp4_seq_show(struct seq_file *seq, void *v)
2469{
5799de0b 2470 struct tcp_iter_state *st;
05dbc7b5 2471 struct sock *sk = v;
1da177e4 2472
652586df 2473 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2474 if (v == SEQ_START_TOKEN) {
652586df 2475 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2476 "rx_queue tr tm->when retrnsmt uid timeout "
2477 "inode");
2478 goto out;
2479 }
2480 st = seq->private;
2481
079096f1
ED
2482 if (sk->sk_state == TCP_TIME_WAIT)
2483 get_timewait4_sock(v, seq, st->num);
2484 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2485 get_openreq4(v, seq, st->num);
079096f1
ED
2486 else
2487 get_tcp4_sock(v, seq, st->num);
1da177e4 2488out:
652586df 2489 seq_pad(seq, '\n');
1da177e4
LT
2490 return 0;
2491}
2492
37d849bb
CH
2493static const struct seq_operations tcp4_seq_ops = {
2494 .show = tcp4_seq_show,
2495 .start = tcp_seq_start,
2496 .next = tcp_seq_next,
2497 .stop = tcp_seq_stop,
2498};
2499
1da177e4 2500static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4 2501 .family = AF_INET,
1da177e4
LT
2502};
2503
2c8c1e72 2504static int __net_init tcp4_proc_init_net(struct net *net)
757764f6 2505{
c3506372
CH
2506 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2507 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
37d849bb
CH
2508 return -ENOMEM;
2509 return 0;
757764f6
PE
2510}
2511
2c8c1e72 2512static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6 2513{
37d849bb 2514 remove_proc_entry("tcp", net->proc_net);
757764f6
PE
2515}
2516
2517static struct pernet_operations tcp4_net_ops = {
2518 .init = tcp4_proc_init_net,
2519 .exit = tcp4_proc_exit_net,
2520};
2521
1da177e4
LT
2522int __init tcp4_proc_init(void)
2523{
757764f6 2524 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2525}
2526
2527void tcp4_proc_exit(void)
2528{
757764f6 2529 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2530}
2531#endif /* CONFIG_PROC_FS */
2532
2533struct proto tcp_prot = {
2534 .name = "TCP",
2535 .owner = THIS_MODULE,
2536 .close = tcp_close,
d74bad4e 2537 .pre_connect = tcp_v4_pre_connect,
1da177e4
LT
2538 .connect = tcp_v4_connect,
2539 .disconnect = tcp_disconnect,
463c84b9 2540 .accept = inet_csk_accept,
1da177e4
LT
2541 .ioctl = tcp_ioctl,
2542 .init = tcp_v4_init_sock,
2543 .destroy = tcp_v4_destroy_sock,
2544 .shutdown = tcp_shutdown,
2545 .setsockopt = tcp_setsockopt,
2546 .getsockopt = tcp_getsockopt,
4b9d07a4 2547 .keepalive = tcp_set_keepalive,
1da177e4 2548 .recvmsg = tcp_recvmsg,
7ba42910
CG
2549 .sendmsg = tcp_sendmsg,
2550 .sendpage = tcp_sendpage,
1da177e4 2551 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2552 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2553 .hash = inet_hash,
2554 .unhash = inet_unhash,
2555 .get_port = inet_csk_get_port,
1da177e4 2556 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2557 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2558 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2559 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2560 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2561 .memory_allocated = &tcp_memory_allocated,
2562 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2563 .sysctl_mem = sysctl_tcp_mem,
356d1833
ED
2564 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2565 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
1da177e4
LT
2566 .max_header = MAX_TCP_HEADER,
2567 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2568 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2569 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2570 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2571 .h.hashinfo = &tcp_hashinfo,
7ba42910 2572 .no_autobind = true,
543d9cfe
ACM
2573#ifdef CONFIG_COMPAT
2574 .compat_setsockopt = compat_tcp_setsockopt,
2575 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2576#endif
c1e64e29 2577 .diag_destroy = tcp_abort,
1da177e4 2578};
4bc2f18b 2579EXPORT_SYMBOL(tcp_prot);
1da177e4 2580
bdbbb852
ED
2581static void __net_exit tcp_sk_exit(struct net *net)
2582{
2583 int cpu;
2584
6670e152
SH
2585 module_put(net->ipv4.tcp_congestion_control->owner);
2586
bdbbb852
ED
2587 for_each_possible_cpu(cpu)
2588 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2589 free_percpu(net->ipv4.tcp_sk);
2590}
2591
046ee902
DL
2592static int __net_init tcp_sk_init(struct net *net)
2593{
fee83d09 2594 int res, cpu, cnt;
bdbbb852
ED
2595
2596 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2597 if (!net->ipv4.tcp_sk)
2598 return -ENOMEM;
2599
2600 for_each_possible_cpu(cpu) {
2601 struct sock *sk;
2602
2603 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2604 IPPROTO_TCP, net);
2605 if (res)
2606 goto fail;
a9d6532b 2607 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
431280ee
ED
2608
2609 /* Please enforce IP_DF and IPID==0 for RST and
2610 * ACK sent in SYN-RECV and TIME-WAIT state.
2611 */
2612 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2613
bdbbb852
ED
2614 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2615 }
49213555 2616
5d134f1c 2617 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2618 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2619
b0f9ca53 2620 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2621 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2622 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2623
13b287e8 2624 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2625 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2626 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2627
6fa25166 2628 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2629 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2630 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2631 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2632 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2633 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2634 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2635 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2636 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
79e9fed4 2637 net->ipv4.sysctl_tcp_tw_reuse = 2;
12ed8244 2638
fee83d09 2639 cnt = tcp_hashinfo.ehash_mask + 1;
743e4815 2640 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
1946e672
HY
2641 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2642
fee83d09 2643 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
f9301034 2644 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2645 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2646 net->ipv4.sysctl_tcp_timestamps = 1;
2ae21cf5 2647 net->ipv4.sysctl_tcp_early_retrans = 3;
e20223f1 2648 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
b510f0d2 2649 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
e0a1e5b5 2650 net->ipv4.sysctl_tcp_retrans_collapse = 1;
c6e21803 2651 net->ipv4.sysctl_tcp_max_reordering = 300;
6496f6bd 2652 net->ipv4.sysctl_tcp_dsack = 1;
0c12654a 2653 net->ipv4.sysctl_tcp_app_win = 31;
94f0893e 2654 net->ipv4.sysctl_tcp_adv_win_scale = 1;
af9b69a7 2655 net->ipv4.sysctl_tcp_frto = 2;
4540c0cf 2656 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
d06a9904
ED
2657 /* This limits the percentage of the congestion window which we
2658 * will allow a single TSO frame to consume. Building TSO frames
2659 * which are too large can cause TCP streams to be bursty.
2660 */
2661 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
c73e5807
ED
2662 /* Default TSQ limit of 16 TSO segments */
2663 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
b530b681
ED
2664 /* rfc5961 challenge ack rate limiting */
2665 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
26e9596e 2666 net->ipv4.sysctl_tcp_min_tso_segs = 2;
bd239704 2667 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
790f00e1 2668 net->ipv4.sysctl_tcp_autocorking = 1;
4170ba6b 2669 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
23a7102a 2670 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
c26e91f8 2671 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
356d1833
ED
2672 if (net != &init_net) {
2673 memcpy(net->ipv4.sysctl_tcp_rmem,
2674 init_net.ipv4.sysctl_tcp_rmem,
2675 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2676 memcpy(net->ipv4.sysctl_tcp_wmem,
2677 init_net.ipv4.sysctl_tcp_wmem,
2678 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2679 }
6d82aa24 2680 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
9c21d2fc 2681 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
e1cfcbe8 2682 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2683 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2684 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2685 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2686
6670e152
SH
2687 /* Reno is always built in */
2688 if (!net_eq(net, &init_net) &&
2689 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2690 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2691 else
2692 net->ipv4.tcp_congestion_control = &tcp_reno;
2693
49213555 2694 return 0;
bdbbb852
ED
2695fail:
2696 tcp_sk_exit(net);
2697
2698 return res;
b099ce26
EB
2699}
2700
2701static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2702{
43713848
HY
2703 struct net *net;
2704
1946e672 2705 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2706
2707 list_for_each_entry(net, net_exit_list, exit_list)
2708 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2709}
2710
2711static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2712 .init = tcp_sk_init,
2713 .exit = tcp_sk_exit,
2714 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2715};
2716
9b0f976f 2717void __init tcp_v4_init(void)
1da177e4 2718{
6a1b3054 2719 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2720 panic("Failed to create the TCP control socket.\n");
1da177e4 2721}