]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/tcp_ipv4.c
UBUNTU: Ubuntu-4.10.0-37.41
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83
cf80e0e4 84#include <crypto/hash.h>
cfb6eeb4
YH
85#include <linux/scatterlist.h>
86
ab32ea5d 87int sysctl_tcp_low_latency __read_mostly;
1da177e4 88
cfb6eeb4 89#ifdef CONFIG_TCP_MD5SIG
a915da9b 90static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
92#endif
93
5caea4ea 94struct inet_hashinfo tcp_hashinfo;
4bc2f18b 95EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 96
95a22cae 97static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
1da177e4 98{
eddc9ec5
ACM
99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 ip_hdr(skb)->saddr,
aa8223c7 101 tcp_hdr(skb)->dest,
95a22cae 102 tcp_hdr(skb)->source, tsoff);
1da177e4
LT
103}
104
6d6ee43e
ACM
105int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106{
107 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 struct tcp_sock *tp = tcp_sk(sk);
109
110 /* With PAWS, it is safe from the viewpoint
111 of data integrity. Even without PAWS it is safe provided sequence
112 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113
114 Actually, the idea is close to VJ's one, only timestamp cache is
115 held not per host, but per port pair and TW bucket is used as state
116 holder.
117
118 If TW bucket has been already destroyed we fall back to VJ's scheme
119 and use initial timestamp retrieved from peer table.
120 */
121 if (tcptw->tw_ts_recent_stamp &&
56ab6b93 122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
9d729f72 123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 if (tp->write_seq == 0)
126 tp->write_seq = 1;
127 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
128 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 sock_hold(sktw);
130 return 1;
131 }
132
133 return 0;
134}
6d6ee43e
ACM
135EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136
1da177e4
LT
137/* This will initiate an outgoing connection. */
138int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139{
2d7192d6 140 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
141 struct inet_sock *inet = inet_sk(sk);
142 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 143 __be16 orig_sport, orig_dport;
bada8adc 144 __be32 daddr, nexthop;
da905bd1 145 struct flowi4 *fl4;
2d7192d6 146 struct rtable *rt;
1da177e4 147 int err;
f6d8bd05 148 struct ip_options_rcu *inet_opt;
1da177e4
LT
149
150 if (addr_len < sizeof(struct sockaddr_in))
151 return -EINVAL;
152
153 if (usin->sin_family != AF_INET)
154 return -EAFNOSUPPORT;
155
156 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 157 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 158 lockdep_sock_is_held(sk));
f6d8bd05 159 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
160 if (!daddr)
161 return -EINVAL;
f6d8bd05 162 nexthop = inet_opt->opt.faddr;
1da177e4
LT
163 }
164
dca8b089
DM
165 orig_sport = inet->inet_sport;
166 orig_dport = usin->sin_port;
da905bd1
DM
167 fl4 = &inet->cork.fl.u.ip4;
168 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170 IPPROTO_TCP,
0e0d44ab 171 orig_sport, orig_dport, sk);
b23dd4fe
DM
172 if (IS_ERR(rt)) {
173 err = PTR_ERR(rt);
174 if (err == -ENETUNREACH)
f1d8cba6 175 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 176 return err;
584bdf8c 177 }
1da177e4
LT
178
179 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
180 ip_rt_put(rt);
181 return -ENETUNREACH;
182 }
183
f6d8bd05 184 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 185 daddr = fl4->daddr;
1da177e4 186
c720c7e8 187 if (!inet->inet_saddr)
da905bd1 188 inet->inet_saddr = fl4->saddr;
d1e559d0 189 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 190
c720c7e8 191 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
192 /* Reset inherited state */
193 tp->rx_opt.ts_recent = 0;
194 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
195 if (likely(!tp->repair))
196 tp->write_seq = 0;
1da177e4
LT
197 }
198
295ff7ed 199 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
200 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
201 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 202
c720c7e8 203 inet->inet_dport = usin->sin_port;
d1e559d0 204 sk_daddr_set(sk, daddr);
1da177e4 205
d83d8461 206 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
207 if (inet_opt)
208 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 209
bee7ca9e 210 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
211
212 /* Socket identity is still unknown (sport may be zero).
213 * However we set state to SYN-SENT and not releasing socket
214 * lock select source port, enter ourselves into the hash tables and
215 * complete initialization after this.
216 */
217 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 218 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
219 if (err)
220 goto failure;
221
877d1f62 222 sk_set_txhash(sk);
9e7ceb06 223
da905bd1 224 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
225 inet->inet_sport, inet->inet_dport, sk);
226 if (IS_ERR(rt)) {
227 err = PTR_ERR(rt);
228 rt = NULL;
1da177e4 229 goto failure;
b23dd4fe 230 }
1da177e4 231 /* OK, now commit destination to socket. */
bcd76111 232 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 233 sk_setup_caps(sk, &rt->dst);
1da177e4 234
ee995283 235 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
236 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
237 inet->inet_daddr,
238 inet->inet_sport,
95a22cae
FW
239 usin->sin_port,
240 &tp->tsoffset);
1da177e4 241
c720c7e8 242 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 243
2b916477 244 err = tcp_connect(sk);
ee995283 245
1da177e4
LT
246 rt = NULL;
247 if (err)
248 goto failure;
249
250 return 0;
251
252failure:
7174259e
ACM
253 /*
254 * This unhashes the socket and releases the local port,
255 * if necessary.
256 */
1da177e4
LT
257 tcp_set_state(sk, TCP_CLOSE);
258 ip_rt_put(rt);
259 sk->sk_route_caps = 0;
c720c7e8 260 inet->inet_dport = 0;
1da177e4
LT
261 return err;
262}
4bc2f18b 263EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 264
1da177e4 265/*
563d34d0
ED
266 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
267 * It can be called through tcp_release_cb() if socket was owned by user
268 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 269 */
4fab9071 270void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 271{
1da177e4 272 struct inet_sock *inet = inet_sk(sk);
76d06ff4
ED
273 struct dst_entry *dst;
274 u32 mtu;
1da177e4 275
76d06ff4
ED
276 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
277 return;
278 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
279 dst = inet_csk_update_pmtu(sk, mtu);
280 if (!dst)
1da177e4
LT
281 return;
282
1da177e4
LT
283 /* Something is about to be wrong... Remember soft error
284 * for the case, if this connection will not able to recover.
285 */
286 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
287 sk->sk_err_soft = EMSGSIZE;
288
289 mtu = dst_mtu(dst);
290
291 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 292 ip_sk_accept_pmtu(sk) &&
d83d8461 293 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
294 tcp_sync_mss(sk, mtu);
295
296 /* Resend the TCP packet because it's
297 * clear that the old packet has been
298 * dropped. This is the new "fast" path mtu
299 * discovery.
300 */
301 tcp_simple_retransmit(sk);
302 } /* else let the usual retransmit timer handle it */
303}
4fab9071 304EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 305
55be7a9c
DM
306static void do_redirect(struct sk_buff *skb, struct sock *sk)
307{
308 struct dst_entry *dst = __sk_dst_check(sk, 0);
309
1ed5c48f 310 if (dst)
6700c270 311 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
312}
313
26e37360
ED
314
315/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 316void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
317{
318 struct request_sock *req = inet_reqsk(sk);
319 struct net *net = sock_net(sk);
320
321 /* ICMPs are not backlogged, hence we cannot get
322 * an established socket here.
323 */
26e37360 324 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 325 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 326 } else if (abort) {
26e37360
ED
327 /*
328 * Still in SYN_RECV, just remove it silently.
329 * There is no good way to pass the error to the newly
330 * created socket, and POSIX does not want network
331 * errors returned from accept().
332 */
c6973669 333 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 334 tcp_listendrop(req->rsk_listener);
26e37360 335 }
ef84d8ce 336 reqsk_put(req);
26e37360
ED
337}
338EXPORT_SYMBOL(tcp_req_err);
339
1da177e4
LT
340/*
341 * This routine is called by the ICMP module when it gets some
342 * sort of error condition. If err < 0 then the socket should
343 * be closed and the error returned to the user. If err > 0
344 * it's just the icmp type << 8 | icmp code. After adjustment
345 * header points to the first 8 bytes of the tcp header. We need
346 * to find the appropriate port.
347 *
348 * The locking strategy used here is very "optimistic". When
349 * someone else accesses the socket the ICMP is just dropped
350 * and for some paths there is no check at all.
351 * A more general error queue to queue errors for later handling
352 * is probably better.
353 *
354 */
355
4d1a2d9e 356void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 357{
b71d1d42 358 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 359 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 360 struct inet_connection_sock *icsk;
1da177e4
LT
361 struct tcp_sock *tp;
362 struct inet_sock *inet;
4d1a2d9e
DL
363 const int type = icmp_hdr(icmp_skb)->type;
364 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 365 struct sock *sk;
f1ecd5d9 366 struct sk_buff *skb;
0a672f74
YC
367 struct request_sock *fastopen;
368 __u32 seq, snd_una;
f1ecd5d9 369 __u32 remaining;
1da177e4 370 int err;
4d1a2d9e 371 struct net *net = dev_net(icmp_skb->dev);
1da177e4 372
26e37360
ED
373 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 th->dest, iph->saddr, ntohs(th->source),
375 inet_iif(icmp_skb));
1da177e4 376 if (!sk) {
5d3848bc 377 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
1da177e4
LT
378 return;
379 }
380 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 381 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
382 return;
383 }
26e37360
ED
384 seq = ntohl(th->seq);
385 if (sk->sk_state == TCP_NEW_SYN_RECV)
9cf74903
ED
386 return tcp_req_err(sk, seq,
387 type == ICMP_PARAMETERPROB ||
388 type == ICMP_TIME_EXCEEDED ||
389 (type == ICMP_DEST_UNREACH &&
390 (code == ICMP_NET_UNREACH ||
391 code == ICMP_HOST_UNREACH)));
1da177e4
LT
392
393 bh_lock_sock(sk);
394 /* If too many ICMPs get dropped on busy
395 * servers this needs to be solved differently.
563d34d0
ED
396 * We do take care of PMTU discovery (RFC1191) special case :
397 * we can receive locally generated ICMP messages while socket is held.
1da177e4 398 */
b74aa930
ED
399 if (sock_owned_by_user(sk)) {
400 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 401 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 402 }
1da177e4
LT
403 if (sk->sk_state == TCP_CLOSE)
404 goto out;
405
97e3ecd1 406 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 407 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 408 goto out;
409 }
410
f1ecd5d9 411 icsk = inet_csk(sk);
1da177e4 412 tp = tcp_sk(sk);
0a672f74
YC
413 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414 fastopen = tp->fastopen_rsk;
415 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 416 if (sk->sk_state != TCP_LISTEN &&
0a672f74 417 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 418 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
419 goto out;
420 }
421
422 switch (type) {
55be7a9c 423 case ICMP_REDIRECT:
0c244cdf
JM
424 if (!sock_owned_by_user(sk))
425 do_redirect(icmp_skb, sk);
55be7a9c 426 goto out;
1da177e4
LT
427 case ICMP_SOURCE_QUENCH:
428 /* Just silently ignore these. */
429 goto out;
430 case ICMP_PARAMETERPROB:
431 err = EPROTO;
432 break;
433 case ICMP_DEST_UNREACH:
434 if (code > NR_ICMP_UNREACH)
435 goto out;
436
437 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
438 /* We are not interested in TCP_LISTEN and open_requests
439 * (SYN-ACKs send out by Linux are always <576bytes so
440 * they should go through unfragmented).
441 */
442 if (sk->sk_state == TCP_LISTEN)
443 goto out;
444
563d34d0 445 tp->mtu_info = info;
144d56e9 446 if (!sock_owned_by_user(sk)) {
563d34d0 447 tcp_v4_mtu_reduced(sk);
144d56e9 448 } else {
7aa5470c 449 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
450 sock_hold(sk);
451 }
1da177e4
LT
452 goto out;
453 }
454
455 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
456 /* check if icmp_skb allows revert of backoff
457 * (see draft-zimmermann-tcp-lcd) */
458 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
459 break;
460 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 461 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
462 break;
463
8f49c270
DM
464 if (sock_owned_by_user(sk))
465 break;
466
f1ecd5d9 467 icsk->icsk_backoff--;
fcdd1cf4
ED
468 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
469 TCP_TIMEOUT_INIT;
470 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
471
472 skb = tcp_write_queue_head(sk);
473 BUG_ON(!skb);
474
7faee5c0
ED
475 remaining = icsk->icsk_rto -
476 min(icsk->icsk_rto,
477 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
478
479 if (remaining) {
480 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
481 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
482 } else {
483 /* RTO revert clocked out retransmission.
484 * Will retransmit now */
485 tcp_retransmit_timer(sk);
486 }
487
1da177e4
LT
488 break;
489 case ICMP_TIME_EXCEEDED:
490 err = EHOSTUNREACH;
491 break;
492 default:
493 goto out;
494 }
495
496 switch (sk->sk_state) {
1da177e4 497 case TCP_SYN_SENT:
0a672f74
YC
498 case TCP_SYN_RECV:
499 /* Only in fast or simultaneous open. If a fast open socket is
500 * is already accepted it is treated as a connected one below.
501 */
51456b29 502 if (fastopen && !fastopen->sk)
0a672f74
YC
503 break;
504
1da177e4 505 if (!sock_owned_by_user(sk)) {
1da177e4
LT
506 sk->sk_err = err;
507
508 sk->sk_error_report(sk);
509
510 tcp_done(sk);
511 } else {
512 sk->sk_err_soft = err;
513 }
514 goto out;
515 }
516
517 /* If we've already connected we will keep trying
518 * until we time out, or the user gives up.
519 *
520 * rfc1122 4.2.3.9 allows to consider as hard errors
521 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
522 * but it is obsoleted by pmtu discovery).
523 *
524 * Note, that in modern internet, where routing is unreliable
525 * and in each dark corner broken firewalls sit, sending random
526 * errors ordered by their masters even this two messages finally lose
527 * their original sense (even Linux sends invalid PORT_UNREACHs)
528 *
529 * Now we are in compliance with RFCs.
530 * --ANK (980905)
531 */
532
533 inet = inet_sk(sk);
534 if (!sock_owned_by_user(sk) && inet->recverr) {
535 sk->sk_err = err;
536 sk->sk_error_report(sk);
537 } else { /* Only an error on timeout */
538 sk->sk_err_soft = err;
539 }
540
541out:
542 bh_unlock_sock(sk);
543 sock_put(sk);
544}
545
28850dc7 546void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 547{
aa8223c7 548 struct tcphdr *th = tcp_hdr(skb);
1da177e4 549
84fa7933 550 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 551 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 552 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 553 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 554 } else {
419f9f89 555 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 556 csum_partial(th,
1da177e4
LT
557 th->doff << 2,
558 skb->csum));
559 }
560}
561
419f9f89 562/* This routine computes an IPv4 TCP checksum. */
bb296246 563void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 564{
cf533ea5 565 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
566
567 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
568}
4bc2f18b 569EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 570
1da177e4
LT
571/*
572 * This routine will send an RST to the other tcp.
573 *
574 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575 * for reset.
576 * Answer: if a packet caused RST, it is not for a socket
577 * existing in our system, if it is matched to a socket,
578 * it is just duplicate segment or bug in other side's TCP.
579 * So that we build reply only basing on parameters
580 * arrived with segment.
581 * Exception: precedence violation. We do not implement it in any case.
582 */
583
a00e7444 584static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 585{
cf533ea5 586 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
587 struct {
588 struct tcphdr th;
589#ifdef CONFIG_TCP_MD5SIG
714e85be 590 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
591#endif
592 } rep;
1da177e4 593 struct ip_reply_arg arg;
cfb6eeb4 594#ifdef CONFIG_TCP_MD5SIG
e46787f0 595 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
596 const __u8 *hash_location = NULL;
597 unsigned char newhash[16];
598 int genhash;
599 struct sock *sk1 = NULL;
cfb6eeb4 600#endif
a86b1e30 601 struct net *net;
1da177e4
LT
602
603 /* Never send a reset in response to a reset. */
604 if (th->rst)
605 return;
606
c3658e8d
ED
607 /* If sk not NULL, it means we did a successful lookup and incoming
608 * route had to be correct. prequeue might have dropped our dst.
609 */
610 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
611 return;
612
613 /* Swap the send and the receive. */
cfb6eeb4
YH
614 memset(&rep, 0, sizeof(rep));
615 rep.th.dest = th->source;
616 rep.th.source = th->dest;
617 rep.th.doff = sizeof(struct tcphdr) / 4;
618 rep.th.rst = 1;
1da177e4
LT
619
620 if (th->ack) {
cfb6eeb4 621 rep.th.seq = th->ack_seq;
1da177e4 622 } else {
cfb6eeb4
YH
623 rep.th.ack = 1;
624 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625 skb->len - (th->doff << 2));
1da177e4
LT
626 }
627
7174259e 628 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
629 arg.iov[0].iov_base = (unsigned char *)&rep;
630 arg.iov[0].iov_len = sizeof(rep.th);
631
0f85feae 632 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 633#ifdef CONFIG_TCP_MD5SIG
3b24d854 634 rcu_read_lock();
658ddaaf 635 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 636 if (sk && sk_fullsock(sk)) {
e46787f0
FW
637 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
638 &ip_hdr(skb)->saddr, AF_INET);
639 } else if (hash_location) {
658ddaaf
SL
640 /*
641 * active side is lost. Try to find listening socket through
642 * source port, and then find md5 key through listening socket.
643 * we are not loose security here:
644 * Incoming packet is checked with md5 hash with finding key,
645 * no RST generated if md5 hash doesn't match.
646 */
a583636a
CG
647 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
648 ip_hdr(skb)->saddr,
da5e3630 649 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
650 ntohs(th->source), inet_iif(skb));
651 /* don't send rst if it can't find key */
652 if (!sk1)
3b24d854
ED
653 goto out;
654
658ddaaf
SL
655 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
656 &ip_hdr(skb)->saddr, AF_INET);
657 if (!key)
3b24d854
ED
658 goto out;
659
658ddaaf 660
39f8e58e 661 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 662 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
663 goto out;
664
658ddaaf
SL
665 }
666
cfb6eeb4
YH
667 if (key) {
668 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
669 (TCPOPT_NOP << 16) |
670 (TCPOPT_MD5SIG << 8) |
671 TCPOLEN_MD5SIG);
672 /* Update length and the length the header thinks exists */
673 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
674 rep.th.doff = arg.iov[0].iov_len / 4;
675
49a72dfb 676 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
677 key, ip_hdr(skb)->saddr,
678 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
679 }
680#endif
eddc9ec5
ACM
681 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
682 ip_hdr(skb)->saddr, /* XXX */
52cd5750 683 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 684 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
685 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
686
e2446eaa 687 /* When socket is gone, all binding information is lost.
4c675258
AK
688 * routing might fail in this case. No choice here, if we choose to force
689 * input interface, we will misroute in case of asymmetric route.
e2446eaa 690 */
4c675258
AK
691 if (sk)
692 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 693
271c3b9b
FW
694 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
695 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
696
66b13d99 697 arg.tos = ip_hdr(skb)->tos;
e2d118a1 698 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 699 local_bh_disable();
bdbbb852
ED
700 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
701 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
702 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
703 &arg, arg.iov[0].iov_len);
1da177e4 704
90bbcc60
ED
705 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
706 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 707 local_bh_enable();
658ddaaf
SL
708
709#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
710out:
711 rcu_read_unlock();
658ddaaf 712#endif
1da177e4
LT
713}
714
715/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
716 outside socket context is ugly, certainly. What can I do?
717 */
718
e2d118a1 719static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 720 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 721 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 722 struct tcp_md5sig_key *key,
66b13d99 723 int reply_flags, u8 tos)
1da177e4 724{
cf533ea5 725 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
726 struct {
727 struct tcphdr th;
714e85be 728 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 729#ifdef CONFIG_TCP_MD5SIG
714e85be 730 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
731#endif
732 ];
1da177e4 733 } rep;
e2d118a1 734 struct net *net = sock_net(sk);
1da177e4
LT
735 struct ip_reply_arg arg;
736
737 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 738 memset(&arg, 0, sizeof(arg));
1da177e4
LT
739
740 arg.iov[0].iov_base = (unsigned char *)&rep;
741 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 742 if (tsecr) {
cfb6eeb4
YH
743 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
744 (TCPOPT_TIMESTAMP << 8) |
745 TCPOLEN_TIMESTAMP);
ee684b6f
AV
746 rep.opt[1] = htonl(tsval);
747 rep.opt[2] = htonl(tsecr);
cb48cfe8 748 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
749 }
750
751 /* Swap the send and the receive. */
752 rep.th.dest = th->source;
753 rep.th.source = th->dest;
754 rep.th.doff = arg.iov[0].iov_len / 4;
755 rep.th.seq = htonl(seq);
756 rep.th.ack_seq = htonl(ack);
757 rep.th.ack = 1;
758 rep.th.window = htons(win);
759
cfb6eeb4 760#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 761 if (key) {
ee684b6f 762 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
763
764 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
769 rep.th.doff = arg.iov[0].iov_len/4;
770
49a72dfb 771 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
772 key, ip_hdr(skb)->saddr,
773 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
774 }
775#endif
88ef4a5a 776 arg.flags = reply_flags;
eddc9ec5
ACM
777 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
778 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
779 arg.iov[0].iov_len, IPPROTO_TCP, 0);
780 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
781 if (oif)
782 arg.bound_dev_if = oif;
66b13d99 783 arg.tos = tos;
e2d118a1 784 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 785 local_bh_disable();
bdbbb852
ED
786 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
787 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
788 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
789 &arg, arg.iov[0].iov_len);
1da177e4 790
90bbcc60 791 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 792 local_bh_enable();
1da177e4
LT
793}
794
795static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
796{
8feaf0c0 797 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 798 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 799
e2d118a1 800 tcp_v4_send_ack(sk, skb,
e62a123b 801 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 802 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 803 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
804 tcptw->tw_ts_recent,
805 tw->tw_bound_dev_if,
88ef4a5a 806 tcp_twsk_md5_key(tcptw),
66b13d99
ED
807 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
808 tw->tw_tos
9501f972 809 );
1da177e4 810
8feaf0c0 811 inet_twsk_put(tw);
1da177e4
LT
812}
813
a00e7444 814static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 815 struct request_sock *req)
1da177e4 816{
168a8f58
JC
817 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
818 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
819 */
e62a123b
ED
820 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
821 tcp_sk(sk)->snd_nxt;
822
20a2b49f
ED
823 /* RFC 7323 2.3
824 * The window field (SEG.WND) of every outgoing segment, with the
825 * exception of <SYN> segments, MUST be right-shifted by
826 * Rcv.Wind.Shift bits:
827 */
e2d118a1 828 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
829 tcp_rsk(req)->rcv_nxt,
830 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
95a22cae 831 tcp_time_stamp + tcp_rsk(req)->ts_off,
9501f972
YH
832 req->ts_recent,
833 0,
a915da9b
ED
834 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
835 AF_INET),
66b13d99
ED
836 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
837 ip_hdr(skb)->tos);
1da177e4
LT
838}
839
1da177e4 840/*
9bf1d83e 841 * Send a SYN-ACK after having received a SYN.
60236fdd 842 * This still operates on a request_sock only, not on a big
1da177e4
LT
843 * socket.
844 */
0f935dbe 845static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 846 struct flowi *fl,
72659ecc 847 struct request_sock *req,
ca6fb065 848 struct tcp_fastopen_cookie *foc,
b3d05147 849 enum tcp_synack_type synack_type)
1da177e4 850{
2e6599cb 851 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 852 struct flowi4 fl4;
1da177e4 853 int err = -1;
d41db5af 854 struct sk_buff *skb;
1da177e4
LT
855
856 /* First, grab a route. */
ba3f7f04 857 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 858 return -1;
1da177e4 859
b3d05147 860 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
861
862 if (skb) {
634fb979 863 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 864
634fb979
ED
865 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
866 ireq->ir_rmt_addr,
2e6599cb 867 ireq->opt);
b9df3cb8 868 err = net_xmit_eval(err);
1da177e4
LT
869 }
870
1da177e4
LT
871 return err;
872}
873
874/*
60236fdd 875 * IPv4 request_sock destructor.
1da177e4 876 */
60236fdd 877static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 878{
a51482bd 879 kfree(inet_rsk(req)->opt);
1da177e4
LT
880}
881
cfb6eeb4
YH
882#ifdef CONFIG_TCP_MD5SIG
883/*
884 * RFC2385 MD5 checksumming requires a mapping of
885 * IP address->MD5 Key.
886 * We need to maintain these in the sk structure.
887 */
888
889/* Find the Key structure for an address. */
b83e3deb 890struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
891 const union tcp_md5_addr *addr,
892 int family)
cfb6eeb4 893{
fd3a154a 894 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 895 struct tcp_md5sig_key *key;
a915da9b 896 unsigned int size = sizeof(struct in_addr);
fd3a154a 897 const struct tcp_md5sig_info *md5sig;
cfb6eeb4 898
a8afca03
ED
899 /* caller either holds rcu_read_lock() or socket lock */
900 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 901 lockdep_sock_is_held(sk));
a8afca03 902 if (!md5sig)
cfb6eeb4 903 return NULL;
a915da9b
ED
904#if IS_ENABLED(CONFIG_IPV6)
905 if (family == AF_INET6)
906 size = sizeof(struct in6_addr);
907#endif
b67bfe0d 908 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
909 if (key->family != family)
910 continue;
911 if (!memcmp(&key->addr, addr, size))
912 return key;
cfb6eeb4
YH
913 }
914 return NULL;
915}
a915da9b 916EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4 917
b83e3deb 918struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 919 const struct sock *addr_sk)
cfb6eeb4 920{
b52e6921 921 const union tcp_md5_addr *addr;
a915da9b 922
b52e6921 923 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 924 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 925}
cfb6eeb4
YH
926EXPORT_SYMBOL(tcp_v4_md5_lookup);
927
cfb6eeb4 928/* This can be called on a newly created socket, from other files */
a915da9b
ED
929int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
930 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
931{
932 /* Add Key to the list */
b0a713e9 933 struct tcp_md5sig_key *key;
cfb6eeb4 934 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 935 struct tcp_md5sig_info *md5sig;
cfb6eeb4 936
c0353c7b 937 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
938 if (key) {
939 /* Pre-existing entry - just update that one. */
a915da9b 940 memcpy(key->key, newkey, newkeylen);
b0a713e9 941 key->keylen = newkeylen;
a915da9b
ED
942 return 0;
943 }
260fcbeb 944
a8afca03 945 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 946 lockdep_sock_is_held(sk));
a915da9b
ED
947 if (!md5sig) {
948 md5sig = kmalloc(sizeof(*md5sig), gfp);
949 if (!md5sig)
cfb6eeb4 950 return -ENOMEM;
cfb6eeb4 951
a915da9b
ED
952 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
953 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 954 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 955 }
cfb6eeb4 956
5f3d9cb2 957 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
958 if (!key)
959 return -ENOMEM;
71cea17e 960 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 961 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 962 return -ENOMEM;
cfb6eeb4 963 }
a915da9b
ED
964
965 memcpy(key->key, newkey, newkeylen);
966 key->keylen = newkeylen;
967 key->family = family;
968 memcpy(&key->addr, addr,
969 (family == AF_INET6) ? sizeof(struct in6_addr) :
970 sizeof(struct in_addr));
971 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
972 return 0;
973}
a915da9b 974EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 975
a915da9b 976int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 977{
a915da9b
ED
978 struct tcp_md5sig_key *key;
979
c0353c7b 980 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
981 if (!key)
982 return -ENOENT;
983 hlist_del_rcu(&key->node);
5f3d9cb2 984 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 985 kfree_rcu(key, rcu);
a915da9b 986 return 0;
cfb6eeb4 987}
a915da9b 988EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 989
e0683e70 990static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
991{
992 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 993 struct tcp_md5sig_key *key;
b67bfe0d 994 struct hlist_node *n;
a8afca03 995 struct tcp_md5sig_info *md5sig;
cfb6eeb4 996
a8afca03
ED
997 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
998
b67bfe0d 999 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1000 hlist_del_rcu(&key->node);
5f3d9cb2 1001 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1002 kfree_rcu(key, rcu);
cfb6eeb4
YH
1003 }
1004}
1005
7174259e
ACM
1006static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1007 int optlen)
cfb6eeb4
YH
1008{
1009 struct tcp_md5sig cmd;
1010 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1011
1012 if (optlen < sizeof(cmd))
1013 return -EINVAL;
1014
7174259e 1015 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1016 return -EFAULT;
1017
1018 if (sin->sin_family != AF_INET)
1019 return -EINVAL;
1020
64a124ed 1021 if (!cmd.tcpm_keylen)
a915da9b
ED
1022 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023 AF_INET);
cfb6eeb4
YH
1024
1025 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026 return -EINVAL;
1027
a915da9b
ED
1028 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1030 GFP_KERNEL);
cfb6eeb4
YH
1031}
1032
19689e38
ED
1033static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1034 __be32 daddr, __be32 saddr,
1035 const struct tcphdr *th, int nbytes)
cfb6eeb4 1036{
cfb6eeb4 1037 struct tcp4_pseudohdr *bp;
49a72dfb 1038 struct scatterlist sg;
19689e38 1039 struct tcphdr *_th;
cfb6eeb4 1040
19689e38 1041 bp = hp->scratch;
cfb6eeb4
YH
1042 bp->saddr = saddr;
1043 bp->daddr = daddr;
1044 bp->pad = 0;
076fb722 1045 bp->protocol = IPPROTO_TCP;
49a72dfb 1046 bp->len = cpu_to_be16(nbytes);
c7da57a1 1047
19689e38
ED
1048 _th = (struct tcphdr *)(bp + 1);
1049 memcpy(_th, th, sizeof(*th));
1050 _th->check = 0;
1051
1052 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1053 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1054 sizeof(*bp) + sizeof(*th));
cf80e0e4 1055 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1056}
1057
a915da9b 1058static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1059 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1060{
1061 struct tcp_md5sig_pool *hp;
cf80e0e4 1062 struct ahash_request *req;
49a72dfb
AL
1063
1064 hp = tcp_get_md5sig_pool();
1065 if (!hp)
1066 goto clear_hash_noput;
cf80e0e4 1067 req = hp->md5_req;
49a72dfb 1068
cf80e0e4 1069 if (crypto_ahash_init(req))
49a72dfb 1070 goto clear_hash;
19689e38 1071 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1072 goto clear_hash;
1073 if (tcp_md5_hash_key(hp, key))
1074 goto clear_hash;
cf80e0e4
HX
1075 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1076 if (crypto_ahash_final(req))
cfb6eeb4
YH
1077 goto clear_hash;
1078
cfb6eeb4 1079 tcp_put_md5sig_pool();
cfb6eeb4 1080 return 0;
49a72dfb 1081
cfb6eeb4
YH
1082clear_hash:
1083 tcp_put_md5sig_pool();
1084clear_hash_noput:
1085 memset(md5_hash, 0, 16);
49a72dfb 1086 return 1;
cfb6eeb4
YH
1087}
1088
39f8e58e
ED
1089int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1090 const struct sock *sk,
318cf7aa 1091 const struct sk_buff *skb)
cfb6eeb4 1092{
49a72dfb 1093 struct tcp_md5sig_pool *hp;
cf80e0e4 1094 struct ahash_request *req;
318cf7aa 1095 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1096 __be32 saddr, daddr;
1097
39f8e58e
ED
1098 if (sk) { /* valid for establish/request sockets */
1099 saddr = sk->sk_rcv_saddr;
1100 daddr = sk->sk_daddr;
cfb6eeb4 1101 } else {
49a72dfb
AL
1102 const struct iphdr *iph = ip_hdr(skb);
1103 saddr = iph->saddr;
1104 daddr = iph->daddr;
cfb6eeb4 1105 }
49a72dfb
AL
1106
1107 hp = tcp_get_md5sig_pool();
1108 if (!hp)
1109 goto clear_hash_noput;
cf80e0e4 1110 req = hp->md5_req;
49a72dfb 1111
cf80e0e4 1112 if (crypto_ahash_init(req))
49a72dfb
AL
1113 goto clear_hash;
1114
19689e38 1115 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1116 goto clear_hash;
1117 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1118 goto clear_hash;
1119 if (tcp_md5_hash_key(hp, key))
1120 goto clear_hash;
cf80e0e4
HX
1121 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1122 if (crypto_ahash_final(req))
49a72dfb
AL
1123 goto clear_hash;
1124
1125 tcp_put_md5sig_pool();
1126 return 0;
1127
1128clear_hash:
1129 tcp_put_md5sig_pool();
1130clear_hash_noput:
1131 memset(md5_hash, 0, 16);
1132 return 1;
cfb6eeb4 1133}
49a72dfb 1134EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1135
ba8e275a
ED
1136#endif
1137
ff74e23f 1138/* Called with rcu_read_lock() */
ba8e275a 1139static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1140 const struct sk_buff *skb)
cfb6eeb4 1141{
ba8e275a 1142#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1143 /*
1144 * This gets called for each TCP segment that arrives
1145 * so we want to be efficient.
1146 * We have 3 drop cases:
1147 * o No MD5 hash and one expected.
1148 * o MD5 hash and we're not expecting one.
1149 * o MD5 hash and its wrong.
1150 */
cf533ea5 1151 const __u8 *hash_location = NULL;
cfb6eeb4 1152 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1153 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1154 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1155 int genhash;
cfb6eeb4
YH
1156 unsigned char newhash[16];
1157
a915da9b
ED
1158 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1159 AF_INET);
7d5d5525 1160 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1161
cfb6eeb4
YH
1162 /* We've parsed the options - do we have a hash? */
1163 if (!hash_expected && !hash_location)
a2a385d6 1164 return false;
cfb6eeb4
YH
1165
1166 if (hash_expected && !hash_location) {
c10d9310 1167 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1168 return true;
cfb6eeb4
YH
1169 }
1170
1171 if (!hash_expected && hash_location) {
c10d9310 1172 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1173 return true;
cfb6eeb4
YH
1174 }
1175
1176 /* Okay, so this is hash_expected and hash_location -
1177 * so we need to calculate the checksum.
1178 */
49a72dfb
AL
1179 genhash = tcp_v4_md5_hash_skb(newhash,
1180 hash_expected,
39f8e58e 1181 NULL, skb);
cfb6eeb4
YH
1182
1183 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1184 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1185 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186 &iph->saddr, ntohs(th->source),
1187 &iph->daddr, ntohs(th->dest),
1188 genhash ? " tcp_v4_calc_md5_hash failed"
1189 : "");
a2a385d6 1190 return true;
cfb6eeb4 1191 }
a2a385d6 1192 return false;
cfb6eeb4 1193#endif
ba8e275a
ED
1194 return false;
1195}
cfb6eeb4 1196
b40cf18e
ED
1197static void tcp_v4_init_req(struct request_sock *req,
1198 const struct sock *sk_listener,
16bea70a
OP
1199 struct sk_buff *skb)
1200{
1201 struct inet_request_sock *ireq = inet_rsk(req);
1202
08d2cc3b
ED
1203 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1204 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
16bea70a
OP
1205 ireq->opt = tcp_v4_save_options(skb);
1206}
1207
f964629e
ED
1208static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1209 struct flowi *fl,
d94e0417
OP
1210 const struct request_sock *req,
1211 bool *strict)
1212{
1213 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1214
1215 if (strict) {
1216 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1217 *strict = true;
1218 else
1219 *strict = false;
1220 }
1221
1222 return dst;
1223}
1224
72a3effa 1225struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1226 .family = PF_INET,
2e6599cb 1227 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1228 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1229 .send_ack = tcp_v4_reqsk_send_ack,
1230 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1231 .send_reset = tcp_v4_send_reset,
688d1945 1232 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1233};
1234
b2e4b3de 1235static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1236 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1237#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1238 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1239 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1240#endif
16bea70a 1241 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1242#ifdef CONFIG_SYN_COOKIES
1243 .cookie_init_seq = cookie_v4_init_sequence,
1244#endif
d94e0417 1245 .route_req = tcp_v4_route_req,
936b8bdb 1246 .init_seq = tcp_v4_init_sequence,
d6274bd8 1247 .send_synack = tcp_v4_send_synack,
16bea70a 1248};
cfb6eeb4 1249
1da177e4
LT
1250int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1251{
1da177e4 1252 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1253 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1254 goto drop;
1255
1fb6f159
OP
1256 return tcp_conn_request(&tcp_request_sock_ops,
1257 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1258
1da177e4 1259drop:
9caad864 1260 tcp_listendrop(sk);
1da177e4
LT
1261 return 0;
1262}
4bc2f18b 1263EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1264
1265
1266/*
1267 * The three way handshake has completed - we got a valid synack -
1268 * now create the new socket.
1269 */
0c27171e 1270struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1271 struct request_sock *req,
5e0724d0
ED
1272 struct dst_entry *dst,
1273 struct request_sock *req_unhash,
1274 bool *own_req)
1da177e4 1275{
2e6599cb 1276 struct inet_request_sock *ireq;
1da177e4
LT
1277 struct inet_sock *newinet;
1278 struct tcp_sock *newtp;
1279 struct sock *newsk;
cfb6eeb4
YH
1280#ifdef CONFIG_TCP_MD5SIG
1281 struct tcp_md5sig_key *key;
1282#endif
f6d8bd05 1283 struct ip_options_rcu *inet_opt;
1da177e4
LT
1284
1285 if (sk_acceptq_is_full(sk))
1286 goto exit_overflow;
1287
1da177e4
LT
1288 newsk = tcp_create_openreq_child(sk, req, skb);
1289 if (!newsk)
093d2823 1290 goto exit_nonewsk;
1da177e4 1291
bcd76111 1292 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1293 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1294
1295 newtp = tcp_sk(newsk);
1296 newinet = inet_sk(newsk);
2e6599cb 1297 ireq = inet_rsk(req);
d1e559d0
ED
1298 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1299 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1300 newsk->sk_bound_dev_if = ireq->ir_iif;
634fb979 1301 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1302 inet_opt = ireq->opt;
1303 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1304 ireq->opt = NULL;
463c84b9 1305 newinet->mc_index = inet_iif(skb);
eddc9ec5 1306 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1307 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1308 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1309 if (inet_opt)
1310 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1311 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1312
dfd25fff
ED
1313 if (!dst) {
1314 dst = inet_csk_route_child_sock(sk, newsk, req);
1315 if (!dst)
1316 goto put_and_exit;
1317 } else {
1318 /* syncookie case : see end of cookie_v4_check() */
1319 }
0e734419
DM
1320 sk_setup_caps(newsk, dst);
1321
81164413
DB
1322 tcp_ca_openreq_child(newsk, dst);
1323
1da177e4 1324 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1325 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1326 if (tcp_sk(sk)->rx_opt.user_mss &&
1327 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1328 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1329
1da177e4
LT
1330 tcp_initialize_rcv_mss(newsk);
1331
cfb6eeb4
YH
1332#ifdef CONFIG_TCP_MD5SIG
1333 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1334 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1335 AF_INET);
00db4124 1336 if (key) {
cfb6eeb4
YH
1337 /*
1338 * We're using one, so create a matching key
1339 * on the newsk structure. If we fail to get
1340 * memory, then we end up not copying the key
1341 * across. Shucks.
1342 */
a915da9b
ED
1343 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1344 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1345 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1346 }
1347#endif
1348
0e734419
DM
1349 if (__inet_inherit_port(sk, newsk) < 0)
1350 goto put_and_exit;
5e0724d0 1351 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
805c4bc0 1352 if (*own_req)
49a496c9 1353 tcp_move_syn(newtp, req);
1da177e4
LT
1354
1355 return newsk;
1356
1357exit_overflow:
c10d9310 1358 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1359exit_nonewsk:
1360 dst_release(dst);
1da177e4 1361exit:
9caad864 1362 tcp_listendrop(sk);
1da177e4 1363 return NULL;
0e734419 1364put_and_exit:
e337e24d
CP
1365 inet_csk_prepare_forced_close(newsk);
1366 tcp_done(newsk);
0e734419 1367 goto exit;
1da177e4 1368}
4bc2f18b 1369EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1370
079096f1 1371static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1372{
079096f1 1373#ifdef CONFIG_SYN_COOKIES
52452c54 1374 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1375
af9b4738 1376 if (!th->syn)
461b74c3 1377 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1378#endif
1379 return sk;
1380}
1381
1da177e4 1382/* The socket must have it's spinlock held when we get
e994b2f0 1383 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1384 *
1385 * We have a potential double-lock case here, so even when
1386 * doing backlog processing we use the BH locking scheme.
1387 * This is because we cannot sleep with the original spinlock
1388 * held.
1389 */
1390int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1391{
cfb6eeb4 1392 struct sock *rsk;
cfb6eeb4 1393
1da177e4 1394 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1395 struct dst_entry *dst = sk->sk_rx_dst;
1396
bdeab991 1397 sock_rps_save_rxhash(sk, skb);
3d97379a 1398 sk_mark_napi_id(sk, skb);
404e0a8b 1399 if (dst) {
505fbcf0 1400 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1401 !dst->ops->check(dst, 0)) {
92101b3b
DM
1402 dst_release(dst);
1403 sk->sk_rx_dst = NULL;
1404 }
1405 }
c995ae22 1406 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1407 return 0;
1408 }
1409
12e25e10 1410 if (tcp_checksum_complete(skb))
1da177e4
LT
1411 goto csum_err;
1412
1413 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1414 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1415
1da177e4
LT
1416 if (!nsk)
1417 goto discard;
1da177e4 1418 if (nsk != sk) {
bdeab991 1419 sock_rps_save_rxhash(nsk, skb);
38cb5245 1420 sk_mark_napi_id(nsk, skb);
cfb6eeb4
YH
1421 if (tcp_child_process(sk, nsk, skb)) {
1422 rsk = nsk;
1da177e4 1423 goto reset;
cfb6eeb4 1424 }
1da177e4
LT
1425 return 0;
1426 }
ca55158c 1427 } else
bdeab991 1428 sock_rps_save_rxhash(sk, skb);
ca55158c 1429
72ab4a86 1430 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1431 rsk = sk;
1da177e4 1432 goto reset;
cfb6eeb4 1433 }
1da177e4
LT
1434 return 0;
1435
1436reset:
cfb6eeb4 1437 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1438discard:
1439 kfree_skb(skb);
1440 /* Be careful here. If this function gets more complicated and
1441 * gcc suffers from register pressure on the x86, sk (in %ebx)
1442 * might be destroyed here. This current version compiles correctly,
1443 * but you have been warned.
1444 */
1445 return 0;
1446
1447csum_err:
c10d9310
ED
1448 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1449 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1450 goto discard;
1451}
4bc2f18b 1452EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1453
160eb5a6 1454void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1455{
41063e9d
DM
1456 const struct iphdr *iph;
1457 const struct tcphdr *th;
1458 struct sock *sk;
41063e9d 1459
41063e9d 1460 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1461 return;
41063e9d 1462
45f00f99 1463 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1464 return;
41063e9d
DM
1465
1466 iph = ip_hdr(skb);
45f00f99 1467 th = tcp_hdr(skb);
41063e9d
DM
1468
1469 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1470 return;
41063e9d 1471
45f00f99 1472 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1473 iph->saddr, th->source,
7011d085 1474 iph->daddr, ntohs(th->dest),
9cb429d6 1475 skb->skb_iif);
41063e9d
DM
1476 if (sk) {
1477 skb->sk = sk;
1478 skb->destructor = sock_edemux;
f7e4eb03 1479 if (sk_fullsock(sk)) {
d0c294c5 1480 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1481
41063e9d
DM
1482 if (dst)
1483 dst = dst_check(dst, 0);
92101b3b 1484 if (dst &&
505fbcf0 1485 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1486 skb_dst_set_noref(skb, dst);
41063e9d
DM
1487 }
1488 }
41063e9d
DM
1489}
1490
b2fb4f54
ED
1491/* Packet is added to VJ-style prequeue for processing in process
1492 * context, if a reader task is waiting. Apparently, this exciting
1493 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1494 * failed somewhere. Latency? Burstiness? Well, at least now we will
1495 * see, why it failed. 8)8) --ANK
1496 *
1497 */
1498bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1499{
1500 struct tcp_sock *tp = tcp_sk(sk);
1501
1502 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1503 return false;
1504
1505 if (skb->len <= tcp_hdrlen(skb) &&
1506 skb_queue_len(&tp->ucopy.prequeue) == 0)
1507 return false;
1508
ca777eff
ED
1509 /* Before escaping RCU protected region, we need to take care of skb
1510 * dst. Prequeue is only enabled for established sockets.
1511 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1512 * Instead of doing full sk_rx_dst validity here, let's perform
1513 * an optimistic check.
1514 */
1515 if (likely(sk->sk_rx_dst))
1516 skb_dst_drop(skb);
1517 else
5037e9ef 1518 skb_dst_force_safe(skb);
ca777eff 1519
b2fb4f54
ED
1520 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1521 tp->ucopy.memory += skb->truesize;
0cef6a4c
ED
1522 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1523 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
b2fb4f54
ED
1524 struct sk_buff *skb1;
1525
1526 BUG_ON(sock_owned_by_user(sk));
0cef6a4c
ED
1527 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1528 skb_queue_len(&tp->ucopy.prequeue));
b2fb4f54 1529
0cef6a4c 1530 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
b2fb4f54 1531 sk_backlog_rcv(sk, skb1);
b2fb4f54
ED
1532
1533 tp->ucopy.memory = 0;
1534 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1535 wake_up_interruptible_sync_poll(sk_sleep(sk),
1536 POLLIN | POLLRDNORM | POLLRDBAND);
1537 if (!inet_csk_ack_scheduled(sk))
1538 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1539 (3 * tcp_rto_min(sk)) / 4,
1540 TCP_RTO_MAX);
1541 }
1542 return true;
1543}
1544EXPORT_SYMBOL(tcp_prequeue);
1545
c9c33212
ED
1546bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1547{
1548 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1549
1550 /* Only socket owner can try to collapse/prune rx queues
1551 * to reduce memory overhead, so add a little headroom here.
1552 * Few sockets backlog are possibly concurrently non empty.
1553 */
1554 limit += 64*1024;
1555
1556 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1557 * we can fix skb->truesize to its real value to avoid future drops.
1558 * This is valid because skb is not yet charged to the socket.
1559 * It has been noticed pure SACK packets were sometimes dropped
1560 * (if cooked by drivers without copybreak feature).
1561 */
1562 if (!skb->data_len)
1563 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1564
1565 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1566 bh_unlock_sock(sk);
1567 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1568 return true;
1569 }
1570 return false;
1571}
1572EXPORT_SYMBOL(tcp_add_backlog);
1573
ac6e7800
ED
1574int tcp_filter(struct sock *sk, struct sk_buff *skb)
1575{
1576 struct tcphdr *th = (struct tcphdr *)skb->data;
1577 unsigned int eaten = skb->len;
1578 int err;
1579
1580 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1581 if (!err) {
1582 eaten -= skb->len;
1583 TCP_SKB_CB(skb)->end_seq -= eaten;
1584 }
1585 return err;
1586}
1587EXPORT_SYMBOL(tcp_filter);
1588
1da177e4
LT
1589/*
1590 * From tcp_input.c
1591 */
1592
1593int tcp_v4_rcv(struct sk_buff *skb)
1594{
3b24d854 1595 struct net *net = dev_net(skb->dev);
eddc9ec5 1596 const struct iphdr *iph;
cf533ea5 1597 const struct tcphdr *th;
3b24d854 1598 bool refcounted;
1da177e4
LT
1599 struct sock *sk;
1600 int ret;
1601
1602 if (skb->pkt_type != PACKET_HOST)
1603 goto discard_it;
1604
1605 /* Count it even if it's bad */
90bbcc60 1606 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1607
1608 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1609 goto discard_it;
1610
ea1627c2 1611 th = (const struct tcphdr *)skb->data;
1da177e4 1612
ea1627c2 1613 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1614 goto bad_packet;
1615 if (!pskb_may_pull(skb, th->doff * 4))
1616 goto discard_it;
1617
1618 /* An explanation is required here, I think.
1619 * Packet length and doff are validated by header prediction,
caa20d9a 1620 * provided case of th->doff==0 is eliminated.
1da177e4 1621 * So, we defer the checks. */
ed70fcfc
TH
1622
1623 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1624 goto csum_error;
1da177e4 1625
ea1627c2 1626 th = (const struct tcphdr *)skb->data;
eddc9ec5 1627 iph = ip_hdr(skb);
971f10ec
ED
1628 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1629 * barrier() makes sure compiler wont play fool^Waliasing games.
1630 */
1631 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1632 sizeof(struct inet_skb_parm));
1633 barrier();
1634
1da177e4
LT
1635 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1636 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1637 skb->len - th->doff * 4);
1638 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1639 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1640 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1641 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1642 TCP_SKB_CB(skb)->sacked = 0;
1643
4bdc3d66 1644lookup:
a583636a 1645 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3b24d854 1646 th->dest, &refcounted);
1da177e4
LT
1647 if (!sk)
1648 goto no_tcp_socket;
1649
bb134d5d
ED
1650process:
1651 if (sk->sk_state == TCP_TIME_WAIT)
1652 goto do_time_wait;
1653
079096f1
ED
1654 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1655 struct request_sock *req = inet_reqsk(sk);
7716682c 1656 struct sock *nsk;
079096f1
ED
1657
1658 sk = req->rsk_listener;
72923555 1659 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1660 sk_drops_add(sk, skb);
72923555
ED
1661 reqsk_put(req);
1662 goto discard_it;
1663 }
7716682c 1664 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1665 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1666 goto lookup;
1667 }
3b24d854
ED
1668 /* We own a reference on the listener, increase it again
1669 * as we might lose it too soon.
1670 */
7716682c 1671 sock_hold(sk);
3b24d854 1672 refcounted = true;
7716682c 1673 nsk = tcp_check_req(sk, skb, req, false);
079096f1
ED
1674 if (!nsk) {
1675 reqsk_put(req);
7716682c 1676 goto discard_and_relse;
079096f1
ED
1677 }
1678 if (nsk == sk) {
079096f1
ED
1679 reqsk_put(req);
1680 } else if (tcp_child_process(sk, nsk, skb)) {
1681 tcp_v4_send_reset(nsk, skb);
7716682c 1682 goto discard_and_relse;
079096f1 1683 } else {
7716682c 1684 sock_put(sk);
079096f1
ED
1685 return 0;
1686 }
1687 }
6cce09f8 1688 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1689 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1690 goto discard_and_relse;
6cce09f8 1691 }
d218d111 1692
1da177e4
LT
1693 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1694 goto discard_and_relse;
9ea88a15 1695
9ea88a15
DP
1696 if (tcp_v4_inbound_md5_hash(sk, skb))
1697 goto discard_and_relse;
9ea88a15 1698
b59c2701 1699 nf_reset(skb);
1da177e4 1700
ac6e7800 1701 if (tcp_filter(sk, skb))
1da177e4 1702 goto discard_and_relse;
ac6e7800
ED
1703 th = (const struct tcphdr *)skb->data;
1704 iph = ip_hdr(skb);
1da177e4
LT
1705
1706 skb->dev = NULL;
1707
e994b2f0
ED
1708 if (sk->sk_state == TCP_LISTEN) {
1709 ret = tcp_v4_do_rcv(sk, skb);
1710 goto put_and_return;
1711 }
1712
1713 sk_incoming_cpu_update(sk);
1714
c6366184 1715 bh_lock_sock_nested(sk);
a44d6eac 1716 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1717 ret = 0;
1718 if (!sock_owned_by_user(sk)) {
7bced397 1719 if (!tcp_prequeue(sk, skb))
1da177e4 1720 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1721 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1722 goto discard_and_relse;
1723 }
1da177e4
LT
1724 bh_unlock_sock(sk);
1725
e994b2f0 1726put_and_return:
3b24d854
ED
1727 if (refcounted)
1728 sock_put(sk);
1da177e4
LT
1729
1730 return ret;
1731
1732no_tcp_socket:
1733 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1734 goto discard_it;
1735
12e25e10 1736 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1737csum_error:
90bbcc60 1738 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1739bad_packet:
90bbcc60 1740 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1741 } else {
cfb6eeb4 1742 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1743 }
1744
1745discard_it:
1746 /* Discard frame. */
1747 kfree_skb(skb);
e905a9ed 1748 return 0;
1da177e4
LT
1749
1750discard_and_relse:
532182cd 1751 sk_drops_add(sk, skb);
3b24d854
ED
1752 if (refcounted)
1753 sock_put(sk);
1da177e4
LT
1754 goto discard_it;
1755
1756do_time_wait:
1757 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1758 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1759 goto discard_it;
1760 }
1761
6a5dc9e5
ED
1762 if (tcp_checksum_complete(skb)) {
1763 inet_twsk_put(inet_twsk(sk));
1764 goto csum_error;
1da177e4 1765 }
9469c7b4 1766 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1767 case TCP_TW_SYN: {
c346dca1 1768 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1769 &tcp_hashinfo, skb,
1770 __tcp_hdrlen(th),
da5e3630 1771 iph->saddr, th->source,
eddc9ec5 1772 iph->daddr, th->dest,
463c84b9 1773 inet_iif(skb));
1da177e4 1774 if (sk2) {
dbe7faa4 1775 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1776 sk = sk2;
3b24d854 1777 refcounted = false;
1da177e4
LT
1778 goto process;
1779 }
1780 /* Fall through to ACK */
1781 }
1782 case TCP_TW_ACK:
1783 tcp_v4_timewait_ack(sk, skb);
1784 break;
1785 case TCP_TW_RST:
271c3b9b
FW
1786 tcp_v4_send_reset(sk, skb);
1787 inet_twsk_deschedule_put(inet_twsk(sk));
1788 goto discard_it;
1da177e4
LT
1789 case TCP_TW_SUCCESS:;
1790 }
1791 goto discard_it;
1792}
1793
ccb7c410
DM
1794static struct timewait_sock_ops tcp_timewait_sock_ops = {
1795 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1796 .twsk_unique = tcp_twsk_unique,
1797 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1798};
1da177e4 1799
63d02d15 1800void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1801{
1802 struct dst_entry *dst = skb_dst(skb);
1803
5037e9ef 1804 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1805 sk->sk_rx_dst = dst;
1806 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1807 }
5d299f3d 1808}
63d02d15 1809EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1810
3b401a81 1811const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1812 .queue_xmit = ip_queue_xmit,
1813 .send_check = tcp_v4_send_check,
1814 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1815 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1816 .conn_request = tcp_v4_conn_request,
1817 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1818 .net_header_len = sizeof(struct iphdr),
1819 .setsockopt = ip_setsockopt,
1820 .getsockopt = ip_getsockopt,
1821 .addr2sockaddr = inet_csk_addr2sockaddr,
1822 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1823 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1824#ifdef CONFIG_COMPAT
543d9cfe
ACM
1825 .compat_setsockopt = compat_ip_setsockopt,
1826 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1827#endif
4fab9071 1828 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1829};
4bc2f18b 1830EXPORT_SYMBOL(ipv4_specific);
1da177e4 1831
cfb6eeb4 1832#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1833static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1834 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1835 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1836 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1837};
b6332e6c 1838#endif
cfb6eeb4 1839
1da177e4
LT
1840/* NOTE: A lot of things set to zero explicitly by call to
1841 * sk_alloc() so need not be done here.
1842 */
1843static int tcp_v4_init_sock(struct sock *sk)
1844{
6687e988 1845 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1846
900f65d3 1847 tcp_init_sock(sk);
1da177e4 1848
8292a17a 1849 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1850
cfb6eeb4 1851#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1852 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1853#endif
1da177e4 1854
1da177e4
LT
1855 return 0;
1856}
1857
7d06b2e0 1858void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1859{
1860 struct tcp_sock *tp = tcp_sk(sk);
1861
1862 tcp_clear_xmit_timers(sk);
1863
6687e988 1864 tcp_cleanup_congestion_control(sk);
317a76f9 1865
1da177e4 1866 /* Cleanup up the write buffer. */
fe067e8a 1867 tcp_write_queue_purge(sk);
1da177e4
LT
1868
1869 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 1870 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 1871
cfb6eeb4
YH
1872#ifdef CONFIG_TCP_MD5SIG
1873 /* Clean up the MD5 key list, if any */
1874 if (tp->md5sig_info) {
a915da9b 1875 tcp_clear_md5_list(sk);
a8afca03 1876 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1877 tp->md5sig_info = NULL;
1878 }
1879#endif
1a2449a8 1880
1da177e4
LT
1881 /* Clean prequeue, it must be empty really */
1882 __skb_queue_purge(&tp->ucopy.prequeue);
1883
1884 /* Clean up a referenced TCP bind bucket. */
463c84b9 1885 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1886 inet_put_port(sk);
1da177e4 1887
00db4124 1888 BUG_ON(tp->fastopen_rsk);
435cf559 1889
cf60af03
YC
1890 /* If socket is aborted during connect operation */
1891 tcp_free_fastopen_req(tp);
cd8ae852 1892 tcp_saved_syn_free(tp);
cf60af03 1893
777c6ae5 1894 local_bh_disable();
180d8cd9 1895 sk_sockets_allocated_dec(sk);
777c6ae5 1896 local_bh_enable();
1da177e4 1897}
1da177e4
LT
1898EXPORT_SYMBOL(tcp_v4_destroy_sock);
1899
1900#ifdef CONFIG_PROC_FS
1901/* Proc filesystem TCP sock list dumping. */
1902
a8b690f9
TH
1903/*
1904 * Get next listener socket follow cur. If cur is NULL, get first socket
1905 * starting from bucket given in st->bucket; when st->bucket is zero the
1906 * very first socket in the hash table is returned.
1907 */
1da177e4
LT
1908static void *listening_get_next(struct seq_file *seq, void *cur)
1909{
5799de0b 1910 struct tcp_iter_state *st = seq->private;
a4146b1b 1911 struct net *net = seq_file_net(seq);
3b24d854 1912 struct inet_listen_hashbucket *ilb;
3b24d854 1913 struct sock *sk = cur;
1da177e4
LT
1914
1915 if (!sk) {
3b24d854 1916get_head:
a8b690f9 1917 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 1918 spin_lock(&ilb->lock);
3b24d854 1919 sk = sk_head(&ilb->head);
a8b690f9 1920 st->offset = 0;
1da177e4
LT
1921 goto get_sk;
1922 }
5caea4ea 1923 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1924 ++st->num;
a8b690f9 1925 ++st->offset;
1da177e4 1926
3b24d854 1927 sk = sk_next(sk);
1da177e4 1928get_sk:
3b24d854 1929 sk_for_each_from(sk) {
8475ef9f
PE
1930 if (!net_eq(sock_net(sk), net))
1931 continue;
3b24d854
ED
1932 if (sk->sk_family == st->family)
1933 return sk;
1da177e4 1934 }
9652dc2e 1935 spin_unlock(&ilb->lock);
a8b690f9 1936 st->offset = 0;
3b24d854
ED
1937 if (++st->bucket < INET_LHTABLE_SIZE)
1938 goto get_head;
1939 return NULL;
1da177e4
LT
1940}
1941
1942static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943{
a8b690f9
TH
1944 struct tcp_iter_state *st = seq->private;
1945 void *rc;
1946
1947 st->bucket = 0;
1948 st->offset = 0;
1949 rc = listening_get_next(seq, NULL);
1da177e4
LT
1950
1951 while (rc && *pos) {
1952 rc = listening_get_next(seq, rc);
1953 --*pos;
1954 }
1955 return rc;
1956}
1957
05dbc7b5 1958static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1959{
05dbc7b5 1960 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1961}
1962
a8b690f9
TH
1963/*
1964 * Get first established socket starting from bucket given in st->bucket.
1965 * If st->bucket is zero, the very first socket in the hash is returned.
1966 */
1da177e4
LT
1967static void *established_get_first(struct seq_file *seq)
1968{
5799de0b 1969 struct tcp_iter_state *st = seq->private;
a4146b1b 1970 struct net *net = seq_file_net(seq);
1da177e4
LT
1971 void *rc = NULL;
1972
a8b690f9
TH
1973 st->offset = 0;
1974 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1975 struct sock *sk;
3ab5aee7 1976 struct hlist_nulls_node *node;
9db66bdc 1977 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1978
6eac5604
AK
1979 /* Lockless fast path for the common case of empty buckets */
1980 if (empty_bucket(st))
1981 continue;
1982
9db66bdc 1983 spin_lock_bh(lock);
3ab5aee7 1984 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1985 if (sk->sk_family != st->family ||
878628fb 1986 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1987 continue;
1988 }
1989 rc = sk;
1990 goto out;
1991 }
9db66bdc 1992 spin_unlock_bh(lock);
1da177e4
LT
1993 }
1994out:
1995 return rc;
1996}
1997
1998static void *established_get_next(struct seq_file *seq, void *cur)
1999{
2000 struct sock *sk = cur;
3ab5aee7 2001 struct hlist_nulls_node *node;
5799de0b 2002 struct tcp_iter_state *st = seq->private;
a4146b1b 2003 struct net *net = seq_file_net(seq);
1da177e4
LT
2004
2005 ++st->num;
a8b690f9 2006 ++st->offset;
1da177e4 2007
05dbc7b5 2008 sk = sk_nulls_next(sk);
1da177e4 2009
3ab5aee7 2010 sk_nulls_for_each_from(sk, node) {
878628fb 2011 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2012 return sk;
1da177e4
LT
2013 }
2014
05dbc7b5
ED
2015 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2016 ++st->bucket;
2017 return established_get_first(seq);
1da177e4
LT
2018}
2019
2020static void *established_get_idx(struct seq_file *seq, loff_t pos)
2021{
a8b690f9
TH
2022 struct tcp_iter_state *st = seq->private;
2023 void *rc;
2024
2025 st->bucket = 0;
2026 rc = established_get_first(seq);
1da177e4
LT
2027
2028 while (rc && pos) {
2029 rc = established_get_next(seq, rc);
2030 --pos;
7174259e 2031 }
1da177e4
LT
2032 return rc;
2033}
2034
2035static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2036{
2037 void *rc;
5799de0b 2038 struct tcp_iter_state *st = seq->private;
1da177e4 2039
1da177e4
LT
2040 st->state = TCP_SEQ_STATE_LISTENING;
2041 rc = listening_get_idx(seq, &pos);
2042
2043 if (!rc) {
1da177e4
LT
2044 st->state = TCP_SEQ_STATE_ESTABLISHED;
2045 rc = established_get_idx(seq, pos);
2046 }
2047
2048 return rc;
2049}
2050
a8b690f9
TH
2051static void *tcp_seek_last_pos(struct seq_file *seq)
2052{
2053 struct tcp_iter_state *st = seq->private;
2054 int offset = st->offset;
2055 int orig_num = st->num;
2056 void *rc = NULL;
2057
2058 switch (st->state) {
a8b690f9
TH
2059 case TCP_SEQ_STATE_LISTENING:
2060 if (st->bucket >= INET_LHTABLE_SIZE)
2061 break;
2062 st->state = TCP_SEQ_STATE_LISTENING;
2063 rc = listening_get_next(seq, NULL);
2064 while (offset-- && rc)
2065 rc = listening_get_next(seq, rc);
2066 if (rc)
2067 break;
2068 st->bucket = 0;
05dbc7b5 2069 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2070 /* Fallthrough */
2071 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2072 if (st->bucket > tcp_hashinfo.ehash_mask)
2073 break;
2074 rc = established_get_first(seq);
2075 while (offset-- && rc)
2076 rc = established_get_next(seq, rc);
2077 }
2078
2079 st->num = orig_num;
2080
2081 return rc;
2082}
2083
1da177e4
LT
2084static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2085{
5799de0b 2086 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2087 void *rc;
2088
2089 if (*pos && *pos == st->last_pos) {
2090 rc = tcp_seek_last_pos(seq);
2091 if (rc)
2092 goto out;
2093 }
2094
1da177e4
LT
2095 st->state = TCP_SEQ_STATE_LISTENING;
2096 st->num = 0;
a8b690f9
TH
2097 st->bucket = 0;
2098 st->offset = 0;
2099 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2100
2101out:
2102 st->last_pos = *pos;
2103 return rc;
1da177e4
LT
2104}
2105
2106static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2107{
a8b690f9 2108 struct tcp_iter_state *st = seq->private;
1da177e4 2109 void *rc = NULL;
1da177e4
LT
2110
2111 if (v == SEQ_START_TOKEN) {
2112 rc = tcp_get_idx(seq, 0);
2113 goto out;
2114 }
1da177e4
LT
2115
2116 switch (st->state) {
1da177e4
LT
2117 case TCP_SEQ_STATE_LISTENING:
2118 rc = listening_get_next(seq, v);
2119 if (!rc) {
1da177e4 2120 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2121 st->bucket = 0;
2122 st->offset = 0;
1da177e4
LT
2123 rc = established_get_first(seq);
2124 }
2125 break;
2126 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2127 rc = established_get_next(seq, v);
2128 break;
2129 }
2130out:
2131 ++*pos;
a8b690f9 2132 st->last_pos = *pos;
1da177e4
LT
2133 return rc;
2134}
2135
2136static void tcp_seq_stop(struct seq_file *seq, void *v)
2137{
5799de0b 2138 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2139
2140 switch (st->state) {
1da177e4
LT
2141 case TCP_SEQ_STATE_LISTENING:
2142 if (v != SEQ_START_TOKEN)
9652dc2e 2143 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2144 break;
1da177e4
LT
2145 case TCP_SEQ_STATE_ESTABLISHED:
2146 if (v)
9db66bdc 2147 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2148 break;
2149 }
2150}
2151
73cb88ec 2152int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2153{
d9dda78b 2154 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2155 struct tcp_iter_state *s;
52d6f3f1 2156 int err;
1da177e4 2157
52d6f3f1
DL
2158 err = seq_open_net(inode, file, &afinfo->seq_ops,
2159 sizeof(struct tcp_iter_state));
2160 if (err < 0)
2161 return err;
f40c8174 2162
52d6f3f1 2163 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2164 s->family = afinfo->family;
688d1945 2165 s->last_pos = 0;
f40c8174
DL
2166 return 0;
2167}
73cb88ec 2168EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2169
6f8b13bc 2170int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2171{
2172 int rc = 0;
2173 struct proc_dir_entry *p;
2174
9427c4b3
DL
2175 afinfo->seq_ops.start = tcp_seq_start;
2176 afinfo->seq_ops.next = tcp_seq_next;
2177 afinfo->seq_ops.stop = tcp_seq_stop;
2178
84841c3c 2179 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2180 afinfo->seq_fops, afinfo);
84841c3c 2181 if (!p)
1da177e4
LT
2182 rc = -ENOMEM;
2183 return rc;
2184}
4bc2f18b 2185EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2186
6f8b13bc 2187void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2188{
ece31ffd 2189 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2190}
4bc2f18b 2191EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2192
d4f06873 2193static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2194 struct seq_file *f, int i)
1da177e4 2195{
2e6599cb 2196 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2197 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2198
5e659e4c 2199 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2200 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2201 i,
634fb979 2202 ireq->ir_loc_addr,
d4f06873 2203 ireq->ir_num,
634fb979
ED
2204 ireq->ir_rmt_addr,
2205 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2206 TCP_SYN_RECV,
2207 0, 0, /* could print option size, but that is af dependent. */
2208 1, /* timers active (only the expire timer) */
a399a805 2209 jiffies_delta_to_clock_t(delta),
e6c022a4 2210 req->num_timeout,
aa3a0c8c
ED
2211 from_kuid_munged(seq_user_ns(f),
2212 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2213 0, /* non standard timer */
2214 0, /* open_requests have no inode */
d4f06873 2215 0,
652586df 2216 req);
1da177e4
LT
2217}
2218
652586df 2219static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2220{
2221 int timer_active;
2222 unsigned long timer_expires;
cf533ea5 2223 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2224 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2225 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2226 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2227 __be32 dest = inet->inet_daddr;
2228 __be32 src = inet->inet_rcv_saddr;
2229 __u16 destp = ntohs(inet->inet_dport);
2230 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2231 int rx_queue;
00fd38d9 2232 int state;
1da177e4 2233
6ba8a3b1
ND
2234 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2235 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2236 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2237 timer_active = 1;
463c84b9
ACM
2238 timer_expires = icsk->icsk_timeout;
2239 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2240 timer_active = 4;
463c84b9 2241 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2242 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2243 timer_active = 2;
cf4c6bf8 2244 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2245 } else {
2246 timer_active = 0;
2247 timer_expires = jiffies;
2248 }
2249
00fd38d9
ED
2250 state = sk_state_load(sk);
2251 if (state == TCP_LISTEN)
49d09007
ED
2252 rx_queue = sk->sk_ack_backlog;
2253 else
00fd38d9
ED
2254 /* Because we don't lock the socket,
2255 * we might find a transient negative value.
49d09007
ED
2256 */
2257 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2258
5e659e4c 2259 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2260 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2261 i, src, srcp, dest, destp, state,
47da8ee6 2262 tp->write_seq - tp->snd_una,
49d09007 2263 rx_queue,
1da177e4 2264 timer_active,
a399a805 2265 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2266 icsk->icsk_retransmits,
a7cb5a49 2267 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2268 icsk->icsk_probes_out,
cf4c6bf8
IJ
2269 sock_i_ino(sk),
2270 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2271 jiffies_to_clock_t(icsk->icsk_rto),
2272 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2273 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2274 tp->snd_cwnd,
00fd38d9
ED
2275 state == TCP_LISTEN ?
2276 fastopenq->max_qlen :
652586df 2277 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2278}
2279
cf533ea5 2280static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2281 struct seq_file *f, int i)
1da177e4 2282{
789f558c 2283 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2284 __be32 dest, src;
1da177e4 2285 __u16 destp, srcp;
1da177e4
LT
2286
2287 dest = tw->tw_daddr;
2288 src = tw->tw_rcv_saddr;
2289 destp = ntohs(tw->tw_dport);
2290 srcp = ntohs(tw->tw_sport);
2291
5e659e4c 2292 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2293 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2294 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2295 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2296 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2297}
2298
2299#define TMPSZ 150
2300
2301static int tcp4_seq_show(struct seq_file *seq, void *v)
2302{
5799de0b 2303 struct tcp_iter_state *st;
05dbc7b5 2304 struct sock *sk = v;
1da177e4 2305
652586df 2306 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2307 if (v == SEQ_START_TOKEN) {
652586df 2308 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2309 "rx_queue tr tm->when retrnsmt uid timeout "
2310 "inode");
2311 goto out;
2312 }
2313 st = seq->private;
2314
079096f1
ED
2315 if (sk->sk_state == TCP_TIME_WAIT)
2316 get_timewait4_sock(v, seq, st->num);
2317 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2318 get_openreq4(v, seq, st->num);
079096f1
ED
2319 else
2320 get_tcp4_sock(v, seq, st->num);
1da177e4 2321out:
652586df 2322 seq_pad(seq, '\n');
1da177e4
LT
2323 return 0;
2324}
2325
73cb88ec
AV
2326static const struct file_operations tcp_afinfo_seq_fops = {
2327 .owner = THIS_MODULE,
2328 .open = tcp_seq_open,
2329 .read = seq_read,
2330 .llseek = seq_lseek,
2331 .release = seq_release_net
2332};
2333
1da177e4 2334static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2335 .name = "tcp",
2336 .family = AF_INET,
73cb88ec 2337 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2338 .seq_ops = {
2339 .show = tcp4_seq_show,
2340 },
1da177e4
LT
2341};
2342
2c8c1e72 2343static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2344{
2345 return tcp_proc_register(net, &tcp4_seq_afinfo);
2346}
2347
2c8c1e72 2348static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2349{
2350 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2351}
2352
2353static struct pernet_operations tcp4_net_ops = {
2354 .init = tcp4_proc_init_net,
2355 .exit = tcp4_proc_exit_net,
2356};
2357
1da177e4
LT
2358int __init tcp4_proc_init(void)
2359{
757764f6 2360 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2361}
2362
2363void tcp4_proc_exit(void)
2364{
757764f6 2365 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2366}
2367#endif /* CONFIG_PROC_FS */
2368
2369struct proto tcp_prot = {
2370 .name = "TCP",
2371 .owner = THIS_MODULE,
2372 .close = tcp_close,
2373 .connect = tcp_v4_connect,
2374 .disconnect = tcp_disconnect,
463c84b9 2375 .accept = inet_csk_accept,
1da177e4
LT
2376 .ioctl = tcp_ioctl,
2377 .init = tcp_v4_init_sock,
2378 .destroy = tcp_v4_destroy_sock,
2379 .shutdown = tcp_shutdown,
2380 .setsockopt = tcp_setsockopt,
2381 .getsockopt = tcp_getsockopt,
1da177e4 2382 .recvmsg = tcp_recvmsg,
7ba42910
CG
2383 .sendmsg = tcp_sendmsg,
2384 .sendpage = tcp_sendpage,
1da177e4 2385 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2386 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2387 .hash = inet_hash,
2388 .unhash = inet_unhash,
2389 .get_port = inet_csk_get_port,
1da177e4 2390 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2391 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2392 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2393 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2394 .memory_allocated = &tcp_memory_allocated,
2395 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2396 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2397 .sysctl_wmem = sysctl_tcp_wmem,
2398 .sysctl_rmem = sysctl_tcp_rmem,
2399 .max_header = MAX_TCP_HEADER,
2400 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2401 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2402 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2403 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2404 .h.hashinfo = &tcp_hashinfo,
7ba42910 2405 .no_autobind = true,
543d9cfe
ACM
2406#ifdef CONFIG_COMPAT
2407 .compat_setsockopt = compat_tcp_setsockopt,
2408 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2409#endif
c1e64e29 2410 .diag_destroy = tcp_abort,
1da177e4 2411};
4bc2f18b 2412EXPORT_SYMBOL(tcp_prot);
1da177e4 2413
bdbbb852
ED
2414static void __net_exit tcp_sk_exit(struct net *net)
2415{
2416 int cpu;
2417
2418 for_each_possible_cpu(cpu)
2419 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2420 free_percpu(net->ipv4.tcp_sk);
2421}
2422
046ee902
DL
2423static int __net_init tcp_sk_init(struct net *net)
2424{
bdbbb852
ED
2425 int res, cpu;
2426
2427 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2428 if (!net->ipv4.tcp_sk)
2429 return -ENOMEM;
2430
2431 for_each_possible_cpu(cpu) {
2432 struct sock *sk;
2433
2434 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2435 IPPROTO_TCP, net);
2436 if (res)
2437 goto fail;
a9d6532b 2438 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
bdbbb852
ED
2439 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2440 }
49213555 2441
5d134f1c 2442 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2443 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2444
b0f9ca53 2445 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2446 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2447 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2448
13b287e8 2449 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2450 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2451 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2452
6fa25166 2453 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2454 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2455 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2456 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2457 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2458 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2459 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2460 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2461 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
56ab6b93 2462 net->ipv4.sysctl_tcp_tw_reuse = 0;
12ed8244 2463
49213555 2464 return 0;
bdbbb852
ED
2465fail:
2466 tcp_sk_exit(net);
2467
2468 return res;
b099ce26
EB
2469}
2470
2471static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2472{
2473 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2474}
2475
2476static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2477 .init = tcp_sk_init,
2478 .exit = tcp_sk_exit,
2479 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2480};
2481
9b0f976f 2482void __init tcp_v4_init(void)
1da177e4 2483{
5caea4ea 2484 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2485 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2486 panic("Failed to create the TCP control socket.\n");
1da177e4 2487}