]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/tcp_ipv4.c
ipv4: Namespaceify tcp synack retries sysctl knob
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83
cfb6eeb4
YH
84#include <linux/crypto.h>
85#include <linux/scatterlist.h>
86
ab32ea5d
BH
87int sysctl_tcp_tw_reuse __read_mostly;
88int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 89EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 90
cfb6eeb4 91#ifdef CONFIG_TCP_MD5SIG
a915da9b 92static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 93 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
94#endif
95
5caea4ea 96struct inet_hashinfo tcp_hashinfo;
4bc2f18b 97EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 98
936b8bdb 99static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 100{
eddc9ec5
ACM
101 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
102 ip_hdr(skb)->saddr,
aa8223c7
ACM
103 tcp_hdr(skb)->dest,
104 tcp_hdr(skb)->source);
1da177e4
LT
105}
106
6d6ee43e
ACM
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
109 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 struct tcp_sock *tp = tcp_sk(sk);
111
112 /* With PAWS, it is safe from the viewpoint
113 of data integrity. Even without PAWS it is safe provided sequence
114 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
115
116 Actually, the idea is close to VJ's one, only timestamp cache is
117 held not per host, but per port pair and TW bucket is used as state
118 holder.
119
120 If TW bucket has been already destroyed we fall back to VJ's scheme
121 and use initial timestamp retrieved from peer table.
122 */
123 if (tcptw->tw_ts_recent_stamp &&
51456b29 124 (!twp || (sysctl_tcp_tw_reuse &&
9d729f72 125 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
126 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
127 if (tp->write_seq == 0)
128 tp->write_seq = 1;
129 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
130 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
131 sock_hold(sktw);
132 return 1;
133 }
134
135 return 0;
136}
6d6ee43e
ACM
137EXPORT_SYMBOL_GPL(tcp_twsk_unique);
138
1da177e4
LT
139/* This will initiate an outgoing connection. */
140int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
141{
2d7192d6 142 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
143 struct inet_sock *inet = inet_sk(sk);
144 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 145 __be16 orig_sport, orig_dport;
bada8adc 146 __be32 daddr, nexthop;
da905bd1 147 struct flowi4 *fl4;
2d7192d6 148 struct rtable *rt;
1da177e4 149 int err;
f6d8bd05 150 struct ip_options_rcu *inet_opt;
1da177e4
LT
151
152 if (addr_len < sizeof(struct sockaddr_in))
153 return -EINVAL;
154
155 if (usin->sin_family != AF_INET)
156 return -EAFNOSUPPORT;
157
158 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
159 inet_opt = rcu_dereference_protected(inet->inet_opt,
160 sock_owned_by_user(sk));
161 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
162 if (!daddr)
163 return -EINVAL;
f6d8bd05 164 nexthop = inet_opt->opt.faddr;
1da177e4
LT
165 }
166
dca8b089
DM
167 orig_sport = inet->inet_sport;
168 orig_dport = usin->sin_port;
da905bd1
DM
169 fl4 = &inet->cork.fl.u.ip4;
170 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
172 IPPROTO_TCP,
0e0d44ab 173 orig_sport, orig_dport, sk);
b23dd4fe
DM
174 if (IS_ERR(rt)) {
175 err = PTR_ERR(rt);
176 if (err == -ENETUNREACH)
f1d8cba6 177 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 178 return err;
584bdf8c 179 }
1da177e4
LT
180
181 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182 ip_rt_put(rt);
183 return -ENETUNREACH;
184 }
185
f6d8bd05 186 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 187 daddr = fl4->daddr;
1da177e4 188
c720c7e8 189 if (!inet->inet_saddr)
da905bd1 190 inet->inet_saddr = fl4->saddr;
d1e559d0 191 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 192
c720c7e8 193 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
194 /* Reset inherited state */
195 tp->rx_opt.ts_recent = 0;
196 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
197 if (likely(!tp->repair))
198 tp->write_seq = 0;
1da177e4
LT
199 }
200
295ff7ed 201 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
202 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
203 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 204
c720c7e8 205 inet->inet_dport = usin->sin_port;
d1e559d0 206 sk_daddr_set(sk, daddr);
1da177e4 207
d83d8461 208 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
209 if (inet_opt)
210 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 211
bee7ca9e 212 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
213
214 /* Socket identity is still unknown (sport may be zero).
215 * However we set state to SYN-SENT and not releasing socket
216 * lock select source port, enter ourselves into the hash tables and
217 * complete initialization after this.
218 */
219 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 220 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
221 if (err)
222 goto failure;
223
877d1f62 224 sk_set_txhash(sk);
9e7ceb06 225
da905bd1 226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
227 inet->inet_sport, inet->inet_dport, sk);
228 if (IS_ERR(rt)) {
229 err = PTR_ERR(rt);
230 rt = NULL;
1da177e4 231 goto failure;
b23dd4fe 232 }
1da177e4 233 /* OK, now commit destination to socket. */
bcd76111 234 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 235 sk_setup_caps(sk, &rt->dst);
1da177e4 236
ee995283 237 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
1da177e4
LT
241 usin->sin_port);
242
c720c7e8 243 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 244
2b916477 245 err = tcp_connect(sk);
ee995283 246
1da177e4
LT
247 rt = NULL;
248 if (err)
249 goto failure;
250
251 return 0;
252
253failure:
7174259e
ACM
254 /*
255 * This unhashes the socket and releases the local port,
256 * if necessary.
257 */
1da177e4
LT
258 tcp_set_state(sk, TCP_CLOSE);
259 ip_rt_put(rt);
260 sk->sk_route_caps = 0;
c720c7e8 261 inet->inet_dport = 0;
1da177e4
LT
262 return err;
263}
4bc2f18b 264EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 265
1da177e4 266/*
563d34d0
ED
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 270 */
4fab9071 271void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
272{
273 struct dst_entry *dst;
274 struct inet_sock *inet = inet_sk(sk);
563d34d0 275 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 276
80d0a69f
DM
277 dst = inet_csk_update_pmtu(sk, mtu);
278 if (!dst)
1da177e4
LT
279 return;
280
1da177e4
LT
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
283 */
284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 sk->sk_err_soft = EMSGSIZE;
286
287 mtu = dst_mtu(dst);
288
289 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 290 ip_sk_accept_pmtu(sk) &&
d83d8461 291 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
292 tcp_sync_mss(sk, mtu);
293
294 /* Resend the TCP packet because it's
295 * clear that the old packet has been
296 * dropped. This is the new "fast" path mtu
297 * discovery.
298 */
299 tcp_simple_retransmit(sk);
300 } /* else let the usual retransmit timer handle it */
301}
4fab9071 302EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 303
55be7a9c
DM
304static void do_redirect(struct sk_buff *skb, struct sock *sk)
305{
306 struct dst_entry *dst = __sk_dst_check(sk, 0);
307
1ed5c48f 308 if (dst)
6700c270 309 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
310}
311
26e37360
ED
312
313/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
314void tcp_req_err(struct sock *sk, u32 seq)
315{
316 struct request_sock *req = inet_reqsk(sk);
317 struct net *net = sock_net(sk);
318
319 /* ICMPs are not backlogged, hence we cannot get
320 * an established socket here.
321 */
322 WARN_ON(req->sk);
323
324 if (seq != tcp_rsk(req)->snt_isn) {
325 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326 } else {
327 /*
328 * Still in SYN_RECV, just remove it silently.
329 * There is no good way to pass the error to the newly
330 * created socket, and POSIX does not want network
331 * errors returned from accept().
332 */
c6973669 333 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
ef84d8ce 334 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
26e37360 335 }
ef84d8ce 336 reqsk_put(req);
26e37360
ED
337}
338EXPORT_SYMBOL(tcp_req_err);
339
1da177e4
LT
340/*
341 * This routine is called by the ICMP module when it gets some
342 * sort of error condition. If err < 0 then the socket should
343 * be closed and the error returned to the user. If err > 0
344 * it's just the icmp type << 8 | icmp code. After adjustment
345 * header points to the first 8 bytes of the tcp header. We need
346 * to find the appropriate port.
347 *
348 * The locking strategy used here is very "optimistic". When
349 * someone else accesses the socket the ICMP is just dropped
350 * and for some paths there is no check at all.
351 * A more general error queue to queue errors for later handling
352 * is probably better.
353 *
354 */
355
4d1a2d9e 356void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 357{
b71d1d42 358 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 359 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 360 struct inet_connection_sock *icsk;
1da177e4
LT
361 struct tcp_sock *tp;
362 struct inet_sock *inet;
4d1a2d9e
DL
363 const int type = icmp_hdr(icmp_skb)->type;
364 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 365 struct sock *sk;
f1ecd5d9 366 struct sk_buff *skb;
0a672f74
YC
367 struct request_sock *fastopen;
368 __u32 seq, snd_una;
f1ecd5d9 369 __u32 remaining;
1da177e4 370 int err;
4d1a2d9e 371 struct net *net = dev_net(icmp_skb->dev);
1da177e4 372
26e37360
ED
373 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374 th->dest, iph->saddr, ntohs(th->source),
375 inet_iif(icmp_skb));
1da177e4 376 if (!sk) {
dcfc23ca 377 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
378 return;
379 }
380 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 381 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
382 return;
383 }
26e37360
ED
384 seq = ntohl(th->seq);
385 if (sk->sk_state == TCP_NEW_SYN_RECV)
386 return tcp_req_err(sk, seq);
1da177e4
LT
387
388 bh_lock_sock(sk);
389 /* If too many ICMPs get dropped on busy
390 * servers this needs to be solved differently.
563d34d0
ED
391 * We do take care of PMTU discovery (RFC1191) special case :
392 * we can receive locally generated ICMP messages while socket is held.
1da177e4 393 */
b74aa930
ED
394 if (sock_owned_by_user(sk)) {
395 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
396 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
397 }
1da177e4
LT
398 if (sk->sk_state == TCP_CLOSE)
399 goto out;
400
97e3ecd1 401 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
402 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
403 goto out;
404 }
405
f1ecd5d9 406 icsk = inet_csk(sk);
1da177e4 407 tp = tcp_sk(sk);
0a672f74
YC
408 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
409 fastopen = tp->fastopen_rsk;
410 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 411 if (sk->sk_state != TCP_LISTEN &&
0a672f74 412 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 413 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
414 goto out;
415 }
416
417 switch (type) {
55be7a9c
DM
418 case ICMP_REDIRECT:
419 do_redirect(icmp_skb, sk);
420 goto out;
1da177e4
LT
421 case ICMP_SOURCE_QUENCH:
422 /* Just silently ignore these. */
423 goto out;
424 case ICMP_PARAMETERPROB:
425 err = EPROTO;
426 break;
427 case ICMP_DEST_UNREACH:
428 if (code > NR_ICMP_UNREACH)
429 goto out;
430
431 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
432 /* We are not interested in TCP_LISTEN and open_requests
433 * (SYN-ACKs send out by Linux are always <576bytes so
434 * they should go through unfragmented).
435 */
436 if (sk->sk_state == TCP_LISTEN)
437 goto out;
438
563d34d0 439 tp->mtu_info = info;
144d56e9 440 if (!sock_owned_by_user(sk)) {
563d34d0 441 tcp_v4_mtu_reduced(sk);
144d56e9
ED
442 } else {
443 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
444 sock_hold(sk);
445 }
1da177e4
LT
446 goto out;
447 }
448
449 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
450 /* check if icmp_skb allows revert of backoff
451 * (see draft-zimmermann-tcp-lcd) */
452 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
453 break;
454 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 455 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
456 break;
457
8f49c270
DM
458 if (sock_owned_by_user(sk))
459 break;
460
f1ecd5d9 461 icsk->icsk_backoff--;
fcdd1cf4
ED
462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
463 TCP_TIMEOUT_INIT;
464 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
465
466 skb = tcp_write_queue_head(sk);
467 BUG_ON(!skb);
468
7faee5c0
ED
469 remaining = icsk->icsk_rto -
470 min(icsk->icsk_rto,
471 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
472
473 if (remaining) {
474 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
475 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
476 } else {
477 /* RTO revert clocked out retransmission.
478 * Will retransmit now */
479 tcp_retransmit_timer(sk);
480 }
481
1da177e4
LT
482 break;
483 case ICMP_TIME_EXCEEDED:
484 err = EHOSTUNREACH;
485 break;
486 default:
487 goto out;
488 }
489
490 switch (sk->sk_state) {
1da177e4 491 case TCP_SYN_SENT:
0a672f74
YC
492 case TCP_SYN_RECV:
493 /* Only in fast or simultaneous open. If a fast open socket is
494 * is already accepted it is treated as a connected one below.
495 */
51456b29 496 if (fastopen && !fastopen->sk)
0a672f74
YC
497 break;
498
1da177e4 499 if (!sock_owned_by_user(sk)) {
1da177e4
LT
500 sk->sk_err = err;
501
502 sk->sk_error_report(sk);
503
504 tcp_done(sk);
505 } else {
506 sk->sk_err_soft = err;
507 }
508 goto out;
509 }
510
511 /* If we've already connected we will keep trying
512 * until we time out, or the user gives up.
513 *
514 * rfc1122 4.2.3.9 allows to consider as hard errors
515 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
516 * but it is obsoleted by pmtu discovery).
517 *
518 * Note, that in modern internet, where routing is unreliable
519 * and in each dark corner broken firewalls sit, sending random
520 * errors ordered by their masters even this two messages finally lose
521 * their original sense (even Linux sends invalid PORT_UNREACHs)
522 *
523 * Now we are in compliance with RFCs.
524 * --ANK (980905)
525 */
526
527 inet = inet_sk(sk);
528 if (!sock_owned_by_user(sk) && inet->recverr) {
529 sk->sk_err = err;
530 sk->sk_error_report(sk);
531 } else { /* Only an error on timeout */
532 sk->sk_err_soft = err;
533 }
534
535out:
536 bh_unlock_sock(sk);
537 sock_put(sk);
538}
539
28850dc7 540void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 541{
aa8223c7 542 struct tcphdr *th = tcp_hdr(skb);
1da177e4 543
84fa7933 544 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 545 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 546 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 547 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 548 } else {
419f9f89 549 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 550 csum_partial(th,
1da177e4
LT
551 th->doff << 2,
552 skb->csum));
553 }
554}
555
419f9f89 556/* This routine computes an IPv4 TCP checksum. */
bb296246 557void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 558{
cf533ea5 559 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
560
561 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
562}
4bc2f18b 563EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 564
1da177e4
LT
565/*
566 * This routine will send an RST to the other tcp.
567 *
568 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
569 * for reset.
570 * Answer: if a packet caused RST, it is not for a socket
571 * existing in our system, if it is matched to a socket,
572 * it is just duplicate segment or bug in other side's TCP.
573 * So that we build reply only basing on parameters
574 * arrived with segment.
575 * Exception: precedence violation. We do not implement it in any case.
576 */
577
a00e7444 578static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 579{
cf533ea5 580 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
581 struct {
582 struct tcphdr th;
583#ifdef CONFIG_TCP_MD5SIG
714e85be 584 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
585#endif
586 } rep;
1da177e4 587 struct ip_reply_arg arg;
cfb6eeb4 588#ifdef CONFIG_TCP_MD5SIG
e46787f0 589 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
590 const __u8 *hash_location = NULL;
591 unsigned char newhash[16];
592 int genhash;
593 struct sock *sk1 = NULL;
cfb6eeb4 594#endif
a86b1e30 595 struct net *net;
1da177e4
LT
596
597 /* Never send a reset in response to a reset. */
598 if (th->rst)
599 return;
600
c3658e8d
ED
601 /* If sk not NULL, it means we did a successful lookup and incoming
602 * route had to be correct. prequeue might have dropped our dst.
603 */
604 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
605 return;
606
607 /* Swap the send and the receive. */
cfb6eeb4
YH
608 memset(&rep, 0, sizeof(rep));
609 rep.th.dest = th->source;
610 rep.th.source = th->dest;
611 rep.th.doff = sizeof(struct tcphdr) / 4;
612 rep.th.rst = 1;
1da177e4
LT
613
614 if (th->ack) {
cfb6eeb4 615 rep.th.seq = th->ack_seq;
1da177e4 616 } else {
cfb6eeb4
YH
617 rep.th.ack = 1;
618 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
619 skb->len - (th->doff << 2));
1da177e4
LT
620 }
621
7174259e 622 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
623 arg.iov[0].iov_base = (unsigned char *)&rep;
624 arg.iov[0].iov_len = sizeof(rep.th);
625
0f85feae 626 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 627#ifdef CONFIG_TCP_MD5SIG
658ddaaf 628 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 629 if (sk && sk_fullsock(sk)) {
e46787f0
FW
630 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
631 &ip_hdr(skb)->saddr, AF_INET);
632 } else if (hash_location) {
658ddaaf
SL
633 /*
634 * active side is lost. Try to find listening socket through
635 * source port, and then find md5 key through listening socket.
636 * we are not loose security here:
637 * Incoming packet is checked with md5 hash with finding key,
638 * no RST generated if md5 hash doesn't match.
639 */
0f85feae 640 sk1 = __inet_lookup_listener(net,
da5e3630
TH
641 &tcp_hashinfo, ip_hdr(skb)->saddr,
642 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
643 ntohs(th->source), inet_iif(skb));
644 /* don't send rst if it can't find key */
645 if (!sk1)
646 return;
647 rcu_read_lock();
648 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
649 &ip_hdr(skb)->saddr, AF_INET);
650 if (!key)
651 goto release_sk1;
652
39f8e58e 653 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf
SL
654 if (genhash || memcmp(hash_location, newhash, 16) != 0)
655 goto release_sk1;
658ddaaf
SL
656 }
657
cfb6eeb4
YH
658 if (key) {
659 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
660 (TCPOPT_NOP << 16) |
661 (TCPOPT_MD5SIG << 8) |
662 TCPOLEN_MD5SIG);
663 /* Update length and the length the header thinks exists */
664 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
665 rep.th.doff = arg.iov[0].iov_len / 4;
666
49a72dfb 667 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
668 key, ip_hdr(skb)->saddr,
669 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
670 }
671#endif
eddc9ec5
ACM
672 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
673 ip_hdr(skb)->saddr, /* XXX */
52cd5750 674 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 675 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
676 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
677
e2446eaa 678 /* When socket is gone, all binding information is lost.
4c675258
AK
679 * routing might fail in this case. No choice here, if we choose to force
680 * input interface, we will misroute in case of asymmetric route.
e2446eaa 681 */
4c675258
AK
682 if (sk)
683 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 684
271c3b9b
FW
685 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
686 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
687
66b13d99 688 arg.tos = ip_hdr(skb)->tos;
bdbbb852
ED
689 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
690 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
691 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
692 &arg, arg.iov[0].iov_len);
1da177e4 693
63231bdd
PE
694 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
695 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
696
697#ifdef CONFIG_TCP_MD5SIG
698release_sk1:
699 if (sk1) {
700 rcu_read_unlock();
701 sock_put(sk1);
702 }
703#endif
1da177e4
LT
704}
705
706/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
707 outside socket context is ugly, certainly. What can I do?
708 */
709
e62a123b
ED
710static void tcp_v4_send_ack(struct net *net,
711 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 712 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 713 struct tcp_md5sig_key *key,
66b13d99 714 int reply_flags, u8 tos)
1da177e4 715{
cf533ea5 716 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
717 struct {
718 struct tcphdr th;
714e85be 719 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 720#ifdef CONFIG_TCP_MD5SIG
714e85be 721 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
722#endif
723 ];
1da177e4
LT
724 } rep;
725 struct ip_reply_arg arg;
726
727 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 728 memset(&arg, 0, sizeof(arg));
1da177e4
LT
729
730 arg.iov[0].iov_base = (unsigned char *)&rep;
731 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 732 if (tsecr) {
cfb6eeb4
YH
733 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
734 (TCPOPT_TIMESTAMP << 8) |
735 TCPOLEN_TIMESTAMP);
ee684b6f
AV
736 rep.opt[1] = htonl(tsval);
737 rep.opt[2] = htonl(tsecr);
cb48cfe8 738 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
739 }
740
741 /* Swap the send and the receive. */
742 rep.th.dest = th->source;
743 rep.th.source = th->dest;
744 rep.th.doff = arg.iov[0].iov_len / 4;
745 rep.th.seq = htonl(seq);
746 rep.th.ack_seq = htonl(ack);
747 rep.th.ack = 1;
748 rep.th.window = htons(win);
749
cfb6eeb4 750#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 751 if (key) {
ee684b6f 752 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
753
754 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
755 (TCPOPT_NOP << 16) |
756 (TCPOPT_MD5SIG << 8) |
757 TCPOLEN_MD5SIG);
758 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
759 rep.th.doff = arg.iov[0].iov_len/4;
760
49a72dfb 761 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
762 key, ip_hdr(skb)->saddr,
763 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
764 }
765#endif
88ef4a5a 766 arg.flags = reply_flags;
eddc9ec5
ACM
767 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
768 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
769 arg.iov[0].iov_len, IPPROTO_TCP, 0);
770 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
771 if (oif)
772 arg.bound_dev_if = oif;
66b13d99 773 arg.tos = tos;
bdbbb852
ED
774 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
775 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
776 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
777 &arg, arg.iov[0].iov_len);
1da177e4 778
63231bdd 779 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
780}
781
782static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
783{
8feaf0c0 784 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 785 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 786
e62a123b
ED
787 tcp_v4_send_ack(sock_net(sk), skb,
788 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 789 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 790 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
791 tcptw->tw_ts_recent,
792 tw->tw_bound_dev_if,
88ef4a5a 793 tcp_twsk_md5_key(tcptw),
66b13d99
ED
794 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
795 tw->tw_tos
9501f972 796 );
1da177e4 797
8feaf0c0 798 inet_twsk_put(tw);
1da177e4
LT
799}
800
a00e7444 801static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 802 struct request_sock *req)
1da177e4 803{
168a8f58
JC
804 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
805 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
806 */
e62a123b
ED
807 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
808 tcp_sk(sk)->snd_nxt;
809
810 tcp_v4_send_ack(sock_net(sk), skb, seq,
ed53d0ab 811 tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
ee684b6f 812 tcp_time_stamp,
9501f972
YH
813 req->ts_recent,
814 0,
a915da9b
ED
815 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
816 AF_INET),
66b13d99
ED
817 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
818 ip_hdr(skb)->tos);
1da177e4
LT
819}
820
1da177e4 821/*
9bf1d83e 822 * Send a SYN-ACK after having received a SYN.
60236fdd 823 * This still operates on a request_sock only, not on a big
1da177e4
LT
824 * socket.
825 */
0f935dbe 826static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 827 struct flowi *fl,
72659ecc 828 struct request_sock *req,
ca6fb065
ED
829 struct tcp_fastopen_cookie *foc,
830 bool attach_req)
1da177e4 831{
2e6599cb 832 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 833 struct flowi4 fl4;
1da177e4 834 int err = -1;
d41db5af 835 struct sk_buff *skb;
1da177e4
LT
836
837 /* First, grab a route. */
ba3f7f04 838 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 839 return -1;
1da177e4 840
ca6fb065 841 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
1da177e4
LT
842
843 if (skb) {
634fb979 844 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 845
634fb979
ED
846 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
847 ireq->ir_rmt_addr,
2e6599cb 848 ireq->opt);
b9df3cb8 849 err = net_xmit_eval(err);
1da177e4
LT
850 }
851
1da177e4
LT
852 return err;
853}
854
855/*
60236fdd 856 * IPv4 request_sock destructor.
1da177e4 857 */
60236fdd 858static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 859{
a51482bd 860 kfree(inet_rsk(req)->opt);
1da177e4
LT
861}
862
1da177e4 863
cfb6eeb4
YH
864#ifdef CONFIG_TCP_MD5SIG
865/*
866 * RFC2385 MD5 checksumming requires a mapping of
867 * IP address->MD5 Key.
868 * We need to maintain these in the sk structure.
869 */
870
871/* Find the Key structure for an address. */
b83e3deb 872struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
873 const union tcp_md5_addr *addr,
874 int family)
cfb6eeb4 875{
fd3a154a 876 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 877 struct tcp_md5sig_key *key;
a915da9b 878 unsigned int size = sizeof(struct in_addr);
fd3a154a 879 const struct tcp_md5sig_info *md5sig;
cfb6eeb4 880
a8afca03
ED
881 /* caller either holds rcu_read_lock() or socket lock */
882 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea 883 sock_owned_by_user(sk) ||
b83e3deb 884 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
a8afca03 885 if (!md5sig)
cfb6eeb4 886 return NULL;
a915da9b
ED
887#if IS_ENABLED(CONFIG_IPV6)
888 if (family == AF_INET6)
889 size = sizeof(struct in6_addr);
890#endif
b67bfe0d 891 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
892 if (key->family != family)
893 continue;
894 if (!memcmp(&key->addr, addr, size))
895 return key;
cfb6eeb4
YH
896 }
897 return NULL;
898}
a915da9b 899EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4 900
b83e3deb 901struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 902 const struct sock *addr_sk)
cfb6eeb4 903{
b52e6921 904 const union tcp_md5_addr *addr;
a915da9b 905
b52e6921 906 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 907 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 908}
cfb6eeb4
YH
909EXPORT_SYMBOL(tcp_v4_md5_lookup);
910
cfb6eeb4 911/* This can be called on a newly created socket, from other files */
a915da9b
ED
912int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
913 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
914{
915 /* Add Key to the list */
b0a713e9 916 struct tcp_md5sig_key *key;
cfb6eeb4 917 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 918 struct tcp_md5sig_info *md5sig;
cfb6eeb4 919
c0353c7b 920 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
921 if (key) {
922 /* Pre-existing entry - just update that one. */
a915da9b 923 memcpy(key->key, newkey, newkeylen);
b0a713e9 924 key->keylen = newkeylen;
a915da9b
ED
925 return 0;
926 }
260fcbeb 927
a8afca03 928 md5sig = rcu_dereference_protected(tp->md5sig_info,
1b8e6a01
ED
929 sock_owned_by_user(sk) ||
930 lockdep_is_held(&sk->sk_lock.slock));
a915da9b
ED
931 if (!md5sig) {
932 md5sig = kmalloc(sizeof(*md5sig), gfp);
933 if (!md5sig)
cfb6eeb4 934 return -ENOMEM;
cfb6eeb4 935
a915da9b
ED
936 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
937 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 938 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 939 }
cfb6eeb4 940
5f3d9cb2 941 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
942 if (!key)
943 return -ENOMEM;
71cea17e 944 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 945 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 946 return -ENOMEM;
cfb6eeb4 947 }
a915da9b
ED
948
949 memcpy(key->key, newkey, newkeylen);
950 key->keylen = newkeylen;
951 key->family = family;
952 memcpy(&key->addr, addr,
953 (family == AF_INET6) ? sizeof(struct in6_addr) :
954 sizeof(struct in_addr));
955 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
956 return 0;
957}
a915da9b 958EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 959
a915da9b 960int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 961{
a915da9b
ED
962 struct tcp_md5sig_key *key;
963
c0353c7b 964 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
965 if (!key)
966 return -ENOENT;
967 hlist_del_rcu(&key->node);
5f3d9cb2 968 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 969 kfree_rcu(key, rcu);
a915da9b 970 return 0;
cfb6eeb4 971}
a915da9b 972EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 973
e0683e70 974static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
975{
976 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 977 struct tcp_md5sig_key *key;
b67bfe0d 978 struct hlist_node *n;
a8afca03 979 struct tcp_md5sig_info *md5sig;
cfb6eeb4 980
a8afca03
ED
981 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
982
b67bfe0d 983 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 984 hlist_del_rcu(&key->node);
5f3d9cb2 985 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 986 kfree_rcu(key, rcu);
cfb6eeb4
YH
987 }
988}
989
7174259e
ACM
990static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
991 int optlen)
cfb6eeb4
YH
992{
993 struct tcp_md5sig cmd;
994 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
995
996 if (optlen < sizeof(cmd))
997 return -EINVAL;
998
7174259e 999 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1000 return -EFAULT;
1001
1002 if (sin->sin_family != AF_INET)
1003 return -EINVAL;
1004
64a124ed 1005 if (!cmd.tcpm_keylen)
a915da9b
ED
1006 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1007 AF_INET);
cfb6eeb4
YH
1008
1009 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1010 return -EINVAL;
1011
a915da9b
ED
1012 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1013 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1014 GFP_KERNEL);
cfb6eeb4
YH
1015}
1016
49a72dfb
AL
1017static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1018 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1019{
cfb6eeb4 1020 struct tcp4_pseudohdr *bp;
49a72dfb 1021 struct scatterlist sg;
cfb6eeb4
YH
1022
1023 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1024
1025 /*
49a72dfb 1026 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1027 * destination IP address, zero-padded protocol number, and
1028 * segment length)
1029 */
1030 bp->saddr = saddr;
1031 bp->daddr = daddr;
1032 bp->pad = 0;
076fb722 1033 bp->protocol = IPPROTO_TCP;
49a72dfb 1034 bp->len = cpu_to_be16(nbytes);
c7da57a1 1035
49a72dfb
AL
1036 sg_init_one(&sg, bp, sizeof(*bp));
1037 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1038}
1039
a915da9b 1040static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1041 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1042{
1043 struct tcp_md5sig_pool *hp;
1044 struct hash_desc *desc;
1045
1046 hp = tcp_get_md5sig_pool();
1047 if (!hp)
1048 goto clear_hash_noput;
1049 desc = &hp->md5_desc;
1050
1051 if (crypto_hash_init(desc))
1052 goto clear_hash;
1053 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1054 goto clear_hash;
1055 if (tcp_md5_hash_header(hp, th))
1056 goto clear_hash;
1057 if (tcp_md5_hash_key(hp, key))
1058 goto clear_hash;
1059 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1060 goto clear_hash;
1061
cfb6eeb4 1062 tcp_put_md5sig_pool();
cfb6eeb4 1063 return 0;
49a72dfb 1064
cfb6eeb4
YH
1065clear_hash:
1066 tcp_put_md5sig_pool();
1067clear_hash_noput:
1068 memset(md5_hash, 0, 16);
49a72dfb 1069 return 1;
cfb6eeb4
YH
1070}
1071
39f8e58e
ED
1072int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1073 const struct sock *sk,
318cf7aa 1074 const struct sk_buff *skb)
cfb6eeb4 1075{
49a72dfb
AL
1076 struct tcp_md5sig_pool *hp;
1077 struct hash_desc *desc;
318cf7aa 1078 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1079 __be32 saddr, daddr;
1080
39f8e58e
ED
1081 if (sk) { /* valid for establish/request sockets */
1082 saddr = sk->sk_rcv_saddr;
1083 daddr = sk->sk_daddr;
cfb6eeb4 1084 } else {
49a72dfb
AL
1085 const struct iphdr *iph = ip_hdr(skb);
1086 saddr = iph->saddr;
1087 daddr = iph->daddr;
cfb6eeb4 1088 }
49a72dfb
AL
1089
1090 hp = tcp_get_md5sig_pool();
1091 if (!hp)
1092 goto clear_hash_noput;
1093 desc = &hp->md5_desc;
1094
1095 if (crypto_hash_init(desc))
1096 goto clear_hash;
1097
1098 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1099 goto clear_hash;
1100 if (tcp_md5_hash_header(hp, th))
1101 goto clear_hash;
1102 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1103 goto clear_hash;
1104 if (tcp_md5_hash_key(hp, key))
1105 goto clear_hash;
1106 if (crypto_hash_final(desc, md5_hash))
1107 goto clear_hash;
1108
1109 tcp_put_md5sig_pool();
1110 return 0;
1111
1112clear_hash:
1113 tcp_put_md5sig_pool();
1114clear_hash_noput:
1115 memset(md5_hash, 0, 16);
1116 return 1;
cfb6eeb4 1117}
49a72dfb 1118EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1119
ba8e275a
ED
1120#endif
1121
ff74e23f 1122/* Called with rcu_read_lock() */
ba8e275a 1123static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1124 const struct sk_buff *skb)
cfb6eeb4 1125{
ba8e275a 1126#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1127 /*
1128 * This gets called for each TCP segment that arrives
1129 * so we want to be efficient.
1130 * We have 3 drop cases:
1131 * o No MD5 hash and one expected.
1132 * o MD5 hash and we're not expecting one.
1133 * o MD5 hash and its wrong.
1134 */
cf533ea5 1135 const __u8 *hash_location = NULL;
cfb6eeb4 1136 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1137 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1138 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1139 int genhash;
cfb6eeb4
YH
1140 unsigned char newhash[16];
1141
a915da9b
ED
1142 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1143 AF_INET);
7d5d5525 1144 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1145
cfb6eeb4
YH
1146 /* We've parsed the options - do we have a hash? */
1147 if (!hash_expected && !hash_location)
a2a385d6 1148 return false;
cfb6eeb4
YH
1149
1150 if (hash_expected && !hash_location) {
785957d3 1151 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1152 return true;
cfb6eeb4
YH
1153 }
1154
1155 if (!hash_expected && hash_location) {
785957d3 1156 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1157 return true;
cfb6eeb4
YH
1158 }
1159
1160 /* Okay, so this is hash_expected and hash_location -
1161 * so we need to calculate the checksum.
1162 */
49a72dfb
AL
1163 genhash = tcp_v4_md5_hash_skb(newhash,
1164 hash_expected,
39f8e58e 1165 NULL, skb);
cfb6eeb4
YH
1166
1167 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1168 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1169 &iph->saddr, ntohs(th->source),
1170 &iph->daddr, ntohs(th->dest),
1171 genhash ? " tcp_v4_calc_md5_hash failed"
1172 : "");
a2a385d6 1173 return true;
cfb6eeb4 1174 }
a2a385d6 1175 return false;
cfb6eeb4 1176#endif
ba8e275a
ED
1177 return false;
1178}
cfb6eeb4 1179
b40cf18e
ED
1180static void tcp_v4_init_req(struct request_sock *req,
1181 const struct sock *sk_listener,
16bea70a
OP
1182 struct sk_buff *skb)
1183{
1184 struct inet_request_sock *ireq = inet_rsk(req);
1185
08d2cc3b
ED
1186 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1187 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1188 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
16bea70a
OP
1189 ireq->opt = tcp_v4_save_options(skb);
1190}
1191
f964629e
ED
1192static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1193 struct flowi *fl,
d94e0417
OP
1194 const struct request_sock *req,
1195 bool *strict)
1196{
1197 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1198
1199 if (strict) {
1200 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1201 *strict = true;
1202 else
1203 *strict = false;
1204 }
1205
1206 return dst;
1207}
1208
72a3effa 1209struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1210 .family = PF_INET,
2e6599cb 1211 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1212 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1213 .send_ack = tcp_v4_reqsk_send_ack,
1214 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1215 .send_reset = tcp_v4_send_reset,
688d1945 1216 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1217};
1218
b2e4b3de 1219static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1220 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1221#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1222 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1223 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1224#endif
16bea70a 1225 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1226#ifdef CONFIG_SYN_COOKIES
1227 .cookie_init_seq = cookie_v4_init_sequence,
1228#endif
d94e0417 1229 .route_req = tcp_v4_route_req,
936b8bdb 1230 .init_seq = tcp_v4_init_sequence,
d6274bd8 1231 .send_synack = tcp_v4_send_synack,
16bea70a 1232};
cfb6eeb4 1233
1da177e4
LT
1234int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1235{
1da177e4 1236 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1237 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1238 goto drop;
1239
1fb6f159
OP
1240 return tcp_conn_request(&tcp_request_sock_ops,
1241 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1242
1da177e4 1243drop:
848bf15f 1244 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1245 return 0;
1246}
4bc2f18b 1247EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1248
1249
1250/*
1251 * The three way handshake has completed - we got a valid synack -
1252 * now create the new socket.
1253 */
0c27171e 1254struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1255 struct request_sock *req,
5e0724d0
ED
1256 struct dst_entry *dst,
1257 struct request_sock *req_unhash,
1258 bool *own_req)
1da177e4 1259{
2e6599cb 1260 struct inet_request_sock *ireq;
1da177e4
LT
1261 struct inet_sock *newinet;
1262 struct tcp_sock *newtp;
1263 struct sock *newsk;
cfb6eeb4
YH
1264#ifdef CONFIG_TCP_MD5SIG
1265 struct tcp_md5sig_key *key;
1266#endif
f6d8bd05 1267 struct ip_options_rcu *inet_opt;
1da177e4
LT
1268
1269 if (sk_acceptq_is_full(sk))
1270 goto exit_overflow;
1271
1da177e4
LT
1272 newsk = tcp_create_openreq_child(sk, req, skb);
1273 if (!newsk)
093d2823 1274 goto exit_nonewsk;
1da177e4 1275
bcd76111 1276 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1277 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1278
1279 newtp = tcp_sk(newsk);
1280 newinet = inet_sk(newsk);
2e6599cb 1281 ireq = inet_rsk(req);
d1e559d0
ED
1282 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1283 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1284 newsk->sk_bound_dev_if = ireq->ir_iif;
634fb979 1285 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1286 inet_opt = ireq->opt;
1287 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1288 ireq->opt = NULL;
463c84b9 1289 newinet->mc_index = inet_iif(skb);
eddc9ec5 1290 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1291 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1292 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1293 if (inet_opt)
1294 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1295 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1296
dfd25fff
ED
1297 if (!dst) {
1298 dst = inet_csk_route_child_sock(sk, newsk, req);
1299 if (!dst)
1300 goto put_and_exit;
1301 } else {
1302 /* syncookie case : see end of cookie_v4_check() */
1303 }
0e734419
DM
1304 sk_setup_caps(newsk, dst);
1305
81164413
DB
1306 tcp_ca_openreq_child(newsk, dst);
1307
1da177e4 1308 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1309 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1310 if (tcp_sk(sk)->rx_opt.user_mss &&
1311 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1312 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1313
1da177e4
LT
1314 tcp_initialize_rcv_mss(newsk);
1315
cfb6eeb4
YH
1316#ifdef CONFIG_TCP_MD5SIG
1317 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1318 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1319 AF_INET);
00db4124 1320 if (key) {
cfb6eeb4
YH
1321 /*
1322 * We're using one, so create a matching key
1323 * on the newsk structure. If we fail to get
1324 * memory, then we end up not copying the key
1325 * across. Shucks.
1326 */
a915da9b
ED
1327 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1328 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1329 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1330 }
1331#endif
1332
0e734419
DM
1333 if (__inet_inherit_port(sk, newsk) < 0)
1334 goto put_and_exit;
5e0724d0 1335 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
805c4bc0 1336 if (*own_req)
49a496c9 1337 tcp_move_syn(newtp, req);
1da177e4
LT
1338
1339 return newsk;
1340
1341exit_overflow:
de0744af 1342 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1343exit_nonewsk:
1344 dst_release(dst);
1da177e4 1345exit:
de0744af 1346 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1347 return NULL;
0e734419 1348put_and_exit:
e337e24d
CP
1349 inet_csk_prepare_forced_close(newsk);
1350 tcp_done(newsk);
0e734419 1351 goto exit;
1da177e4 1352}
4bc2f18b 1353EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1354
079096f1 1355static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1356{
079096f1 1357#ifdef CONFIG_SYN_COOKIES
52452c54 1358 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1359
af9b4738 1360 if (!th->syn)
461b74c3 1361 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1362#endif
1363 return sk;
1364}
1365
1da177e4 1366/* The socket must have it's spinlock held when we get
e994b2f0 1367 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1368 *
1369 * We have a potential double-lock case here, so even when
1370 * doing backlog processing we use the BH locking scheme.
1371 * This is because we cannot sleep with the original spinlock
1372 * held.
1373 */
1374int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1375{
cfb6eeb4 1376 struct sock *rsk;
cfb6eeb4 1377
1da177e4 1378 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1379 struct dst_entry *dst = sk->sk_rx_dst;
1380
bdeab991 1381 sock_rps_save_rxhash(sk, skb);
3d97379a 1382 sk_mark_napi_id(sk, skb);
404e0a8b 1383 if (dst) {
505fbcf0 1384 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1385 !dst->ops->check(dst, 0)) {
92101b3b
DM
1386 dst_release(dst);
1387 sk->sk_rx_dst = NULL;
1388 }
1389 }
c995ae22 1390 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1391 return 0;
1392 }
1393
12e25e10 1394 if (tcp_checksum_complete(skb))
1da177e4
LT
1395 goto csum_err;
1396
1397 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1398 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1399
1da177e4
LT
1400 if (!nsk)
1401 goto discard;
1da177e4 1402 if (nsk != sk) {
bdeab991 1403 sock_rps_save_rxhash(nsk, skb);
38cb5245 1404 sk_mark_napi_id(nsk, skb);
cfb6eeb4
YH
1405 if (tcp_child_process(sk, nsk, skb)) {
1406 rsk = nsk;
1da177e4 1407 goto reset;
cfb6eeb4 1408 }
1da177e4
LT
1409 return 0;
1410 }
ca55158c 1411 } else
bdeab991 1412 sock_rps_save_rxhash(sk, skb);
ca55158c 1413
72ab4a86 1414 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1415 rsk = sk;
1da177e4 1416 goto reset;
cfb6eeb4 1417 }
1da177e4
LT
1418 return 0;
1419
1420reset:
cfb6eeb4 1421 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1422discard:
1423 kfree_skb(skb);
1424 /* Be careful here. If this function gets more complicated and
1425 * gcc suffers from register pressure on the x86, sk (in %ebx)
1426 * might be destroyed here. This current version compiles correctly,
1427 * but you have been warned.
1428 */
1429 return 0;
1430
1431csum_err:
6a5dc9e5 1432 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1433 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1434 goto discard;
1435}
4bc2f18b 1436EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1437
160eb5a6 1438void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1439{
41063e9d
DM
1440 const struct iphdr *iph;
1441 const struct tcphdr *th;
1442 struct sock *sk;
41063e9d 1443
41063e9d 1444 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1445 return;
41063e9d 1446
45f00f99 1447 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1448 return;
41063e9d
DM
1449
1450 iph = ip_hdr(skb);
45f00f99 1451 th = tcp_hdr(skb);
41063e9d
DM
1452
1453 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1454 return;
41063e9d 1455
45f00f99 1456 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1457 iph->saddr, th->source,
7011d085 1458 iph->daddr, ntohs(th->dest),
9cb429d6 1459 skb->skb_iif);
41063e9d
DM
1460 if (sk) {
1461 skb->sk = sk;
1462 skb->destructor = sock_edemux;
f7e4eb03 1463 if (sk_fullsock(sk)) {
d0c294c5 1464 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1465
41063e9d
DM
1466 if (dst)
1467 dst = dst_check(dst, 0);
92101b3b 1468 if (dst &&
505fbcf0 1469 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1470 skb_dst_set_noref(skb, dst);
41063e9d
DM
1471 }
1472 }
41063e9d
DM
1473}
1474
b2fb4f54
ED
1475/* Packet is added to VJ-style prequeue for processing in process
1476 * context, if a reader task is waiting. Apparently, this exciting
1477 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1478 * failed somewhere. Latency? Burstiness? Well, at least now we will
1479 * see, why it failed. 8)8) --ANK
1480 *
1481 */
1482bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1483{
1484 struct tcp_sock *tp = tcp_sk(sk);
1485
1486 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1487 return false;
1488
1489 if (skb->len <= tcp_hdrlen(skb) &&
1490 skb_queue_len(&tp->ucopy.prequeue) == 0)
1491 return false;
1492
ca777eff
ED
1493 /* Before escaping RCU protected region, we need to take care of skb
1494 * dst. Prequeue is only enabled for established sockets.
1495 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1496 * Instead of doing full sk_rx_dst validity here, let's perform
1497 * an optimistic check.
1498 */
1499 if (likely(sk->sk_rx_dst))
1500 skb_dst_drop(skb);
1501 else
5037e9ef 1502 skb_dst_force_safe(skb);
ca777eff 1503
b2fb4f54
ED
1504 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1505 tp->ucopy.memory += skb->truesize;
1506 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1507 struct sk_buff *skb1;
1508
1509 BUG_ON(sock_owned_by_user(sk));
1510
1511 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1512 sk_backlog_rcv(sk, skb1);
1513 NET_INC_STATS_BH(sock_net(sk),
1514 LINUX_MIB_TCPPREQUEUEDROPPED);
1515 }
1516
1517 tp->ucopy.memory = 0;
1518 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1519 wake_up_interruptible_sync_poll(sk_sleep(sk),
1520 POLLIN | POLLRDNORM | POLLRDBAND);
1521 if (!inet_csk_ack_scheduled(sk))
1522 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1523 (3 * tcp_rto_min(sk)) / 4,
1524 TCP_RTO_MAX);
1525 }
1526 return true;
1527}
1528EXPORT_SYMBOL(tcp_prequeue);
1529
1da177e4
LT
1530/*
1531 * From tcp_input.c
1532 */
1533
1534int tcp_v4_rcv(struct sk_buff *skb)
1535{
eddc9ec5 1536 const struct iphdr *iph;
cf533ea5 1537 const struct tcphdr *th;
1da177e4
LT
1538 struct sock *sk;
1539 int ret;
a86b1e30 1540 struct net *net = dev_net(skb->dev);
1da177e4
LT
1541
1542 if (skb->pkt_type != PACKET_HOST)
1543 goto discard_it;
1544
1545 /* Count it even if it's bad */
63231bdd 1546 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1547
1548 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1549 goto discard_it;
1550
aa8223c7 1551 th = tcp_hdr(skb);
1da177e4
LT
1552
1553 if (th->doff < sizeof(struct tcphdr) / 4)
1554 goto bad_packet;
1555 if (!pskb_may_pull(skb, th->doff * 4))
1556 goto discard_it;
1557
1558 /* An explanation is required here, I think.
1559 * Packet length and doff are validated by header prediction,
caa20d9a 1560 * provided case of th->doff==0 is eliminated.
1da177e4 1561 * So, we defer the checks. */
ed70fcfc
TH
1562
1563 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1564 goto csum_error;
1da177e4 1565
aa8223c7 1566 th = tcp_hdr(skb);
eddc9ec5 1567 iph = ip_hdr(skb);
971f10ec
ED
1568 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1569 * barrier() makes sure compiler wont play fool^Waliasing games.
1570 */
1571 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1572 sizeof(struct inet_skb_parm));
1573 barrier();
1574
1da177e4
LT
1575 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1576 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1577 skb->len - th->doff * 4);
1578 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1579 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1580 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1581 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1582 TCP_SKB_CB(skb)->sacked = 0;
1583
4bdc3d66 1584lookup:
9a1f27c4 1585 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1586 if (!sk)
1587 goto no_tcp_socket;
1588
bb134d5d
ED
1589process:
1590 if (sk->sk_state == TCP_TIME_WAIT)
1591 goto do_time_wait;
1592
079096f1
ED
1593 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1594 struct request_sock *req = inet_reqsk(sk);
1595 struct sock *nsk = NULL;
1596
1597 sk = req->rsk_listener;
1598 if (tcp_v4_inbound_md5_hash(sk, skb))
1599 goto discard_and_relse;
4bdc3d66 1600 if (likely(sk->sk_state == TCP_LISTEN)) {
079096f1 1601 nsk = tcp_check_req(sk, skb, req, false);
4bdc3d66 1602 } else {
f03f2e15 1603 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1604 goto lookup;
1605 }
079096f1
ED
1606 if (!nsk) {
1607 reqsk_put(req);
1608 goto discard_it;
1609 }
1610 if (nsk == sk) {
1611 sock_hold(sk);
1612 reqsk_put(req);
1613 } else if (tcp_child_process(sk, nsk, skb)) {
1614 tcp_v4_send_reset(nsk, skb);
1615 goto discard_it;
1616 } else {
1617 return 0;
1618 }
1619 }
6cce09f8
ED
1620 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1621 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1622 goto discard_and_relse;
6cce09f8 1623 }
d218d111 1624
1da177e4
LT
1625 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1626 goto discard_and_relse;
9ea88a15 1627
9ea88a15
DP
1628 if (tcp_v4_inbound_md5_hash(sk, skb))
1629 goto discard_and_relse;
9ea88a15 1630
b59c2701 1631 nf_reset(skb);
1da177e4 1632
fda9ef5d 1633 if (sk_filter(sk, skb))
1da177e4
LT
1634 goto discard_and_relse;
1635
1636 skb->dev = NULL;
1637
e994b2f0
ED
1638 if (sk->sk_state == TCP_LISTEN) {
1639 ret = tcp_v4_do_rcv(sk, skb);
1640 goto put_and_return;
1641 }
1642
1643 sk_incoming_cpu_update(sk);
1644
c6366184 1645 bh_lock_sock_nested(sk);
2efd055c 1646 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1da177e4
LT
1647 ret = 0;
1648 if (!sock_owned_by_user(sk)) {
7bced397 1649 if (!tcp_prequeue(sk, skb))
1da177e4 1650 ret = tcp_v4_do_rcv(sk, skb);
da882c1f
ED
1651 } else if (unlikely(sk_add_backlog(sk, skb,
1652 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1653 bh_unlock_sock(sk);
6cce09f8 1654 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1655 goto discard_and_relse;
1656 }
1da177e4
LT
1657 bh_unlock_sock(sk);
1658
e994b2f0 1659put_and_return:
1da177e4
LT
1660 sock_put(sk);
1661
1662 return ret;
1663
1664no_tcp_socket:
1665 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1666 goto discard_it;
1667
12e25e10 1668 if (tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1669csum_error:
1670 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1671bad_packet:
63231bdd 1672 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1673 } else {
cfb6eeb4 1674 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1675 }
1676
1677discard_it:
1678 /* Discard frame. */
1679 kfree_skb(skb);
e905a9ed 1680 return 0;
1da177e4
LT
1681
1682discard_and_relse:
1683 sock_put(sk);
1684 goto discard_it;
1685
1686do_time_wait:
1687 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1688 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1689 goto discard_it;
1690 }
1691
6a5dc9e5
ED
1692 if (tcp_checksum_complete(skb)) {
1693 inet_twsk_put(inet_twsk(sk));
1694 goto csum_error;
1da177e4 1695 }
9469c7b4 1696 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1697 case TCP_TW_SYN: {
c346dca1 1698 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1699 &tcp_hashinfo,
da5e3630 1700 iph->saddr, th->source,
eddc9ec5 1701 iph->daddr, th->dest,
463c84b9 1702 inet_iif(skb));
1da177e4 1703 if (sk2) {
dbe7faa4 1704 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4
LT
1705 sk = sk2;
1706 goto process;
1707 }
1708 /* Fall through to ACK */
1709 }
1710 case TCP_TW_ACK:
1711 tcp_v4_timewait_ack(sk, skb);
1712 break;
1713 case TCP_TW_RST:
271c3b9b
FW
1714 tcp_v4_send_reset(sk, skb);
1715 inet_twsk_deschedule_put(inet_twsk(sk));
1716 goto discard_it;
1da177e4
LT
1717 case TCP_TW_SUCCESS:;
1718 }
1719 goto discard_it;
1720}
1721
ccb7c410
DM
1722static struct timewait_sock_ops tcp_timewait_sock_ops = {
1723 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1724 .twsk_unique = tcp_twsk_unique,
1725 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1726};
1da177e4 1727
63d02d15 1728void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1729{
1730 struct dst_entry *dst = skb_dst(skb);
1731
5037e9ef 1732 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1733 sk->sk_rx_dst = dst;
1734 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1735 }
5d299f3d 1736}
63d02d15 1737EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1738
3b401a81 1739const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1740 .queue_xmit = ip_queue_xmit,
1741 .send_check = tcp_v4_send_check,
1742 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1743 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1744 .conn_request = tcp_v4_conn_request,
1745 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1746 .net_header_len = sizeof(struct iphdr),
1747 .setsockopt = ip_setsockopt,
1748 .getsockopt = ip_getsockopt,
1749 .addr2sockaddr = inet_csk_addr2sockaddr,
1750 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1751 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1752#ifdef CONFIG_COMPAT
543d9cfe
ACM
1753 .compat_setsockopt = compat_ip_setsockopt,
1754 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1755#endif
4fab9071 1756 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1757};
4bc2f18b 1758EXPORT_SYMBOL(ipv4_specific);
1da177e4 1759
cfb6eeb4 1760#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1761static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1762 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1763 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1764 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1765};
b6332e6c 1766#endif
cfb6eeb4 1767
1da177e4
LT
1768/* NOTE: A lot of things set to zero explicitly by call to
1769 * sk_alloc() so need not be done here.
1770 */
1771static int tcp_v4_init_sock(struct sock *sk)
1772{
6687e988 1773 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1774
900f65d3 1775 tcp_init_sock(sk);
1da177e4 1776
8292a17a 1777 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1778
cfb6eeb4 1779#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1780 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1781#endif
1da177e4 1782
1da177e4
LT
1783 return 0;
1784}
1785
7d06b2e0 1786void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1787{
1788 struct tcp_sock *tp = tcp_sk(sk);
1789
1790 tcp_clear_xmit_timers(sk);
1791
6687e988 1792 tcp_cleanup_congestion_control(sk);
317a76f9 1793
1da177e4 1794 /* Cleanup up the write buffer. */
fe067e8a 1795 tcp_write_queue_purge(sk);
1da177e4
LT
1796
1797 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1798 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1799
cfb6eeb4
YH
1800#ifdef CONFIG_TCP_MD5SIG
1801 /* Clean up the MD5 key list, if any */
1802 if (tp->md5sig_info) {
a915da9b 1803 tcp_clear_md5_list(sk);
a8afca03 1804 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1805 tp->md5sig_info = NULL;
1806 }
1807#endif
1a2449a8 1808
1da177e4
LT
1809 /* Clean prequeue, it must be empty really */
1810 __skb_queue_purge(&tp->ucopy.prequeue);
1811
1812 /* Clean up a referenced TCP bind bucket. */
463c84b9 1813 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1814 inet_put_port(sk);
1da177e4 1815
00db4124 1816 BUG_ON(tp->fastopen_rsk);
435cf559 1817
cf60af03
YC
1818 /* If socket is aborted during connect operation */
1819 tcp_free_fastopen_req(tp);
cd8ae852 1820 tcp_saved_syn_free(tp);
cf60af03 1821
180d8cd9 1822 sk_sockets_allocated_dec(sk);
3d596f7b 1823
baac50bb 1824 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3d596f7b 1825 sock_release_memcg(sk);
1da177e4 1826}
1da177e4
LT
1827EXPORT_SYMBOL(tcp_v4_destroy_sock);
1828
1829#ifdef CONFIG_PROC_FS
1830/* Proc filesystem TCP sock list dumping. */
1831
a8b690f9
TH
1832/*
1833 * Get next listener socket follow cur. If cur is NULL, get first socket
1834 * starting from bucket given in st->bucket; when st->bucket is zero the
1835 * very first socket in the hash table is returned.
1836 */
1da177e4
LT
1837static void *listening_get_next(struct seq_file *seq, void *cur)
1838{
463c84b9 1839 struct inet_connection_sock *icsk;
c25eb3bf 1840 struct hlist_nulls_node *node;
1da177e4 1841 struct sock *sk = cur;
5caea4ea 1842 struct inet_listen_hashbucket *ilb;
5799de0b 1843 struct tcp_iter_state *st = seq->private;
a4146b1b 1844 struct net *net = seq_file_net(seq);
1da177e4
LT
1845
1846 if (!sk) {
a8b690f9 1847 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1848 spin_lock_bh(&ilb->lock);
c25eb3bf 1849 sk = sk_nulls_head(&ilb->head);
a8b690f9 1850 st->offset = 0;
1da177e4
LT
1851 goto get_sk;
1852 }
5caea4ea 1853 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1854 ++st->num;
a8b690f9 1855 ++st->offset;
1da177e4 1856
079096f1 1857 sk = sk_nulls_next(sk);
1da177e4 1858get_sk:
c25eb3bf 1859 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
1860 if (!net_eq(sock_net(sk), net))
1861 continue;
1862 if (sk->sk_family == st->family) {
1da177e4
LT
1863 cur = sk;
1864 goto out;
1865 }
e905a9ed 1866 icsk = inet_csk(sk);
1da177e4 1867 }
5caea4ea 1868 spin_unlock_bh(&ilb->lock);
a8b690f9 1869 st->offset = 0;
0f7ff927 1870 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
1871 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872 spin_lock_bh(&ilb->lock);
c25eb3bf 1873 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
1874 goto get_sk;
1875 }
1876 cur = NULL;
1877out:
1878 return cur;
1879}
1880
1881static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1882{
a8b690f9
TH
1883 struct tcp_iter_state *st = seq->private;
1884 void *rc;
1885
1886 st->bucket = 0;
1887 st->offset = 0;
1888 rc = listening_get_next(seq, NULL);
1da177e4
LT
1889
1890 while (rc && *pos) {
1891 rc = listening_get_next(seq, rc);
1892 --*pos;
1893 }
1894 return rc;
1895}
1896
05dbc7b5 1897static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1898{
05dbc7b5 1899 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1900}
1901
a8b690f9
TH
1902/*
1903 * Get first established socket starting from bucket given in st->bucket.
1904 * If st->bucket is zero, the very first socket in the hash is returned.
1905 */
1da177e4
LT
1906static void *established_get_first(struct seq_file *seq)
1907{
5799de0b 1908 struct tcp_iter_state *st = seq->private;
a4146b1b 1909 struct net *net = seq_file_net(seq);
1da177e4
LT
1910 void *rc = NULL;
1911
a8b690f9
TH
1912 st->offset = 0;
1913 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1914 struct sock *sk;
3ab5aee7 1915 struct hlist_nulls_node *node;
9db66bdc 1916 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1917
6eac5604
AK
1918 /* Lockless fast path for the common case of empty buckets */
1919 if (empty_bucket(st))
1920 continue;
1921
9db66bdc 1922 spin_lock_bh(lock);
3ab5aee7 1923 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1924 if (sk->sk_family != st->family ||
878628fb 1925 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1926 continue;
1927 }
1928 rc = sk;
1929 goto out;
1930 }
9db66bdc 1931 spin_unlock_bh(lock);
1da177e4
LT
1932 }
1933out:
1934 return rc;
1935}
1936
1937static void *established_get_next(struct seq_file *seq, void *cur)
1938{
1939 struct sock *sk = cur;
3ab5aee7 1940 struct hlist_nulls_node *node;
5799de0b 1941 struct tcp_iter_state *st = seq->private;
a4146b1b 1942 struct net *net = seq_file_net(seq);
1da177e4
LT
1943
1944 ++st->num;
a8b690f9 1945 ++st->offset;
1da177e4 1946
05dbc7b5 1947 sk = sk_nulls_next(sk);
1da177e4 1948
3ab5aee7 1949 sk_nulls_for_each_from(sk, node) {
878628fb 1950 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 1951 return sk;
1da177e4
LT
1952 }
1953
05dbc7b5
ED
1954 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1955 ++st->bucket;
1956 return established_get_first(seq);
1da177e4
LT
1957}
1958
1959static void *established_get_idx(struct seq_file *seq, loff_t pos)
1960{
a8b690f9
TH
1961 struct tcp_iter_state *st = seq->private;
1962 void *rc;
1963
1964 st->bucket = 0;
1965 rc = established_get_first(seq);
1da177e4
LT
1966
1967 while (rc && pos) {
1968 rc = established_get_next(seq, rc);
1969 --pos;
7174259e 1970 }
1da177e4
LT
1971 return rc;
1972}
1973
1974static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1975{
1976 void *rc;
5799de0b 1977 struct tcp_iter_state *st = seq->private;
1da177e4 1978
1da177e4
LT
1979 st->state = TCP_SEQ_STATE_LISTENING;
1980 rc = listening_get_idx(seq, &pos);
1981
1982 if (!rc) {
1da177e4
LT
1983 st->state = TCP_SEQ_STATE_ESTABLISHED;
1984 rc = established_get_idx(seq, pos);
1985 }
1986
1987 return rc;
1988}
1989
a8b690f9
TH
1990static void *tcp_seek_last_pos(struct seq_file *seq)
1991{
1992 struct tcp_iter_state *st = seq->private;
1993 int offset = st->offset;
1994 int orig_num = st->num;
1995 void *rc = NULL;
1996
1997 switch (st->state) {
a8b690f9
TH
1998 case TCP_SEQ_STATE_LISTENING:
1999 if (st->bucket >= INET_LHTABLE_SIZE)
2000 break;
2001 st->state = TCP_SEQ_STATE_LISTENING;
2002 rc = listening_get_next(seq, NULL);
2003 while (offset-- && rc)
2004 rc = listening_get_next(seq, rc);
2005 if (rc)
2006 break;
2007 st->bucket = 0;
05dbc7b5 2008 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2009 /* Fallthrough */
2010 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2011 if (st->bucket > tcp_hashinfo.ehash_mask)
2012 break;
2013 rc = established_get_first(seq);
2014 while (offset-- && rc)
2015 rc = established_get_next(seq, rc);
2016 }
2017
2018 st->num = orig_num;
2019
2020 return rc;
2021}
2022
1da177e4
LT
2023static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2024{
5799de0b 2025 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2026 void *rc;
2027
2028 if (*pos && *pos == st->last_pos) {
2029 rc = tcp_seek_last_pos(seq);
2030 if (rc)
2031 goto out;
2032 }
2033
1da177e4
LT
2034 st->state = TCP_SEQ_STATE_LISTENING;
2035 st->num = 0;
a8b690f9
TH
2036 st->bucket = 0;
2037 st->offset = 0;
2038 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2039
2040out:
2041 st->last_pos = *pos;
2042 return rc;
1da177e4
LT
2043}
2044
2045static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2046{
a8b690f9 2047 struct tcp_iter_state *st = seq->private;
1da177e4 2048 void *rc = NULL;
1da177e4
LT
2049
2050 if (v == SEQ_START_TOKEN) {
2051 rc = tcp_get_idx(seq, 0);
2052 goto out;
2053 }
1da177e4
LT
2054
2055 switch (st->state) {
1da177e4
LT
2056 case TCP_SEQ_STATE_LISTENING:
2057 rc = listening_get_next(seq, v);
2058 if (!rc) {
1da177e4 2059 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2060 st->bucket = 0;
2061 st->offset = 0;
1da177e4
LT
2062 rc = established_get_first(seq);
2063 }
2064 break;
2065 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2066 rc = established_get_next(seq, v);
2067 break;
2068 }
2069out:
2070 ++*pos;
a8b690f9 2071 st->last_pos = *pos;
1da177e4
LT
2072 return rc;
2073}
2074
2075static void tcp_seq_stop(struct seq_file *seq, void *v)
2076{
5799de0b 2077 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2078
2079 switch (st->state) {
1da177e4
LT
2080 case TCP_SEQ_STATE_LISTENING:
2081 if (v != SEQ_START_TOKEN)
5caea4ea 2082 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2083 break;
1da177e4
LT
2084 case TCP_SEQ_STATE_ESTABLISHED:
2085 if (v)
9db66bdc 2086 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2087 break;
2088 }
2089}
2090
73cb88ec 2091int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2092{
d9dda78b 2093 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2094 struct tcp_iter_state *s;
52d6f3f1 2095 int err;
1da177e4 2096
52d6f3f1
DL
2097 err = seq_open_net(inode, file, &afinfo->seq_ops,
2098 sizeof(struct tcp_iter_state));
2099 if (err < 0)
2100 return err;
f40c8174 2101
52d6f3f1 2102 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2103 s->family = afinfo->family;
688d1945 2104 s->last_pos = 0;
f40c8174
DL
2105 return 0;
2106}
73cb88ec 2107EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2108
6f8b13bc 2109int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2110{
2111 int rc = 0;
2112 struct proc_dir_entry *p;
2113
9427c4b3
DL
2114 afinfo->seq_ops.start = tcp_seq_start;
2115 afinfo->seq_ops.next = tcp_seq_next;
2116 afinfo->seq_ops.stop = tcp_seq_stop;
2117
84841c3c 2118 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2119 afinfo->seq_fops, afinfo);
84841c3c 2120 if (!p)
1da177e4
LT
2121 rc = -ENOMEM;
2122 return rc;
2123}
4bc2f18b 2124EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2125
6f8b13bc 2126void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2127{
ece31ffd 2128 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2129}
4bc2f18b 2130EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2131
d4f06873 2132static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2133 struct seq_file *f, int i)
1da177e4 2134{
2e6599cb 2135 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2136 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2137
5e659e4c 2138 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2139 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2140 i,
634fb979 2141 ireq->ir_loc_addr,
d4f06873 2142 ireq->ir_num,
634fb979
ED
2143 ireq->ir_rmt_addr,
2144 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2145 TCP_SYN_RECV,
2146 0, 0, /* could print option size, but that is af dependent. */
2147 1, /* timers active (only the expire timer) */
a399a805 2148 jiffies_delta_to_clock_t(delta),
e6c022a4 2149 req->num_timeout,
aa3a0c8c
ED
2150 from_kuid_munged(seq_user_ns(f),
2151 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2152 0, /* non standard timer */
2153 0, /* open_requests have no inode */
d4f06873 2154 0,
652586df 2155 req);
1da177e4
LT
2156}
2157
652586df 2158static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2159{
2160 int timer_active;
2161 unsigned long timer_expires;
cf533ea5 2162 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2163 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2164 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2165 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2166 __be32 dest = inet->inet_daddr;
2167 __be32 src = inet->inet_rcv_saddr;
2168 __u16 destp = ntohs(inet->inet_dport);
2169 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2170 int rx_queue;
00fd38d9 2171 int state;
1da177e4 2172
6ba8a3b1
ND
2173 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2174 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2175 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2176 timer_active = 1;
463c84b9
ACM
2177 timer_expires = icsk->icsk_timeout;
2178 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2179 timer_active = 4;
463c84b9 2180 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2181 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2182 timer_active = 2;
cf4c6bf8 2183 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2184 } else {
2185 timer_active = 0;
2186 timer_expires = jiffies;
2187 }
2188
00fd38d9
ED
2189 state = sk_state_load(sk);
2190 if (state == TCP_LISTEN)
49d09007
ED
2191 rx_queue = sk->sk_ack_backlog;
2192 else
00fd38d9
ED
2193 /* Because we don't lock the socket,
2194 * we might find a transient negative value.
49d09007
ED
2195 */
2196 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2197
5e659e4c 2198 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2199 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2200 i, src, srcp, dest, destp, state,
47da8ee6 2201 tp->write_seq - tp->snd_una,
49d09007 2202 rx_queue,
1da177e4 2203 timer_active,
a399a805 2204 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2205 icsk->icsk_retransmits,
a7cb5a49 2206 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2207 icsk->icsk_probes_out,
cf4c6bf8
IJ
2208 sock_i_ino(sk),
2209 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2210 jiffies_to_clock_t(icsk->icsk_rto),
2211 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2212 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2213 tp->snd_cwnd,
00fd38d9
ED
2214 state == TCP_LISTEN ?
2215 fastopenq->max_qlen :
652586df 2216 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2217}
2218
cf533ea5 2219static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2220 struct seq_file *f, int i)
1da177e4 2221{
789f558c 2222 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2223 __be32 dest, src;
1da177e4 2224 __u16 destp, srcp;
1da177e4
LT
2225
2226 dest = tw->tw_daddr;
2227 src = tw->tw_rcv_saddr;
2228 destp = ntohs(tw->tw_dport);
2229 srcp = ntohs(tw->tw_sport);
2230
5e659e4c 2231 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2232 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2233 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2234 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2235 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2236}
2237
2238#define TMPSZ 150
2239
2240static int tcp4_seq_show(struct seq_file *seq, void *v)
2241{
5799de0b 2242 struct tcp_iter_state *st;
05dbc7b5 2243 struct sock *sk = v;
1da177e4 2244
652586df 2245 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2246 if (v == SEQ_START_TOKEN) {
652586df 2247 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2248 "rx_queue tr tm->when retrnsmt uid timeout "
2249 "inode");
2250 goto out;
2251 }
2252 st = seq->private;
2253
079096f1
ED
2254 if (sk->sk_state == TCP_TIME_WAIT)
2255 get_timewait4_sock(v, seq, st->num);
2256 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2257 get_openreq4(v, seq, st->num);
079096f1
ED
2258 else
2259 get_tcp4_sock(v, seq, st->num);
1da177e4 2260out:
652586df 2261 seq_pad(seq, '\n');
1da177e4
LT
2262 return 0;
2263}
2264
73cb88ec
AV
2265static const struct file_operations tcp_afinfo_seq_fops = {
2266 .owner = THIS_MODULE,
2267 .open = tcp_seq_open,
2268 .read = seq_read,
2269 .llseek = seq_lseek,
2270 .release = seq_release_net
2271};
2272
1da177e4 2273static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2274 .name = "tcp",
2275 .family = AF_INET,
73cb88ec 2276 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2277 .seq_ops = {
2278 .show = tcp4_seq_show,
2279 },
1da177e4
LT
2280};
2281
2c8c1e72 2282static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2283{
2284 return tcp_proc_register(net, &tcp4_seq_afinfo);
2285}
2286
2c8c1e72 2287static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2288{
2289 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2290}
2291
2292static struct pernet_operations tcp4_net_ops = {
2293 .init = tcp4_proc_init_net,
2294 .exit = tcp4_proc_exit_net,
2295};
2296
1da177e4
LT
2297int __init tcp4_proc_init(void)
2298{
757764f6 2299 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2300}
2301
2302void tcp4_proc_exit(void)
2303{
757764f6 2304 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2305}
2306#endif /* CONFIG_PROC_FS */
2307
2308struct proto tcp_prot = {
2309 .name = "TCP",
2310 .owner = THIS_MODULE,
2311 .close = tcp_close,
2312 .connect = tcp_v4_connect,
2313 .disconnect = tcp_disconnect,
463c84b9 2314 .accept = inet_csk_accept,
1da177e4
LT
2315 .ioctl = tcp_ioctl,
2316 .init = tcp_v4_init_sock,
2317 .destroy = tcp_v4_destroy_sock,
2318 .shutdown = tcp_shutdown,
2319 .setsockopt = tcp_setsockopt,
2320 .getsockopt = tcp_getsockopt,
1da177e4 2321 .recvmsg = tcp_recvmsg,
7ba42910
CG
2322 .sendmsg = tcp_sendmsg,
2323 .sendpage = tcp_sendpage,
1da177e4 2324 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2325 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2326 .hash = inet_hash,
2327 .unhash = inet_unhash,
2328 .get_port = inet_csk_get_port,
1da177e4 2329 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2330 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2331 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2332 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2333 .memory_allocated = &tcp_memory_allocated,
2334 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2335 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2336 .sysctl_wmem = sysctl_tcp_wmem,
2337 .sysctl_rmem = sysctl_tcp_rmem,
2338 .max_header = MAX_TCP_HEADER,
2339 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2340 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2341 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2342 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2343 .h.hashinfo = &tcp_hashinfo,
7ba42910 2344 .no_autobind = true,
543d9cfe
ACM
2345#ifdef CONFIG_COMPAT
2346 .compat_setsockopt = compat_tcp_setsockopt,
2347 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2348#endif
c1e64e29 2349 .diag_destroy = tcp_abort,
1da177e4 2350};
4bc2f18b 2351EXPORT_SYMBOL(tcp_prot);
1da177e4 2352
bdbbb852
ED
2353static void __net_exit tcp_sk_exit(struct net *net)
2354{
2355 int cpu;
2356
2357 for_each_possible_cpu(cpu)
2358 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2359 free_percpu(net->ipv4.tcp_sk);
2360}
2361
046ee902
DL
2362static int __net_init tcp_sk_init(struct net *net)
2363{
bdbbb852
ED
2364 int res, cpu;
2365
2366 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2367 if (!net->ipv4.tcp_sk)
2368 return -ENOMEM;
2369
2370 for_each_possible_cpu(cpu) {
2371 struct sock *sk;
2372
2373 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2374 IPPROTO_TCP, net);
2375 if (res)
2376 goto fail;
2377 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2378 }
49213555 2379
5d134f1c 2380 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2381 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2382
b0f9ca53 2383 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2384 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2385 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2386
13b287e8 2387 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2388 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2389 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2390
6fa25166 2391 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2392 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
6fa25166 2393
49213555 2394 return 0;
bdbbb852
ED
2395fail:
2396 tcp_sk_exit(net);
2397
2398 return res;
b099ce26
EB
2399}
2400
2401static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2402{
2403 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2404}
2405
2406static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2407 .init = tcp_sk_init,
2408 .exit = tcp_sk_exit,
2409 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2410};
2411
9b0f976f 2412void __init tcp_v4_init(void)
1da177e4 2413{
5caea4ea 2414 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2415 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2416 panic("Failed to create the TCP control socket.\n");
1da177e4 2417}