]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/ipv4/tcp_ipv4.c
hns_enet: use cpumask_var_t for on-stack mask
[mirror_ubuntu-eoan-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83
cf80e0e4 84#include <crypto/hash.h>
cfb6eeb4
YH
85#include <linux/scatterlist.h>
86
ab32ea5d 87int sysctl_tcp_low_latency __read_mostly;
1da177e4 88
cfb6eeb4 89#ifdef CONFIG_TCP_MD5SIG
a915da9b 90static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
92#endif
93
5caea4ea 94struct inet_hashinfo tcp_hashinfo;
4bc2f18b 95EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 96
95a22cae 97static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
1da177e4 98{
eddc9ec5
ACM
99 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
100 ip_hdr(skb)->saddr,
aa8223c7 101 tcp_hdr(skb)->dest,
95a22cae 102 tcp_hdr(skb)->source, tsoff);
1da177e4
LT
103}
104
6d6ee43e
ACM
105int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
106{
107 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
108 struct tcp_sock *tp = tcp_sk(sk);
109
110 /* With PAWS, it is safe from the viewpoint
111 of data integrity. Even without PAWS it is safe provided sequence
112 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
113
114 Actually, the idea is close to VJ's one, only timestamp cache is
115 held not per host, but per port pair and TW bucket is used as state
116 holder.
117
118 If TW bucket has been already destroyed we fall back to VJ's scheme
119 and use initial timestamp retrieved from peer table.
120 */
121 if (tcptw->tw_ts_recent_stamp &&
56ab6b93 122 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
9d729f72 123 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
124 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
125 if (tp->write_seq == 0)
126 tp->write_seq = 1;
127 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
128 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
129 sock_hold(sktw);
130 return 1;
131 }
132
133 return 0;
134}
6d6ee43e
ACM
135EXPORT_SYMBOL_GPL(tcp_twsk_unique);
136
1da177e4
LT
137/* This will initiate an outgoing connection. */
138int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
139{
2d7192d6 140 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
141 struct inet_sock *inet = inet_sk(sk);
142 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 143 __be16 orig_sport, orig_dport;
bada8adc 144 __be32 daddr, nexthop;
da905bd1 145 struct flowi4 *fl4;
2d7192d6 146 struct rtable *rt;
1da177e4 147 int err;
f6d8bd05 148 struct ip_options_rcu *inet_opt;
1946e672 149 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
150
151 if (addr_len < sizeof(struct sockaddr_in))
152 return -EINVAL;
153
154 if (usin->sin_family != AF_INET)
155 return -EAFNOSUPPORT;
156
157 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 158 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 159 lockdep_sock_is_held(sk));
f6d8bd05 160 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
161 if (!daddr)
162 return -EINVAL;
f6d8bd05 163 nexthop = inet_opt->opt.faddr;
1da177e4
LT
164 }
165
dca8b089
DM
166 orig_sport = inet->inet_sport;
167 orig_dport = usin->sin_port;
da905bd1
DM
168 fl4 = &inet->cork.fl.u.ip4;
169 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 IPPROTO_TCP,
0e0d44ab 172 orig_sport, orig_dport, sk);
b23dd4fe
DM
173 if (IS_ERR(rt)) {
174 err = PTR_ERR(rt);
175 if (err == -ENETUNREACH)
f1d8cba6 176 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 177 return err;
584bdf8c 178 }
1da177e4
LT
179
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
181 ip_rt_put(rt);
182 return -ENETUNREACH;
183 }
184
f6d8bd05 185 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 186 daddr = fl4->daddr;
1da177e4 187
c720c7e8 188 if (!inet->inet_saddr)
da905bd1 189 inet->inet_saddr = fl4->saddr;
d1e559d0 190 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 191
c720c7e8 192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
193 /* Reset inherited state */
194 tp->rx_opt.ts_recent = 0;
195 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
196 if (likely(!tp->repair))
197 tp->write_seq = 0;
1da177e4
LT
198 }
199
1946e672 200 if (tcp_death_row->sysctl_tw_recycle &&
81166dd6
DM
201 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
202 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 203
c720c7e8 204 inet->inet_dport = usin->sin_port;
d1e559d0 205 sk_daddr_set(sk, daddr);
1da177e4 206
d83d8461 207 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
208 if (inet_opt)
209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 210
bee7ca9e 211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
212
213 /* Socket identity is still unknown (sport may be zero).
214 * However we set state to SYN-SENT and not releasing socket
215 * lock select source port, enter ourselves into the hash tables and
216 * complete initialization after this.
217 */
218 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 219 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
220 if (err)
221 goto failure;
222
877d1f62 223 sk_set_txhash(sk);
9e7ceb06 224
da905bd1 225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
226 inet->inet_sport, inet->inet_dport, sk);
227 if (IS_ERR(rt)) {
228 err = PTR_ERR(rt);
229 rt = NULL;
1da177e4 230 goto failure;
b23dd4fe 231 }
1da177e4 232 /* OK, now commit destination to socket. */
bcd76111 233 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 234 sk_setup_caps(sk, &rt->dst);
19f6d3f3 235 rt = NULL;
1da177e4 236
ee995283 237 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
95a22cae
FW
241 usin->sin_port,
242 &tp->tsoffset);
1da177e4 243
c720c7e8 244 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 245
19f6d3f3
WW
246 if (tcp_fastopen_defer_connect(sk, &err))
247 return err;
248 if (err)
249 goto failure;
250
2b916477 251 err = tcp_connect(sk);
ee995283 252
1da177e4
LT
253 if (err)
254 goto failure;
255
256 return 0;
257
258failure:
7174259e
ACM
259 /*
260 * This unhashes the socket and releases the local port,
261 * if necessary.
262 */
1da177e4
LT
263 tcp_set_state(sk, TCP_CLOSE);
264 ip_rt_put(rt);
265 sk->sk_route_caps = 0;
c720c7e8 266 inet->inet_dport = 0;
1da177e4
LT
267 return err;
268}
4bc2f18b 269EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 270
1da177e4 271/*
563d34d0
ED
272 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273 * It can be called through tcp_release_cb() if socket was owned by user
274 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 275 */
4fab9071 276void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
277{
278 struct dst_entry *dst;
279 struct inet_sock *inet = inet_sk(sk);
563d34d0 280 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 281
80d0a69f
DM
282 dst = inet_csk_update_pmtu(sk, mtu);
283 if (!dst)
1da177e4
LT
284 return;
285
1da177e4
LT
286 /* Something is about to be wrong... Remember soft error
287 * for the case, if this connection will not able to recover.
288 */
289 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
290 sk->sk_err_soft = EMSGSIZE;
291
292 mtu = dst_mtu(dst);
293
294 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 295 ip_sk_accept_pmtu(sk) &&
d83d8461 296 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
297 tcp_sync_mss(sk, mtu);
298
299 /* Resend the TCP packet because it's
300 * clear that the old packet has been
301 * dropped. This is the new "fast" path mtu
302 * discovery.
303 */
304 tcp_simple_retransmit(sk);
305 } /* else let the usual retransmit timer handle it */
306}
4fab9071 307EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 308
55be7a9c
DM
309static void do_redirect(struct sk_buff *skb, struct sock *sk)
310{
311 struct dst_entry *dst = __sk_dst_check(sk, 0);
312
1ed5c48f 313 if (dst)
6700c270 314 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
315}
316
26e37360
ED
317
318/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 319void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
320{
321 struct request_sock *req = inet_reqsk(sk);
322 struct net *net = sock_net(sk);
323
324 /* ICMPs are not backlogged, hence we cannot get
325 * an established socket here.
326 */
26e37360 327 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 328 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 329 } else if (abort) {
26e37360
ED
330 /*
331 * Still in SYN_RECV, just remove it silently.
332 * There is no good way to pass the error to the newly
333 * created socket, and POSIX does not want network
334 * errors returned from accept().
335 */
c6973669 336 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 337 tcp_listendrop(req->rsk_listener);
26e37360 338 }
ef84d8ce 339 reqsk_put(req);
26e37360
ED
340}
341EXPORT_SYMBOL(tcp_req_err);
342
1da177e4
LT
343/*
344 * This routine is called by the ICMP module when it gets some
345 * sort of error condition. If err < 0 then the socket should
346 * be closed and the error returned to the user. If err > 0
347 * it's just the icmp type << 8 | icmp code. After adjustment
348 * header points to the first 8 bytes of the tcp header. We need
349 * to find the appropriate port.
350 *
351 * The locking strategy used here is very "optimistic". When
352 * someone else accesses the socket the ICMP is just dropped
353 * and for some paths there is no check at all.
354 * A more general error queue to queue errors for later handling
355 * is probably better.
356 *
357 */
358
4d1a2d9e 359void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 360{
b71d1d42 361 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 362 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 363 struct inet_connection_sock *icsk;
1da177e4
LT
364 struct tcp_sock *tp;
365 struct inet_sock *inet;
4d1a2d9e
DL
366 const int type = icmp_hdr(icmp_skb)->type;
367 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 368 struct sock *sk;
f1ecd5d9 369 struct sk_buff *skb;
0a672f74
YC
370 struct request_sock *fastopen;
371 __u32 seq, snd_una;
f1ecd5d9 372 __u32 remaining;
1da177e4 373 int err;
4d1a2d9e 374 struct net *net = dev_net(icmp_skb->dev);
1da177e4 375
26e37360
ED
376 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
377 th->dest, iph->saddr, ntohs(th->source),
378 inet_iif(icmp_skb));
1da177e4 379 if (!sk) {
5d3848bc 380 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
1da177e4
LT
381 return;
382 }
383 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 384 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
385 return;
386 }
26e37360
ED
387 seq = ntohl(th->seq);
388 if (sk->sk_state == TCP_NEW_SYN_RECV)
9cf74903
ED
389 return tcp_req_err(sk, seq,
390 type == ICMP_PARAMETERPROB ||
391 type == ICMP_TIME_EXCEEDED ||
392 (type == ICMP_DEST_UNREACH &&
393 (code == ICMP_NET_UNREACH ||
394 code == ICMP_HOST_UNREACH)));
1da177e4
LT
395
396 bh_lock_sock(sk);
397 /* If too many ICMPs get dropped on busy
398 * servers this needs to be solved differently.
563d34d0
ED
399 * We do take care of PMTU discovery (RFC1191) special case :
400 * we can receive locally generated ICMP messages while socket is held.
1da177e4 401 */
b74aa930
ED
402 if (sock_owned_by_user(sk)) {
403 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 404 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 405 }
1da177e4
LT
406 if (sk->sk_state == TCP_CLOSE)
407 goto out;
408
97e3ecd1 409 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 410 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 411 goto out;
412 }
413
f1ecd5d9 414 icsk = inet_csk(sk);
1da177e4 415 tp = tcp_sk(sk);
0a672f74
YC
416 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
417 fastopen = tp->fastopen_rsk;
418 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 419 if (sk->sk_state != TCP_LISTEN &&
0a672f74 420 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 421 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
422 goto out;
423 }
424
425 switch (type) {
55be7a9c
DM
426 case ICMP_REDIRECT:
427 do_redirect(icmp_skb, sk);
428 goto out;
1da177e4
LT
429 case ICMP_SOURCE_QUENCH:
430 /* Just silently ignore these. */
431 goto out;
432 case ICMP_PARAMETERPROB:
433 err = EPROTO;
434 break;
435 case ICMP_DEST_UNREACH:
436 if (code > NR_ICMP_UNREACH)
437 goto out;
438
439 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
440 /* We are not interested in TCP_LISTEN and open_requests
441 * (SYN-ACKs send out by Linux are always <576bytes so
442 * they should go through unfragmented).
443 */
444 if (sk->sk_state == TCP_LISTEN)
445 goto out;
446
563d34d0 447 tp->mtu_info = info;
144d56e9 448 if (!sock_owned_by_user(sk)) {
563d34d0 449 tcp_v4_mtu_reduced(sk);
144d56e9 450 } else {
7aa5470c 451 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
452 sock_hold(sk);
453 }
1da177e4
LT
454 goto out;
455 }
456
457 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
458 /* check if icmp_skb allows revert of backoff
459 * (see draft-zimmermann-tcp-lcd) */
460 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
461 break;
462 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 463 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
464 break;
465
8f49c270
DM
466 if (sock_owned_by_user(sk))
467 break;
468
f1ecd5d9 469 icsk->icsk_backoff--;
fcdd1cf4
ED
470 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
471 TCP_TIMEOUT_INIT;
472 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
473
474 skb = tcp_write_queue_head(sk);
475 BUG_ON(!skb);
476
7faee5c0
ED
477 remaining = icsk->icsk_rto -
478 min(icsk->icsk_rto,
479 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
480
481 if (remaining) {
482 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
483 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
484 } else {
485 /* RTO revert clocked out retransmission.
486 * Will retransmit now */
487 tcp_retransmit_timer(sk);
488 }
489
1da177e4
LT
490 break;
491 case ICMP_TIME_EXCEEDED:
492 err = EHOSTUNREACH;
493 break;
494 default:
495 goto out;
496 }
497
498 switch (sk->sk_state) {
1da177e4 499 case TCP_SYN_SENT:
0a672f74
YC
500 case TCP_SYN_RECV:
501 /* Only in fast or simultaneous open. If a fast open socket is
502 * is already accepted it is treated as a connected one below.
503 */
51456b29 504 if (fastopen && !fastopen->sk)
0a672f74
YC
505 break;
506
1da177e4 507 if (!sock_owned_by_user(sk)) {
1da177e4
LT
508 sk->sk_err = err;
509
510 sk->sk_error_report(sk);
511
512 tcp_done(sk);
513 } else {
514 sk->sk_err_soft = err;
515 }
516 goto out;
517 }
518
519 /* If we've already connected we will keep trying
520 * until we time out, or the user gives up.
521 *
522 * rfc1122 4.2.3.9 allows to consider as hard errors
523 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
524 * but it is obsoleted by pmtu discovery).
525 *
526 * Note, that in modern internet, where routing is unreliable
527 * and in each dark corner broken firewalls sit, sending random
528 * errors ordered by their masters even this two messages finally lose
529 * their original sense (even Linux sends invalid PORT_UNREACHs)
530 *
531 * Now we are in compliance with RFCs.
532 * --ANK (980905)
533 */
534
535 inet = inet_sk(sk);
536 if (!sock_owned_by_user(sk) && inet->recverr) {
537 sk->sk_err = err;
538 sk->sk_error_report(sk);
539 } else { /* Only an error on timeout */
540 sk->sk_err_soft = err;
541 }
542
543out:
544 bh_unlock_sock(sk);
545 sock_put(sk);
546}
547
28850dc7 548void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 549{
aa8223c7 550 struct tcphdr *th = tcp_hdr(skb);
1da177e4 551
84fa7933 552 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 553 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 554 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 555 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 556 } else {
419f9f89 557 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 558 csum_partial(th,
1da177e4
LT
559 th->doff << 2,
560 skb->csum));
561 }
562}
563
419f9f89 564/* This routine computes an IPv4 TCP checksum. */
bb296246 565void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 566{
cf533ea5 567 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
568
569 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
570}
4bc2f18b 571EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 572
1da177e4
LT
573/*
574 * This routine will send an RST to the other tcp.
575 *
576 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577 * for reset.
578 * Answer: if a packet caused RST, it is not for a socket
579 * existing in our system, if it is matched to a socket,
580 * it is just duplicate segment or bug in other side's TCP.
581 * So that we build reply only basing on parameters
582 * arrived with segment.
583 * Exception: precedence violation. We do not implement it in any case.
584 */
585
a00e7444 586static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 587{
cf533ea5 588 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
589 struct {
590 struct tcphdr th;
591#ifdef CONFIG_TCP_MD5SIG
714e85be 592 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
593#endif
594 } rep;
1da177e4 595 struct ip_reply_arg arg;
cfb6eeb4 596#ifdef CONFIG_TCP_MD5SIG
e46787f0 597 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
598 const __u8 *hash_location = NULL;
599 unsigned char newhash[16];
600 int genhash;
601 struct sock *sk1 = NULL;
cfb6eeb4 602#endif
a86b1e30 603 struct net *net;
1da177e4
LT
604
605 /* Never send a reset in response to a reset. */
606 if (th->rst)
607 return;
608
c3658e8d
ED
609 /* If sk not NULL, it means we did a successful lookup and incoming
610 * route had to be correct. prequeue might have dropped our dst.
611 */
612 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
613 return;
614
615 /* Swap the send and the receive. */
cfb6eeb4
YH
616 memset(&rep, 0, sizeof(rep));
617 rep.th.dest = th->source;
618 rep.th.source = th->dest;
619 rep.th.doff = sizeof(struct tcphdr) / 4;
620 rep.th.rst = 1;
1da177e4
LT
621
622 if (th->ack) {
cfb6eeb4 623 rep.th.seq = th->ack_seq;
1da177e4 624 } else {
cfb6eeb4
YH
625 rep.th.ack = 1;
626 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
627 skb->len - (th->doff << 2));
1da177e4
LT
628 }
629
7174259e 630 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
633
0f85feae 634 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 635#ifdef CONFIG_TCP_MD5SIG
3b24d854 636 rcu_read_lock();
658ddaaf 637 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 638 if (sk && sk_fullsock(sk)) {
e46787f0
FW
639 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
640 &ip_hdr(skb)->saddr, AF_INET);
641 } else if (hash_location) {
658ddaaf
SL
642 /*
643 * active side is lost. Try to find listening socket through
644 * source port, and then find md5 key through listening socket.
645 * we are not loose security here:
646 * Incoming packet is checked with md5 hash with finding key,
647 * no RST generated if md5 hash doesn't match.
648 */
a583636a
CG
649 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
650 ip_hdr(skb)->saddr,
da5e3630 651 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
652 ntohs(th->source), inet_iif(skb));
653 /* don't send rst if it can't find key */
654 if (!sk1)
3b24d854
ED
655 goto out;
656
658ddaaf
SL
657 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
658 &ip_hdr(skb)->saddr, AF_INET);
659 if (!key)
3b24d854
ED
660 goto out;
661
658ddaaf 662
39f8e58e 663 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 664 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
665 goto out;
666
658ddaaf
SL
667 }
668
cfb6eeb4
YH
669 if (key) {
670 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
671 (TCPOPT_NOP << 16) |
672 (TCPOPT_MD5SIG << 8) |
673 TCPOLEN_MD5SIG);
674 /* Update length and the length the header thinks exists */
675 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
676 rep.th.doff = arg.iov[0].iov_len / 4;
677
49a72dfb 678 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
679 key, ip_hdr(skb)->saddr,
680 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
681 }
682#endif
eddc9ec5
ACM
683 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
684 ip_hdr(skb)->saddr, /* XXX */
52cd5750 685 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 686 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
687 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
688
e2446eaa 689 /* When socket is gone, all binding information is lost.
4c675258
AK
690 * routing might fail in this case. No choice here, if we choose to force
691 * input interface, we will misroute in case of asymmetric route.
e2446eaa 692 */
4c675258
AK
693 if (sk)
694 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 695
271c3b9b
FW
696 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
697 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
698
66b13d99 699 arg.tos = ip_hdr(skb)->tos;
e2d118a1 700 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 701 local_bh_disable();
bdbbb852
ED
702 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
703 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
704 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
705 &arg, arg.iov[0].iov_len);
1da177e4 706
90bbcc60
ED
707 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
708 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 709 local_bh_enable();
658ddaaf
SL
710
711#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
712out:
713 rcu_read_unlock();
658ddaaf 714#endif
1da177e4
LT
715}
716
717/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
718 outside socket context is ugly, certainly. What can I do?
719 */
720
e2d118a1 721static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 722 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 723 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 724 struct tcp_md5sig_key *key,
66b13d99 725 int reply_flags, u8 tos)
1da177e4 726{
cf533ea5 727 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
728 struct {
729 struct tcphdr th;
714e85be 730 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 731#ifdef CONFIG_TCP_MD5SIG
714e85be 732 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
733#endif
734 ];
1da177e4 735 } rep;
e2d118a1 736 struct net *net = sock_net(sk);
1da177e4
LT
737 struct ip_reply_arg arg;
738
739 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 740 memset(&arg, 0, sizeof(arg));
1da177e4
LT
741
742 arg.iov[0].iov_base = (unsigned char *)&rep;
743 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 744 if (tsecr) {
cfb6eeb4
YH
745 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
746 (TCPOPT_TIMESTAMP << 8) |
747 TCPOLEN_TIMESTAMP);
ee684b6f
AV
748 rep.opt[1] = htonl(tsval);
749 rep.opt[2] = htonl(tsecr);
cb48cfe8 750 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
751 }
752
753 /* Swap the send and the receive. */
754 rep.th.dest = th->source;
755 rep.th.source = th->dest;
756 rep.th.doff = arg.iov[0].iov_len / 4;
757 rep.th.seq = htonl(seq);
758 rep.th.ack_seq = htonl(ack);
759 rep.th.ack = 1;
760 rep.th.window = htons(win);
761
cfb6eeb4 762#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 763 if (key) {
ee684b6f 764 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
765
766 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
767 (TCPOPT_NOP << 16) |
768 (TCPOPT_MD5SIG << 8) |
769 TCPOLEN_MD5SIG);
770 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
771 rep.th.doff = arg.iov[0].iov_len/4;
772
49a72dfb 773 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
774 key, ip_hdr(skb)->saddr,
775 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
776 }
777#endif
88ef4a5a 778 arg.flags = reply_flags;
eddc9ec5
ACM
779 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
780 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
781 arg.iov[0].iov_len, IPPROTO_TCP, 0);
782 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
783 if (oif)
784 arg.bound_dev_if = oif;
66b13d99 785 arg.tos = tos;
e2d118a1 786 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 787 local_bh_disable();
bdbbb852
ED
788 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
789 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
790 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
791 &arg, arg.iov[0].iov_len);
1da177e4 792
90bbcc60 793 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 794 local_bh_enable();
1da177e4
LT
795}
796
797static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
798{
8feaf0c0 799 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 800 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 801
e2d118a1 802 tcp_v4_send_ack(sk, skb,
e62a123b 803 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 804 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 805 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
806 tcptw->tw_ts_recent,
807 tw->tw_bound_dev_if,
88ef4a5a 808 tcp_twsk_md5_key(tcptw),
66b13d99
ED
809 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
810 tw->tw_tos
9501f972 811 );
1da177e4 812
8feaf0c0 813 inet_twsk_put(tw);
1da177e4
LT
814}
815
a00e7444 816static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 817 struct request_sock *req)
1da177e4 818{
168a8f58
JC
819 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
820 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
821 */
e62a123b
ED
822 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
823 tcp_sk(sk)->snd_nxt;
824
20a2b49f
ED
825 /* RFC 7323 2.3
826 * The window field (SEG.WND) of every outgoing segment, with the
827 * exception of <SYN> segments, MUST be right-shifted by
828 * Rcv.Wind.Shift bits:
829 */
e2d118a1 830 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
831 tcp_rsk(req)->rcv_nxt,
832 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
95a22cae 833 tcp_time_stamp + tcp_rsk(req)->ts_off,
9501f972
YH
834 req->ts_recent,
835 0,
a915da9b
ED
836 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
837 AF_INET),
66b13d99
ED
838 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
839 ip_hdr(skb)->tos);
1da177e4
LT
840}
841
1da177e4 842/*
9bf1d83e 843 * Send a SYN-ACK after having received a SYN.
60236fdd 844 * This still operates on a request_sock only, not on a big
1da177e4
LT
845 * socket.
846 */
0f935dbe 847static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 848 struct flowi *fl,
72659ecc 849 struct request_sock *req,
ca6fb065 850 struct tcp_fastopen_cookie *foc,
b3d05147 851 enum tcp_synack_type synack_type)
1da177e4 852{
2e6599cb 853 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 854 struct flowi4 fl4;
1da177e4 855 int err = -1;
d41db5af 856 struct sk_buff *skb;
1da177e4
LT
857
858 /* First, grab a route. */
ba3f7f04 859 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 860 return -1;
1da177e4 861
b3d05147 862 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
863
864 if (skb) {
634fb979 865 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 866
634fb979
ED
867 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
868 ireq->ir_rmt_addr,
2e6599cb 869 ireq->opt);
b9df3cb8 870 err = net_xmit_eval(err);
1da177e4
LT
871 }
872
1da177e4
LT
873 return err;
874}
875
876/*
60236fdd 877 * IPv4 request_sock destructor.
1da177e4 878 */
60236fdd 879static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 880{
a51482bd 881 kfree(inet_rsk(req)->opt);
1da177e4
LT
882}
883
cfb6eeb4
YH
884#ifdef CONFIG_TCP_MD5SIG
885/*
886 * RFC2385 MD5 checksumming requires a mapping of
887 * IP address->MD5 Key.
888 * We need to maintain these in the sk structure.
889 */
890
891/* Find the Key structure for an address. */
b83e3deb 892struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
893 const union tcp_md5_addr *addr,
894 int family)
cfb6eeb4 895{
fd3a154a 896 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 897 struct tcp_md5sig_key *key;
a915da9b 898 unsigned int size = sizeof(struct in_addr);
fd3a154a 899 const struct tcp_md5sig_info *md5sig;
cfb6eeb4 900
a8afca03
ED
901 /* caller either holds rcu_read_lock() or socket lock */
902 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 903 lockdep_sock_is_held(sk));
a8afca03 904 if (!md5sig)
cfb6eeb4 905 return NULL;
a915da9b
ED
906#if IS_ENABLED(CONFIG_IPV6)
907 if (family == AF_INET6)
908 size = sizeof(struct in6_addr);
909#endif
b67bfe0d 910 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
911 if (key->family != family)
912 continue;
913 if (!memcmp(&key->addr, addr, size))
914 return key;
cfb6eeb4
YH
915 }
916 return NULL;
917}
a915da9b 918EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4 919
b83e3deb 920struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 921 const struct sock *addr_sk)
cfb6eeb4 922{
b52e6921 923 const union tcp_md5_addr *addr;
a915da9b 924
b52e6921 925 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 926 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 927}
cfb6eeb4
YH
928EXPORT_SYMBOL(tcp_v4_md5_lookup);
929
cfb6eeb4 930/* This can be called on a newly created socket, from other files */
a915da9b
ED
931int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
932 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
933{
934 /* Add Key to the list */
b0a713e9 935 struct tcp_md5sig_key *key;
cfb6eeb4 936 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 937 struct tcp_md5sig_info *md5sig;
cfb6eeb4 938
c0353c7b 939 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
940 if (key) {
941 /* Pre-existing entry - just update that one. */
a915da9b 942 memcpy(key->key, newkey, newkeylen);
b0a713e9 943 key->keylen = newkeylen;
a915da9b
ED
944 return 0;
945 }
260fcbeb 946
a8afca03 947 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 948 lockdep_sock_is_held(sk));
a915da9b
ED
949 if (!md5sig) {
950 md5sig = kmalloc(sizeof(*md5sig), gfp);
951 if (!md5sig)
cfb6eeb4 952 return -ENOMEM;
cfb6eeb4 953
a915da9b
ED
954 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
955 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 956 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 957 }
cfb6eeb4 958
5f3d9cb2 959 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
960 if (!key)
961 return -ENOMEM;
71cea17e 962 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 963 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 964 return -ENOMEM;
cfb6eeb4 965 }
a915da9b
ED
966
967 memcpy(key->key, newkey, newkeylen);
968 key->keylen = newkeylen;
969 key->family = family;
970 memcpy(&key->addr, addr,
971 (family == AF_INET6) ? sizeof(struct in6_addr) :
972 sizeof(struct in_addr));
973 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
974 return 0;
975}
a915da9b 976EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 977
a915da9b 978int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 979{
a915da9b
ED
980 struct tcp_md5sig_key *key;
981
c0353c7b 982 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
983 if (!key)
984 return -ENOENT;
985 hlist_del_rcu(&key->node);
5f3d9cb2 986 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 987 kfree_rcu(key, rcu);
a915da9b 988 return 0;
cfb6eeb4 989}
a915da9b 990EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 991
e0683e70 992static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
993{
994 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 995 struct tcp_md5sig_key *key;
b67bfe0d 996 struct hlist_node *n;
a8afca03 997 struct tcp_md5sig_info *md5sig;
cfb6eeb4 998
a8afca03
ED
999 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1000
b67bfe0d 1001 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1002 hlist_del_rcu(&key->node);
5f3d9cb2 1003 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1004 kfree_rcu(key, rcu);
cfb6eeb4
YH
1005 }
1006}
1007
7174259e
ACM
1008static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1009 int optlen)
cfb6eeb4
YH
1010{
1011 struct tcp_md5sig cmd;
1012 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1013
1014 if (optlen < sizeof(cmd))
1015 return -EINVAL;
1016
7174259e 1017 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1018 return -EFAULT;
1019
1020 if (sin->sin_family != AF_INET)
1021 return -EINVAL;
1022
64a124ed 1023 if (!cmd.tcpm_keylen)
a915da9b
ED
1024 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1025 AF_INET);
cfb6eeb4
YH
1026
1027 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1028 return -EINVAL;
1029
a915da9b
ED
1030 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1031 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1032 GFP_KERNEL);
cfb6eeb4
YH
1033}
1034
19689e38
ED
1035static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1036 __be32 daddr, __be32 saddr,
1037 const struct tcphdr *th, int nbytes)
cfb6eeb4 1038{
cfb6eeb4 1039 struct tcp4_pseudohdr *bp;
49a72dfb 1040 struct scatterlist sg;
19689e38 1041 struct tcphdr *_th;
cfb6eeb4 1042
19689e38 1043 bp = hp->scratch;
cfb6eeb4
YH
1044 bp->saddr = saddr;
1045 bp->daddr = daddr;
1046 bp->pad = 0;
076fb722 1047 bp->protocol = IPPROTO_TCP;
49a72dfb 1048 bp->len = cpu_to_be16(nbytes);
c7da57a1 1049
19689e38
ED
1050 _th = (struct tcphdr *)(bp + 1);
1051 memcpy(_th, th, sizeof(*th));
1052 _th->check = 0;
1053
1054 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1055 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1056 sizeof(*bp) + sizeof(*th));
cf80e0e4 1057 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1058}
1059
a915da9b 1060static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1061 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1062{
1063 struct tcp_md5sig_pool *hp;
cf80e0e4 1064 struct ahash_request *req;
49a72dfb
AL
1065
1066 hp = tcp_get_md5sig_pool();
1067 if (!hp)
1068 goto clear_hash_noput;
cf80e0e4 1069 req = hp->md5_req;
49a72dfb 1070
cf80e0e4 1071 if (crypto_ahash_init(req))
49a72dfb 1072 goto clear_hash;
19689e38 1073 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1074 goto clear_hash;
1075 if (tcp_md5_hash_key(hp, key))
1076 goto clear_hash;
cf80e0e4
HX
1077 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1078 if (crypto_ahash_final(req))
cfb6eeb4
YH
1079 goto clear_hash;
1080
cfb6eeb4 1081 tcp_put_md5sig_pool();
cfb6eeb4 1082 return 0;
49a72dfb 1083
cfb6eeb4
YH
1084clear_hash:
1085 tcp_put_md5sig_pool();
1086clear_hash_noput:
1087 memset(md5_hash, 0, 16);
49a72dfb 1088 return 1;
cfb6eeb4
YH
1089}
1090
39f8e58e
ED
1091int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1092 const struct sock *sk,
318cf7aa 1093 const struct sk_buff *skb)
cfb6eeb4 1094{
49a72dfb 1095 struct tcp_md5sig_pool *hp;
cf80e0e4 1096 struct ahash_request *req;
318cf7aa 1097 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1098 __be32 saddr, daddr;
1099
39f8e58e
ED
1100 if (sk) { /* valid for establish/request sockets */
1101 saddr = sk->sk_rcv_saddr;
1102 daddr = sk->sk_daddr;
cfb6eeb4 1103 } else {
49a72dfb
AL
1104 const struct iphdr *iph = ip_hdr(skb);
1105 saddr = iph->saddr;
1106 daddr = iph->daddr;
cfb6eeb4 1107 }
49a72dfb
AL
1108
1109 hp = tcp_get_md5sig_pool();
1110 if (!hp)
1111 goto clear_hash_noput;
cf80e0e4 1112 req = hp->md5_req;
49a72dfb 1113
cf80e0e4 1114 if (crypto_ahash_init(req))
49a72dfb
AL
1115 goto clear_hash;
1116
19689e38 1117 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1118 goto clear_hash;
1119 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1120 goto clear_hash;
1121 if (tcp_md5_hash_key(hp, key))
1122 goto clear_hash;
cf80e0e4
HX
1123 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1124 if (crypto_ahash_final(req))
49a72dfb
AL
1125 goto clear_hash;
1126
1127 tcp_put_md5sig_pool();
1128 return 0;
1129
1130clear_hash:
1131 tcp_put_md5sig_pool();
1132clear_hash_noput:
1133 memset(md5_hash, 0, 16);
1134 return 1;
cfb6eeb4 1135}
49a72dfb 1136EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1137
ba8e275a
ED
1138#endif
1139
ff74e23f 1140/* Called with rcu_read_lock() */
ba8e275a 1141static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1142 const struct sk_buff *skb)
cfb6eeb4 1143{
ba8e275a 1144#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1145 /*
1146 * This gets called for each TCP segment that arrives
1147 * so we want to be efficient.
1148 * We have 3 drop cases:
1149 * o No MD5 hash and one expected.
1150 * o MD5 hash and we're not expecting one.
1151 * o MD5 hash and its wrong.
1152 */
cf533ea5 1153 const __u8 *hash_location = NULL;
cfb6eeb4 1154 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1155 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1156 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1157 int genhash;
cfb6eeb4
YH
1158 unsigned char newhash[16];
1159
a915da9b
ED
1160 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1161 AF_INET);
7d5d5525 1162 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1163
cfb6eeb4
YH
1164 /* We've parsed the options - do we have a hash? */
1165 if (!hash_expected && !hash_location)
a2a385d6 1166 return false;
cfb6eeb4
YH
1167
1168 if (hash_expected && !hash_location) {
c10d9310 1169 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1170 return true;
cfb6eeb4
YH
1171 }
1172
1173 if (!hash_expected && hash_location) {
c10d9310 1174 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1175 return true;
cfb6eeb4
YH
1176 }
1177
1178 /* Okay, so this is hash_expected and hash_location -
1179 * so we need to calculate the checksum.
1180 */
49a72dfb
AL
1181 genhash = tcp_v4_md5_hash_skb(newhash,
1182 hash_expected,
39f8e58e 1183 NULL, skb);
cfb6eeb4
YH
1184
1185 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1186 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1187 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188 &iph->saddr, ntohs(th->source),
1189 &iph->daddr, ntohs(th->dest),
1190 genhash ? " tcp_v4_calc_md5_hash failed"
1191 : "");
a2a385d6 1192 return true;
cfb6eeb4 1193 }
a2a385d6 1194 return false;
cfb6eeb4 1195#endif
ba8e275a
ED
1196 return false;
1197}
cfb6eeb4 1198
b40cf18e
ED
1199static void tcp_v4_init_req(struct request_sock *req,
1200 const struct sock *sk_listener,
16bea70a
OP
1201 struct sk_buff *skb)
1202{
1203 struct inet_request_sock *ireq = inet_rsk(req);
1204
08d2cc3b
ED
1205 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1206 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
16bea70a
OP
1207 ireq->opt = tcp_v4_save_options(skb);
1208}
1209
f964629e
ED
1210static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1211 struct flowi *fl,
d94e0417
OP
1212 const struct request_sock *req,
1213 bool *strict)
1214{
1215 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1216
1217 if (strict) {
1218 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1219 *strict = true;
1220 else
1221 *strict = false;
1222 }
1223
1224 return dst;
1225}
1226
72a3effa 1227struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1228 .family = PF_INET,
2e6599cb 1229 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1230 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1231 .send_ack = tcp_v4_reqsk_send_ack,
1232 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1233 .send_reset = tcp_v4_send_reset,
688d1945 1234 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1235};
1236
b2e4b3de 1237static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1238 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1239#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1240 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1241 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1242#endif
16bea70a 1243 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1244#ifdef CONFIG_SYN_COOKIES
1245 .cookie_init_seq = cookie_v4_init_sequence,
1246#endif
d94e0417 1247 .route_req = tcp_v4_route_req,
936b8bdb 1248 .init_seq = tcp_v4_init_sequence,
d6274bd8 1249 .send_synack = tcp_v4_send_synack,
16bea70a 1250};
cfb6eeb4 1251
1da177e4
LT
1252int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1253{
1da177e4 1254 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1255 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1256 goto drop;
1257
1fb6f159
OP
1258 return tcp_conn_request(&tcp_request_sock_ops,
1259 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1260
1da177e4 1261drop:
9caad864 1262 tcp_listendrop(sk);
1da177e4
LT
1263 return 0;
1264}
4bc2f18b 1265EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1266
1267
1268/*
1269 * The three way handshake has completed - we got a valid synack -
1270 * now create the new socket.
1271 */
0c27171e 1272struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1273 struct request_sock *req,
5e0724d0
ED
1274 struct dst_entry *dst,
1275 struct request_sock *req_unhash,
1276 bool *own_req)
1da177e4 1277{
2e6599cb 1278 struct inet_request_sock *ireq;
1da177e4
LT
1279 struct inet_sock *newinet;
1280 struct tcp_sock *newtp;
1281 struct sock *newsk;
cfb6eeb4
YH
1282#ifdef CONFIG_TCP_MD5SIG
1283 struct tcp_md5sig_key *key;
1284#endif
f6d8bd05 1285 struct ip_options_rcu *inet_opt;
1da177e4
LT
1286
1287 if (sk_acceptq_is_full(sk))
1288 goto exit_overflow;
1289
1da177e4
LT
1290 newsk = tcp_create_openreq_child(sk, req, skb);
1291 if (!newsk)
093d2823 1292 goto exit_nonewsk;
1da177e4 1293
bcd76111 1294 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1295 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1296
1297 newtp = tcp_sk(newsk);
1298 newinet = inet_sk(newsk);
2e6599cb 1299 ireq = inet_rsk(req);
d1e559d0
ED
1300 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1301 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1302 newsk->sk_bound_dev_if = ireq->ir_iif;
634fb979 1303 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1304 inet_opt = ireq->opt;
1305 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1306 ireq->opt = NULL;
463c84b9 1307 newinet->mc_index = inet_iif(skb);
eddc9ec5 1308 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1309 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1310 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1311 if (inet_opt)
1312 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1313 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1314
dfd25fff
ED
1315 if (!dst) {
1316 dst = inet_csk_route_child_sock(sk, newsk, req);
1317 if (!dst)
1318 goto put_and_exit;
1319 } else {
1320 /* syncookie case : see end of cookie_v4_check() */
1321 }
0e734419
DM
1322 sk_setup_caps(newsk, dst);
1323
81164413
DB
1324 tcp_ca_openreq_child(newsk, dst);
1325
1da177e4 1326 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1327 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1328 if (tcp_sk(sk)->rx_opt.user_mss &&
1329 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1330 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1331
1da177e4
LT
1332 tcp_initialize_rcv_mss(newsk);
1333
cfb6eeb4
YH
1334#ifdef CONFIG_TCP_MD5SIG
1335 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1336 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1337 AF_INET);
00db4124 1338 if (key) {
cfb6eeb4
YH
1339 /*
1340 * We're using one, so create a matching key
1341 * on the newsk structure. If we fail to get
1342 * memory, then we end up not copying the key
1343 * across. Shucks.
1344 */
a915da9b
ED
1345 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1346 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1347 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1348 }
1349#endif
1350
0e734419
DM
1351 if (__inet_inherit_port(sk, newsk) < 0)
1352 goto put_and_exit;
5e0724d0 1353 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
805c4bc0 1354 if (*own_req)
49a496c9 1355 tcp_move_syn(newtp, req);
1da177e4
LT
1356
1357 return newsk;
1358
1359exit_overflow:
c10d9310 1360 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1361exit_nonewsk:
1362 dst_release(dst);
1da177e4 1363exit:
9caad864 1364 tcp_listendrop(sk);
1da177e4 1365 return NULL;
0e734419 1366put_and_exit:
e337e24d
CP
1367 inet_csk_prepare_forced_close(newsk);
1368 tcp_done(newsk);
0e734419 1369 goto exit;
1da177e4 1370}
4bc2f18b 1371EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1372
079096f1 1373static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1374{
079096f1 1375#ifdef CONFIG_SYN_COOKIES
52452c54 1376 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1377
af9b4738 1378 if (!th->syn)
461b74c3 1379 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1380#endif
1381 return sk;
1382}
1383
1da177e4 1384/* The socket must have it's spinlock held when we get
e994b2f0 1385 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1386 *
1387 * We have a potential double-lock case here, so even when
1388 * doing backlog processing we use the BH locking scheme.
1389 * This is because we cannot sleep with the original spinlock
1390 * held.
1391 */
1392int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1393{
cfb6eeb4 1394 struct sock *rsk;
cfb6eeb4 1395
1da177e4 1396 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1397 struct dst_entry *dst = sk->sk_rx_dst;
1398
bdeab991 1399 sock_rps_save_rxhash(sk, skb);
3d97379a 1400 sk_mark_napi_id(sk, skb);
404e0a8b 1401 if (dst) {
505fbcf0 1402 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1403 !dst->ops->check(dst, 0)) {
92101b3b
DM
1404 dst_release(dst);
1405 sk->sk_rx_dst = NULL;
1406 }
1407 }
c995ae22 1408 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1409 return 0;
1410 }
1411
12e25e10 1412 if (tcp_checksum_complete(skb))
1da177e4
LT
1413 goto csum_err;
1414
1415 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1416 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1417
1da177e4
LT
1418 if (!nsk)
1419 goto discard;
1da177e4 1420 if (nsk != sk) {
bdeab991 1421 sock_rps_save_rxhash(nsk, skb);
38cb5245 1422 sk_mark_napi_id(nsk, skb);
cfb6eeb4
YH
1423 if (tcp_child_process(sk, nsk, skb)) {
1424 rsk = nsk;
1da177e4 1425 goto reset;
cfb6eeb4 1426 }
1da177e4
LT
1427 return 0;
1428 }
ca55158c 1429 } else
bdeab991 1430 sock_rps_save_rxhash(sk, skb);
ca55158c 1431
72ab4a86 1432 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1433 rsk = sk;
1da177e4 1434 goto reset;
cfb6eeb4 1435 }
1da177e4
LT
1436 return 0;
1437
1438reset:
cfb6eeb4 1439 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1440discard:
1441 kfree_skb(skb);
1442 /* Be careful here. If this function gets more complicated and
1443 * gcc suffers from register pressure on the x86, sk (in %ebx)
1444 * might be destroyed here. This current version compiles correctly,
1445 * but you have been warned.
1446 */
1447 return 0;
1448
1449csum_err:
c10d9310
ED
1450 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1451 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1452 goto discard;
1453}
4bc2f18b 1454EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1455
160eb5a6 1456void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1457{
41063e9d
DM
1458 const struct iphdr *iph;
1459 const struct tcphdr *th;
1460 struct sock *sk;
41063e9d 1461
41063e9d 1462 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1463 return;
41063e9d 1464
45f00f99 1465 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1466 return;
41063e9d
DM
1467
1468 iph = ip_hdr(skb);
45f00f99 1469 th = tcp_hdr(skb);
41063e9d
DM
1470
1471 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1472 return;
41063e9d 1473
45f00f99 1474 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1475 iph->saddr, th->source,
7011d085 1476 iph->daddr, ntohs(th->dest),
9cb429d6 1477 skb->skb_iif);
41063e9d
DM
1478 if (sk) {
1479 skb->sk = sk;
1480 skb->destructor = sock_edemux;
f7e4eb03 1481 if (sk_fullsock(sk)) {
d0c294c5 1482 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1483
41063e9d
DM
1484 if (dst)
1485 dst = dst_check(dst, 0);
92101b3b 1486 if (dst &&
505fbcf0 1487 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1488 skb_dst_set_noref(skb, dst);
41063e9d
DM
1489 }
1490 }
41063e9d
DM
1491}
1492
b2fb4f54
ED
1493/* Packet is added to VJ-style prequeue for processing in process
1494 * context, if a reader task is waiting. Apparently, this exciting
1495 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1496 * failed somewhere. Latency? Burstiness? Well, at least now we will
1497 * see, why it failed. 8)8) --ANK
1498 *
1499 */
1500bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1501{
1502 struct tcp_sock *tp = tcp_sk(sk);
1503
1504 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1505 return false;
1506
1507 if (skb->len <= tcp_hdrlen(skb) &&
1508 skb_queue_len(&tp->ucopy.prequeue) == 0)
1509 return false;
1510
ca777eff
ED
1511 /* Before escaping RCU protected region, we need to take care of skb
1512 * dst. Prequeue is only enabled for established sockets.
1513 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1514 * Instead of doing full sk_rx_dst validity here, let's perform
1515 * an optimistic check.
1516 */
1517 if (likely(sk->sk_rx_dst))
1518 skb_dst_drop(skb);
1519 else
5037e9ef 1520 skb_dst_force_safe(skb);
ca777eff 1521
b2fb4f54
ED
1522 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1523 tp->ucopy.memory += skb->truesize;
0cef6a4c
ED
1524 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1525 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
b2fb4f54
ED
1526 struct sk_buff *skb1;
1527
1528 BUG_ON(sock_owned_by_user(sk));
0cef6a4c
ED
1529 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1530 skb_queue_len(&tp->ucopy.prequeue));
b2fb4f54 1531
0cef6a4c 1532 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
b2fb4f54 1533 sk_backlog_rcv(sk, skb1);
b2fb4f54
ED
1534
1535 tp->ucopy.memory = 0;
1536 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1537 wake_up_interruptible_sync_poll(sk_sleep(sk),
1538 POLLIN | POLLRDNORM | POLLRDBAND);
1539 if (!inet_csk_ack_scheduled(sk))
1540 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1541 (3 * tcp_rto_min(sk)) / 4,
1542 TCP_RTO_MAX);
1543 }
1544 return true;
1545}
1546EXPORT_SYMBOL(tcp_prequeue);
1547
c9c33212
ED
1548bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1549{
1550 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1551
1552 /* Only socket owner can try to collapse/prune rx queues
1553 * to reduce memory overhead, so add a little headroom here.
1554 * Few sockets backlog are possibly concurrently non empty.
1555 */
1556 limit += 64*1024;
1557
1558 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1559 * we can fix skb->truesize to its real value to avoid future drops.
1560 * This is valid because skb is not yet charged to the socket.
1561 * It has been noticed pure SACK packets were sometimes dropped
1562 * (if cooked by drivers without copybreak feature).
1563 */
60b1af33 1564 skb_condense(skb);
c9c33212
ED
1565
1566 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1567 bh_unlock_sock(sk);
1568 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1569 return true;
1570 }
1571 return false;
1572}
1573EXPORT_SYMBOL(tcp_add_backlog);
1574
ac6e7800
ED
1575int tcp_filter(struct sock *sk, struct sk_buff *skb)
1576{
1577 struct tcphdr *th = (struct tcphdr *)skb->data;
1578 unsigned int eaten = skb->len;
1579 int err;
1580
1581 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1582 if (!err) {
1583 eaten -= skb->len;
1584 TCP_SKB_CB(skb)->end_seq -= eaten;
1585 }
1586 return err;
1587}
1588EXPORT_SYMBOL(tcp_filter);
1589
1da177e4
LT
1590/*
1591 * From tcp_input.c
1592 */
1593
1594int tcp_v4_rcv(struct sk_buff *skb)
1595{
3b24d854 1596 struct net *net = dev_net(skb->dev);
eddc9ec5 1597 const struct iphdr *iph;
cf533ea5 1598 const struct tcphdr *th;
3b24d854 1599 bool refcounted;
1da177e4
LT
1600 struct sock *sk;
1601 int ret;
1602
1603 if (skb->pkt_type != PACKET_HOST)
1604 goto discard_it;
1605
1606 /* Count it even if it's bad */
90bbcc60 1607 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1608
1609 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1610 goto discard_it;
1611
ea1627c2 1612 th = (const struct tcphdr *)skb->data;
1da177e4 1613
ea1627c2 1614 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1615 goto bad_packet;
1616 if (!pskb_may_pull(skb, th->doff * 4))
1617 goto discard_it;
1618
1619 /* An explanation is required here, I think.
1620 * Packet length and doff are validated by header prediction,
caa20d9a 1621 * provided case of th->doff==0 is eliminated.
1da177e4 1622 * So, we defer the checks. */
ed70fcfc
TH
1623
1624 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1625 goto csum_error;
1da177e4 1626
ea1627c2 1627 th = (const struct tcphdr *)skb->data;
eddc9ec5 1628 iph = ip_hdr(skb);
971f10ec
ED
1629 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1630 * barrier() makes sure compiler wont play fool^Waliasing games.
1631 */
1632 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1633 sizeof(struct inet_skb_parm));
1634 barrier();
1635
1da177e4
LT
1636 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1637 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1638 skb->len - th->doff * 4);
1639 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1640 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1641 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1642 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1643 TCP_SKB_CB(skb)->sacked = 0;
1644
4bdc3d66 1645lookup:
a583636a 1646 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3b24d854 1647 th->dest, &refcounted);
1da177e4
LT
1648 if (!sk)
1649 goto no_tcp_socket;
1650
bb134d5d
ED
1651process:
1652 if (sk->sk_state == TCP_TIME_WAIT)
1653 goto do_time_wait;
1654
079096f1
ED
1655 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1656 struct request_sock *req = inet_reqsk(sk);
7716682c 1657 struct sock *nsk;
079096f1
ED
1658
1659 sk = req->rsk_listener;
72923555 1660 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1661 sk_drops_add(sk, skb);
72923555
ED
1662 reqsk_put(req);
1663 goto discard_it;
1664 }
7716682c 1665 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1666 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1667 goto lookup;
1668 }
3b24d854
ED
1669 /* We own a reference on the listener, increase it again
1670 * as we might lose it too soon.
1671 */
7716682c 1672 sock_hold(sk);
3b24d854 1673 refcounted = true;
7716682c 1674 nsk = tcp_check_req(sk, skb, req, false);
079096f1
ED
1675 if (!nsk) {
1676 reqsk_put(req);
7716682c 1677 goto discard_and_relse;
079096f1
ED
1678 }
1679 if (nsk == sk) {
079096f1
ED
1680 reqsk_put(req);
1681 } else if (tcp_child_process(sk, nsk, skb)) {
1682 tcp_v4_send_reset(nsk, skb);
7716682c 1683 goto discard_and_relse;
079096f1 1684 } else {
7716682c 1685 sock_put(sk);
079096f1
ED
1686 return 0;
1687 }
1688 }
6cce09f8 1689 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1690 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1691 goto discard_and_relse;
6cce09f8 1692 }
d218d111 1693
1da177e4
LT
1694 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1695 goto discard_and_relse;
9ea88a15 1696
9ea88a15
DP
1697 if (tcp_v4_inbound_md5_hash(sk, skb))
1698 goto discard_and_relse;
9ea88a15 1699
b59c2701 1700 nf_reset(skb);
1da177e4 1701
ac6e7800 1702 if (tcp_filter(sk, skb))
1da177e4 1703 goto discard_and_relse;
ac6e7800
ED
1704 th = (const struct tcphdr *)skb->data;
1705 iph = ip_hdr(skb);
1da177e4
LT
1706
1707 skb->dev = NULL;
1708
e994b2f0
ED
1709 if (sk->sk_state == TCP_LISTEN) {
1710 ret = tcp_v4_do_rcv(sk, skb);
1711 goto put_and_return;
1712 }
1713
1714 sk_incoming_cpu_update(sk);
1715
c6366184 1716 bh_lock_sock_nested(sk);
a44d6eac 1717 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1718 ret = 0;
1719 if (!sock_owned_by_user(sk)) {
7bced397 1720 if (!tcp_prequeue(sk, skb))
1da177e4 1721 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1722 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1723 goto discard_and_relse;
1724 }
1da177e4
LT
1725 bh_unlock_sock(sk);
1726
e994b2f0 1727put_and_return:
3b24d854
ED
1728 if (refcounted)
1729 sock_put(sk);
1da177e4
LT
1730
1731 return ret;
1732
1733no_tcp_socket:
1734 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1735 goto discard_it;
1736
12e25e10 1737 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1738csum_error:
90bbcc60 1739 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1740bad_packet:
90bbcc60 1741 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1742 } else {
cfb6eeb4 1743 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1744 }
1745
1746discard_it:
1747 /* Discard frame. */
1748 kfree_skb(skb);
e905a9ed 1749 return 0;
1da177e4
LT
1750
1751discard_and_relse:
532182cd 1752 sk_drops_add(sk, skb);
3b24d854
ED
1753 if (refcounted)
1754 sock_put(sk);
1da177e4
LT
1755 goto discard_it;
1756
1757do_time_wait:
1758 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1759 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1760 goto discard_it;
1761 }
1762
6a5dc9e5
ED
1763 if (tcp_checksum_complete(skb)) {
1764 inet_twsk_put(inet_twsk(sk));
1765 goto csum_error;
1da177e4 1766 }
9469c7b4 1767 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1768 case TCP_TW_SYN: {
c346dca1 1769 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1770 &tcp_hashinfo, skb,
1771 __tcp_hdrlen(th),
da5e3630 1772 iph->saddr, th->source,
eddc9ec5 1773 iph->daddr, th->dest,
463c84b9 1774 inet_iif(skb));
1da177e4 1775 if (sk2) {
dbe7faa4 1776 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1777 sk = sk2;
3b24d854 1778 refcounted = false;
1da177e4
LT
1779 goto process;
1780 }
1781 /* Fall through to ACK */
1782 }
1783 case TCP_TW_ACK:
1784 tcp_v4_timewait_ack(sk, skb);
1785 break;
1786 case TCP_TW_RST:
271c3b9b
FW
1787 tcp_v4_send_reset(sk, skb);
1788 inet_twsk_deschedule_put(inet_twsk(sk));
1789 goto discard_it;
1da177e4
LT
1790 case TCP_TW_SUCCESS:;
1791 }
1792 goto discard_it;
1793}
1794
ccb7c410
DM
1795static struct timewait_sock_ops tcp_timewait_sock_ops = {
1796 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1797 .twsk_unique = tcp_twsk_unique,
1798 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1799};
1da177e4 1800
63d02d15 1801void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1802{
1803 struct dst_entry *dst = skb_dst(skb);
1804
5037e9ef 1805 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1806 sk->sk_rx_dst = dst;
1807 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1808 }
5d299f3d 1809}
63d02d15 1810EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1811
3b401a81 1812const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1813 .queue_xmit = ip_queue_xmit,
1814 .send_check = tcp_v4_send_check,
1815 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1816 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1817 .conn_request = tcp_v4_conn_request,
1818 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1819 .net_header_len = sizeof(struct iphdr),
1820 .setsockopt = ip_setsockopt,
1821 .getsockopt = ip_getsockopt,
1822 .addr2sockaddr = inet_csk_addr2sockaddr,
1823 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 1824#ifdef CONFIG_COMPAT
543d9cfe
ACM
1825 .compat_setsockopt = compat_ip_setsockopt,
1826 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1827#endif
4fab9071 1828 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1829};
4bc2f18b 1830EXPORT_SYMBOL(ipv4_specific);
1da177e4 1831
cfb6eeb4 1832#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1833static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1834 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1835 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1836 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1837};
b6332e6c 1838#endif
cfb6eeb4 1839
1da177e4
LT
1840/* NOTE: A lot of things set to zero explicitly by call to
1841 * sk_alloc() so need not be done here.
1842 */
1843static int tcp_v4_init_sock(struct sock *sk)
1844{
6687e988 1845 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1846
900f65d3 1847 tcp_init_sock(sk);
1da177e4 1848
8292a17a 1849 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1850
cfb6eeb4 1851#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1852 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1853#endif
1da177e4 1854
1da177e4
LT
1855 return 0;
1856}
1857
7d06b2e0 1858void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1859{
1860 struct tcp_sock *tp = tcp_sk(sk);
1861
1862 tcp_clear_xmit_timers(sk);
1863
6687e988 1864 tcp_cleanup_congestion_control(sk);
317a76f9 1865
1da177e4 1866 /* Cleanup up the write buffer. */
fe067e8a 1867 tcp_write_queue_purge(sk);
1da177e4
LT
1868
1869 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 1870 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 1871
cfb6eeb4
YH
1872#ifdef CONFIG_TCP_MD5SIG
1873 /* Clean up the MD5 key list, if any */
1874 if (tp->md5sig_info) {
a915da9b 1875 tcp_clear_md5_list(sk);
a8afca03 1876 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1877 tp->md5sig_info = NULL;
1878 }
1879#endif
1a2449a8 1880
1da177e4
LT
1881 /* Clean prequeue, it must be empty really */
1882 __skb_queue_purge(&tp->ucopy.prequeue);
1883
1884 /* Clean up a referenced TCP bind bucket. */
463c84b9 1885 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1886 inet_put_port(sk);
1da177e4 1887
00db4124 1888 BUG_ON(tp->fastopen_rsk);
435cf559 1889
cf60af03
YC
1890 /* If socket is aborted during connect operation */
1891 tcp_free_fastopen_req(tp);
cd8ae852 1892 tcp_saved_syn_free(tp);
cf60af03 1893
180d8cd9 1894 sk_sockets_allocated_dec(sk);
1da177e4 1895}
1da177e4
LT
1896EXPORT_SYMBOL(tcp_v4_destroy_sock);
1897
1898#ifdef CONFIG_PROC_FS
1899/* Proc filesystem TCP sock list dumping. */
1900
a8b690f9
TH
1901/*
1902 * Get next listener socket follow cur. If cur is NULL, get first socket
1903 * starting from bucket given in st->bucket; when st->bucket is zero the
1904 * very first socket in the hash table is returned.
1905 */
1da177e4
LT
1906static void *listening_get_next(struct seq_file *seq, void *cur)
1907{
5799de0b 1908 struct tcp_iter_state *st = seq->private;
a4146b1b 1909 struct net *net = seq_file_net(seq);
3b24d854 1910 struct inet_listen_hashbucket *ilb;
3b24d854 1911 struct sock *sk = cur;
1da177e4
LT
1912
1913 if (!sk) {
3b24d854 1914get_head:
a8b690f9 1915 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 1916 spin_lock(&ilb->lock);
3b24d854 1917 sk = sk_head(&ilb->head);
a8b690f9 1918 st->offset = 0;
1da177e4
LT
1919 goto get_sk;
1920 }
5caea4ea 1921 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1922 ++st->num;
a8b690f9 1923 ++st->offset;
1da177e4 1924
3b24d854 1925 sk = sk_next(sk);
1da177e4 1926get_sk:
3b24d854 1927 sk_for_each_from(sk) {
8475ef9f
PE
1928 if (!net_eq(sock_net(sk), net))
1929 continue;
3b24d854
ED
1930 if (sk->sk_family == st->family)
1931 return sk;
1da177e4 1932 }
9652dc2e 1933 spin_unlock(&ilb->lock);
a8b690f9 1934 st->offset = 0;
3b24d854
ED
1935 if (++st->bucket < INET_LHTABLE_SIZE)
1936 goto get_head;
1937 return NULL;
1da177e4
LT
1938}
1939
1940static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941{
a8b690f9
TH
1942 struct tcp_iter_state *st = seq->private;
1943 void *rc;
1944
1945 st->bucket = 0;
1946 st->offset = 0;
1947 rc = listening_get_next(seq, NULL);
1da177e4
LT
1948
1949 while (rc && *pos) {
1950 rc = listening_get_next(seq, rc);
1951 --*pos;
1952 }
1953 return rc;
1954}
1955
05dbc7b5 1956static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1957{
05dbc7b5 1958 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1959}
1960
a8b690f9
TH
1961/*
1962 * Get first established socket starting from bucket given in st->bucket.
1963 * If st->bucket is zero, the very first socket in the hash is returned.
1964 */
1da177e4
LT
1965static void *established_get_first(struct seq_file *seq)
1966{
5799de0b 1967 struct tcp_iter_state *st = seq->private;
a4146b1b 1968 struct net *net = seq_file_net(seq);
1da177e4
LT
1969 void *rc = NULL;
1970
a8b690f9
TH
1971 st->offset = 0;
1972 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1973 struct sock *sk;
3ab5aee7 1974 struct hlist_nulls_node *node;
9db66bdc 1975 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1976
6eac5604
AK
1977 /* Lockless fast path for the common case of empty buckets */
1978 if (empty_bucket(st))
1979 continue;
1980
9db66bdc 1981 spin_lock_bh(lock);
3ab5aee7 1982 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1983 if (sk->sk_family != st->family ||
878628fb 1984 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1985 continue;
1986 }
1987 rc = sk;
1988 goto out;
1989 }
9db66bdc 1990 spin_unlock_bh(lock);
1da177e4
LT
1991 }
1992out:
1993 return rc;
1994}
1995
1996static void *established_get_next(struct seq_file *seq, void *cur)
1997{
1998 struct sock *sk = cur;
3ab5aee7 1999 struct hlist_nulls_node *node;
5799de0b 2000 struct tcp_iter_state *st = seq->private;
a4146b1b 2001 struct net *net = seq_file_net(seq);
1da177e4
LT
2002
2003 ++st->num;
a8b690f9 2004 ++st->offset;
1da177e4 2005
05dbc7b5 2006 sk = sk_nulls_next(sk);
1da177e4 2007
3ab5aee7 2008 sk_nulls_for_each_from(sk, node) {
878628fb 2009 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2010 return sk;
1da177e4
LT
2011 }
2012
05dbc7b5
ED
2013 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2014 ++st->bucket;
2015 return established_get_first(seq);
1da177e4
LT
2016}
2017
2018static void *established_get_idx(struct seq_file *seq, loff_t pos)
2019{
a8b690f9
TH
2020 struct tcp_iter_state *st = seq->private;
2021 void *rc;
2022
2023 st->bucket = 0;
2024 rc = established_get_first(seq);
1da177e4
LT
2025
2026 while (rc && pos) {
2027 rc = established_get_next(seq, rc);
2028 --pos;
7174259e 2029 }
1da177e4
LT
2030 return rc;
2031}
2032
2033static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2034{
2035 void *rc;
5799de0b 2036 struct tcp_iter_state *st = seq->private;
1da177e4 2037
1da177e4
LT
2038 st->state = TCP_SEQ_STATE_LISTENING;
2039 rc = listening_get_idx(seq, &pos);
2040
2041 if (!rc) {
1da177e4
LT
2042 st->state = TCP_SEQ_STATE_ESTABLISHED;
2043 rc = established_get_idx(seq, pos);
2044 }
2045
2046 return rc;
2047}
2048
a8b690f9
TH
2049static void *tcp_seek_last_pos(struct seq_file *seq)
2050{
2051 struct tcp_iter_state *st = seq->private;
2052 int offset = st->offset;
2053 int orig_num = st->num;
2054 void *rc = NULL;
2055
2056 switch (st->state) {
a8b690f9
TH
2057 case TCP_SEQ_STATE_LISTENING:
2058 if (st->bucket >= INET_LHTABLE_SIZE)
2059 break;
2060 st->state = TCP_SEQ_STATE_LISTENING;
2061 rc = listening_get_next(seq, NULL);
2062 while (offset-- && rc)
2063 rc = listening_get_next(seq, rc);
2064 if (rc)
2065 break;
2066 st->bucket = 0;
05dbc7b5 2067 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2068 /* Fallthrough */
2069 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2070 if (st->bucket > tcp_hashinfo.ehash_mask)
2071 break;
2072 rc = established_get_first(seq);
2073 while (offset-- && rc)
2074 rc = established_get_next(seq, rc);
2075 }
2076
2077 st->num = orig_num;
2078
2079 return rc;
2080}
2081
1da177e4
LT
2082static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2083{
5799de0b 2084 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2085 void *rc;
2086
2087 if (*pos && *pos == st->last_pos) {
2088 rc = tcp_seek_last_pos(seq);
2089 if (rc)
2090 goto out;
2091 }
2092
1da177e4
LT
2093 st->state = TCP_SEQ_STATE_LISTENING;
2094 st->num = 0;
a8b690f9
TH
2095 st->bucket = 0;
2096 st->offset = 0;
2097 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2098
2099out:
2100 st->last_pos = *pos;
2101 return rc;
1da177e4
LT
2102}
2103
2104static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2105{
a8b690f9 2106 struct tcp_iter_state *st = seq->private;
1da177e4 2107 void *rc = NULL;
1da177e4
LT
2108
2109 if (v == SEQ_START_TOKEN) {
2110 rc = tcp_get_idx(seq, 0);
2111 goto out;
2112 }
1da177e4
LT
2113
2114 switch (st->state) {
1da177e4
LT
2115 case TCP_SEQ_STATE_LISTENING:
2116 rc = listening_get_next(seq, v);
2117 if (!rc) {
1da177e4 2118 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2119 st->bucket = 0;
2120 st->offset = 0;
1da177e4
LT
2121 rc = established_get_first(seq);
2122 }
2123 break;
2124 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2125 rc = established_get_next(seq, v);
2126 break;
2127 }
2128out:
2129 ++*pos;
a8b690f9 2130 st->last_pos = *pos;
1da177e4
LT
2131 return rc;
2132}
2133
2134static void tcp_seq_stop(struct seq_file *seq, void *v)
2135{
5799de0b 2136 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2137
2138 switch (st->state) {
1da177e4
LT
2139 case TCP_SEQ_STATE_LISTENING:
2140 if (v != SEQ_START_TOKEN)
9652dc2e 2141 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2142 break;
1da177e4
LT
2143 case TCP_SEQ_STATE_ESTABLISHED:
2144 if (v)
9db66bdc 2145 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2146 break;
2147 }
2148}
2149
73cb88ec 2150int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2151{
d9dda78b 2152 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2153 struct tcp_iter_state *s;
52d6f3f1 2154 int err;
1da177e4 2155
52d6f3f1
DL
2156 err = seq_open_net(inode, file, &afinfo->seq_ops,
2157 sizeof(struct tcp_iter_state));
2158 if (err < 0)
2159 return err;
f40c8174 2160
52d6f3f1 2161 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2162 s->family = afinfo->family;
688d1945 2163 s->last_pos = 0;
f40c8174
DL
2164 return 0;
2165}
73cb88ec 2166EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2167
6f8b13bc 2168int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2169{
2170 int rc = 0;
2171 struct proc_dir_entry *p;
2172
9427c4b3
DL
2173 afinfo->seq_ops.start = tcp_seq_start;
2174 afinfo->seq_ops.next = tcp_seq_next;
2175 afinfo->seq_ops.stop = tcp_seq_stop;
2176
84841c3c 2177 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2178 afinfo->seq_fops, afinfo);
84841c3c 2179 if (!p)
1da177e4
LT
2180 rc = -ENOMEM;
2181 return rc;
2182}
4bc2f18b 2183EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2184
6f8b13bc 2185void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2186{
ece31ffd 2187 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2188}
4bc2f18b 2189EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2190
d4f06873 2191static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2192 struct seq_file *f, int i)
1da177e4 2193{
2e6599cb 2194 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2195 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2196
5e659e4c 2197 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2198 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2199 i,
634fb979 2200 ireq->ir_loc_addr,
d4f06873 2201 ireq->ir_num,
634fb979
ED
2202 ireq->ir_rmt_addr,
2203 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2204 TCP_SYN_RECV,
2205 0, 0, /* could print option size, but that is af dependent. */
2206 1, /* timers active (only the expire timer) */
a399a805 2207 jiffies_delta_to_clock_t(delta),
e6c022a4 2208 req->num_timeout,
aa3a0c8c
ED
2209 from_kuid_munged(seq_user_ns(f),
2210 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2211 0, /* non standard timer */
2212 0, /* open_requests have no inode */
d4f06873 2213 0,
652586df 2214 req);
1da177e4
LT
2215}
2216
652586df 2217static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2218{
2219 int timer_active;
2220 unsigned long timer_expires;
cf533ea5 2221 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2222 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2223 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2224 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2225 __be32 dest = inet->inet_daddr;
2226 __be32 src = inet->inet_rcv_saddr;
2227 __u16 destp = ntohs(inet->inet_dport);
2228 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2229 int rx_queue;
00fd38d9 2230 int state;
1da177e4 2231
6ba8a3b1 2232 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2233 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2234 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2235 timer_active = 1;
463c84b9
ACM
2236 timer_expires = icsk->icsk_timeout;
2237 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2238 timer_active = 4;
463c84b9 2239 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2240 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2241 timer_active = 2;
cf4c6bf8 2242 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2243 } else {
2244 timer_active = 0;
2245 timer_expires = jiffies;
2246 }
2247
00fd38d9
ED
2248 state = sk_state_load(sk);
2249 if (state == TCP_LISTEN)
49d09007
ED
2250 rx_queue = sk->sk_ack_backlog;
2251 else
00fd38d9
ED
2252 /* Because we don't lock the socket,
2253 * we might find a transient negative value.
49d09007
ED
2254 */
2255 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2256
5e659e4c 2257 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2258 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2259 i, src, srcp, dest, destp, state,
47da8ee6 2260 tp->write_seq - tp->snd_una,
49d09007 2261 rx_queue,
1da177e4 2262 timer_active,
a399a805 2263 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2264 icsk->icsk_retransmits,
a7cb5a49 2265 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2266 icsk->icsk_probes_out,
cf4c6bf8
IJ
2267 sock_i_ino(sk),
2268 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2269 jiffies_to_clock_t(icsk->icsk_rto),
2270 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2271 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2272 tp->snd_cwnd,
00fd38d9
ED
2273 state == TCP_LISTEN ?
2274 fastopenq->max_qlen :
652586df 2275 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2276}
2277
cf533ea5 2278static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2279 struct seq_file *f, int i)
1da177e4 2280{
789f558c 2281 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2282 __be32 dest, src;
1da177e4 2283 __u16 destp, srcp;
1da177e4
LT
2284
2285 dest = tw->tw_daddr;
2286 src = tw->tw_rcv_saddr;
2287 destp = ntohs(tw->tw_dport);
2288 srcp = ntohs(tw->tw_sport);
2289
5e659e4c 2290 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2291 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2292 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2293 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2294 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2295}
2296
2297#define TMPSZ 150
2298
2299static int tcp4_seq_show(struct seq_file *seq, void *v)
2300{
5799de0b 2301 struct tcp_iter_state *st;
05dbc7b5 2302 struct sock *sk = v;
1da177e4 2303
652586df 2304 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2305 if (v == SEQ_START_TOKEN) {
652586df 2306 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2307 "rx_queue tr tm->when retrnsmt uid timeout "
2308 "inode");
2309 goto out;
2310 }
2311 st = seq->private;
2312
079096f1
ED
2313 if (sk->sk_state == TCP_TIME_WAIT)
2314 get_timewait4_sock(v, seq, st->num);
2315 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2316 get_openreq4(v, seq, st->num);
079096f1
ED
2317 else
2318 get_tcp4_sock(v, seq, st->num);
1da177e4 2319out:
652586df 2320 seq_pad(seq, '\n');
1da177e4
LT
2321 return 0;
2322}
2323
73cb88ec
AV
2324static const struct file_operations tcp_afinfo_seq_fops = {
2325 .owner = THIS_MODULE,
2326 .open = tcp_seq_open,
2327 .read = seq_read,
2328 .llseek = seq_lseek,
2329 .release = seq_release_net
2330};
2331
1da177e4 2332static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2333 .name = "tcp",
2334 .family = AF_INET,
73cb88ec 2335 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2336 .seq_ops = {
2337 .show = tcp4_seq_show,
2338 },
1da177e4
LT
2339};
2340
2c8c1e72 2341static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2342{
2343 return tcp_proc_register(net, &tcp4_seq_afinfo);
2344}
2345
2c8c1e72 2346static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2347{
2348 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2349}
2350
2351static struct pernet_operations tcp4_net_ops = {
2352 .init = tcp4_proc_init_net,
2353 .exit = tcp4_proc_exit_net,
2354};
2355
1da177e4
LT
2356int __init tcp4_proc_init(void)
2357{
757764f6 2358 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2359}
2360
2361void tcp4_proc_exit(void)
2362{
757764f6 2363 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2364}
2365#endif /* CONFIG_PROC_FS */
2366
2367struct proto tcp_prot = {
2368 .name = "TCP",
2369 .owner = THIS_MODULE,
2370 .close = tcp_close,
2371 .connect = tcp_v4_connect,
2372 .disconnect = tcp_disconnect,
463c84b9 2373 .accept = inet_csk_accept,
1da177e4
LT
2374 .ioctl = tcp_ioctl,
2375 .init = tcp_v4_init_sock,
2376 .destroy = tcp_v4_destroy_sock,
2377 .shutdown = tcp_shutdown,
2378 .setsockopt = tcp_setsockopt,
2379 .getsockopt = tcp_getsockopt,
4b9d07a4 2380 .keepalive = tcp_set_keepalive,
1da177e4 2381 .recvmsg = tcp_recvmsg,
7ba42910
CG
2382 .sendmsg = tcp_sendmsg,
2383 .sendpage = tcp_sendpage,
1da177e4 2384 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2385 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2386 .hash = inet_hash,
2387 .unhash = inet_unhash,
2388 .get_port = inet_csk_get_port,
1da177e4 2389 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2390 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2391 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2392 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2393 .memory_allocated = &tcp_memory_allocated,
2394 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2395 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2396 .sysctl_wmem = sysctl_tcp_wmem,
2397 .sysctl_rmem = sysctl_tcp_rmem,
2398 .max_header = MAX_TCP_HEADER,
2399 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2400 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2401 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2402 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2403 .h.hashinfo = &tcp_hashinfo,
7ba42910 2404 .no_autobind = true,
543d9cfe
ACM
2405#ifdef CONFIG_COMPAT
2406 .compat_setsockopt = compat_tcp_setsockopt,
2407 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2408#endif
c1e64e29 2409 .diag_destroy = tcp_abort,
1da177e4 2410};
4bc2f18b 2411EXPORT_SYMBOL(tcp_prot);
1da177e4 2412
bdbbb852
ED
2413static void __net_exit tcp_sk_exit(struct net *net)
2414{
2415 int cpu;
2416
2417 for_each_possible_cpu(cpu)
2418 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2419 free_percpu(net->ipv4.tcp_sk);
2420}
2421
046ee902
DL
2422static int __net_init tcp_sk_init(struct net *net)
2423{
fee83d09 2424 int res, cpu, cnt;
bdbbb852
ED
2425
2426 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2427 if (!net->ipv4.tcp_sk)
2428 return -ENOMEM;
2429
2430 for_each_possible_cpu(cpu) {
2431 struct sock *sk;
2432
2433 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2434 IPPROTO_TCP, net);
2435 if (res)
2436 goto fail;
a9d6532b 2437 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
bdbbb852
ED
2438 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2439 }
49213555 2440
5d134f1c 2441 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2442 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2443
b0f9ca53 2444 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2445 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2446 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2447
13b287e8 2448 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2449 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2450 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2451
6fa25166 2452 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2453 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2454 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2455 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2456 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2457 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2458 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2459 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2460 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
56ab6b93 2461 net->ipv4.sysctl_tcp_tw_reuse = 0;
12ed8244 2462
fee83d09 2463 cnt = tcp_hashinfo.ehash_mask + 1;
1946e672 2464 net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
fee83d09 2465 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
1946e672
HY
2466 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2467
fee83d09
HY
2468 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2469
49213555 2470 return 0;
bdbbb852
ED
2471fail:
2472 tcp_sk_exit(net);
2473
2474 return res;
b099ce26
EB
2475}
2476
2477static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2478{
1946e672 2479 inet_twsk_purge(&tcp_hashinfo, AF_INET);
046ee902
DL
2480}
2481
2482static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2483 .init = tcp_sk_init,
2484 .exit = tcp_sk_exit,
2485 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2486};
2487
9b0f976f 2488void __init tcp_v4_init(void)
1da177e4 2489{
6a1b3054 2490 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2491 panic("Failed to create the TCP control socket.\n");
1da177e4 2492}