]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/tcp_ipv4.c
syncookies: do not store rcv_wscale in tcp timestamp
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
1da177e4 53
eb4dea58 54#include <linux/bottom_half.h>
1da177e4
LT
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
5a0e3ad6 63#include <linux/slab.h>
1da177e4 64
457c4cbc 65#include <net/net_namespace.h>
1da177e4 66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4 68#include <net/tcp.h>
20380731 69#include <net/transp_v6.h>
1da177e4
LT
70#include <net/ipv6.h>
71#include <net/inet_common.h>
6d6ee43e 72#include <net/timewait_sock.h>
1da177e4 73#include <net/xfrm.h>
1a2449a8 74#include <net/netdma.h>
1da177e4
LT
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
cfb6eeb4
YH
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
ab32ea5d
BH
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
1da177e4 87
1da177e4 88
cfb6eeb4 89#ifdef CONFIG_TCP_MD5SIG
7174259e
ACM
90static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
91 __be32 addr);
49a72dfb
AL
92static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
93 __be32 daddr, __be32 saddr, struct tcphdr *th);
9501f972
YH
94#else
95static inline
96struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97{
98 return NULL;
99}
cfb6eeb4
YH
100#endif
101
5caea4ea 102struct inet_hashinfo tcp_hashinfo;
1da177e4 103
a94f723d 104static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
1da177e4 105{
eddc9ec5
ACM
106 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107 ip_hdr(skb)->saddr,
aa8223c7
ACM
108 tcp_hdr(skb)->dest,
109 tcp_hdr(skb)->source);
1da177e4
LT
110}
111
6d6ee43e
ACM
112int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113{
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 /* With PAWS, it is safe from the viewpoint
118 of data integrity. Even without PAWS it is safe provided sequence
119 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120
121 Actually, the idea is close to VJ's one, only timestamp cache is
122 held not per host, but per port pair and TW bucket is used as state
123 holder.
124
125 If TW bucket has been already destroyed we fall back to VJ's scheme
126 and use initial timestamp retrieved from peer table.
127 */
128 if (tcptw->tw_ts_recent_stamp &&
129 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 130 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
131 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
132 if (tp->write_seq == 0)
133 tp->write_seq = 1;
134 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
135 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
136 sock_hold(sktw);
137 return 1;
138 }
139
140 return 0;
141}
142
143EXPORT_SYMBOL_GPL(tcp_twsk_unique);
144
1da177e4
LT
145/* This will initiate an outgoing connection. */
146int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
147{
148 struct inet_sock *inet = inet_sk(sk);
149 struct tcp_sock *tp = tcp_sk(sk);
150 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151 struct rtable *rt;
bada8adc 152 __be32 daddr, nexthop;
1da177e4
LT
153 int tmp;
154 int err;
155
156 if (addr_len < sizeof(struct sockaddr_in))
157 return -EINVAL;
158
159 if (usin->sin_family != AF_INET)
160 return -EAFNOSUPPORT;
161
162 nexthop = daddr = usin->sin_addr.s_addr;
163 if (inet->opt && inet->opt->srr) {
164 if (!daddr)
165 return -EINVAL;
166 nexthop = inet->opt->faddr;
167 }
168
c720c7e8 169 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
1da177e4
LT
170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171 IPPROTO_TCP,
c720c7e8 172 inet->inet_sport, usin->sin_port, sk, 1);
584bdf8c
WD
173 if (tmp < 0) {
174 if (tmp == -ENETUNREACH)
7c73a6fa 175 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4 176 return tmp;
584bdf8c 177 }
1da177e4
LT
178
179 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
180 ip_rt_put(rt);
181 return -ENETUNREACH;
182 }
183
184 if (!inet->opt || !inet->opt->srr)
185 daddr = rt->rt_dst;
186
c720c7e8
ED
187 if (!inet->inet_saddr)
188 inet->inet_saddr = rt->rt_src;
189 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 190
c720c7e8 191 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
192 /* Reset inherited state */
193 tp->rx_opt.ts_recent = 0;
194 tp->rx_opt.ts_recent_stamp = 0;
195 tp->write_seq = 0;
196 }
197
295ff7ed 198 if (tcp_death_row.sysctl_tw_recycle &&
1da177e4
LT
199 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
200 struct inet_peer *peer = rt_get_peer(rt);
7174259e
ACM
201 /*
202 * VJ's idea. We save last timestamp seen from
203 * the destination in peer table, when entering state
204 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
205 * when trying new connection.
1da177e4 206 */
317fe0e6
ED
207 if (peer) {
208 inet_peer_refcheck(peer);
209 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
210 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
211 tp->rx_opt.ts_recent = peer->tcp_ts;
212 }
1da177e4
LT
213 }
214 }
215
c720c7e8
ED
216 inet->inet_dport = usin->sin_port;
217 inet->inet_daddr = daddr;
1da177e4 218
d83d8461 219 inet_csk(sk)->icsk_ext_hdr_len = 0;
1da177e4 220 if (inet->opt)
d83d8461 221 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
1da177e4 222
bee7ca9e 223 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
224
225 /* Socket identity is still unknown (sport may be zero).
226 * However we set state to SYN-SENT and not releasing socket
227 * lock select source port, enter ourselves into the hash tables and
228 * complete initialization after this.
229 */
230 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 231 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
232 if (err)
233 goto failure;
234
7174259e 235 err = ip_route_newports(&rt, IPPROTO_TCP,
c720c7e8 236 inet->inet_sport, inet->inet_dport, sk);
1da177e4
LT
237 if (err)
238 goto failure;
239
240 /* OK, now commit destination to socket. */
bcd76111 241 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 242 sk_setup_caps(sk, &rt->dst);
1da177e4
LT
243
244 if (!tp->write_seq)
c720c7e8
ED
245 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
246 inet->inet_daddr,
247 inet->inet_sport,
1da177e4
LT
248 usin->sin_port);
249
c720c7e8 250 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4
LT
251
252 err = tcp_connect(sk);
253 rt = NULL;
254 if (err)
255 goto failure;
256
257 return 0;
258
259failure:
7174259e
ACM
260 /*
261 * This unhashes the socket and releases the local port,
262 * if necessary.
263 */
1da177e4
LT
264 tcp_set_state(sk, TCP_CLOSE);
265 ip_rt_put(rt);
266 sk->sk_route_caps = 0;
c720c7e8 267 inet->inet_dport = 0;
1da177e4
LT
268 return err;
269}
270
1da177e4
LT
271/*
272 * This routine does path mtu discovery as defined in RFC1191.
273 */
40efc6fa 274static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
1da177e4
LT
275{
276 struct dst_entry *dst;
277 struct inet_sock *inet = inet_sk(sk);
1da177e4
LT
278
279 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280 * send out by Linux are always <576bytes so they should go through
281 * unfragmented).
282 */
283 if (sk->sk_state == TCP_LISTEN)
284 return;
285
286 /* We don't check in the destentry if pmtu discovery is forbidden
287 * on this route. We just assume that no packet_to_big packets
288 * are send back when pmtu discovery is not active.
e905a9ed 289 * There is a small race when the user changes this flag in the
1da177e4
LT
290 * route, but I think that's acceptable.
291 */
292 if ((dst = __sk_dst_check(sk, 0)) == NULL)
293 return;
294
295 dst->ops->update_pmtu(dst, mtu);
296
297 /* Something is about to be wrong... Remember soft error
298 * for the case, if this connection will not able to recover.
299 */
300 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301 sk->sk_err_soft = EMSGSIZE;
302
303 mtu = dst_mtu(dst);
304
305 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 306 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
307 tcp_sync_mss(sk, mtu);
308
309 /* Resend the TCP packet because it's
310 * clear that the old packet has been
311 * dropped. This is the new "fast" path mtu
312 * discovery.
313 */
314 tcp_simple_retransmit(sk);
315 } /* else let the usual retransmit timer handle it */
316}
317
318/*
319 * This routine is called by the ICMP module when it gets some
320 * sort of error condition. If err < 0 then the socket should
321 * be closed and the error returned to the user. If err > 0
322 * it's just the icmp type << 8 | icmp code. After adjustment
323 * header points to the first 8 bytes of the tcp header. We need
324 * to find the appropriate port.
325 *
326 * The locking strategy used here is very "optimistic". When
327 * someone else accesses the socket the ICMP is just dropped
328 * and for some paths there is no check at all.
329 * A more general error queue to queue errors for later handling
330 * is probably better.
331 *
332 */
333
4d1a2d9e 334void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 335{
4d1a2d9e
DL
336 struct iphdr *iph = (struct iphdr *)icmp_skb->data;
337 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 338 struct inet_connection_sock *icsk;
1da177e4
LT
339 struct tcp_sock *tp;
340 struct inet_sock *inet;
4d1a2d9e
DL
341 const int type = icmp_hdr(icmp_skb)->type;
342 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 343 struct sock *sk;
f1ecd5d9 344 struct sk_buff *skb;
1da177e4 345 __u32 seq;
f1ecd5d9 346 __u32 remaining;
1da177e4 347 int err;
4d1a2d9e 348 struct net *net = dev_net(icmp_skb->dev);
1da177e4 349
4d1a2d9e 350 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 351 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
352 return;
353 }
354
fd54d716 355 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 356 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 357 if (!sk) {
dcfc23ca 358 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
359 return;
360 }
361 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 362 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
363 return;
364 }
365
366 bh_lock_sock(sk);
367 /* If too many ICMPs get dropped on busy
368 * servers this needs to be solved differently.
369 */
370 if (sock_owned_by_user(sk))
de0744af 371 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
372
373 if (sk->sk_state == TCP_CLOSE)
374 goto out;
375
97e3ecd1 376 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
377 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
378 goto out;
379 }
380
f1ecd5d9 381 icsk = inet_csk(sk);
1da177e4
LT
382 tp = tcp_sk(sk);
383 seq = ntohl(th->seq);
384 if (sk->sk_state != TCP_LISTEN &&
385 !between(seq, tp->snd_una, tp->snd_nxt)) {
de0744af 386 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
387 goto out;
388 }
389
390 switch (type) {
391 case ICMP_SOURCE_QUENCH:
392 /* Just silently ignore these. */
393 goto out;
394 case ICMP_PARAMETERPROB:
395 err = EPROTO;
396 break;
397 case ICMP_DEST_UNREACH:
398 if (code > NR_ICMP_UNREACH)
399 goto out;
400
401 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
402 if (!sock_owned_by_user(sk))
403 do_pmtu_discovery(sk, iph, info);
404 goto out;
405 }
406
407 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
408 /* check if icmp_skb allows revert of backoff
409 * (see draft-zimmermann-tcp-lcd) */
410 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
411 break;
412 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
413 !icsk->icsk_backoff)
414 break;
415
416 icsk->icsk_backoff--;
417 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
418 icsk->icsk_backoff;
419 tcp_bound_rto(sk);
420
421 skb = tcp_write_queue_head(sk);
422 BUG_ON(!skb);
423
424 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
425 tcp_time_stamp - TCP_SKB_CB(skb)->when);
426
427 if (remaining) {
428 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
429 remaining, TCP_RTO_MAX);
430 } else if (sock_owned_by_user(sk)) {
431 /* RTO revert clocked out retransmission,
432 * but socket is locked. Will defer. */
433 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
434 HZ/20, TCP_RTO_MAX);
435 } else {
436 /* RTO revert clocked out retransmission.
437 * Will retransmit now */
438 tcp_retransmit_timer(sk);
439 }
440
1da177e4
LT
441 break;
442 case ICMP_TIME_EXCEEDED:
443 err = EHOSTUNREACH;
444 break;
445 default:
446 goto out;
447 }
448
449 switch (sk->sk_state) {
60236fdd 450 struct request_sock *req, **prev;
1da177e4
LT
451 case TCP_LISTEN:
452 if (sock_owned_by_user(sk))
453 goto out;
454
463c84b9
ACM
455 req = inet_csk_search_req(sk, &prev, th->dest,
456 iph->daddr, iph->saddr);
1da177e4
LT
457 if (!req)
458 goto out;
459
460 /* ICMPs are not backlogged, hence we cannot get
461 an established socket here.
462 */
547b792c 463 WARN_ON(req->sk);
1da177e4 464
2e6599cb 465 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 466 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
467 goto out;
468 }
469
470 /*
471 * Still in SYN_RECV, just remove it silently.
472 * There is no good way to pass the error to the newly
473 * created socket, and POSIX does not want network
474 * errors returned from accept().
475 */
463c84b9 476 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
477 goto out;
478
479 case TCP_SYN_SENT:
480 case TCP_SYN_RECV: /* Cannot happen.
481 It can f.e. if SYNs crossed.
482 */
483 if (!sock_owned_by_user(sk)) {
1da177e4
LT
484 sk->sk_err = err;
485
486 sk->sk_error_report(sk);
487
488 tcp_done(sk);
489 } else {
490 sk->sk_err_soft = err;
491 }
492 goto out;
493 }
494
495 /* If we've already connected we will keep trying
496 * until we time out, or the user gives up.
497 *
498 * rfc1122 4.2.3.9 allows to consider as hard errors
499 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
500 * but it is obsoleted by pmtu discovery).
501 *
502 * Note, that in modern internet, where routing is unreliable
503 * and in each dark corner broken firewalls sit, sending random
504 * errors ordered by their masters even this two messages finally lose
505 * their original sense (even Linux sends invalid PORT_UNREACHs)
506 *
507 * Now we are in compliance with RFCs.
508 * --ANK (980905)
509 */
510
511 inet = inet_sk(sk);
512 if (!sock_owned_by_user(sk) && inet->recverr) {
513 sk->sk_err = err;
514 sk->sk_error_report(sk);
515 } else { /* Only an error on timeout */
516 sk->sk_err_soft = err;
517 }
518
519out:
520 bh_unlock_sock(sk);
521 sock_put(sk);
522}
523
419f9f89
HX
524static void __tcp_v4_send_check(struct sk_buff *skb,
525 __be32 saddr, __be32 daddr)
1da177e4 526{
aa8223c7 527 struct tcphdr *th = tcp_hdr(skb);
1da177e4 528
84fa7933 529 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 530 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 531 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 532 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 533 } else {
419f9f89 534 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 535 csum_partial(th,
1da177e4
LT
536 th->doff << 2,
537 skb->csum));
538 }
539}
540
419f9f89 541/* This routine computes an IPv4 TCP checksum. */
bb296246 542void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89
HX
543{
544 struct inet_sock *inet = inet_sk(sk);
545
546 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
547}
548
a430a43d
HX
549int tcp_v4_gso_send_check(struct sk_buff *skb)
550{
eddc9ec5 551 const struct iphdr *iph;
a430a43d
HX
552 struct tcphdr *th;
553
554 if (!pskb_may_pull(skb, sizeof(*th)))
555 return -EINVAL;
556
eddc9ec5 557 iph = ip_hdr(skb);
aa8223c7 558 th = tcp_hdr(skb);
a430a43d
HX
559
560 th->check = 0;
84fa7933 561 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 562 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
563 return 0;
564}
565
1da177e4
LT
566/*
567 * This routine will send an RST to the other tcp.
568 *
569 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
570 * for reset.
571 * Answer: if a packet caused RST, it is not for a socket
572 * existing in our system, if it is matched to a socket,
573 * it is just duplicate segment or bug in other side's TCP.
574 * So that we build reply only basing on parameters
575 * arrived with segment.
576 * Exception: precedence violation. We do not implement it in any case.
577 */
578
cfb6eeb4 579static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 580{
aa8223c7 581 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
582 struct {
583 struct tcphdr th;
584#ifdef CONFIG_TCP_MD5SIG
714e85be 585 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
586#endif
587 } rep;
1da177e4 588 struct ip_reply_arg arg;
cfb6eeb4
YH
589#ifdef CONFIG_TCP_MD5SIG
590 struct tcp_md5sig_key *key;
591#endif
a86b1e30 592 struct net *net;
1da177e4
LT
593
594 /* Never send a reset in response to a reset. */
595 if (th->rst)
596 return;
597
511c3f92 598 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
599 return;
600
601 /* Swap the send and the receive. */
cfb6eeb4
YH
602 memset(&rep, 0, sizeof(rep));
603 rep.th.dest = th->source;
604 rep.th.source = th->dest;
605 rep.th.doff = sizeof(struct tcphdr) / 4;
606 rep.th.rst = 1;
1da177e4
LT
607
608 if (th->ack) {
cfb6eeb4 609 rep.th.seq = th->ack_seq;
1da177e4 610 } else {
cfb6eeb4
YH
611 rep.th.ack = 1;
612 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
613 skb->len - (th->doff << 2));
1da177e4
LT
614 }
615
7174259e 616 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
617 arg.iov[0].iov_base = (unsigned char *)&rep;
618 arg.iov[0].iov_len = sizeof(rep.th);
619
620#ifdef CONFIG_TCP_MD5SIG
eddc9ec5 621 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
cfb6eeb4
YH
622 if (key) {
623 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
624 (TCPOPT_NOP << 16) |
625 (TCPOPT_MD5SIG << 8) |
626 TCPOLEN_MD5SIG);
627 /* Update length and the length the header thinks exists */
628 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
629 rep.th.doff = arg.iov[0].iov_len / 4;
630
49a72dfb 631 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
632 key, ip_hdr(skb)->saddr,
633 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
634 }
635#endif
eddc9ec5
ACM
636 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
637 ip_hdr(skb)->saddr, /* XXX */
52cd5750 638 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 639 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 640 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
1da177e4 641
adf30907 642 net = dev_net(skb_dst(skb)->dev);
a86b1e30 643 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 644 &arg, arg.iov[0].iov_len);
1da177e4 645
63231bdd
PE
646 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
647 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
1da177e4
LT
648}
649
650/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
651 outside socket context is ugly, certainly. What can I do?
652 */
653
9501f972
YH
654static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
655 u32 win, u32 ts, int oif,
88ef4a5a
KK
656 struct tcp_md5sig_key *key,
657 int reply_flags)
1da177e4 658{
aa8223c7 659 struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
660 struct {
661 struct tcphdr th;
714e85be 662 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 663#ifdef CONFIG_TCP_MD5SIG
714e85be 664 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
665#endif
666 ];
1da177e4
LT
667 } rep;
668 struct ip_reply_arg arg;
adf30907 669 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
670
671 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 672 memset(&arg, 0, sizeof(arg));
1da177e4
LT
673
674 arg.iov[0].iov_base = (unsigned char *)&rep;
675 arg.iov[0].iov_len = sizeof(rep.th);
676 if (ts) {
cfb6eeb4
YH
677 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
678 (TCPOPT_TIMESTAMP << 8) |
679 TCPOLEN_TIMESTAMP);
680 rep.opt[1] = htonl(tcp_time_stamp);
681 rep.opt[2] = htonl(ts);
cb48cfe8 682 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
683 }
684
685 /* Swap the send and the receive. */
686 rep.th.dest = th->source;
687 rep.th.source = th->dest;
688 rep.th.doff = arg.iov[0].iov_len / 4;
689 rep.th.seq = htonl(seq);
690 rep.th.ack_seq = htonl(ack);
691 rep.th.ack = 1;
692 rep.th.window = htons(win);
693
cfb6eeb4 694#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
695 if (key) {
696 int offset = (ts) ? 3 : 0;
697
698 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
699 (TCPOPT_NOP << 16) |
700 (TCPOPT_MD5SIG << 8) |
701 TCPOLEN_MD5SIG);
702 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
703 rep.th.doff = arg.iov[0].iov_len/4;
704
49a72dfb 705 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
706 key, ip_hdr(skb)->saddr,
707 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
708 }
709#endif
88ef4a5a 710 arg.flags = reply_flags;
eddc9ec5
ACM
711 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
712 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
713 arg.iov[0].iov_len, IPPROTO_TCP, 0);
714 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
715 if (oif)
716 arg.bound_dev_if = oif;
1da177e4 717
a86b1e30 718 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 719 &arg, arg.iov[0].iov_len);
1da177e4 720
63231bdd 721 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
722}
723
724static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
725{
8feaf0c0 726 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 727 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 728
9501f972 729 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 730 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
731 tcptw->tw_ts_recent,
732 tw->tw_bound_dev_if,
88ef4a5a
KK
733 tcp_twsk_md5_key(tcptw),
734 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
9501f972 735 );
1da177e4 736
8feaf0c0 737 inet_twsk_put(tw);
1da177e4
LT
738}
739
6edafaaf 740static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 741 struct request_sock *req)
1da177e4 742{
9501f972 743 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
cfb6eeb4 744 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
9501f972
YH
745 req->ts_recent,
746 0,
88ef4a5a
KK
747 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
748 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
1da177e4
LT
749}
750
1da177e4 751/*
9bf1d83e 752 * Send a SYN-ACK after having received a SYN.
60236fdd 753 * This still operates on a request_sock only, not on a big
1da177e4
LT
754 * socket.
755 */
72659ecc
OP
756static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
757 struct request_sock *req,
758 struct request_values *rvp)
1da177e4 759{
2e6599cb 760 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
761 int err = -1;
762 struct sk_buff * skb;
763
764 /* First, grab a route. */
463c84b9 765 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
fd80eb94 766 return -1;
1da177e4 767
e6b4d113 768 skb = tcp_make_synack(sk, dst, req, rvp);
1da177e4
LT
769
770 if (skb) {
419f9f89 771 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 772
2e6599cb
ACM
773 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
774 ireq->rmt_addr,
775 ireq->opt);
b9df3cb8 776 err = net_xmit_eval(err);
1da177e4
LT
777 }
778
1da177e4
LT
779 dst_release(dst);
780 return err;
781}
782
72659ecc 783static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
e6b4d113 784 struct request_values *rvp)
fd80eb94 785{
72659ecc
OP
786 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
787 return tcp_v4_send_synack(sk, NULL, req, rvp);
fd80eb94
DL
788}
789
1da177e4 790/*
60236fdd 791 * IPv4 request_sock destructor.
1da177e4 792 */
60236fdd 793static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 794{
a51482bd 795 kfree(inet_rsk(req)->opt);
1da177e4
LT
796}
797
2a1d4bd4 798static void syn_flood_warning(const struct sk_buff *skb)
1da177e4 799{
2a1d4bd4 800 const char *msg;
1da177e4 801
2a1d4bd4
FW
802#ifdef CONFIG_SYN_COOKIES
803 if (sysctl_tcp_syncookies)
804 msg = "Sending cookies";
805 else
80e40daa 806#endif
2a1d4bd4
FW
807 msg = "Dropping request";
808
809 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
810 ntohs(tcp_hdr(skb)->dest), msg);
811}
1da177e4
LT
812
813/*
60236fdd 814 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 815 */
40efc6fa
SH
816static struct ip_options *tcp_v4_save_options(struct sock *sk,
817 struct sk_buff *skb)
1da177e4
LT
818{
819 struct ip_options *opt = &(IPCB(skb)->opt);
820 struct ip_options *dopt = NULL;
821
822 if (opt && opt->optlen) {
823 int opt_size = optlength(opt);
824 dopt = kmalloc(opt_size, GFP_ATOMIC);
825 if (dopt) {
826 if (ip_options_echo(dopt, skb)) {
827 kfree(dopt);
828 dopt = NULL;
829 }
830 }
831 }
832 return dopt;
833}
834
cfb6eeb4
YH
835#ifdef CONFIG_TCP_MD5SIG
836/*
837 * RFC2385 MD5 checksumming requires a mapping of
838 * IP address->MD5 Key.
839 * We need to maintain these in the sk structure.
840 */
841
842/* Find the Key structure for an address. */
7174259e
ACM
843static struct tcp_md5sig_key *
844 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
cfb6eeb4
YH
845{
846 struct tcp_sock *tp = tcp_sk(sk);
847 int i;
848
849 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
850 return NULL;
851 for (i = 0; i < tp->md5sig_info->entries4; i++) {
852 if (tp->md5sig_info->keys4[i].addr == addr)
f8ab18d2 853 return &tp->md5sig_info->keys4[i].base;
cfb6eeb4
YH
854 }
855 return NULL;
856}
857
858struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
859 struct sock *addr_sk)
860{
c720c7e8 861 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
cfb6eeb4
YH
862}
863
864EXPORT_SYMBOL(tcp_v4_md5_lookup);
865
f5b99bcd
AB
866static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
867 struct request_sock *req)
cfb6eeb4
YH
868{
869 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
870}
871
872/* This can be called on a newly created socket, from other files */
873int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
874 u8 *newkey, u8 newkeylen)
875{
876 /* Add Key to the list */
b0a713e9 877 struct tcp_md5sig_key *key;
cfb6eeb4
YH
878 struct tcp_sock *tp = tcp_sk(sk);
879 struct tcp4_md5sig_key *keys;
880
b0a713e9 881 key = tcp_v4_md5_do_lookup(sk, addr);
cfb6eeb4
YH
882 if (key) {
883 /* Pre-existing entry - just update that one. */
b0a713e9
MD
884 kfree(key->key);
885 key->key = newkey;
886 key->keylen = newkeylen;
cfb6eeb4 887 } else {
f6685938
ACM
888 struct tcp_md5sig_info *md5sig;
889
cfb6eeb4 890 if (!tp->md5sig_info) {
f6685938
ACM
891 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
892 GFP_ATOMIC);
cfb6eeb4
YH
893 if (!tp->md5sig_info) {
894 kfree(newkey);
895 return -ENOMEM;
896 }
a465419b 897 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4 898 }
aa133076 899 if (tcp_alloc_md5sig_pool(sk) == NULL) {
cfb6eeb4
YH
900 kfree(newkey);
901 return -ENOMEM;
902 }
f6685938
ACM
903 md5sig = tp->md5sig_info;
904
905 if (md5sig->alloced4 == md5sig->entries4) {
906 keys = kmalloc((sizeof(*keys) *
e905a9ed 907 (md5sig->entries4 + 1)), GFP_ATOMIC);
cfb6eeb4
YH
908 if (!keys) {
909 kfree(newkey);
910 tcp_free_md5sig_pool();
911 return -ENOMEM;
912 }
913
f6685938
ACM
914 if (md5sig->entries4)
915 memcpy(keys, md5sig->keys4,
916 sizeof(*keys) * md5sig->entries4);
cfb6eeb4
YH
917
918 /* Free old key list, and reference new one */
a80cc20d 919 kfree(md5sig->keys4);
f6685938
ACM
920 md5sig->keys4 = keys;
921 md5sig->alloced4++;
cfb6eeb4 922 }
f6685938 923 md5sig->entries4++;
f8ab18d2
DM
924 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
925 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
926 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
cfb6eeb4
YH
927 }
928 return 0;
929}
930
931EXPORT_SYMBOL(tcp_v4_md5_do_add);
932
933static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
934 u8 *newkey, u8 newkeylen)
935{
c720c7e8 936 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
cfb6eeb4
YH
937 newkey, newkeylen);
938}
939
940int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
941{
942 struct tcp_sock *tp = tcp_sk(sk);
943 int i;
944
945 for (i = 0; i < tp->md5sig_info->entries4; i++) {
946 if (tp->md5sig_info->keys4[i].addr == addr) {
947 /* Free the key */
f8ab18d2 948 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
949 tp->md5sig_info->entries4--;
950
951 if (tp->md5sig_info->entries4 == 0) {
952 kfree(tp->md5sig_info->keys4);
953 tp->md5sig_info->keys4 = NULL;
8228a18d 954 tp->md5sig_info->alloced4 = 0;
7174259e 955 } else if (tp->md5sig_info->entries4 != i) {
cfb6eeb4 956 /* Need to do some manipulation */
354faf09
YH
957 memmove(&tp->md5sig_info->keys4[i],
958 &tp->md5sig_info->keys4[i+1],
959 (tp->md5sig_info->entries4 - i) *
960 sizeof(struct tcp4_md5sig_key));
cfb6eeb4
YH
961 }
962 tcp_free_md5sig_pool();
963 return 0;
964 }
965 }
966 return -ENOENT;
967}
968
969EXPORT_SYMBOL(tcp_v4_md5_do_del);
970
7174259e 971static void tcp_v4_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
972{
973 struct tcp_sock *tp = tcp_sk(sk);
974
975 /* Free each key, then the set of key keys,
976 * the crypto element, and then decrement our
977 * hold on the last resort crypto.
978 */
979 if (tp->md5sig_info->entries4) {
980 int i;
981 for (i = 0; i < tp->md5sig_info->entries4; i++)
f8ab18d2 982 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
983 tp->md5sig_info->entries4 = 0;
984 tcp_free_md5sig_pool();
985 }
986 if (tp->md5sig_info->keys4) {
987 kfree(tp->md5sig_info->keys4);
988 tp->md5sig_info->keys4 = NULL;
989 tp->md5sig_info->alloced4 = 0;
990 }
991}
992
7174259e
ACM
993static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
994 int optlen)
cfb6eeb4
YH
995{
996 struct tcp_md5sig cmd;
997 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
998 u8 *newkey;
999
1000 if (optlen < sizeof(cmd))
1001 return -EINVAL;
1002
7174259e 1003 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1004 return -EFAULT;
1005
1006 if (sin->sin_family != AF_INET)
1007 return -EINVAL;
1008
1009 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1010 if (!tcp_sk(sk)->md5sig_info)
1011 return -ENOENT;
1012 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1013 }
1014
1015 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1016 return -EINVAL;
1017
1018 if (!tcp_sk(sk)->md5sig_info) {
1019 struct tcp_sock *tp = tcp_sk(sk);
aa133076 1020 struct tcp_md5sig_info *p;
cfb6eeb4 1021
aa133076 1022 p = kzalloc(sizeof(*p), sk->sk_allocation);
cfb6eeb4
YH
1023 if (!p)
1024 return -EINVAL;
1025
1026 tp->md5sig_info = p;
a465419b 1027 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1028 }
1029
aa133076 1030 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
cfb6eeb4
YH
1031 if (!newkey)
1032 return -ENOMEM;
cfb6eeb4
YH
1033 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1034 newkey, cmd.tcpm_keylen);
1035}
1036
49a72dfb
AL
1037static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1038 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1039{
cfb6eeb4 1040 struct tcp4_pseudohdr *bp;
49a72dfb 1041 struct scatterlist sg;
cfb6eeb4
YH
1042
1043 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1044
1045 /*
49a72dfb 1046 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1047 * destination IP address, zero-padded protocol number, and
1048 * segment length)
1049 */
1050 bp->saddr = saddr;
1051 bp->daddr = daddr;
1052 bp->pad = 0;
076fb722 1053 bp->protocol = IPPROTO_TCP;
49a72dfb 1054 bp->len = cpu_to_be16(nbytes);
c7da57a1 1055
49a72dfb
AL
1056 sg_init_one(&sg, bp, sizeof(*bp));
1057 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1058}
1059
1060static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1061 __be32 daddr, __be32 saddr, struct tcphdr *th)
1062{
1063 struct tcp_md5sig_pool *hp;
1064 struct hash_desc *desc;
1065
1066 hp = tcp_get_md5sig_pool();
1067 if (!hp)
1068 goto clear_hash_noput;
1069 desc = &hp->md5_desc;
1070
1071 if (crypto_hash_init(desc))
1072 goto clear_hash;
1073 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1074 goto clear_hash;
1075 if (tcp_md5_hash_header(hp, th))
1076 goto clear_hash;
1077 if (tcp_md5_hash_key(hp, key))
1078 goto clear_hash;
1079 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1080 goto clear_hash;
1081
cfb6eeb4 1082 tcp_put_md5sig_pool();
cfb6eeb4 1083 return 0;
49a72dfb 1084
cfb6eeb4
YH
1085clear_hash:
1086 tcp_put_md5sig_pool();
1087clear_hash_noput:
1088 memset(md5_hash, 0, 16);
49a72dfb 1089 return 1;
cfb6eeb4
YH
1090}
1091
49a72dfb
AL
1092int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1093 struct sock *sk, struct request_sock *req,
1094 struct sk_buff *skb)
cfb6eeb4 1095{
49a72dfb
AL
1096 struct tcp_md5sig_pool *hp;
1097 struct hash_desc *desc;
1098 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1099 __be32 saddr, daddr;
1100
1101 if (sk) {
c720c7e8
ED
1102 saddr = inet_sk(sk)->inet_saddr;
1103 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1104 } else if (req) {
1105 saddr = inet_rsk(req)->loc_addr;
1106 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1107 } else {
49a72dfb
AL
1108 const struct iphdr *iph = ip_hdr(skb);
1109 saddr = iph->saddr;
1110 daddr = iph->daddr;
cfb6eeb4 1111 }
49a72dfb
AL
1112
1113 hp = tcp_get_md5sig_pool();
1114 if (!hp)
1115 goto clear_hash_noput;
1116 desc = &hp->md5_desc;
1117
1118 if (crypto_hash_init(desc))
1119 goto clear_hash;
1120
1121 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1122 goto clear_hash;
1123 if (tcp_md5_hash_header(hp, th))
1124 goto clear_hash;
1125 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1126 goto clear_hash;
1127 if (tcp_md5_hash_key(hp, key))
1128 goto clear_hash;
1129 if (crypto_hash_final(desc, md5_hash))
1130 goto clear_hash;
1131
1132 tcp_put_md5sig_pool();
1133 return 0;
1134
1135clear_hash:
1136 tcp_put_md5sig_pool();
1137clear_hash_noput:
1138 memset(md5_hash, 0, 16);
1139 return 1;
cfb6eeb4
YH
1140}
1141
49a72dfb 1142EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1143
7174259e 1144static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
cfb6eeb4
YH
1145{
1146 /*
1147 * This gets called for each TCP segment that arrives
1148 * so we want to be efficient.
1149 * We have 3 drop cases:
1150 * o No MD5 hash and one expected.
1151 * o MD5 hash and we're not expecting one.
1152 * o MD5 hash and its wrong.
1153 */
1154 __u8 *hash_location = NULL;
1155 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1156 const struct iphdr *iph = ip_hdr(skb);
aa8223c7 1157 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1158 int genhash;
cfb6eeb4
YH
1159 unsigned char newhash[16];
1160
1161 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
7d5d5525 1162 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1163
cfb6eeb4
YH
1164 /* We've parsed the options - do we have a hash? */
1165 if (!hash_expected && !hash_location)
1166 return 0;
1167
1168 if (hash_expected && !hash_location) {
785957d3 1169 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
cfb6eeb4
YH
1170 return 1;
1171 }
1172
1173 if (!hash_expected && hash_location) {
785957d3 1174 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
cfb6eeb4
YH
1175 return 1;
1176 }
1177
1178 /* Okay, so this is hash_expected and hash_location -
1179 * so we need to calculate the checksum.
1180 */
49a72dfb
AL
1181 genhash = tcp_v4_md5_hash_skb(newhash,
1182 hash_expected,
1183 NULL, NULL, skb);
cfb6eeb4
YH
1184
1185 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1186 if (net_ratelimit()) {
673d57e7
HH
1187 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1188 &iph->saddr, ntohs(th->source),
1189 &iph->daddr, ntohs(th->dest),
cfb6eeb4 1190 genhash ? " tcp_v4_calc_md5_hash failed" : "");
cfb6eeb4
YH
1191 }
1192 return 1;
1193 }
1194 return 0;
1195}
1196
1197#endif
1198
72a3effa 1199struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1200 .family = PF_INET,
2e6599cb 1201 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1202 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1203 .send_ack = tcp_v4_reqsk_send_ack,
1204 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1205 .send_reset = tcp_v4_send_reset,
72659ecc 1206 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1207};
1208
cfb6eeb4 1209#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1210static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1211 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1212 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1213};
b6332e6c 1214#endif
cfb6eeb4 1215
6d6ee43e
ACM
1216static struct timewait_sock_ops tcp_timewait_sock_ops = {
1217 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1218 .twsk_unique = tcp_twsk_unique,
cfb6eeb4 1219 .twsk_destructor= tcp_twsk_destructor,
6d6ee43e
ACM
1220};
1221
1da177e4
LT
1222int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1223{
4957faad 1224 struct tcp_extend_values tmp_ext;
1da177e4 1225 struct tcp_options_received tmp_opt;
4957faad 1226 u8 *hash_location;
60236fdd 1227 struct request_sock *req;
e6b4d113 1228 struct inet_request_sock *ireq;
4957faad 1229 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1230 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1231 __be32 saddr = ip_hdr(skb)->saddr;
1232 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1233 __u32 isn = TCP_SKB_CB(skb)->when;
1da177e4
LT
1234#ifdef CONFIG_SYN_COOKIES
1235 int want_cookie = 0;
1236#else
1237#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1238#endif
1239
1240 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1241 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1242 goto drop;
1243
1244 /* TW buckets are converted to open requests without
1245 * limitations, they conserve resources and peer is
1246 * evidently real one.
1247 */
463c84b9 1248 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
2a1d4bd4
FW
1249 if (net_ratelimit())
1250 syn_flood_warning(skb);
1da177e4
LT
1251#ifdef CONFIG_SYN_COOKIES
1252 if (sysctl_tcp_syncookies) {
1253 want_cookie = 1;
1254 } else
1255#endif
1256 goto drop;
1257 }
1258
1259 /* Accept backlog is full. If we have already queued enough
1260 * of warm entries in syn queue, drop request. It is better than
1261 * clogging syn queue with openreqs with exponentially increasing
1262 * timeout.
1263 */
463c84b9 1264 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1265 goto drop;
1266
ce4a7d0d 1267 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1268 if (!req)
1269 goto drop;
1270
cfb6eeb4
YH
1271#ifdef CONFIG_TCP_MD5SIG
1272 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1273#endif
1274
1da177e4 1275 tcp_clear_options(&tmp_opt);
bee7ca9e 1276 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1277 tmp_opt.user_mss = tp->rx_opt.user_mss;
bb5b7c11 1278 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
4957faad
WAS
1279
1280 if (tmp_opt.cookie_plus > 0 &&
1281 tmp_opt.saw_tstamp &&
1282 !tp->rx_opt.cookie_out_never &&
1283 (sysctl_tcp_cookie_size > 0 ||
1284 (tp->cookie_values != NULL &&
1285 tp->cookie_values->cookie_desired > 0))) {
1286 u8 *c;
1287 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291 goto drop_and_release;
1292
1293 /* Secret recipe starts with IP addresses */
0eae88f3
ED
1294 *mess++ ^= (__force u32)daddr;
1295 *mess++ ^= (__force u32)saddr;
1da177e4 1296
4957faad
WAS
1297 /* plus variable length Initiator Cookie */
1298 c = (u8 *)mess;
1299 while (l-- > 0)
1300 *c++ ^= *hash_location++;
1301
1302#ifdef CONFIG_SYN_COOKIES
1303 want_cookie = 0; /* not our kind of cookie */
1304#endif
1305 tmp_ext.cookie_out_never = 0; /* false */
1306 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307 } else if (!tp->rx_opt.cookie_in_always) {
1308 /* redundant indications, but ensure initialization. */
1309 tmp_ext.cookie_out_never = 1; /* true */
1310 tmp_ext.cookie_plus = 0;
1311 } else {
1312 goto drop_and_release;
1313 }
1314 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1da177e4 1315
4dfc2817 1316 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1317 tcp_clear_options(&tmp_opt);
1da177e4 1318
1da177e4 1319 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1320 tcp_openreq_init(req, &tmp_opt, skb);
1321
bb5b7c11
DM
1322 ireq = inet_rsk(req);
1323 ireq->loc_addr = daddr;
1324 ireq->rmt_addr = saddr;
1325 ireq->no_srccheck = inet_sk(sk)->transparent;
1326 ireq->opt = tcp_v4_save_options(sk, skb);
1327
284904aa 1328 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1329 goto drop_and_free;
284904aa 1330
1da177e4 1331 if (!want_cookie)
aa8223c7 1332 TCP_ECN_create_request(req, tcp_hdr(skb));
1da177e4
LT
1333
1334 if (want_cookie) {
1335#ifdef CONFIG_SYN_COOKIES
4dfc2817 1336 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4
LT
1337#endif
1338 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339 } else if (!isn) {
1340 struct inet_peer *peer = NULL;
1341
1342 /* VJ's idea. We save last timestamp seen
1343 * from the destination in peer table, when entering
1344 * state TIME-WAIT, and check against it before
1345 * accepting new connection request.
1346 *
1347 * If "isn" is not zero, this request hit alive
1348 * timewait bucket, so that all the necessary checks
1349 * are made in the function processing timewait state.
1350 */
1351 if (tmp_opt.saw_tstamp &&
295ff7ed 1352 tcp_death_row.sysctl_tw_recycle &&
bb5b7c11 1353 (dst = inet_csk_route_req(sk, req)) != NULL &&
1da177e4
LT
1354 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355 peer->v4daddr == saddr) {
317fe0e6 1356 inet_peer_refcheck(peer);
2c1409a0 1357 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1da177e4
LT
1358 (s32)(peer->tcp_ts - req->ts_recent) >
1359 TCP_PAWS_WINDOW) {
de0744af 1360 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1361 goto drop_and_release;
1da177e4
LT
1362 }
1363 }
1364 /* Kill the following clause, if you dislike this way. */
1365 else if (!sysctl_tcp_syncookies &&
463c84b9 1366 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4
LT
1367 (sysctl_max_syn_backlog >> 2)) &&
1368 (!peer || !peer->tcp_ts_stamp) &&
1369 (!dst || !dst_metric(dst, RTAX_RTT))) {
1370 /* Without syncookies last quarter of
1371 * backlog is filled with destinations,
1372 * proven to be alive.
1373 * It means that we continue to communicate
1374 * to destinations, already remembered
1375 * to the moment of synflood.
1376 */
673d57e7
HH
1377 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1378 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1379 goto drop_and_release;
1da177e4
LT
1380 }
1381
a94f723d 1382 isn = tcp_v4_init_sequence(skb);
1da177e4 1383 }
2e6599cb 1384 tcp_rsk(req)->snt_isn = isn;
1da177e4 1385
72659ecc
OP
1386 if (tcp_v4_send_synack(sk, dst, req,
1387 (struct request_values *)&tmp_ext) ||
4957faad 1388 want_cookie)
1da177e4
LT
1389 goto drop_and_free;
1390
7cd04fa7 1391 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1da177e4
LT
1392 return 0;
1393
7cd04fa7
DL
1394drop_and_release:
1395 dst_release(dst);
1da177e4 1396drop_and_free:
60236fdd 1397 reqsk_free(req);
1da177e4 1398drop:
1da177e4
LT
1399 return 0;
1400}
1401
1402
1403/*
1404 * The three way handshake has completed - we got a valid synack -
1405 * now create the new socket.
1406 */
1407struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1408 struct request_sock *req,
1da177e4
LT
1409 struct dst_entry *dst)
1410{
2e6599cb 1411 struct inet_request_sock *ireq;
1da177e4
LT
1412 struct inet_sock *newinet;
1413 struct tcp_sock *newtp;
1414 struct sock *newsk;
cfb6eeb4
YH
1415#ifdef CONFIG_TCP_MD5SIG
1416 struct tcp_md5sig_key *key;
1417#endif
1da177e4
LT
1418
1419 if (sk_acceptq_is_full(sk))
1420 goto exit_overflow;
1421
463c84b9 1422 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1da177e4
LT
1423 goto exit;
1424
1425 newsk = tcp_create_openreq_child(sk, req, skb);
1426 if (!newsk)
1427 goto exit;
1428
bcd76111 1429 newsk->sk_gso_type = SKB_GSO_TCPV4;
6cbb0df7 1430 sk_setup_caps(newsk, dst);
1da177e4
LT
1431
1432 newtp = tcp_sk(newsk);
1433 newinet = inet_sk(newsk);
2e6599cb 1434 ireq = inet_rsk(req);
c720c7e8
ED
1435 newinet->inet_daddr = ireq->rmt_addr;
1436 newinet->inet_rcv_saddr = ireq->loc_addr;
1437 newinet->inet_saddr = ireq->loc_addr;
2e6599cb
ACM
1438 newinet->opt = ireq->opt;
1439 ireq->opt = NULL;
463c84b9 1440 newinet->mc_index = inet_iif(skb);
eddc9ec5 1441 newinet->mc_ttl = ip_hdr(skb)->ttl;
d83d8461 1442 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1da177e4 1443 if (newinet->opt)
d83d8461 1444 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
c720c7e8 1445 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1446
5d424d5a 1447 tcp_mtup_init(newsk);
1da177e4
LT
1448 tcp_sync_mss(newsk, dst_mtu(dst));
1449 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
f5fff5dc
TQ
1450 if (tcp_sk(sk)->rx_opt.user_mss &&
1451 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1452 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1453
1da177e4
LT
1454 tcp_initialize_rcv_mss(newsk);
1455
cfb6eeb4
YH
1456#ifdef CONFIG_TCP_MD5SIG
1457 /* Copy over the MD5 key from the original socket */
c720c7e8
ED
1458 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1459 if (key != NULL) {
cfb6eeb4
YH
1460 /*
1461 * We're using one, so create a matching key
1462 * on the newsk structure. If we fail to get
1463 * memory, then we end up not copying the key
1464 * across. Shucks.
1465 */
f6685938
ACM
1466 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1467 if (newkey != NULL)
c720c7e8 1468 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
cfb6eeb4 1469 newkey, key->keylen);
a465419b 1470 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1471 }
1472#endif
1473
9327f705 1474 __inet_hash_nolisten(newsk, NULL);
ab1e0a13 1475 __inet_inherit_port(sk, newsk);
1da177e4
LT
1476
1477 return newsk;
1478
1479exit_overflow:
de0744af 1480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1da177e4 1481exit:
de0744af 1482 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1483 dst_release(dst);
1484 return NULL;
1485}
1486
1487static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1488{
aa8223c7 1489 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1490 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1491 struct sock *nsk;
60236fdd 1492 struct request_sock **prev;
1da177e4 1493 /* Find possible connection requests. */
463c84b9
ACM
1494 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1495 iph->saddr, iph->daddr);
1da177e4
LT
1496 if (req)
1497 return tcp_check_req(sk, skb, req, prev);
1498
3b1e0a65 1499 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1500 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1501
1502 if (nsk) {
1503 if (nsk->sk_state != TCP_TIME_WAIT) {
1504 bh_lock_sock(nsk);
1505 return nsk;
1506 }
9469c7b4 1507 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1508 return NULL;
1509 }
1510
1511#ifdef CONFIG_SYN_COOKIES
af9b4738 1512 if (!th->syn)
1da177e4
LT
1513 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1514#endif
1515 return sk;
1516}
1517
b51655b9 1518static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1519{
eddc9ec5
ACM
1520 const struct iphdr *iph = ip_hdr(skb);
1521
84fa7933 1522 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1523 if (!tcp_v4_check(skb->len, iph->saddr,
1524 iph->daddr, skb->csum)) {
fb286bb2 1525 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1526 return 0;
fb286bb2 1527 }
1da177e4 1528 }
fb286bb2 1529
eddc9ec5 1530 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1531 skb->len, IPPROTO_TCP, 0);
1532
1da177e4 1533 if (skb->len <= 76) {
fb286bb2 1534 return __skb_checksum_complete(skb);
1da177e4
LT
1535 }
1536 return 0;
1537}
1538
1539
1540/* The socket must have it's spinlock held when we get
1541 * here.
1542 *
1543 * We have a potential double-lock case here, so even when
1544 * doing backlog processing we use the BH locking scheme.
1545 * This is because we cannot sleep with the original spinlock
1546 * held.
1547 */
1548int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549{
cfb6eeb4
YH
1550 struct sock *rsk;
1551#ifdef CONFIG_TCP_MD5SIG
1552 /*
1553 * We really want to reject the packet as early as possible
1554 * if:
1555 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1556 * o There is an MD5 option and we're not expecting one
1557 */
7174259e 1558 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1559 goto discard;
1560#endif
1561
1da177e4 1562 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
ca55158c 1563 sock_rps_save_rxhash(sk, skb->rxhash);
1da177e4 1564 TCP_CHECK_TIMER(sk);
aa8223c7 1565 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1566 rsk = sk;
1da177e4 1567 goto reset;
cfb6eeb4 1568 }
1da177e4
LT
1569 TCP_CHECK_TIMER(sk);
1570 return 0;
1571 }
1572
ab6a5bb6 1573 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1574 goto csum_err;
1575
1576 if (sk->sk_state == TCP_LISTEN) {
1577 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1578 if (!nsk)
1579 goto discard;
1580
1581 if (nsk != sk) {
cfb6eeb4
YH
1582 if (tcp_child_process(sk, nsk, skb)) {
1583 rsk = nsk;
1da177e4 1584 goto reset;
cfb6eeb4 1585 }
1da177e4
LT
1586 return 0;
1587 }
ca55158c
ED
1588 } else
1589 sock_rps_save_rxhash(sk, skb->rxhash);
1590
1da177e4
LT
1591
1592 TCP_CHECK_TIMER(sk);
aa8223c7 1593 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1594 rsk = sk;
1da177e4 1595 goto reset;
cfb6eeb4 1596 }
1da177e4
LT
1597 TCP_CHECK_TIMER(sk);
1598 return 0;
1599
1600reset:
cfb6eeb4 1601 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1602discard:
1603 kfree_skb(skb);
1604 /* Be careful here. If this function gets more complicated and
1605 * gcc suffers from register pressure on the x86, sk (in %ebx)
1606 * might be destroyed here. This current version compiles correctly,
1607 * but you have been warned.
1608 */
1609 return 0;
1610
1611csum_err:
63231bdd 1612 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1613 goto discard;
1614}
1615
1616/*
1617 * From tcp_input.c
1618 */
1619
1620int tcp_v4_rcv(struct sk_buff *skb)
1621{
eddc9ec5 1622 const struct iphdr *iph;
1da177e4
LT
1623 struct tcphdr *th;
1624 struct sock *sk;
1625 int ret;
a86b1e30 1626 struct net *net = dev_net(skb->dev);
1da177e4
LT
1627
1628 if (skb->pkt_type != PACKET_HOST)
1629 goto discard_it;
1630
1631 /* Count it even if it's bad */
63231bdd 1632 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1633
1634 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1635 goto discard_it;
1636
aa8223c7 1637 th = tcp_hdr(skb);
1da177e4
LT
1638
1639 if (th->doff < sizeof(struct tcphdr) / 4)
1640 goto bad_packet;
1641 if (!pskb_may_pull(skb, th->doff * 4))
1642 goto discard_it;
1643
1644 /* An explanation is required here, I think.
1645 * Packet length and doff are validated by header prediction,
caa20d9a 1646 * provided case of th->doff==0 is eliminated.
1da177e4 1647 * So, we defer the checks. */
60476372 1648 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1649 goto bad_packet;
1650
aa8223c7 1651 th = tcp_hdr(skb);
eddc9ec5 1652 iph = ip_hdr(skb);
1da177e4
LT
1653 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1654 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1655 skb->len - th->doff * 4);
1656 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1657 TCP_SKB_CB(skb)->when = 0;
eddc9ec5 1658 TCP_SKB_CB(skb)->flags = iph->tos;
1da177e4
LT
1659 TCP_SKB_CB(skb)->sacked = 0;
1660
9a1f27c4 1661 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1662 if (!sk)
1663 goto no_tcp_socket;
1664
bb134d5d
ED
1665process:
1666 if (sk->sk_state == TCP_TIME_WAIT)
1667 goto do_time_wait;
1668
6cce09f8
ED
1669 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1670 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1671 goto discard_and_relse;
6cce09f8 1672 }
d218d111 1673
1da177e4
LT
1674 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1675 goto discard_and_relse;
b59c2701 1676 nf_reset(skb);
1da177e4 1677
fda9ef5d 1678 if (sk_filter(sk, skb))
1da177e4
LT
1679 goto discard_and_relse;
1680
1681 skb->dev = NULL;
1682
c6366184 1683 bh_lock_sock_nested(sk);
1da177e4
LT
1684 ret = 0;
1685 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1686#ifdef CONFIG_NET_DMA
1687 struct tcp_sock *tp = tcp_sk(sk);
1688 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
f67b4599 1689 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1a2449a8 1690 if (tp->ucopy.dma_chan)
1da177e4 1691 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1692 else
1693#endif
1694 {
1695 if (!tcp_prequeue(sk, skb))
ae8d7f88 1696 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1697 }
6cce09f8 1698 } else if (unlikely(sk_add_backlog(sk, skb))) {
6b03a53a 1699 bh_unlock_sock(sk);
6cce09f8 1700 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1701 goto discard_and_relse;
1702 }
1da177e4
LT
1703 bh_unlock_sock(sk);
1704
1705 sock_put(sk);
1706
1707 return ret;
1708
1709no_tcp_socket:
1710 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1711 goto discard_it;
1712
1713 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1714bad_packet:
63231bdd 1715 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1716 } else {
cfb6eeb4 1717 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1718 }
1719
1720discard_it:
1721 /* Discard frame. */
1722 kfree_skb(skb);
e905a9ed 1723 return 0;
1da177e4
LT
1724
1725discard_and_relse:
1726 sock_put(sk);
1727 goto discard_it;
1728
1729do_time_wait:
1730 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1731 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1732 goto discard_it;
1733 }
1734
1735 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 1736 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 1737 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1738 goto discard_it;
1739 }
9469c7b4 1740 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1741 case TCP_TW_SYN: {
c346dca1 1742 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1743 &tcp_hashinfo,
eddc9ec5 1744 iph->daddr, th->dest,
463c84b9 1745 inet_iif(skb));
1da177e4 1746 if (sk2) {
9469c7b4
YH
1747 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1748 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1749 sk = sk2;
1750 goto process;
1751 }
1752 /* Fall through to ACK */
1753 }
1754 case TCP_TW_ACK:
1755 tcp_v4_timewait_ack(sk, skb);
1756 break;
1757 case TCP_TW_RST:
1758 goto no_tcp_socket;
1759 case TCP_TW_SUCCESS:;
1760 }
1761 goto discard_it;
1762}
1763
1da177e4
LT
1764/* VJ's idea. Save last timestamp seen from this destination
1765 * and hold it at least for normal timewait interval to use for duplicate
1766 * segment detection in subsequent connections, before they enter synchronized
1767 * state.
1768 */
1769
1770int tcp_v4_remember_stamp(struct sock *sk)
1771{
1772 struct inet_sock *inet = inet_sk(sk);
1773 struct tcp_sock *tp = tcp_sk(sk);
1774 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1775 struct inet_peer *peer = NULL;
1776 int release_it = 0;
1777
c720c7e8
ED
1778 if (!rt || rt->rt_dst != inet->inet_daddr) {
1779 peer = inet_getpeer(inet->inet_daddr, 1);
1da177e4
LT
1780 release_it = 1;
1781 } else {
1782 if (!rt->peer)
1783 rt_bind_peer(rt, 1);
1784 peer = rt->peer;
1785 }
1786
1787 if (peer) {
1788 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
2c1409a0
ED
1789 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1791 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1da177e4
LT
1792 peer->tcp_ts = tp->rx_opt.ts_recent;
1793 }
1794 if (release_it)
1795 inet_putpeer(peer);
1796 return 1;
1797 }
1798
1799 return 0;
1800}
1801
8feaf0c0 1802int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1da177e4 1803{
8feaf0c0 1804 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1da177e4
LT
1805
1806 if (peer) {
8feaf0c0
ACM
1807 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1808
1809 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
2c1409a0
ED
1810 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1811 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1812 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
8feaf0c0 1813 peer->tcp_ts = tcptw->tw_ts_recent;
1da177e4
LT
1814 }
1815 inet_putpeer(peer);
1816 return 1;
1817 }
1818
1819 return 0;
1820}
1821
3b401a81 1822const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1823 .queue_xmit = ip_queue_xmit,
1824 .send_check = tcp_v4_send_check,
1825 .rebuild_header = inet_sk_rebuild_header,
1826 .conn_request = tcp_v4_conn_request,
1827 .syn_recv_sock = tcp_v4_syn_recv_sock,
1828 .remember_stamp = tcp_v4_remember_stamp,
1829 .net_header_len = sizeof(struct iphdr),
1830 .setsockopt = ip_setsockopt,
1831 .getsockopt = ip_getsockopt,
1832 .addr2sockaddr = inet_csk_addr2sockaddr,
1833 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1834 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1835#ifdef CONFIG_COMPAT
543d9cfe
ACM
1836 .compat_setsockopt = compat_ip_setsockopt,
1837 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1838#endif
1da177e4
LT
1839};
1840
cfb6eeb4 1841#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1842static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1843 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1844 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4
YH
1845 .md5_add = tcp_v4_md5_add_func,
1846 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1847};
b6332e6c 1848#endif
cfb6eeb4 1849
1da177e4
LT
1850/* NOTE: A lot of things set to zero explicitly by call to
1851 * sk_alloc() so need not be done here.
1852 */
1853static int tcp_v4_init_sock(struct sock *sk)
1854{
6687e988 1855 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4
LT
1856 struct tcp_sock *tp = tcp_sk(sk);
1857
1858 skb_queue_head_init(&tp->out_of_order_queue);
1859 tcp_init_xmit_timers(sk);
1860 tcp_prequeue_init(tp);
1861
6687e988 1862 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
1863 tp->mdev = TCP_TIMEOUT_INIT;
1864
1865 /* So many TCP implementations out there (incorrectly) count the
1866 * initial SYN frame in their delayed-ACK and congestion control
1867 * algorithms that we must have the following bandaid to talk
1868 * efficiently to them. -DaveM
1869 */
1870 tp->snd_cwnd = 2;
1871
1872 /* See draft-stevens-tcpca-spec-01 for discussion of the
1873 * initialization of these values.
1874 */
0b6a05c1 1875 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4 1876 tp->snd_cwnd_clamp = ~0;
bee7ca9e 1877 tp->mss_cache = TCP_MSS_DEFAULT;
1da177e4
LT
1878
1879 tp->reordering = sysctl_tcp_reordering;
6687e988 1880 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1881
1882 sk->sk_state = TCP_CLOSE;
1883
1884 sk->sk_write_space = sk_stream_write_space;
1885 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1886
8292a17a 1887 icsk->icsk_af_ops = &ipv4_specific;
d83d8461 1888 icsk->icsk_sync_mss = tcp_sync_mss;
cfb6eeb4
YH
1889#ifdef CONFIG_TCP_MD5SIG
1890 tp->af_specific = &tcp_sock_ipv4_specific;
1891#endif
1da177e4 1892
435cf559
WAS
1893 /* TCP Cookie Transactions */
1894 if (sysctl_tcp_cookie_size > 0) {
1895 /* Default, cookies without s_data_payload. */
1896 tp->cookie_values =
1897 kzalloc(sizeof(*tp->cookie_values),
1898 sk->sk_allocation);
1899 if (tp->cookie_values != NULL)
1900 kref_init(&tp->cookie_values->kref);
1901 }
1902 /* Presumed zeroed, in order of appearance:
1903 * cookie_in_always, cookie_out_never,
1904 * s_data_constant, s_data_in, s_data_out
1905 */
1da177e4
LT
1906 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1907 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1908
eb4dea58 1909 local_bh_disable();
1748376b 1910 percpu_counter_inc(&tcp_sockets_allocated);
eb4dea58 1911 local_bh_enable();
1da177e4
LT
1912
1913 return 0;
1914}
1915
7d06b2e0 1916void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1917{
1918 struct tcp_sock *tp = tcp_sk(sk);
1919
1920 tcp_clear_xmit_timers(sk);
1921
6687e988 1922 tcp_cleanup_congestion_control(sk);
317a76f9 1923
1da177e4 1924 /* Cleanup up the write buffer. */
fe067e8a 1925 tcp_write_queue_purge(sk);
1da177e4
LT
1926
1927 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1928 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1929
cfb6eeb4
YH
1930#ifdef CONFIG_TCP_MD5SIG
1931 /* Clean up the MD5 key list, if any */
1932 if (tp->md5sig_info) {
1933 tcp_v4_clear_md5_list(sk);
1934 kfree(tp->md5sig_info);
1935 tp->md5sig_info = NULL;
1936 }
1937#endif
1938
1a2449a8
CL
1939#ifdef CONFIG_NET_DMA
1940 /* Cleans up our sk_async_wait_queue */
e905a9ed 1941 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1942#endif
1943
1da177e4
LT
1944 /* Clean prequeue, it must be empty really */
1945 __skb_queue_purge(&tp->ucopy.prequeue);
1946
1947 /* Clean up a referenced TCP bind bucket. */
463c84b9 1948 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1949 inet_put_port(sk);
1da177e4
LT
1950
1951 /*
1952 * If sendmsg cached page exists, toss it.
1953 */
1954 if (sk->sk_sndmsg_page) {
1955 __free_page(sk->sk_sndmsg_page);
1956 sk->sk_sndmsg_page = NULL;
1957 }
1958
435cf559
WAS
1959 /* TCP Cookie Transactions */
1960 if (tp->cookie_values != NULL) {
1961 kref_put(&tp->cookie_values->kref,
1962 tcp_cookie_values_release);
1963 tp->cookie_values = NULL;
1964 }
1965
1748376b 1966 percpu_counter_dec(&tcp_sockets_allocated);
1da177e4
LT
1967}
1968
1969EXPORT_SYMBOL(tcp_v4_destroy_sock);
1970
1971#ifdef CONFIG_PROC_FS
1972/* Proc filesystem TCP sock list dumping. */
1973
3ab5aee7 1974static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 1975{
3ab5aee7 1976 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 1977 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1978}
1979
8feaf0c0 1980static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 1981{
3ab5aee7
ED
1982 return !is_a_nulls(tw->tw_node.next) ?
1983 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
1984}
1985
a8b690f9
TH
1986/*
1987 * Get next listener socket follow cur. If cur is NULL, get first socket
1988 * starting from bucket given in st->bucket; when st->bucket is zero the
1989 * very first socket in the hash table is returned.
1990 */
1da177e4
LT
1991static void *listening_get_next(struct seq_file *seq, void *cur)
1992{
463c84b9 1993 struct inet_connection_sock *icsk;
c25eb3bf 1994 struct hlist_nulls_node *node;
1da177e4 1995 struct sock *sk = cur;
5caea4ea 1996 struct inet_listen_hashbucket *ilb;
5799de0b 1997 struct tcp_iter_state *st = seq->private;
a4146b1b 1998 struct net *net = seq_file_net(seq);
1da177e4
LT
1999
2000 if (!sk) {
a8b690f9 2001 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 2002 spin_lock_bh(&ilb->lock);
c25eb3bf 2003 sk = sk_nulls_head(&ilb->head);
a8b690f9 2004 st->offset = 0;
1da177e4
LT
2005 goto get_sk;
2006 }
5caea4ea 2007 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2008 ++st->num;
a8b690f9 2009 ++st->offset;
1da177e4
LT
2010
2011 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 2012 struct request_sock *req = cur;
1da177e4 2013
72a3effa 2014 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
2015 req = req->dl_next;
2016 while (1) {
2017 while (req) {
bdccc4ca 2018 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2019 cur = req;
2020 goto out;
2021 }
2022 req = req->dl_next;
2023 }
a8b690f9 2024 st->offset = 0;
72a3effa 2025 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
2026 break;
2027get_req:
463c84b9 2028 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4
LT
2029 }
2030 sk = sk_next(st->syn_wait_sk);
2031 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2032 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2033 } else {
e905a9ed 2034 icsk = inet_csk(sk);
463c84b9
ACM
2035 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2036 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2037 goto start_req;
463c84b9 2038 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2039 sk = sk_next(sk);
2040 }
2041get_sk:
c25eb3bf 2042 sk_nulls_for_each_from(sk, node) {
878628fb 2043 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1da177e4
LT
2044 cur = sk;
2045 goto out;
2046 }
e905a9ed 2047 icsk = inet_csk(sk);
463c84b9
ACM
2048 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2049 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2050start_req:
2051 st->uid = sock_i_uid(sk);
2052 st->syn_wait_sk = sk;
2053 st->state = TCP_SEQ_STATE_OPENREQ;
2054 st->sbucket = 0;
2055 goto get_req;
2056 }
463c84b9 2057 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2058 }
5caea4ea 2059 spin_unlock_bh(&ilb->lock);
a8b690f9 2060 st->offset = 0;
0f7ff927 2061 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2062 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2063 spin_lock_bh(&ilb->lock);
c25eb3bf 2064 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2065 goto get_sk;
2066 }
2067 cur = NULL;
2068out:
2069 return cur;
2070}
2071
2072static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2073{
a8b690f9
TH
2074 struct tcp_iter_state *st = seq->private;
2075 void *rc;
2076
2077 st->bucket = 0;
2078 st->offset = 0;
2079 rc = listening_get_next(seq, NULL);
1da177e4
LT
2080
2081 while (rc && *pos) {
2082 rc = listening_get_next(seq, rc);
2083 --*pos;
2084 }
2085 return rc;
2086}
2087
6eac5604
AK
2088static inline int empty_bucket(struct tcp_iter_state *st)
2089{
3ab5aee7
ED
2090 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2091 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2092}
2093
a8b690f9
TH
2094/*
2095 * Get first established socket starting from bucket given in st->bucket.
2096 * If st->bucket is zero, the very first socket in the hash is returned.
2097 */
1da177e4
LT
2098static void *established_get_first(struct seq_file *seq)
2099{
5799de0b 2100 struct tcp_iter_state *st = seq->private;
a4146b1b 2101 struct net *net = seq_file_net(seq);
1da177e4
LT
2102 void *rc = NULL;
2103
a8b690f9
TH
2104 st->offset = 0;
2105 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2106 struct sock *sk;
3ab5aee7 2107 struct hlist_nulls_node *node;
8feaf0c0 2108 struct inet_timewait_sock *tw;
9db66bdc 2109 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2110
6eac5604
AK
2111 /* Lockless fast path for the common case of empty buckets */
2112 if (empty_bucket(st))
2113 continue;
2114
9db66bdc 2115 spin_lock_bh(lock);
3ab5aee7 2116 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2117 if (sk->sk_family != st->family ||
878628fb 2118 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2119 continue;
2120 }
2121 rc = sk;
2122 goto out;
2123 }
2124 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2125 inet_twsk_for_each(tw, node,
dbca9b27 2126 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2127 if (tw->tw_family != st->family ||
878628fb 2128 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2129 continue;
2130 }
2131 rc = tw;
2132 goto out;
2133 }
9db66bdc 2134 spin_unlock_bh(lock);
1da177e4
LT
2135 st->state = TCP_SEQ_STATE_ESTABLISHED;
2136 }
2137out:
2138 return rc;
2139}
2140
2141static void *established_get_next(struct seq_file *seq, void *cur)
2142{
2143 struct sock *sk = cur;
8feaf0c0 2144 struct inet_timewait_sock *tw;
3ab5aee7 2145 struct hlist_nulls_node *node;
5799de0b 2146 struct tcp_iter_state *st = seq->private;
a4146b1b 2147 struct net *net = seq_file_net(seq);
1da177e4
LT
2148
2149 ++st->num;
a8b690f9 2150 ++st->offset;
1da177e4
LT
2151
2152 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2153 tw = cur;
2154 tw = tw_next(tw);
2155get_tw:
878628fb 2156 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2157 tw = tw_next(tw);
2158 }
2159 if (tw) {
2160 cur = tw;
2161 goto out;
2162 }
9db66bdc 2163 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2164 st->state = TCP_SEQ_STATE_ESTABLISHED;
2165
6eac5604 2166 /* Look for next non empty bucket */
a8b690f9 2167 st->offset = 0;
f373b53b 2168 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2169 empty_bucket(st))
2170 ;
f373b53b 2171 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2172 return NULL;
2173
9db66bdc 2174 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2175 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2176 } else
3ab5aee7 2177 sk = sk_nulls_next(sk);
1da177e4 2178
3ab5aee7 2179 sk_nulls_for_each_from(sk, node) {
878628fb 2180 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2181 goto found;
2182 }
2183
2184 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2185 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2186 goto get_tw;
2187found:
2188 cur = sk;
2189out:
2190 return cur;
2191}
2192
2193static void *established_get_idx(struct seq_file *seq, loff_t pos)
2194{
a8b690f9
TH
2195 struct tcp_iter_state *st = seq->private;
2196 void *rc;
2197
2198 st->bucket = 0;
2199 rc = established_get_first(seq);
1da177e4
LT
2200
2201 while (rc && pos) {
2202 rc = established_get_next(seq, rc);
2203 --pos;
7174259e 2204 }
1da177e4
LT
2205 return rc;
2206}
2207
2208static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2209{
2210 void *rc;
5799de0b 2211 struct tcp_iter_state *st = seq->private;
1da177e4 2212
1da177e4
LT
2213 st->state = TCP_SEQ_STATE_LISTENING;
2214 rc = listening_get_idx(seq, &pos);
2215
2216 if (!rc) {
1da177e4
LT
2217 st->state = TCP_SEQ_STATE_ESTABLISHED;
2218 rc = established_get_idx(seq, pos);
2219 }
2220
2221 return rc;
2222}
2223
a8b690f9
TH
2224static void *tcp_seek_last_pos(struct seq_file *seq)
2225{
2226 struct tcp_iter_state *st = seq->private;
2227 int offset = st->offset;
2228 int orig_num = st->num;
2229 void *rc = NULL;
2230
2231 switch (st->state) {
2232 case TCP_SEQ_STATE_OPENREQ:
2233 case TCP_SEQ_STATE_LISTENING:
2234 if (st->bucket >= INET_LHTABLE_SIZE)
2235 break;
2236 st->state = TCP_SEQ_STATE_LISTENING;
2237 rc = listening_get_next(seq, NULL);
2238 while (offset-- && rc)
2239 rc = listening_get_next(seq, rc);
2240 if (rc)
2241 break;
2242 st->bucket = 0;
2243 /* Fallthrough */
2244 case TCP_SEQ_STATE_ESTABLISHED:
2245 case TCP_SEQ_STATE_TIME_WAIT:
2246 st->state = TCP_SEQ_STATE_ESTABLISHED;
2247 if (st->bucket > tcp_hashinfo.ehash_mask)
2248 break;
2249 rc = established_get_first(seq);
2250 while (offset-- && rc)
2251 rc = established_get_next(seq, rc);
2252 }
2253
2254 st->num = orig_num;
2255
2256 return rc;
2257}
2258
1da177e4
LT
2259static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2260{
5799de0b 2261 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2262 void *rc;
2263
2264 if (*pos && *pos == st->last_pos) {
2265 rc = tcp_seek_last_pos(seq);
2266 if (rc)
2267 goto out;
2268 }
2269
1da177e4
LT
2270 st->state = TCP_SEQ_STATE_LISTENING;
2271 st->num = 0;
a8b690f9
TH
2272 st->bucket = 0;
2273 st->offset = 0;
2274 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2275
2276out:
2277 st->last_pos = *pos;
2278 return rc;
1da177e4
LT
2279}
2280
2281static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282{
a8b690f9 2283 struct tcp_iter_state *st = seq->private;
1da177e4 2284 void *rc = NULL;
1da177e4
LT
2285
2286 if (v == SEQ_START_TOKEN) {
2287 rc = tcp_get_idx(seq, 0);
2288 goto out;
2289 }
1da177e4
LT
2290
2291 switch (st->state) {
2292 case TCP_SEQ_STATE_OPENREQ:
2293 case TCP_SEQ_STATE_LISTENING:
2294 rc = listening_get_next(seq, v);
2295 if (!rc) {
1da177e4 2296 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2297 st->bucket = 0;
2298 st->offset = 0;
1da177e4
LT
2299 rc = established_get_first(seq);
2300 }
2301 break;
2302 case TCP_SEQ_STATE_ESTABLISHED:
2303 case TCP_SEQ_STATE_TIME_WAIT:
2304 rc = established_get_next(seq, v);
2305 break;
2306 }
2307out:
2308 ++*pos;
a8b690f9 2309 st->last_pos = *pos;
1da177e4
LT
2310 return rc;
2311}
2312
2313static void tcp_seq_stop(struct seq_file *seq, void *v)
2314{
5799de0b 2315 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2316
2317 switch (st->state) {
2318 case TCP_SEQ_STATE_OPENREQ:
2319 if (v) {
463c84b9
ACM
2320 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2321 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2322 }
2323 case TCP_SEQ_STATE_LISTENING:
2324 if (v != SEQ_START_TOKEN)
5caea4ea 2325 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2326 break;
2327 case TCP_SEQ_STATE_TIME_WAIT:
2328 case TCP_SEQ_STATE_ESTABLISHED:
2329 if (v)
9db66bdc 2330 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2331 break;
2332 }
2333}
2334
2335static int tcp_seq_open(struct inode *inode, struct file *file)
2336{
2337 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2338 struct tcp_iter_state *s;
52d6f3f1 2339 int err;
1da177e4 2340
52d6f3f1
DL
2341 err = seq_open_net(inode, file, &afinfo->seq_ops,
2342 sizeof(struct tcp_iter_state));
2343 if (err < 0)
2344 return err;
f40c8174 2345
52d6f3f1 2346 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2347 s->family = afinfo->family;
a8b690f9 2348 s->last_pos = 0;
f40c8174
DL
2349 return 0;
2350}
2351
6f8b13bc 2352int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2353{
2354 int rc = 0;
2355 struct proc_dir_entry *p;
2356
68fcadd1
DL
2357 afinfo->seq_fops.open = tcp_seq_open;
2358 afinfo->seq_fops.read = seq_read;
2359 afinfo->seq_fops.llseek = seq_lseek;
2360 afinfo->seq_fops.release = seq_release_net;
7174259e 2361
9427c4b3
DL
2362 afinfo->seq_ops.start = tcp_seq_start;
2363 afinfo->seq_ops.next = tcp_seq_next;
2364 afinfo->seq_ops.stop = tcp_seq_stop;
2365
84841c3c
DL
2366 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2367 &afinfo->seq_fops, afinfo);
2368 if (!p)
1da177e4
LT
2369 rc = -ENOMEM;
2370 return rc;
2371}
2372
6f8b13bc 2373void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2374{
6f8b13bc 2375 proc_net_remove(net, afinfo->name);
1da177e4
LT
2376}
2377
60236fdd 2378static void get_openreq4(struct sock *sk, struct request_sock *req,
5e659e4c 2379 struct seq_file *f, int i, int uid, int *len)
1da177e4 2380{
2e6599cb 2381 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2382 int ttd = req->expires - jiffies;
2383
5e659e4c
PE
2384 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2385 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
1da177e4 2386 i,
2e6599cb 2387 ireq->loc_addr,
c720c7e8 2388 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2389 ireq->rmt_addr,
2390 ntohs(ireq->rmt_port),
1da177e4
LT
2391 TCP_SYN_RECV,
2392 0, 0, /* could print option size, but that is af dependent. */
2393 1, /* timers active (only the expire timer) */
2394 jiffies_to_clock_t(ttd),
2395 req->retrans,
2396 uid,
2397 0, /* non standard timer */
2398 0, /* open_requests have no inode */
2399 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2400 req,
2401 len);
1da177e4
LT
2402}
2403
5e659e4c 2404static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2405{
2406 int timer_active;
2407 unsigned long timer_expires;
cf4c6bf8
IJ
2408 struct tcp_sock *tp = tcp_sk(sk);
2409 const struct inet_connection_sock *icsk = inet_csk(sk);
2410 struct inet_sock *inet = inet_sk(sk);
c720c7e8
ED
2411 __be32 dest = inet->inet_daddr;
2412 __be32 src = inet->inet_rcv_saddr;
2413 __u16 destp = ntohs(inet->inet_dport);
2414 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2415 int rx_queue;
1da177e4 2416
463c84b9 2417 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2418 timer_active = 1;
463c84b9
ACM
2419 timer_expires = icsk->icsk_timeout;
2420 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2421 timer_active = 4;
463c84b9 2422 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2423 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2424 timer_active = 2;
cf4c6bf8 2425 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2426 } else {
2427 timer_active = 0;
2428 timer_expires = jiffies;
2429 }
2430
49d09007
ED
2431 if (sk->sk_state == TCP_LISTEN)
2432 rx_queue = sk->sk_ack_backlog;
2433 else
2434 /*
2435 * because we dont lock socket, we might find a transient negative value
2436 */
2437 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2438
5e659e4c 2439 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
7be87351 2440 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
cf4c6bf8 2441 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2442 tp->write_seq - tp->snd_una,
49d09007 2443 rx_queue,
1da177e4
LT
2444 timer_active,
2445 jiffies_to_clock_t(timer_expires - jiffies),
463c84b9 2446 icsk->icsk_retransmits,
cf4c6bf8 2447 sock_i_uid(sk),
6687e988 2448 icsk->icsk_probes_out,
cf4c6bf8
IJ
2449 sock_i_ino(sk),
2450 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2451 jiffies_to_clock_t(icsk->icsk_rto),
2452 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2453 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2454 tp->snd_cwnd,
0b6a05c1 2455 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
5e659e4c 2456 len);
1da177e4
LT
2457}
2458
7174259e 2459static void get_timewait4_sock(struct inet_timewait_sock *tw,
5e659e4c 2460 struct seq_file *f, int i, int *len)
1da177e4 2461{
23f33c2d 2462 __be32 dest, src;
1da177e4
LT
2463 __u16 destp, srcp;
2464 int ttd = tw->tw_ttd - jiffies;
2465
2466 if (ttd < 0)
2467 ttd = 0;
2468
2469 dest = tw->tw_daddr;
2470 src = tw->tw_rcv_saddr;
2471 destp = ntohs(tw->tw_dport);
2472 srcp = ntohs(tw->tw_sport);
2473
5e659e4c
PE
2474 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2475 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
1da177e4
LT
2476 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2477 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
5e659e4c 2478 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2479}
2480
2481#define TMPSZ 150
2482
2483static int tcp4_seq_show(struct seq_file *seq, void *v)
2484{
5799de0b 2485 struct tcp_iter_state *st;
5e659e4c 2486 int len;
1da177e4
LT
2487
2488 if (v == SEQ_START_TOKEN) {
2489 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2490 " sl local_address rem_address st tx_queue "
2491 "rx_queue tr tm->when retrnsmt uid timeout "
2492 "inode");
2493 goto out;
2494 }
2495 st = seq->private;
2496
2497 switch (st->state) {
2498 case TCP_SEQ_STATE_LISTENING:
2499 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2500 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2501 break;
2502 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2503 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2504 break;
2505 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2506 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2507 break;
2508 }
5e659e4c 2509 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2510out:
2511 return 0;
2512}
2513
1da177e4 2514static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2515 .name = "tcp",
2516 .family = AF_INET,
5f4472c5
DL
2517 .seq_fops = {
2518 .owner = THIS_MODULE,
2519 },
9427c4b3
DL
2520 .seq_ops = {
2521 .show = tcp4_seq_show,
2522 },
1da177e4
LT
2523};
2524
2c8c1e72 2525static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2526{
2527 return tcp_proc_register(net, &tcp4_seq_afinfo);
2528}
2529
2c8c1e72 2530static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2531{
2532 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2533}
2534
2535static struct pernet_operations tcp4_net_ops = {
2536 .init = tcp4_proc_init_net,
2537 .exit = tcp4_proc_exit_net,
2538};
2539
1da177e4
LT
2540int __init tcp4_proc_init(void)
2541{
757764f6 2542 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2543}
2544
2545void tcp4_proc_exit(void)
2546{
757764f6 2547 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2548}
2549#endif /* CONFIG_PROC_FS */
2550
bf296b12
HX
2551struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2552{
36e7b1b8 2553 struct iphdr *iph = skb_gro_network_header(skb);
bf296b12
HX
2554
2555 switch (skb->ip_summed) {
2556 case CHECKSUM_COMPLETE:
86911732 2557 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2558 skb->csum)) {
2559 skb->ip_summed = CHECKSUM_UNNECESSARY;
2560 break;
2561 }
2562
2563 /* fall through */
2564 case CHECKSUM_NONE:
2565 NAPI_GRO_CB(skb)->flush = 1;
2566 return NULL;
2567 }
2568
2569 return tcp_gro_receive(head, skb);
2570}
2571EXPORT_SYMBOL(tcp4_gro_receive);
2572
2573int tcp4_gro_complete(struct sk_buff *skb)
2574{
2575 struct iphdr *iph = ip_hdr(skb);
2576 struct tcphdr *th = tcp_hdr(skb);
2577
2578 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2579 iph->saddr, iph->daddr, 0);
2580 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2581
2582 return tcp_gro_complete(skb);
2583}
2584EXPORT_SYMBOL(tcp4_gro_complete);
2585
1da177e4
LT
2586struct proto tcp_prot = {
2587 .name = "TCP",
2588 .owner = THIS_MODULE,
2589 .close = tcp_close,
2590 .connect = tcp_v4_connect,
2591 .disconnect = tcp_disconnect,
463c84b9 2592 .accept = inet_csk_accept,
1da177e4
LT
2593 .ioctl = tcp_ioctl,
2594 .init = tcp_v4_init_sock,
2595 .destroy = tcp_v4_destroy_sock,
2596 .shutdown = tcp_shutdown,
2597 .setsockopt = tcp_setsockopt,
2598 .getsockopt = tcp_getsockopt,
1da177e4
LT
2599 .recvmsg = tcp_recvmsg,
2600 .backlog_rcv = tcp_v4_do_rcv,
ab1e0a13
ACM
2601 .hash = inet_hash,
2602 .unhash = inet_unhash,
2603 .get_port = inet_csk_get_port,
1da177e4
LT
2604 .enter_memory_pressure = tcp_enter_memory_pressure,
2605 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2606 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2607 .memory_allocated = &tcp_memory_allocated,
2608 .memory_pressure = &tcp_memory_pressure,
2609 .sysctl_mem = sysctl_tcp_mem,
2610 .sysctl_wmem = sysctl_tcp_wmem,
2611 .sysctl_rmem = sysctl_tcp_rmem,
2612 .max_header = MAX_TCP_HEADER,
2613 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2614 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2615 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2616 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2617 .h.hashinfo = &tcp_hashinfo,
543d9cfe
ACM
2618#ifdef CONFIG_COMPAT
2619 .compat_setsockopt = compat_tcp_setsockopt,
2620 .compat_getsockopt = compat_tcp_getsockopt,
2621#endif
1da177e4
LT
2622};
2623
046ee902
DL
2624
2625static int __net_init tcp_sk_init(struct net *net)
2626{
2627 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2628 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2629}
2630
2631static void __net_exit tcp_sk_exit(struct net *net)
2632{
2633 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
b099ce26
EB
2634}
2635
2636static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2637{
2638 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2639}
2640
2641static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2642 .init = tcp_sk_init,
2643 .exit = tcp_sk_exit,
2644 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2645};
2646
9b0f976f 2647void __init tcp_v4_init(void)
1da177e4 2648{
5caea4ea 2649 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2650 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2651 panic("Failed to create the TCP control socket.\n");
1da177e4
LT
2652}
2653
2654EXPORT_SYMBOL(ipv4_specific);
1da177e4 2655EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 2656EXPORT_SYMBOL(tcp_prot);
1da177e4
LT
2657EXPORT_SYMBOL(tcp_v4_conn_request);
2658EXPORT_SYMBOL(tcp_v4_connect);
2659EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
2660EXPORT_SYMBOL(tcp_v4_remember_stamp);
2661EXPORT_SYMBOL(tcp_v4_send_check);
2662EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2663
2664#ifdef CONFIG_PROC_FS
2665EXPORT_SYMBOL(tcp_proc_register);
2666EXPORT_SYMBOL(tcp_proc_unregister);
2667#endif
1da177e4 2668EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 2669