]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/tcp_ipv4.c
[SOCK]: Introduce sk_clone
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
60236fdd 39 * request_sock handling and moved
1da177e4
LT
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4
LT
68#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
0f7ff927
ACM
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
6e04e021 97 .port_rover = 1024 - 1,
1da177e4
LT
98};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
1da177e4 106
0f7ff927 107static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
1da177e4 108{
8feaf0c0 109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
1da177e4
LT
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
8feaf0c0 122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
1da177e4
LT
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136{
0f7ff927 137 struct inet_bind_hashbucket *head;
1da177e4 138 struct hlist_node *node;
0f7ff927 139 struct inet_bind_bucket *tb;
1da177e4
LT
140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
6e04e021
ACM
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
0b2531bd
FH
151 rover = low;
152 else
6e04e021 153 rover = tcp_hashinfo.port_rover;
1da177e4
LT
154 do {
155 rover++;
0b2531bd 156 if (rover > high)
1da177e4 157 rover = low;
6e04e021 158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
1da177e4 159 spin_lock(&head->lock);
0f7ff927 160 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
6e04e021
ACM
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
1da177e4 169
d5d28375
DM
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
1da177e4 176 ret = 1;
d5d28375 177 if (unlikely(remaining <= 0))
1da177e4
LT
178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
6e04e021 185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
1da177e4 186 spin_lock(&head->lock);
0f7ff927 187 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206tb_not_found:
207 ret = 1;
6e04e021 208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
1da177e4
LT
209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218success:
a55ebcc4 219 if (!inet_sk(sk)->bind_hash)
2d8c4ce5 220 inet_bind_hash(sk, tb, snum);
a55ebcc4 221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
1da177e4
LT
222 ret = 0;
223
224fail_unlock:
225 spin_unlock(&head->lock);
226fail:
227 local_bh_enable();
228 return ret;
229}
230
1da177e4
LT
231static void tcp_v4_hash(struct sock *sk)
232{
81849d10 233 inet_hash(&tcp_hashinfo, sk);
1da177e4
LT
234}
235
236void tcp_unhash(struct sock *sk)
237{
81849d10 238 inet_unhash(&tcp_hashinfo, sk);
1da177e4
LT
239}
240
1da177e4
LT
241static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
242{
243 return secure_tcp_sequence_number(skb->nh.iph->daddr,
244 skb->nh.iph->saddr,
245 skb->h.th->dest,
246 skb->h.th->source);
247}
248
249/* called with local bh disabled */
250static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
8feaf0c0 251 struct inet_timewait_sock **twp)
1da177e4
LT
252{
253 struct inet_sock *inet = inet_sk(sk);
254 u32 daddr = inet->rcv_saddr;
255 u32 saddr = inet->daddr;
256 int dif = sk->sk_bound_dev_if;
8feaf0c0
ACM
257 INET_ADDR_COOKIE(acookie, saddr, daddr)
258 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
6e04e021
ACM
259 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
260 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
1da177e4 261 struct sock *sk2;
8feaf0c0
ACM
262 const struct hlist_node *node;
263 struct inet_timewait_sock *tw;
1da177e4
LT
264
265 write_lock(&head->lock);
266
267 /* Check TIME-WAIT sockets first. */
6e04e021 268 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
8feaf0c0 269 tw = inet_twsk(sk2);
1da177e4 270
8feaf0c0
ACM
271 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
272 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
1da177e4
LT
273 struct tcp_sock *tp = tcp_sk(sk);
274
275 /* With PAWS, it is safe from the viewpoint
276 of data integrity. Even without PAWS it
277 is safe provided sequence spaces do not
278 overlap i.e. at data rates <= 80Mbit/sec.
279
280 Actually, the idea is close to VJ's one,
281 only timestamp cache is held not per host,
282 but per port pair and TW bucket is used
283 as state holder.
284
285 If TW bucket has been already destroyed we
286 fall back to VJ's scheme and use initial
287 timestamp retrieved from peer table.
288 */
8feaf0c0 289 if (tcptw->tw_ts_recent_stamp &&
1da177e4
LT
290 (!twp || (sysctl_tcp_tw_reuse &&
291 xtime.tv_sec -
8feaf0c0
ACM
292 tcptw->tw_ts_recent_stamp > 1))) {
293 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
294 if (tp->write_seq == 0)
1da177e4 295 tp->write_seq = 1;
8feaf0c0
ACM
296 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
297 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
1da177e4
LT
298 sock_hold(sk2);
299 goto unique;
300 } else
301 goto not_unique;
302 }
303 }
304 tw = NULL;
305
306 /* And established part... */
307 sk_for_each(sk2, node, &head->chain) {
8feaf0c0 308 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
1da177e4
LT
309 goto not_unique;
310 }
311
312unique:
313 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */
315 inet->num = lport;
316 inet->sport = htons(lport);
317 sk->sk_hashent = hash;
318 BUG_TRAP(sk_unhashed(sk));
319 __sk_add_node(sk, &head->chain);
320 sock_prot_inc_use(sk->sk_prot);
321 write_unlock(&head->lock);
322
323 if (twp) {
324 *twp = tw;
325 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
326 } else if (tw) {
327 /* Silly. Should hash-dance instead... */
328 tcp_tw_deschedule(tw);
329 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
330
8feaf0c0 331 inet_twsk_put(tw);
1da177e4
LT
332 }
333
334 return 0;
335
336not_unique:
337 write_unlock(&head->lock);
338 return -EADDRNOTAVAIL;
339}
340
341static inline u32 connect_port_offset(const struct sock *sk)
342{
343 const struct inet_sock *inet = inet_sk(sk);
344
345 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
346 inet->dport);
347}
348
349/*
350 * Bind a port for a connect operation and hash it.
351 */
352static inline int tcp_v4_hash_connect(struct sock *sk)
353{
0f7ff927
ACM
354 const unsigned short snum = inet_sk(sk)->num;
355 struct inet_bind_hashbucket *head;
356 struct inet_bind_bucket *tb;
1da177e4
LT
357 int ret;
358
359 if (!snum) {
360 int low = sysctl_local_port_range[0];
361 int high = sysctl_local_port_range[1];
362 int range = high - low;
363 int i;
364 int port;
365 static u32 hint;
366 u32 offset = hint + connect_port_offset(sk);
367 struct hlist_node *node;
8feaf0c0 368 struct inet_timewait_sock *tw = NULL;
1da177e4
LT
369
370 local_bh_disable();
371 for (i = 1; i <= range; i++) {
372 port = low + (i + offset) % range;
6e04e021 373 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
1da177e4
LT
374 spin_lock(&head->lock);
375
376 /* Does not bother with rcv_saddr checks,
377 * because the established check is already
378 * unique enough.
379 */
0f7ff927 380 inet_bind_bucket_for_each(tb, node, &head->chain) {
1da177e4
LT
381 if (tb->port == port) {
382 BUG_TRAP(!hlist_empty(&tb->owners));
383 if (tb->fastreuse >= 0)
384 goto next_port;
385 if (!__tcp_v4_check_established(sk,
386 port,
387 &tw))
388 goto ok;
389 goto next_port;
390 }
391 }
392
6e04e021 393 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
1da177e4
LT
394 if (!tb) {
395 spin_unlock(&head->lock);
396 break;
397 }
398 tb->fastreuse = -1;
399 goto ok;
400
401 next_port:
402 spin_unlock(&head->lock);
403 }
404 local_bh_enable();
405
406 return -EADDRNOTAVAIL;
407
408ok:
409 hint += i;
410
411 /* Head lock still held and bh's disabled */
2d8c4ce5 412 inet_bind_hash(sk, tb, port);
1da177e4
LT
413 if (sk_unhashed(sk)) {
414 inet_sk(sk)->sport = htons(port);
f3f05f70 415 __inet_hash(&tcp_hashinfo, sk, 0);
1da177e4
LT
416 }
417 spin_unlock(&head->lock);
418
419 if (tw) {
420 tcp_tw_deschedule(tw);
8feaf0c0 421 inet_twsk_put(tw);
1da177e4
LT
422 }
423
424 ret = 0;
425 goto out;
426 }
427
6e04e021 428 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
a55ebcc4 429 tb = inet_sk(sk)->bind_hash;
1da177e4
LT
430 spin_lock_bh(&head->lock);
431 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
f3f05f70 432 __inet_hash(&tcp_hashinfo, sk, 0);
1da177e4
LT
433 spin_unlock_bh(&head->lock);
434 return 0;
435 } else {
436 spin_unlock(&head->lock);
437 /* No definite answer... Walk to established hash table */
438 ret = __tcp_v4_check_established(sk, snum, NULL);
439out:
440 local_bh_enable();
441 return ret;
442 }
443}
444
445/* This will initiate an outgoing connection. */
446int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
447{
448 struct inet_sock *inet = inet_sk(sk);
449 struct tcp_sock *tp = tcp_sk(sk);
450 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
451 struct rtable *rt;
452 u32 daddr, nexthop;
453 int tmp;
454 int err;
455
456 if (addr_len < sizeof(struct sockaddr_in))
457 return -EINVAL;
458
459 if (usin->sin_family != AF_INET)
460 return -EAFNOSUPPORT;
461
462 nexthop = daddr = usin->sin_addr.s_addr;
463 if (inet->opt && inet->opt->srr) {
464 if (!daddr)
465 return -EINVAL;
466 nexthop = inet->opt->faddr;
467 }
468
469 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
470 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
471 IPPROTO_TCP,
472 inet->sport, usin->sin_port, sk);
473 if (tmp < 0)
474 return tmp;
475
476 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
477 ip_rt_put(rt);
478 return -ENETUNREACH;
479 }
480
481 if (!inet->opt || !inet->opt->srr)
482 daddr = rt->rt_dst;
483
484 if (!inet->saddr)
485 inet->saddr = rt->rt_src;
486 inet->rcv_saddr = inet->saddr;
487
488 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
489 /* Reset inherited state */
490 tp->rx_opt.ts_recent = 0;
491 tp->rx_opt.ts_recent_stamp = 0;
492 tp->write_seq = 0;
493 }
494
495 if (sysctl_tcp_tw_recycle &&
496 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
497 struct inet_peer *peer = rt_get_peer(rt);
498
499 /* VJ's idea. We save last timestamp seen from
500 * the destination in peer table, when entering state TIME-WAIT
501 * and initialize rx_opt.ts_recent from it, when trying new connection.
502 */
503
504 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
505 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
506 tp->rx_opt.ts_recent = peer->tcp_ts;
507 }
508 }
509
510 inet->dport = usin->sin_port;
511 inet->daddr = daddr;
512
513 tp->ext_header_len = 0;
514 if (inet->opt)
515 tp->ext_header_len = inet->opt->optlen;
516
517 tp->rx_opt.mss_clamp = 536;
518
519 /* Socket identity is still unknown (sport may be zero).
520 * However we set state to SYN-SENT and not releasing socket
521 * lock select source port, enter ourselves into the hash tables and
522 * complete initialization after this.
523 */
524 tcp_set_state(sk, TCP_SYN_SENT);
525 err = tcp_v4_hash_connect(sk);
526 if (err)
527 goto failure;
528
529 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
530 if (err)
531 goto failure;
532
533 /* OK, now commit destination to socket. */
6cbb0df7 534 sk_setup_caps(sk, &rt->u.dst);
1da177e4
LT
535
536 if (!tp->write_seq)
537 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
538 inet->daddr,
539 inet->sport,
540 usin->sin_port);
541
542 inet->id = tp->write_seq ^ jiffies;
543
544 err = tcp_connect(sk);
545 rt = NULL;
546 if (err)
547 goto failure;
548
549 return 0;
550
551failure:
552 /* This unhashes the socket and releases the local port, if necessary. */
553 tcp_set_state(sk, TCP_CLOSE);
554 ip_rt_put(rt);
555 sk->sk_route_caps = 0;
556 inet->dport = 0;
557 return err;
558}
559
560static __inline__ int tcp_v4_iif(struct sk_buff *skb)
561{
562 return ((struct rtable *)skb->dst)->rt_iif;
563}
564
565static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
566{
567 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
568}
569
60236fdd
ACM
570static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
571 struct request_sock ***prevp,
1da177e4
LT
572 __u16 rport,
573 __u32 raddr, __u32 laddr)
574{
2ad69c55 575 struct listen_sock *lopt = tp->accept_queue.listen_opt;
60236fdd 576 struct request_sock *req, **prev;
1da177e4
LT
577
578 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
579 (req = *prev) != NULL;
580 prev = &req->dl_next) {
2e6599cb
ACM
581 const struct inet_request_sock *ireq = inet_rsk(req);
582
583 if (ireq->rmt_port == rport &&
584 ireq->rmt_addr == raddr &&
585 ireq->loc_addr == laddr &&
60236fdd 586 TCP_INET_FAMILY(req->rsk_ops->family)) {
1da177e4
LT
587 BUG_TRAP(!req->sk);
588 *prevp = prev;
589 break;
590 }
591 }
592
593 return req;
594}
595
60236fdd 596static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
1da177e4
LT
597{
598 struct tcp_sock *tp = tcp_sk(sk);
2ad69c55 599 struct listen_sock *lopt = tp->accept_queue.listen_opt;
2e6599cb 600 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1da177e4 601
0e87506f 602 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
1da177e4
LT
603 tcp_synq_added(sk);
604}
605
606
607/*
608 * This routine does path mtu discovery as defined in RFC1191.
609 */
610static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
611 u32 mtu)
612{
613 struct dst_entry *dst;
614 struct inet_sock *inet = inet_sk(sk);
615 struct tcp_sock *tp = tcp_sk(sk);
616
617 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
618 * send out by Linux are always <576bytes so they should go through
619 * unfragmented).
620 */
621 if (sk->sk_state == TCP_LISTEN)
622 return;
623
624 /* We don't check in the destentry if pmtu discovery is forbidden
625 * on this route. We just assume that no packet_to_big packets
626 * are send back when pmtu discovery is not active.
627 * There is a small race when the user changes this flag in the
628 * route, but I think that's acceptable.
629 */
630 if ((dst = __sk_dst_check(sk, 0)) == NULL)
631 return;
632
633 dst->ops->update_pmtu(dst, mtu);
634
635 /* Something is about to be wrong... Remember soft error
636 * for the case, if this connection will not able to recover.
637 */
638 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
639 sk->sk_err_soft = EMSGSIZE;
640
641 mtu = dst_mtu(dst);
642
643 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
644 tp->pmtu_cookie > mtu) {
645 tcp_sync_mss(sk, mtu);
646
647 /* Resend the TCP packet because it's
648 * clear that the old packet has been
649 * dropped. This is the new "fast" path mtu
650 * discovery.
651 */
652 tcp_simple_retransmit(sk);
653 } /* else let the usual retransmit timer handle it */
654}
655
656/*
657 * This routine is called by the ICMP module when it gets some
658 * sort of error condition. If err < 0 then the socket should
659 * be closed and the error returned to the user. If err > 0
660 * it's just the icmp type << 8 | icmp code. After adjustment
661 * header points to the first 8 bytes of the tcp header. We need
662 * to find the appropriate port.
663 *
664 * The locking strategy used here is very "optimistic". When
665 * someone else accesses the socket the ICMP is just dropped
666 * and for some paths there is no check at all.
667 * A more general error queue to queue errors for later handling
668 * is probably better.
669 *
670 */
671
672void tcp_v4_err(struct sk_buff *skb, u32 info)
673{
674 struct iphdr *iph = (struct iphdr *)skb->data;
675 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
676 struct tcp_sock *tp;
677 struct inet_sock *inet;
678 int type = skb->h.icmph->type;
679 int code = skb->h.icmph->code;
680 struct sock *sk;
681 __u32 seq;
682 int err;
683
684 if (skb->len < (iph->ihl << 2) + 8) {
685 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
686 return;
687 }
688
e48c414e
ACM
689 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
690 th->source, tcp_v4_iif(skb));
1da177e4
LT
691 if (!sk) {
692 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
693 return;
694 }
695 if (sk->sk_state == TCP_TIME_WAIT) {
8feaf0c0 696 inet_twsk_put((struct inet_timewait_sock *)sk);
1da177e4
LT
697 return;
698 }
699
700 bh_lock_sock(sk);
701 /* If too many ICMPs get dropped on busy
702 * servers this needs to be solved differently.
703 */
704 if (sock_owned_by_user(sk))
705 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
706
707 if (sk->sk_state == TCP_CLOSE)
708 goto out;
709
710 tp = tcp_sk(sk);
711 seq = ntohl(th->seq);
712 if (sk->sk_state != TCP_LISTEN &&
713 !between(seq, tp->snd_una, tp->snd_nxt)) {
714 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
715 goto out;
716 }
717
718 switch (type) {
719 case ICMP_SOURCE_QUENCH:
720 /* Just silently ignore these. */
721 goto out;
722 case ICMP_PARAMETERPROB:
723 err = EPROTO;
724 break;
725 case ICMP_DEST_UNREACH:
726 if (code > NR_ICMP_UNREACH)
727 goto out;
728
729 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
730 if (!sock_owned_by_user(sk))
731 do_pmtu_discovery(sk, iph, info);
732 goto out;
733 }
734
735 err = icmp_err_convert[code].errno;
736 break;
737 case ICMP_TIME_EXCEEDED:
738 err = EHOSTUNREACH;
739 break;
740 default:
741 goto out;
742 }
743
744 switch (sk->sk_state) {
60236fdd 745 struct request_sock *req, **prev;
1da177e4
LT
746 case TCP_LISTEN:
747 if (sock_owned_by_user(sk))
748 goto out;
749
750 req = tcp_v4_search_req(tp, &prev, th->dest,
751 iph->daddr, iph->saddr);
752 if (!req)
753 goto out;
754
755 /* ICMPs are not backlogged, hence we cannot get
756 an established socket here.
757 */
758 BUG_TRAP(!req->sk);
759
2e6599cb 760 if (seq != tcp_rsk(req)->snt_isn) {
1da177e4
LT
761 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
762 goto out;
763 }
764
765 /*
766 * Still in SYN_RECV, just remove it silently.
767 * There is no good way to pass the error to the newly
768 * created socket, and POSIX does not want network
769 * errors returned from accept().
770 */
771 tcp_synq_drop(sk, req, prev);
772 goto out;
773
774 case TCP_SYN_SENT:
775 case TCP_SYN_RECV: /* Cannot happen.
776 It can f.e. if SYNs crossed.
777 */
778 if (!sock_owned_by_user(sk)) {
779 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
780 sk->sk_err = err;
781
782 sk->sk_error_report(sk);
783
784 tcp_done(sk);
785 } else {
786 sk->sk_err_soft = err;
787 }
788 goto out;
789 }
790
791 /* If we've already connected we will keep trying
792 * until we time out, or the user gives up.
793 *
794 * rfc1122 4.2.3.9 allows to consider as hard errors
795 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
796 * but it is obsoleted by pmtu discovery).
797 *
798 * Note, that in modern internet, where routing is unreliable
799 * and in each dark corner broken firewalls sit, sending random
800 * errors ordered by their masters even this two messages finally lose
801 * their original sense (even Linux sends invalid PORT_UNREACHs)
802 *
803 * Now we are in compliance with RFCs.
804 * --ANK (980905)
805 */
806
807 inet = inet_sk(sk);
808 if (!sock_owned_by_user(sk) && inet->recverr) {
809 sk->sk_err = err;
810 sk->sk_error_report(sk);
811 } else { /* Only an error on timeout */
812 sk->sk_err_soft = err;
813 }
814
815out:
816 bh_unlock_sock(sk);
817 sock_put(sk);
818}
819
820/* This routine computes an IPv4 TCP checksum. */
821void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
822 struct sk_buff *skb)
823{
824 struct inet_sock *inet = inet_sk(sk);
825
826 if (skb->ip_summed == CHECKSUM_HW) {
827 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
828 skb->csum = offsetof(struct tcphdr, check);
829 } else {
830 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
831 csum_partial((char *)th,
832 th->doff << 2,
833 skb->csum));
834 }
835}
836
837/*
838 * This routine will send an RST to the other tcp.
839 *
840 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
841 * for reset.
842 * Answer: if a packet caused RST, it is not for a socket
843 * existing in our system, if it is matched to a socket,
844 * it is just duplicate segment or bug in other side's TCP.
845 * So that we build reply only basing on parameters
846 * arrived with segment.
847 * Exception: precedence violation. We do not implement it in any case.
848 */
849
850static void tcp_v4_send_reset(struct sk_buff *skb)
851{
852 struct tcphdr *th = skb->h.th;
853 struct tcphdr rth;
854 struct ip_reply_arg arg;
855
856 /* Never send a reset in response to a reset. */
857 if (th->rst)
858 return;
859
860 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
861 return;
862
863 /* Swap the send and the receive. */
864 memset(&rth, 0, sizeof(struct tcphdr));
865 rth.dest = th->source;
866 rth.source = th->dest;
867 rth.doff = sizeof(struct tcphdr) / 4;
868 rth.rst = 1;
869
870 if (th->ack) {
871 rth.seq = th->ack_seq;
872 } else {
873 rth.ack = 1;
874 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
875 skb->len - (th->doff << 2));
876 }
877
878 memset(&arg, 0, sizeof arg);
879 arg.iov[0].iov_base = (unsigned char *)&rth;
880 arg.iov[0].iov_len = sizeof rth;
881 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
882 skb->nh.iph->saddr, /*XXX*/
883 sizeof(struct tcphdr), IPPROTO_TCP, 0);
884 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
885
886 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
887
888 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
889 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
890}
891
892/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
893 outside socket context is ugly, certainly. What can I do?
894 */
895
896static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
897 u32 win, u32 ts)
898{
899 struct tcphdr *th = skb->h.th;
900 struct {
901 struct tcphdr th;
902 u32 tsopt[3];
903 } rep;
904 struct ip_reply_arg arg;
905
906 memset(&rep.th, 0, sizeof(struct tcphdr));
907 memset(&arg, 0, sizeof arg);
908
909 arg.iov[0].iov_base = (unsigned char *)&rep;
910 arg.iov[0].iov_len = sizeof(rep.th);
911 if (ts) {
912 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
913 (TCPOPT_TIMESTAMP << 8) |
914 TCPOLEN_TIMESTAMP);
915 rep.tsopt[1] = htonl(tcp_time_stamp);
916 rep.tsopt[2] = htonl(ts);
917 arg.iov[0].iov_len = sizeof(rep);
918 }
919
920 /* Swap the send and the receive. */
921 rep.th.dest = th->source;
922 rep.th.source = th->dest;
923 rep.th.doff = arg.iov[0].iov_len / 4;
924 rep.th.seq = htonl(seq);
925 rep.th.ack_seq = htonl(ack);
926 rep.th.ack = 1;
927 rep.th.window = htons(win);
928
929 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
930 skb->nh.iph->saddr, /*XXX*/
931 arg.iov[0].iov_len, IPPROTO_TCP, 0);
932 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
933
934 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
935
936 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
937}
938
939static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
940{
8feaf0c0
ACM
941 struct inet_timewait_sock *tw = inet_twsk(sk);
942 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 943
8feaf0c0
ACM
944 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
945 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
1da177e4 946
8feaf0c0 947 inet_twsk_put(tw);
1da177e4
LT
948}
949
60236fdd 950static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1da177e4 951{
2e6599cb 952 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1da177e4
LT
953 req->ts_recent);
954}
955
956static struct dst_entry* tcp_v4_route_req(struct sock *sk,
60236fdd 957 struct request_sock *req)
1da177e4
LT
958{
959 struct rtable *rt;
2e6599cb
ACM
960 const struct inet_request_sock *ireq = inet_rsk(req);
961 struct ip_options *opt = inet_rsk(req)->opt;
1da177e4
LT
962 struct flowi fl = { .oif = sk->sk_bound_dev_if,
963 .nl_u = { .ip4_u =
964 { .daddr = ((opt && opt->srr) ?
965 opt->faddr :
2e6599cb
ACM
966 ireq->rmt_addr),
967 .saddr = ireq->loc_addr,
1da177e4
LT
968 .tos = RT_CONN_FLAGS(sk) } },
969 .proto = IPPROTO_TCP,
970 .uli_u = { .ports =
971 { .sport = inet_sk(sk)->sport,
2e6599cb 972 .dport = ireq->rmt_port } } };
1da177e4
LT
973
974 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
975 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
976 return NULL;
977 }
978 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
979 ip_rt_put(rt);
980 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
981 return NULL;
982 }
983 return &rt->u.dst;
984}
985
986/*
987 * Send a SYN-ACK after having received an ACK.
60236fdd 988 * This still operates on a request_sock only, not on a big
1da177e4
LT
989 * socket.
990 */
60236fdd 991static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1da177e4
LT
992 struct dst_entry *dst)
993{
2e6599cb 994 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
995 int err = -1;
996 struct sk_buff * skb;
997
998 /* First, grab a route. */
999 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1000 goto out;
1001
1002 skb = tcp_make_synack(sk, dst, req);
1003
1004 if (skb) {
1005 struct tcphdr *th = skb->h.th;
1006
1007 th->check = tcp_v4_check(th, skb->len,
2e6599cb
ACM
1008 ireq->loc_addr,
1009 ireq->rmt_addr,
1da177e4
LT
1010 csum_partial((char *)th, skb->len,
1011 skb->csum));
1012
2e6599cb
ACM
1013 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1014 ireq->rmt_addr,
1015 ireq->opt);
1da177e4
LT
1016 if (err == NET_XMIT_CN)
1017 err = 0;
1018 }
1019
1020out:
1021 dst_release(dst);
1022 return err;
1023}
1024
1025/*
60236fdd 1026 * IPv4 request_sock destructor.
1da177e4 1027 */
60236fdd 1028static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1029{
2e6599cb
ACM
1030 if (inet_rsk(req)->opt)
1031 kfree(inet_rsk(req)->opt);
1da177e4
LT
1032}
1033
1034static inline void syn_flood_warning(struct sk_buff *skb)
1035{
1036 static unsigned long warntime;
1037
1038 if (time_after(jiffies, (warntime + HZ * 60))) {
1039 warntime = jiffies;
1040 printk(KERN_INFO
1041 "possible SYN flooding on port %d. Sending cookies.\n",
1042 ntohs(skb->h.th->dest));
1043 }
1044}
1045
1046/*
60236fdd 1047 * Save and compile IPv4 options into the request_sock if needed.
1da177e4
LT
1048 */
1049static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1050 struct sk_buff *skb)
1051{
1052 struct ip_options *opt = &(IPCB(skb)->opt);
1053 struct ip_options *dopt = NULL;
1054
1055 if (opt && opt->optlen) {
1056 int opt_size = optlength(opt);
1057 dopt = kmalloc(opt_size, GFP_ATOMIC);
1058 if (dopt) {
1059 if (ip_options_echo(dopt, skb)) {
1060 kfree(dopt);
1061 dopt = NULL;
1062 }
1063 }
1064 }
1065 return dopt;
1066}
1067
60236fdd 1068struct request_sock_ops tcp_request_sock_ops = {
1da177e4 1069 .family = PF_INET,
2e6599cb 1070 .obj_size = sizeof(struct tcp_request_sock),
1da177e4 1071 .rtx_syn_ack = tcp_v4_send_synack,
60236fdd
ACM
1072 .send_ack = tcp_v4_reqsk_send_ack,
1073 .destructor = tcp_v4_reqsk_destructor,
1da177e4
LT
1074 .send_reset = tcp_v4_send_reset,
1075};
1076
1077int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1078{
2e6599cb 1079 struct inet_request_sock *ireq;
1da177e4 1080 struct tcp_options_received tmp_opt;
60236fdd 1081 struct request_sock *req;
1da177e4
LT
1082 __u32 saddr = skb->nh.iph->saddr;
1083 __u32 daddr = skb->nh.iph->daddr;
1084 __u32 isn = TCP_SKB_CB(skb)->when;
1085 struct dst_entry *dst = NULL;
1086#ifdef CONFIG_SYN_COOKIES
1087 int want_cookie = 0;
1088#else
1089#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1090#endif
1091
1092 /* Never answer to SYNs send to broadcast or multicast */
1093 if (((struct rtable *)skb->dst)->rt_flags &
1094 (RTCF_BROADCAST | RTCF_MULTICAST))
1095 goto drop;
1096
1097 /* TW buckets are converted to open requests without
1098 * limitations, they conserve resources and peer is
1099 * evidently real one.
1100 */
1101 if (tcp_synq_is_full(sk) && !isn) {
1102#ifdef CONFIG_SYN_COOKIES
1103 if (sysctl_tcp_syncookies) {
1104 want_cookie = 1;
1105 } else
1106#endif
1107 goto drop;
1108 }
1109
1110 /* Accept backlog is full. If we have already queued enough
1111 * of warm entries in syn queue, drop request. It is better than
1112 * clogging syn queue with openreqs with exponentially increasing
1113 * timeout.
1114 */
1115 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1116 goto drop;
1117
60236fdd 1118 req = reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1119 if (!req)
1120 goto drop;
1121
1122 tcp_clear_options(&tmp_opt);
1123 tmp_opt.mss_clamp = 536;
1124 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1125
1126 tcp_parse_options(skb, &tmp_opt, 0);
1127
1128 if (want_cookie) {
1129 tcp_clear_options(&tmp_opt);
1130 tmp_opt.saw_tstamp = 0;
1131 }
1132
1133 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1134 /* Some OSes (unknown ones, but I see them on web server, which
1135 * contains information interesting only for windows'
1136 * users) do not send their stamp in SYN. It is easy case.
1137 * We simply do not advertise TS support.
1138 */
1139 tmp_opt.saw_tstamp = 0;
1140 tmp_opt.tstamp_ok = 0;
1141 }
1142 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1143
1144 tcp_openreq_init(req, &tmp_opt, skb);
1145
2e6599cb
ACM
1146 ireq = inet_rsk(req);
1147 ireq->loc_addr = daddr;
1148 ireq->rmt_addr = saddr;
1149 ireq->opt = tcp_v4_save_options(sk, skb);
1da177e4
LT
1150 if (!want_cookie)
1151 TCP_ECN_create_request(req, skb->h.th);
1152
1153 if (want_cookie) {
1154#ifdef CONFIG_SYN_COOKIES
1155 syn_flood_warning(skb);
1156#endif
1157 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1158 } else if (!isn) {
1159 struct inet_peer *peer = NULL;
1160
1161 /* VJ's idea. We save last timestamp seen
1162 * from the destination in peer table, when entering
1163 * state TIME-WAIT, and check against it before
1164 * accepting new connection request.
1165 *
1166 * If "isn" is not zero, this request hit alive
1167 * timewait bucket, so that all the necessary checks
1168 * are made in the function processing timewait state.
1169 */
1170 if (tmp_opt.saw_tstamp &&
1171 sysctl_tcp_tw_recycle &&
1172 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1173 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1174 peer->v4daddr == saddr) {
1175 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1176 (s32)(peer->tcp_ts - req->ts_recent) >
1177 TCP_PAWS_WINDOW) {
1178 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1179 dst_release(dst);
1180 goto drop_and_free;
1181 }
1182 }
1183 /* Kill the following clause, if you dislike this way. */
1184 else if (!sysctl_tcp_syncookies &&
1185 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1186 (sysctl_max_syn_backlog >> 2)) &&
1187 (!peer || !peer->tcp_ts_stamp) &&
1188 (!dst || !dst_metric(dst, RTAX_RTT))) {
1189 /* Without syncookies last quarter of
1190 * backlog is filled with destinations,
1191 * proven to be alive.
1192 * It means that we continue to communicate
1193 * to destinations, already remembered
1194 * to the moment of synflood.
1195 */
ca933452
HO
1196 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1197 "request from %u.%u."
1198 "%u.%u/%u\n",
1199 NIPQUAD(saddr),
1200 ntohs(skb->h.th->source)));
1da177e4
LT
1201 dst_release(dst);
1202 goto drop_and_free;
1203 }
1204
1205 isn = tcp_v4_init_sequence(sk, skb);
1206 }
2e6599cb 1207 tcp_rsk(req)->snt_isn = isn;
1da177e4
LT
1208
1209 if (tcp_v4_send_synack(sk, req, dst))
1210 goto drop_and_free;
1211
1212 if (want_cookie) {
60236fdd 1213 reqsk_free(req);
1da177e4
LT
1214 } else {
1215 tcp_v4_synq_add(sk, req);
1216 }
1217 return 0;
1218
1219drop_and_free:
60236fdd 1220 reqsk_free(req);
1da177e4
LT
1221drop:
1222 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1223 return 0;
1224}
1225
1226
1227/*
1228 * The three way handshake has completed - we got a valid synack -
1229 * now create the new socket.
1230 */
1231struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1232 struct request_sock *req,
1da177e4
LT
1233 struct dst_entry *dst)
1234{
2e6599cb 1235 struct inet_request_sock *ireq;
1da177e4
LT
1236 struct inet_sock *newinet;
1237 struct tcp_sock *newtp;
1238 struct sock *newsk;
1239
1240 if (sk_acceptq_is_full(sk))
1241 goto exit_overflow;
1242
1243 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1244 goto exit;
1245
1246 newsk = tcp_create_openreq_child(sk, req, skb);
1247 if (!newsk)
1248 goto exit;
1249
6cbb0df7 1250 sk_setup_caps(newsk, dst);
1da177e4
LT
1251
1252 newtp = tcp_sk(newsk);
1253 newinet = inet_sk(newsk);
2e6599cb
ACM
1254 ireq = inet_rsk(req);
1255 newinet->daddr = ireq->rmt_addr;
1256 newinet->rcv_saddr = ireq->loc_addr;
1257 newinet->saddr = ireq->loc_addr;
1258 newinet->opt = ireq->opt;
1259 ireq->opt = NULL;
1da177e4
LT
1260 newinet->mc_index = tcp_v4_iif(skb);
1261 newinet->mc_ttl = skb->nh.iph->ttl;
1262 newtp->ext_header_len = 0;
1263 if (newinet->opt)
1264 newtp->ext_header_len = newinet->opt->optlen;
1265 newinet->id = newtp->write_seq ^ jiffies;
1266
1267 tcp_sync_mss(newsk, dst_mtu(dst));
1268 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1269 tcp_initialize_rcv_mss(newsk);
1270
f3f05f70 1271 __inet_hash(&tcp_hashinfo, newsk, 0);
2d8c4ce5 1272 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1da177e4
LT
1273
1274 return newsk;
1275
1276exit_overflow:
1277 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1278exit:
1279 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1280 dst_release(dst);
1281 return NULL;
1282}
1283
1284static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1285{
1286 struct tcphdr *th = skb->h.th;
1287 struct iphdr *iph = skb->nh.iph;
1288 struct tcp_sock *tp = tcp_sk(sk);
1289 struct sock *nsk;
60236fdd 1290 struct request_sock **prev;
1da177e4 1291 /* Find possible connection requests. */
60236fdd 1292 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1da177e4
LT
1293 iph->saddr, iph->daddr);
1294 if (req)
1295 return tcp_check_req(sk, skb, req, prev);
1296
e48c414e
ACM
1297 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1298 th->source, skb->nh.iph->daddr,
1299 ntohs(th->dest), tcp_v4_iif(skb));
1da177e4
LT
1300
1301 if (nsk) {
1302 if (nsk->sk_state != TCP_TIME_WAIT) {
1303 bh_lock_sock(nsk);
1304 return nsk;
1305 }
8feaf0c0 1306 inet_twsk_put((struct inet_timewait_sock *)nsk);
1da177e4
LT
1307 return NULL;
1308 }
1309
1310#ifdef CONFIG_SYN_COOKIES
1311 if (!th->rst && !th->syn && th->ack)
1312 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1313#endif
1314 return sk;
1315}
1316
1317static int tcp_v4_checksum_init(struct sk_buff *skb)
1318{
1319 if (skb->ip_summed == CHECKSUM_HW) {
1320 skb->ip_summed = CHECKSUM_UNNECESSARY;
1321 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1322 skb->nh.iph->daddr, skb->csum))
1323 return 0;
1324
ca933452 1325 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1da177e4
LT
1326 skb->ip_summed = CHECKSUM_NONE;
1327 }
1328 if (skb->len <= 76) {
1329 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1330 skb->nh.iph->daddr,
1331 skb_checksum(skb, 0, skb->len, 0)))
1332 return -1;
1333 skb->ip_summed = CHECKSUM_UNNECESSARY;
1334 } else {
1335 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1336 skb->nh.iph->saddr,
1337 skb->nh.iph->daddr, 0);
1338 }
1339 return 0;
1340}
1341
1342
1343/* The socket must have it's spinlock held when we get
1344 * here.
1345 *
1346 * We have a potential double-lock case here, so even when
1347 * doing backlog processing we use the BH locking scheme.
1348 * This is because we cannot sleep with the original spinlock
1349 * held.
1350 */
1351int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1352{
1353 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1354 TCP_CHECK_TIMER(sk);
1355 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1356 goto reset;
1357 TCP_CHECK_TIMER(sk);
1358 return 0;
1359 }
1360
1361 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1362 goto csum_err;
1363
1364 if (sk->sk_state == TCP_LISTEN) {
1365 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1366 if (!nsk)
1367 goto discard;
1368
1369 if (nsk != sk) {
1370 if (tcp_child_process(sk, nsk, skb))
1371 goto reset;
1372 return 0;
1373 }
1374 }
1375
1376 TCP_CHECK_TIMER(sk);
1377 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1378 goto reset;
1379 TCP_CHECK_TIMER(sk);
1380 return 0;
1381
1382reset:
1383 tcp_v4_send_reset(skb);
1384discard:
1385 kfree_skb(skb);
1386 /* Be careful here. If this function gets more complicated and
1387 * gcc suffers from register pressure on the x86, sk (in %ebx)
1388 * might be destroyed here. This current version compiles correctly,
1389 * but you have been warned.
1390 */
1391 return 0;
1392
1393csum_err:
1394 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1395 goto discard;
1396}
1397
1398/*
1399 * From tcp_input.c
1400 */
1401
1402int tcp_v4_rcv(struct sk_buff *skb)
1403{
1404 struct tcphdr *th;
1405 struct sock *sk;
1406 int ret;
1407
1408 if (skb->pkt_type != PACKET_HOST)
1409 goto discard_it;
1410
1411 /* Count it even if it's bad */
1412 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1413
1414 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1415 goto discard_it;
1416
1417 th = skb->h.th;
1418
1419 if (th->doff < sizeof(struct tcphdr) / 4)
1420 goto bad_packet;
1421 if (!pskb_may_pull(skb, th->doff * 4))
1422 goto discard_it;
1423
1424 /* An explanation is required here, I think.
1425 * Packet length and doff are validated by header prediction,
1426 * provided case of th->doff==0 is elimineted.
1427 * So, we defer the checks. */
1428 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1429 tcp_v4_checksum_init(skb) < 0))
1430 goto bad_packet;
1431
1432 th = skb->h.th;
1433 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1434 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1435 skb->len - th->doff * 4);
1436 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1437 TCP_SKB_CB(skb)->when = 0;
1438 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1439 TCP_SKB_CB(skb)->sacked = 0;
1440
e48c414e
ACM
1441 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1442 skb->nh.iph->daddr, ntohs(th->dest),
1443 tcp_v4_iif(skb));
1da177e4
LT
1444
1445 if (!sk)
1446 goto no_tcp_socket;
1447
1448process:
1449 if (sk->sk_state == TCP_TIME_WAIT)
1450 goto do_time_wait;
1451
1452 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1453 goto discard_and_relse;
1454
1455 if (sk_filter(sk, skb, 0))
1456 goto discard_and_relse;
1457
1458 skb->dev = NULL;
1459
1460 bh_lock_sock(sk);
1461 ret = 0;
1462 if (!sock_owned_by_user(sk)) {
1463 if (!tcp_prequeue(sk, skb))
1464 ret = tcp_v4_do_rcv(sk, skb);
1465 } else
1466 sk_add_backlog(sk, skb);
1467 bh_unlock_sock(sk);
1468
1469 sock_put(sk);
1470
1471 return ret;
1472
1473no_tcp_socket:
1474 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1475 goto discard_it;
1476
1477 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1478bad_packet:
1479 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1480 } else {
1481 tcp_v4_send_reset(skb);
1482 }
1483
1484discard_it:
1485 /* Discard frame. */
1486 kfree_skb(skb);
1487 return 0;
1488
1489discard_and_relse:
1490 sock_put(sk);
1491 goto discard_it;
1492
1493do_time_wait:
1494 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
8feaf0c0 1495 inet_twsk_put((struct inet_timewait_sock *) sk);
1da177e4
LT
1496 goto discard_it;
1497 }
1498
1499 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1500 TCP_INC_STATS_BH(TCP_MIB_INERRS);
8feaf0c0 1501 inet_twsk_put((struct inet_timewait_sock *) sk);
1da177e4
LT
1502 goto discard_it;
1503 }
8feaf0c0
ACM
1504 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1505 skb, th)) {
1da177e4 1506 case TCP_TW_SYN: {
33b62231
ACM
1507 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1508 skb->nh.iph->daddr,
1509 ntohs(th->dest),
1510 tcp_v4_iif(skb));
1da177e4 1511 if (sk2) {
8feaf0c0
ACM
1512 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1513 inet_twsk_put((struct inet_timewait_sock *)sk);
1da177e4
LT
1514 sk = sk2;
1515 goto process;
1516 }
1517 /* Fall through to ACK */
1518 }
1519 case TCP_TW_ACK:
1520 tcp_v4_timewait_ack(sk, skb);
1521 break;
1522 case TCP_TW_RST:
1523 goto no_tcp_socket;
1524 case TCP_TW_SUCCESS:;
1525 }
1526 goto discard_it;
1527}
1528
1da177e4
LT
1529static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1530{
1531 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1532 struct inet_sock *inet = inet_sk(sk);
1533
1534 sin->sin_family = AF_INET;
1535 sin->sin_addr.s_addr = inet->daddr;
1536 sin->sin_port = inet->dport;
1537}
1538
1539/* VJ's idea. Save last timestamp seen from this destination
1540 * and hold it at least for normal timewait interval to use for duplicate
1541 * segment detection in subsequent connections, before they enter synchronized
1542 * state.
1543 */
1544
1545int tcp_v4_remember_stamp(struct sock *sk)
1546{
1547 struct inet_sock *inet = inet_sk(sk);
1548 struct tcp_sock *tp = tcp_sk(sk);
1549 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1550 struct inet_peer *peer = NULL;
1551 int release_it = 0;
1552
1553 if (!rt || rt->rt_dst != inet->daddr) {
1554 peer = inet_getpeer(inet->daddr, 1);
1555 release_it = 1;
1556 } else {
1557 if (!rt->peer)
1558 rt_bind_peer(rt, 1);
1559 peer = rt->peer;
1560 }
1561
1562 if (peer) {
1563 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1564 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1565 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1566 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1567 peer->tcp_ts = tp->rx_opt.ts_recent;
1568 }
1569 if (release_it)
1570 inet_putpeer(peer);
1571 return 1;
1572 }
1573
1574 return 0;
1575}
1576
8feaf0c0 1577int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1da177e4 1578{
8feaf0c0 1579 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1da177e4
LT
1580
1581 if (peer) {
8feaf0c0
ACM
1582 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1583
1584 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1da177e4 1585 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
8feaf0c0
ACM
1586 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1587 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1588 peer->tcp_ts = tcptw->tw_ts_recent;
1da177e4
LT
1589 }
1590 inet_putpeer(peer);
1591 return 1;
1592 }
1593
1594 return 0;
1595}
1596
1597struct tcp_func ipv4_specific = {
1598 .queue_xmit = ip_queue_xmit,
1599 .send_check = tcp_v4_send_check,
32519f11 1600 .rebuild_header = inet_sk_rebuild_header,
1da177e4
LT
1601 .conn_request = tcp_v4_conn_request,
1602 .syn_recv_sock = tcp_v4_syn_recv_sock,
1603 .remember_stamp = tcp_v4_remember_stamp,
1604 .net_header_len = sizeof(struct iphdr),
1605 .setsockopt = ip_setsockopt,
1606 .getsockopt = ip_getsockopt,
1607 .addr2sockaddr = v4_addr2sockaddr,
1608 .sockaddr_len = sizeof(struct sockaddr_in),
1609};
1610
1611/* NOTE: A lot of things set to zero explicitly by call to
1612 * sk_alloc() so need not be done here.
1613 */
1614static int tcp_v4_init_sock(struct sock *sk)
1615{
1616 struct tcp_sock *tp = tcp_sk(sk);
1617
1618 skb_queue_head_init(&tp->out_of_order_queue);
1619 tcp_init_xmit_timers(sk);
1620 tcp_prequeue_init(tp);
1621
1622 tp->rto = TCP_TIMEOUT_INIT;
1623 tp->mdev = TCP_TIMEOUT_INIT;
1624
1625 /* So many TCP implementations out there (incorrectly) count the
1626 * initial SYN frame in their delayed-ACK and congestion control
1627 * algorithms that we must have the following bandaid to talk
1628 * efficiently to them. -DaveM
1629 */
1630 tp->snd_cwnd = 2;
1631
1632 /* See draft-stevens-tcpca-spec-01 for discussion of the
1633 * initialization of these values.
1634 */
1635 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1636 tp->snd_cwnd_clamp = ~0;
c1b4a7e6 1637 tp->mss_cache = 536;
1da177e4
LT
1638
1639 tp->reordering = sysctl_tcp_reordering;
5f8ef48d 1640 tp->ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1641
1642 sk->sk_state = TCP_CLOSE;
1643
1644 sk->sk_write_space = sk_stream_write_space;
1645 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1646
1647 tp->af_specific = &ipv4_specific;
1648
1649 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1650 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1651
1652 atomic_inc(&tcp_sockets_allocated);
1653
1654 return 0;
1655}
1656
1657int tcp_v4_destroy_sock(struct sock *sk)
1658{
1659 struct tcp_sock *tp = tcp_sk(sk);
1660
1661 tcp_clear_xmit_timers(sk);
1662
317a76f9
SH
1663 tcp_cleanup_congestion_control(tp);
1664
1da177e4
LT
1665 /* Cleanup up the write buffer. */
1666 sk_stream_writequeue_purge(sk);
1667
1668 /* Cleans up our, hopefully empty, out_of_order_queue. */
1669 __skb_queue_purge(&tp->out_of_order_queue);
1670
1671 /* Clean prequeue, it must be empty really */
1672 __skb_queue_purge(&tp->ucopy.prequeue);
1673
1674 /* Clean up a referenced TCP bind bucket. */
a55ebcc4 1675 if (inet_sk(sk)->bind_hash)
2d8c4ce5 1676 inet_put_port(&tcp_hashinfo, sk);
1da177e4
LT
1677
1678 /*
1679 * If sendmsg cached page exists, toss it.
1680 */
1681 if (sk->sk_sndmsg_page) {
1682 __free_page(sk->sk_sndmsg_page);
1683 sk->sk_sndmsg_page = NULL;
1684 }
1685
1686 atomic_dec(&tcp_sockets_allocated);
1687
1688 return 0;
1689}
1690
1691EXPORT_SYMBOL(tcp_v4_destroy_sock);
1692
1693#ifdef CONFIG_PROC_FS
1694/* Proc filesystem TCP sock list dumping. */
1695
8feaf0c0 1696static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1da177e4
LT
1697{
1698 return hlist_empty(head) ? NULL :
8feaf0c0 1699 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1700}
1701
8feaf0c0 1702static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4
LT
1703{
1704 return tw->tw_node.next ?
1705 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1706}
1707
1708static void *listening_get_next(struct seq_file *seq, void *cur)
1709{
1710 struct tcp_sock *tp;
1711 struct hlist_node *node;
1712 struct sock *sk = cur;
1713 struct tcp_iter_state* st = seq->private;
1714
1715 if (!sk) {
1716 st->bucket = 0;
6e04e021 1717 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1da177e4
LT
1718 goto get_sk;
1719 }
1720
1721 ++st->num;
1722
1723 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1724 struct request_sock *req = cur;
1da177e4
LT
1725
1726 tp = tcp_sk(st->syn_wait_sk);
1727 req = req->dl_next;
1728 while (1) {
1729 while (req) {
60236fdd 1730 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1731 cur = req;
1732 goto out;
1733 }
1734 req = req->dl_next;
1735 }
1736 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1737 break;
1738get_req:
0e87506f 1739 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4
LT
1740 }
1741 sk = sk_next(st->syn_wait_sk);
1742 st->state = TCP_SEQ_STATE_LISTENING;
0e87506f 1743 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
1744 } else {
1745 tp = tcp_sk(sk);
0e87506f
ACM
1746 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1747 if (reqsk_queue_len(&tp->accept_queue))
1da177e4 1748 goto start_req;
0e87506f 1749 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
1750 sk = sk_next(sk);
1751 }
1752get_sk:
1753 sk_for_each_from(sk, node) {
1754 if (sk->sk_family == st->family) {
1755 cur = sk;
1756 goto out;
1757 }
1758 tp = tcp_sk(sk);
0e87506f
ACM
1759 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1760 if (reqsk_queue_len(&tp->accept_queue)) {
1da177e4
LT
1761start_req:
1762 st->uid = sock_i_uid(sk);
1763 st->syn_wait_sk = sk;
1764 st->state = TCP_SEQ_STATE_OPENREQ;
1765 st->sbucket = 0;
1766 goto get_req;
1767 }
0e87506f 1768 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4 1769 }
0f7ff927 1770 if (++st->bucket < INET_LHTABLE_SIZE) {
6e04e021 1771 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1da177e4
LT
1772 goto get_sk;
1773 }
1774 cur = NULL;
1775out:
1776 return cur;
1777}
1778
1779static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1780{
1781 void *rc = listening_get_next(seq, NULL);
1782
1783 while (rc && *pos) {
1784 rc = listening_get_next(seq, rc);
1785 --*pos;
1786 }
1787 return rc;
1788}
1789
1790static void *established_get_first(struct seq_file *seq)
1791{
1792 struct tcp_iter_state* st = seq->private;
1793 void *rc = NULL;
1794
6e04e021 1795 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1da177e4
LT
1796 struct sock *sk;
1797 struct hlist_node *node;
8feaf0c0 1798 struct inet_timewait_sock *tw;
1da177e4
LT
1799
1800 /* We can reschedule _before_ having picked the target: */
1801 cond_resched_softirq();
1802
6e04e021
ACM
1803 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1804 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1da177e4
LT
1805 if (sk->sk_family != st->family) {
1806 continue;
1807 }
1808 rc = sk;
1809 goto out;
1810 }
1811 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0
ACM
1812 inet_twsk_for_each(tw, node,
1813 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1da177e4
LT
1814 if (tw->tw_family != st->family) {
1815 continue;
1816 }
1817 rc = tw;
1818 goto out;
1819 }
6e04e021 1820 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
1821 st->state = TCP_SEQ_STATE_ESTABLISHED;
1822 }
1823out:
1824 return rc;
1825}
1826
1827static void *established_get_next(struct seq_file *seq, void *cur)
1828{
1829 struct sock *sk = cur;
8feaf0c0 1830 struct inet_timewait_sock *tw;
1da177e4
LT
1831 struct hlist_node *node;
1832 struct tcp_iter_state* st = seq->private;
1833
1834 ++st->num;
1835
1836 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1837 tw = cur;
1838 tw = tw_next(tw);
1839get_tw:
1840 while (tw && tw->tw_family != st->family) {
1841 tw = tw_next(tw);
1842 }
1843 if (tw) {
1844 cur = tw;
1845 goto out;
1846 }
6e04e021 1847 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
1848 st->state = TCP_SEQ_STATE_ESTABLISHED;
1849
1850 /* We can reschedule between buckets: */
1851 cond_resched_softirq();
1852
6e04e021
ACM
1853 if (++st->bucket < tcp_hashinfo.ehash_size) {
1854 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1855 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4
LT
1856 } else {
1857 cur = NULL;
1858 goto out;
1859 }
1860 } else
1861 sk = sk_next(sk);
1862
1863 sk_for_each_from(sk, node) {
1864 if (sk->sk_family == st->family)
1865 goto found;
1866 }
1867
1868 st->state = TCP_SEQ_STATE_TIME_WAIT;
6e04e021 1869 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1da177e4
LT
1870 goto get_tw;
1871found:
1872 cur = sk;
1873out:
1874 return cur;
1875}
1876
1877static void *established_get_idx(struct seq_file *seq, loff_t pos)
1878{
1879 void *rc = established_get_first(seq);
1880
1881 while (rc && pos) {
1882 rc = established_get_next(seq, rc);
1883 --pos;
1884 }
1885 return rc;
1886}
1887
1888static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1889{
1890 void *rc;
1891 struct tcp_iter_state* st = seq->private;
1892
f3f05f70 1893 inet_listen_lock(&tcp_hashinfo);
1da177e4
LT
1894 st->state = TCP_SEQ_STATE_LISTENING;
1895 rc = listening_get_idx(seq, &pos);
1896
1897 if (!rc) {
f3f05f70 1898 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
1899 local_bh_disable();
1900 st->state = TCP_SEQ_STATE_ESTABLISHED;
1901 rc = established_get_idx(seq, pos);
1902 }
1903
1904 return rc;
1905}
1906
1907static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1908{
1909 struct tcp_iter_state* st = seq->private;
1910 st->state = TCP_SEQ_STATE_LISTENING;
1911 st->num = 0;
1912 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1913}
1914
1915static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1916{
1917 void *rc = NULL;
1918 struct tcp_iter_state* st;
1919
1920 if (v == SEQ_START_TOKEN) {
1921 rc = tcp_get_idx(seq, 0);
1922 goto out;
1923 }
1924 st = seq->private;
1925
1926 switch (st->state) {
1927 case TCP_SEQ_STATE_OPENREQ:
1928 case TCP_SEQ_STATE_LISTENING:
1929 rc = listening_get_next(seq, v);
1930 if (!rc) {
f3f05f70 1931 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
1932 local_bh_disable();
1933 st->state = TCP_SEQ_STATE_ESTABLISHED;
1934 rc = established_get_first(seq);
1935 }
1936 break;
1937 case TCP_SEQ_STATE_ESTABLISHED:
1938 case TCP_SEQ_STATE_TIME_WAIT:
1939 rc = established_get_next(seq, v);
1940 break;
1941 }
1942out:
1943 ++*pos;
1944 return rc;
1945}
1946
1947static void tcp_seq_stop(struct seq_file *seq, void *v)
1948{
1949 struct tcp_iter_state* st = seq->private;
1950
1951 switch (st->state) {
1952 case TCP_SEQ_STATE_OPENREQ:
1953 if (v) {
1954 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
0e87506f 1955 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
1956 }
1957 case TCP_SEQ_STATE_LISTENING:
1958 if (v != SEQ_START_TOKEN)
f3f05f70 1959 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
1960 break;
1961 case TCP_SEQ_STATE_TIME_WAIT:
1962 case TCP_SEQ_STATE_ESTABLISHED:
1963 if (v)
6e04e021 1964 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
1965 local_bh_enable();
1966 break;
1967 }
1968}
1969
1970static int tcp_seq_open(struct inode *inode, struct file *file)
1971{
1972 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1973 struct seq_file *seq;
1974 struct tcp_iter_state *s;
1975 int rc;
1976
1977 if (unlikely(afinfo == NULL))
1978 return -EINVAL;
1979
1980 s = kmalloc(sizeof(*s), GFP_KERNEL);
1981 if (!s)
1982 return -ENOMEM;
1983 memset(s, 0, sizeof(*s));
1984 s->family = afinfo->family;
1985 s->seq_ops.start = tcp_seq_start;
1986 s->seq_ops.next = tcp_seq_next;
1987 s->seq_ops.show = afinfo->seq_show;
1988 s->seq_ops.stop = tcp_seq_stop;
1989
1990 rc = seq_open(file, &s->seq_ops);
1991 if (rc)
1992 goto out_kfree;
1993 seq = file->private_data;
1994 seq->private = s;
1995out:
1996 return rc;
1997out_kfree:
1998 kfree(s);
1999 goto out;
2000}
2001
2002int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2003{
2004 int rc = 0;
2005 struct proc_dir_entry *p;
2006
2007 if (!afinfo)
2008 return -EINVAL;
2009 afinfo->seq_fops->owner = afinfo->owner;
2010 afinfo->seq_fops->open = tcp_seq_open;
2011 afinfo->seq_fops->read = seq_read;
2012 afinfo->seq_fops->llseek = seq_lseek;
2013 afinfo->seq_fops->release = seq_release_private;
2014
2015 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2016 if (p)
2017 p->data = afinfo;
2018 else
2019 rc = -ENOMEM;
2020 return rc;
2021}
2022
2023void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2024{
2025 if (!afinfo)
2026 return;
2027 proc_net_remove(afinfo->name);
2028 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2029}
2030
60236fdd 2031static void get_openreq4(struct sock *sk, struct request_sock *req,
1da177e4
LT
2032 char *tmpbuf, int i, int uid)
2033{
2e6599cb 2034 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2035 int ttd = req->expires - jiffies;
2036
2037 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2038 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2039 i,
2e6599cb 2040 ireq->loc_addr,
1da177e4 2041 ntohs(inet_sk(sk)->sport),
2e6599cb
ACM
2042 ireq->rmt_addr,
2043 ntohs(ireq->rmt_port),
1da177e4
LT
2044 TCP_SYN_RECV,
2045 0, 0, /* could print option size, but that is af dependent. */
2046 1, /* timers active (only the expire timer) */
2047 jiffies_to_clock_t(ttd),
2048 req->retrans,
2049 uid,
2050 0, /* non standard timer */
2051 0, /* open_requests have no inode */
2052 atomic_read(&sk->sk_refcnt),
2053 req);
2054}
2055
2056static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2057{
2058 int timer_active;
2059 unsigned long timer_expires;
2060 struct tcp_sock *tp = tcp_sk(sp);
2061 struct inet_sock *inet = inet_sk(sp);
2062 unsigned int dest = inet->daddr;
2063 unsigned int src = inet->rcv_saddr;
2064 __u16 destp = ntohs(inet->dport);
2065 __u16 srcp = ntohs(inet->sport);
2066
2067 if (tp->pending == TCP_TIME_RETRANS) {
2068 timer_active = 1;
2069 timer_expires = tp->timeout;
2070 } else if (tp->pending == TCP_TIME_PROBE0) {
2071 timer_active = 4;
2072 timer_expires = tp->timeout;
2073 } else if (timer_pending(&sp->sk_timer)) {
2074 timer_active = 2;
2075 timer_expires = sp->sk_timer.expires;
2076 } else {
2077 timer_active = 0;
2078 timer_expires = jiffies;
2079 }
2080
2081 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2082 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2083 i, src, srcp, dest, destp, sp->sk_state,
2084 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2085 timer_active,
2086 jiffies_to_clock_t(timer_expires - jiffies),
2087 tp->retransmits,
2088 sock_i_uid(sp),
2089 tp->probes_out,
2090 sock_i_ino(sp),
2091 atomic_read(&sp->sk_refcnt), sp,
2092 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2093 tp->snd_cwnd,
2094 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2095}
2096
8feaf0c0 2097static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1da177e4
LT
2098{
2099 unsigned int dest, src;
2100 __u16 destp, srcp;
2101 int ttd = tw->tw_ttd - jiffies;
2102
2103 if (ttd < 0)
2104 ttd = 0;
2105
2106 dest = tw->tw_daddr;
2107 src = tw->tw_rcv_saddr;
2108 destp = ntohs(tw->tw_dport);
2109 srcp = ntohs(tw->tw_sport);
2110
2111 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2112 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2113 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2114 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2115 atomic_read(&tw->tw_refcnt), tw);
2116}
2117
2118#define TMPSZ 150
2119
2120static int tcp4_seq_show(struct seq_file *seq, void *v)
2121{
2122 struct tcp_iter_state* st;
2123 char tmpbuf[TMPSZ + 1];
2124
2125 if (v == SEQ_START_TOKEN) {
2126 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2127 " sl local_address rem_address st tx_queue "
2128 "rx_queue tr tm->when retrnsmt uid timeout "
2129 "inode");
2130 goto out;
2131 }
2132 st = seq->private;
2133
2134 switch (st->state) {
2135 case TCP_SEQ_STATE_LISTENING:
2136 case TCP_SEQ_STATE_ESTABLISHED:
2137 get_tcp4_sock(v, tmpbuf, st->num);
2138 break;
2139 case TCP_SEQ_STATE_OPENREQ:
2140 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2141 break;
2142 case TCP_SEQ_STATE_TIME_WAIT:
2143 get_timewait4_sock(v, tmpbuf, st->num);
2144 break;
2145 }
2146 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2147out:
2148 return 0;
2149}
2150
2151static struct file_operations tcp4_seq_fops;
2152static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2153 .owner = THIS_MODULE,
2154 .name = "tcp",
2155 .family = AF_INET,
2156 .seq_show = tcp4_seq_show,
2157 .seq_fops = &tcp4_seq_fops,
2158};
2159
2160int __init tcp4_proc_init(void)
2161{
2162 return tcp_proc_register(&tcp4_seq_afinfo);
2163}
2164
2165void tcp4_proc_exit(void)
2166{
2167 tcp_proc_unregister(&tcp4_seq_afinfo);
2168}
2169#endif /* CONFIG_PROC_FS */
2170
2171struct proto tcp_prot = {
2172 .name = "TCP",
2173 .owner = THIS_MODULE,
2174 .close = tcp_close,
2175 .connect = tcp_v4_connect,
2176 .disconnect = tcp_disconnect,
2177 .accept = tcp_accept,
2178 .ioctl = tcp_ioctl,
2179 .init = tcp_v4_init_sock,
2180 .destroy = tcp_v4_destroy_sock,
2181 .shutdown = tcp_shutdown,
2182 .setsockopt = tcp_setsockopt,
2183 .getsockopt = tcp_getsockopt,
2184 .sendmsg = tcp_sendmsg,
2185 .recvmsg = tcp_recvmsg,
2186 .backlog_rcv = tcp_v4_do_rcv,
2187 .hash = tcp_v4_hash,
2188 .unhash = tcp_unhash,
2189 .get_port = tcp_v4_get_port,
2190 .enter_memory_pressure = tcp_enter_memory_pressure,
2191 .sockets_allocated = &tcp_sockets_allocated,
2192 .memory_allocated = &tcp_memory_allocated,
2193 .memory_pressure = &tcp_memory_pressure,
2194 .sysctl_mem = sysctl_tcp_mem,
2195 .sysctl_wmem = sysctl_tcp_wmem,
2196 .sysctl_rmem = sysctl_tcp_rmem,
2197 .max_header = MAX_TCP_HEADER,
2198 .obj_size = sizeof(struct tcp_sock),
8feaf0c0 2199 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
60236fdd 2200 .rsk_prot = &tcp_request_sock_ops,
1da177e4
LT
2201};
2202
2203
2204
2205void __init tcp_v4_init(struct net_proto_family *ops)
2206{
2207 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2208 if (err < 0)
2209 panic("Failed to create the TCP control socket.\n");
2210 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2211 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2212
2213 /* Unhash it so that IP input processing does not even
2214 * see it, we do not wish this socket to see incoming
2215 * packets.
2216 */
2217 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2218}
2219
2220EXPORT_SYMBOL(ipv4_specific);
0f7ff927 2221EXPORT_SYMBOL(inet_bind_bucket_create);
1da177e4 2222EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 2223EXPORT_SYMBOL(tcp_prot);
1da177e4
LT
2224EXPORT_SYMBOL(tcp_unhash);
2225EXPORT_SYMBOL(tcp_v4_conn_request);
2226EXPORT_SYMBOL(tcp_v4_connect);
2227EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
2228EXPORT_SYMBOL(tcp_v4_remember_stamp);
2229EXPORT_SYMBOL(tcp_v4_send_check);
2230EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2231
2232#ifdef CONFIG_PROC_FS
2233EXPORT_SYMBOL(tcp_proc_register);
2234EXPORT_SYMBOL(tcp_proc_unregister);
2235#endif
2236EXPORT_SYMBOL(sysctl_local_port_range);
1da177e4
LT
2237EXPORT_SYMBOL(sysctl_tcp_low_latency);
2238EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2239