]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/tcp_ipv4.c
[NETFILTER]: fix list traversal order in ctnetlink
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
60236fdd 39 * request_sock handling and moved
1da177e4
LT
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4
LT
68#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
0f7ff927
ACM
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
1da177e4
LT
97};
98
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
1da177e4
LT
107/* Caller must disable local BH processing. */
108static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
109{
0f7ff927
ACM
110 struct inet_bind_hashbucket *head =
111 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
112 tcp_bhash_size)];
113 struct inet_bind_bucket *tb;
1da177e4
LT
114
115 spin_lock(&head->lock);
a55ebcc4 116 tb = inet_sk(sk)->bind_hash;
1da177e4 117 sk_add_bind_node(child, &tb->owners);
a55ebcc4 118 inet_sk(child)->bind_hash = tb;
1da177e4
LT
119 spin_unlock(&head->lock);
120}
121
122inline void tcp_inherit_port(struct sock *sk, struct sock *child)
123{
124 local_bh_disable();
125 __tcp_inherit_port(sk, child);
126 local_bh_enable();
127}
128
0f7ff927
ACM
129void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130 const unsigned short snum)
1da177e4 131{
a55ebcc4
ACM
132 struct inet_sock *inet = inet_sk(sk);
133 inet->num = snum;
1da177e4 134 sk_add_bind_node(sk, &tb->owners);
a55ebcc4 135 inet->bind_hash = tb;
1da177e4
LT
136}
137
0f7ff927 138static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
1da177e4
LT
139{
140 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
141 struct sock *sk2;
142 struct hlist_node *node;
143 int reuse = sk->sk_reuse;
144
145 sk_for_each_bound(sk2, node, &tb->owners) {
146 if (sk != sk2 &&
147 !tcp_v6_ipv6only(sk2) &&
148 (!sk->sk_bound_dev_if ||
149 !sk2->sk_bound_dev_if ||
150 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
151 if (!reuse || !sk2->sk_reuse ||
152 sk2->sk_state == TCP_LISTEN) {
153 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
154 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
155 sk2_rcv_saddr == sk_rcv_saddr)
156 break;
157 }
158 }
159 }
160 return node != NULL;
161}
162
163/* Obtain a reference to a local port for the given sock,
164 * if snum is zero it means select any available local port.
165 */
166static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
167{
0f7ff927 168 struct inet_bind_hashbucket *head;
1da177e4 169 struct hlist_node *node;
0f7ff927 170 struct inet_bind_bucket *tb;
1da177e4
LT
171 int ret;
172
173 local_bh_disable();
174 if (!snum) {
175 int low = sysctl_local_port_range[0];
176 int high = sysctl_local_port_range[1];
177 int remaining = (high - low) + 1;
178 int rover;
179
180 spin_lock(&tcp_portalloc_lock);
0b2531bd
FH
181 if (tcp_port_rover < low)
182 rover = low;
183 else
184 rover = tcp_port_rover;
1da177e4
LT
185 do {
186 rover++;
0b2531bd 187 if (rover > high)
1da177e4 188 rover = low;
0f7ff927 189 head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
1da177e4 190 spin_lock(&head->lock);
0f7ff927 191 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
192 if (tb->port == rover)
193 goto next;
194 break;
195 next:
196 spin_unlock(&head->lock);
197 } while (--remaining > 0);
198 tcp_port_rover = rover;
199 spin_unlock(&tcp_portalloc_lock);
200
d5d28375
DM
201 /* Exhausted local port range during search? It is not
202 * possible for us to be holding one of the bind hash
203 * locks if this test triggers, because if 'remaining'
204 * drops to zero, we broke out of the do/while loop at
205 * the top level, not from the 'break;' statement.
206 */
1da177e4 207 ret = 1;
d5d28375 208 if (unlikely(remaining <= 0))
1da177e4
LT
209 goto fail;
210
211 /* OK, here is the one we will use. HEAD is
212 * non-NULL and we hold it's mutex.
213 */
214 snum = rover;
215 } else {
0f7ff927 216 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
1da177e4 217 spin_lock(&head->lock);
0f7ff927 218 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
219 if (tb->port == snum)
220 goto tb_found;
221 }
222 tb = NULL;
223 goto tb_not_found;
224tb_found:
225 if (!hlist_empty(&tb->owners)) {
226 if (sk->sk_reuse > 1)
227 goto success;
228 if (tb->fastreuse > 0 &&
229 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
230 goto success;
231 } else {
232 ret = 1;
233 if (tcp_bind_conflict(sk, tb))
234 goto fail_unlock;
235 }
236 }
237tb_not_found:
238 ret = 1;
0f7ff927 239 if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
1da177e4
LT
240 goto fail_unlock;
241 if (hlist_empty(&tb->owners)) {
242 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
243 tb->fastreuse = 1;
244 else
245 tb->fastreuse = 0;
246 } else if (tb->fastreuse &&
247 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
248 tb->fastreuse = 0;
249success:
a55ebcc4 250 if (!inet_sk(sk)->bind_hash)
1da177e4 251 tcp_bind_hash(sk, tb, snum);
a55ebcc4 252 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
1da177e4
LT
253 ret = 0;
254
255fail_unlock:
256 spin_unlock(&head->lock);
257fail:
258 local_bh_enable();
259 return ret;
260}
261
262/* Get rid of any references to a local port held by the
263 * given sock.
264 */
265static void __tcp_put_port(struct sock *sk)
266{
267 struct inet_sock *inet = inet_sk(sk);
0f7ff927
ACM
268 struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
269 tcp_bhash_size)];
270 struct inet_bind_bucket *tb;
1da177e4
LT
271
272 spin_lock(&head->lock);
a55ebcc4 273 tb = inet->bind_hash;
1da177e4 274 __sk_del_bind_node(sk);
a55ebcc4 275 inet->bind_hash = NULL;
1da177e4 276 inet->num = 0;
0f7ff927 277 inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
1da177e4
LT
278 spin_unlock(&head->lock);
279}
280
281void tcp_put_port(struct sock *sk)
282{
283 local_bh_disable();
284 __tcp_put_port(sk);
285 local_bh_enable();
286}
287
288/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
289 * Look, when several writers sleep and reader wakes them up, all but one
290 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
291 * this, _but_ remember, it adds useless work on UP machines (wake up each
292 * exclusive lock release). It should be ifdefed really.
293 */
294
295void tcp_listen_wlock(void)
296{
297 write_lock(&tcp_lhash_lock);
298
299 if (atomic_read(&tcp_lhash_users)) {
300 DEFINE_WAIT(wait);
301
302 for (;;) {
303 prepare_to_wait_exclusive(&tcp_lhash_wait,
304 &wait, TASK_UNINTERRUPTIBLE);
305 if (!atomic_read(&tcp_lhash_users))
306 break;
307 write_unlock_bh(&tcp_lhash_lock);
308 schedule();
309 write_lock_bh(&tcp_lhash_lock);
310 }
311
312 finish_wait(&tcp_lhash_wait, &wait);
313 }
314}
315
316static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
317{
318 struct hlist_head *list;
319 rwlock_t *lock;
320
321 BUG_TRAP(sk_unhashed(sk));
322 if (listen_possible && sk->sk_state == TCP_LISTEN) {
0f7ff927 323 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
1da177e4
LT
324 lock = &tcp_lhash_lock;
325 tcp_listen_wlock();
326 } else {
304a1618
ACM
327 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
328 list = &tcp_ehash[sk->sk_hashent].chain;
1da177e4
LT
329 lock = &tcp_ehash[sk->sk_hashent].lock;
330 write_lock(lock);
331 }
332 __sk_add_node(sk, list);
333 sock_prot_inc_use(sk->sk_prot);
334 write_unlock(lock);
335 if (listen_possible && sk->sk_state == TCP_LISTEN)
336 wake_up(&tcp_lhash_wait);
337}
338
339static void tcp_v4_hash(struct sock *sk)
340{
341 if (sk->sk_state != TCP_CLOSE) {
342 local_bh_disable();
343 __tcp_v4_hash(sk, 1);
344 local_bh_enable();
345 }
346}
347
348void tcp_unhash(struct sock *sk)
349{
350 rwlock_t *lock;
351
352 if (sk_unhashed(sk))
353 goto ende;
354
355 if (sk->sk_state == TCP_LISTEN) {
356 local_bh_disable();
357 tcp_listen_wlock();
358 lock = &tcp_lhash_lock;
359 } else {
0f7ff927 360 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
1da177e4
LT
361 lock = &head->lock;
362 write_lock_bh(&head->lock);
363 }
364
365 if (__sk_del_node_init(sk))
366 sock_prot_dec_use(sk->sk_prot);
367 write_unlock_bh(lock);
368
369 ende:
370 if (sk->sk_state == TCP_LISTEN)
371 wake_up(&tcp_lhash_wait);
372}
373
374/* Don't inline this cruft. Here are some nice properties to
375 * exploit here. The BSD API does not allow a listening TCP
376 * to specify the remote port nor the remote address for the
377 * connection. So always assume those are both wildcarded
378 * during the search since they can never be otherwise.
379 */
0f7ff927
ACM
380static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
381 const u32 daddr,
382 const unsigned short hnum,
383 const int dif)
1da177e4
LT
384{
385 struct sock *result = NULL, *sk;
386 struct hlist_node *node;
387 int score, hiscore;
388
389 hiscore=-1;
390 sk_for_each(sk, node, head) {
391 struct inet_sock *inet = inet_sk(sk);
392
393 if (inet->num == hnum && !ipv6_only_sock(sk)) {
394 __u32 rcv_saddr = inet->rcv_saddr;
395
396 score = (sk->sk_family == PF_INET ? 1 : 0);
397 if (rcv_saddr) {
398 if (rcv_saddr != daddr)
399 continue;
400 score+=2;
401 }
402 if (sk->sk_bound_dev_if) {
403 if (sk->sk_bound_dev_if != dif)
404 continue;
405 score+=2;
406 }
407 if (score == 5)
408 return sk;
409 if (score > hiscore) {
410 hiscore = score;
411 result = sk;
412 }
413 }
414 }
415 return result;
416}
417
418/* Optimize the common listener case. */
0f7ff927
ACM
419static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
420 const unsigned short hnum,
421 const int dif)
1da177e4
LT
422{
423 struct sock *sk = NULL;
424 struct hlist_head *head;
425
426 read_lock(&tcp_lhash_lock);
0f7ff927 427 head = &tcp_listening_hash[inet_lhashfn(hnum)];
1da177e4
LT
428 if (!hlist_empty(head)) {
429 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
430
431 if (inet->num == hnum && !sk->sk_node.next &&
432 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
433 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
434 !sk->sk_bound_dev_if)
435 goto sherry_cache;
436 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
437 }
438 if (sk) {
439sherry_cache:
440 sock_hold(sk);
441 }
442 read_unlock(&tcp_lhash_lock);
443 return sk;
444}
445
446/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
447 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
448 *
449 * Local BH must be disabled here.
450 */
451
0f7ff927
ACM
452static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
453 const u16 sport,
454 const u32 daddr,
455 const u16 hnum,
456 const int dif)
1da177e4 457{
0f7ff927 458 struct inet_ehash_bucket *head;
1da177e4
LT
459 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
460 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
461 struct sock *sk;
462 struct hlist_node *node;
463 /* Optimize here for direct hit, only listening connections can
464 * have wildcards anyways.
465 */
304a1618 466 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
1da177e4
LT
467 head = &tcp_ehash[hash];
468 read_lock(&head->lock);
469 sk_for_each(sk, node, &head->chain) {
470 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
471 goto hit; /* You sunk my battleship! */
472 }
473
474 /* Must check for a TIME_WAIT'er before going to listener hash. */
475 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
476 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
477 goto hit;
478 }
479 sk = NULL;
480out:
481 read_unlock(&head->lock);
482 return sk;
483hit:
484 sock_hold(sk);
485 goto out;
486}
487
488static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
489 u32 daddr, u16 hnum, int dif)
490{
491 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
492 daddr, hnum, dif);
493
494 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
495}
496
497inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
498 u16 dport, int dif)
499{
500 struct sock *sk;
501
502 local_bh_disable();
503 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
504 local_bh_enable();
505
506 return sk;
507}
508
509EXPORT_SYMBOL_GPL(tcp_v4_lookup);
510
511static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
512{
513 return secure_tcp_sequence_number(skb->nh.iph->daddr,
514 skb->nh.iph->saddr,
515 skb->h.th->dest,
516 skb->h.th->source);
517}
518
519/* called with local bh disabled */
520static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
521 struct tcp_tw_bucket **twp)
522{
523 struct inet_sock *inet = inet_sk(sk);
524 u32 daddr = inet->rcv_saddr;
525 u32 saddr = inet->daddr;
526 int dif = sk->sk_bound_dev_if;
527 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
528 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
304a1618 529 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
0f7ff927 530 struct inet_ehash_bucket *head = &tcp_ehash[hash];
1da177e4
LT
531 struct sock *sk2;
532 struct hlist_node *node;
533 struct tcp_tw_bucket *tw;
534
535 write_lock(&head->lock);
536
537 /* Check TIME-WAIT sockets first. */
538 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
539 tw = (struct tcp_tw_bucket *)sk2;
540
541 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
542 struct tcp_sock *tp = tcp_sk(sk);
543
544 /* With PAWS, it is safe from the viewpoint
545 of data integrity. Even without PAWS it
546 is safe provided sequence spaces do not
547 overlap i.e. at data rates <= 80Mbit/sec.
548
549 Actually, the idea is close to VJ's one,
550 only timestamp cache is held not per host,
551 but per port pair and TW bucket is used
552 as state holder.
553
554 If TW bucket has been already destroyed we
555 fall back to VJ's scheme and use initial
556 timestamp retrieved from peer table.
557 */
558 if (tw->tw_ts_recent_stamp &&
559 (!twp || (sysctl_tcp_tw_reuse &&
560 xtime.tv_sec -
561 tw->tw_ts_recent_stamp > 1))) {
562 if ((tp->write_seq =
563 tw->tw_snd_nxt + 65535 + 2) == 0)
564 tp->write_seq = 1;
565 tp->rx_opt.ts_recent = tw->tw_ts_recent;
566 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
567 sock_hold(sk2);
568 goto unique;
569 } else
570 goto not_unique;
571 }
572 }
573 tw = NULL;
574
575 /* And established part... */
576 sk_for_each(sk2, node, &head->chain) {
577 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
578 goto not_unique;
579 }
580
581unique:
582 /* Must record num and sport now. Otherwise we will see
583 * in hash table socket with a funny identity. */
584 inet->num = lport;
585 inet->sport = htons(lport);
586 sk->sk_hashent = hash;
587 BUG_TRAP(sk_unhashed(sk));
588 __sk_add_node(sk, &head->chain);
589 sock_prot_inc_use(sk->sk_prot);
590 write_unlock(&head->lock);
591
592 if (twp) {
593 *twp = tw;
594 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
595 } else if (tw) {
596 /* Silly. Should hash-dance instead... */
597 tcp_tw_deschedule(tw);
598 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
599
600 tcp_tw_put(tw);
601 }
602
603 return 0;
604
605not_unique:
606 write_unlock(&head->lock);
607 return -EADDRNOTAVAIL;
608}
609
610static inline u32 connect_port_offset(const struct sock *sk)
611{
612 const struct inet_sock *inet = inet_sk(sk);
613
614 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
615 inet->dport);
616}
617
618/*
619 * Bind a port for a connect operation and hash it.
620 */
621static inline int tcp_v4_hash_connect(struct sock *sk)
622{
0f7ff927
ACM
623 const unsigned short snum = inet_sk(sk)->num;
624 struct inet_bind_hashbucket *head;
625 struct inet_bind_bucket *tb;
1da177e4
LT
626 int ret;
627
628 if (!snum) {
629 int low = sysctl_local_port_range[0];
630 int high = sysctl_local_port_range[1];
631 int range = high - low;
632 int i;
633 int port;
634 static u32 hint;
635 u32 offset = hint + connect_port_offset(sk);
636 struct hlist_node *node;
637 struct tcp_tw_bucket *tw = NULL;
638
639 local_bh_disable();
640 for (i = 1; i <= range; i++) {
641 port = low + (i + offset) % range;
0f7ff927 642 head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
1da177e4
LT
643 spin_lock(&head->lock);
644
645 /* Does not bother with rcv_saddr checks,
646 * because the established check is already
647 * unique enough.
648 */
0f7ff927 649 inet_bind_bucket_for_each(tb, node, &head->chain) {
1da177e4
LT
650 if (tb->port == port) {
651 BUG_TRAP(!hlist_empty(&tb->owners));
652 if (tb->fastreuse >= 0)
653 goto next_port;
654 if (!__tcp_v4_check_established(sk,
655 port,
656 &tw))
657 goto ok;
658 goto next_port;
659 }
660 }
661
0f7ff927 662 tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
1da177e4
LT
663 if (!tb) {
664 spin_unlock(&head->lock);
665 break;
666 }
667 tb->fastreuse = -1;
668 goto ok;
669
670 next_port:
671 spin_unlock(&head->lock);
672 }
673 local_bh_enable();
674
675 return -EADDRNOTAVAIL;
676
677ok:
678 hint += i;
679
680 /* Head lock still held and bh's disabled */
681 tcp_bind_hash(sk, tb, port);
682 if (sk_unhashed(sk)) {
683 inet_sk(sk)->sport = htons(port);
684 __tcp_v4_hash(sk, 0);
685 }
686 spin_unlock(&head->lock);
687
688 if (tw) {
689 tcp_tw_deschedule(tw);
690 tcp_tw_put(tw);
691 }
692
693 ret = 0;
694 goto out;
695 }
696
0f7ff927 697 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
a55ebcc4 698 tb = inet_sk(sk)->bind_hash;
1da177e4
LT
699 spin_lock_bh(&head->lock);
700 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
701 __tcp_v4_hash(sk, 0);
702 spin_unlock_bh(&head->lock);
703 return 0;
704 } else {
705 spin_unlock(&head->lock);
706 /* No definite answer... Walk to established hash table */
707 ret = __tcp_v4_check_established(sk, snum, NULL);
708out:
709 local_bh_enable();
710 return ret;
711 }
712}
713
714/* This will initiate an outgoing connection. */
715int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
716{
717 struct inet_sock *inet = inet_sk(sk);
718 struct tcp_sock *tp = tcp_sk(sk);
719 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
720 struct rtable *rt;
721 u32 daddr, nexthop;
722 int tmp;
723 int err;
724
725 if (addr_len < sizeof(struct sockaddr_in))
726 return -EINVAL;
727
728 if (usin->sin_family != AF_INET)
729 return -EAFNOSUPPORT;
730
731 nexthop = daddr = usin->sin_addr.s_addr;
732 if (inet->opt && inet->opt->srr) {
733 if (!daddr)
734 return -EINVAL;
735 nexthop = inet->opt->faddr;
736 }
737
738 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
739 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
740 IPPROTO_TCP,
741 inet->sport, usin->sin_port, sk);
742 if (tmp < 0)
743 return tmp;
744
745 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
746 ip_rt_put(rt);
747 return -ENETUNREACH;
748 }
749
750 if (!inet->opt || !inet->opt->srr)
751 daddr = rt->rt_dst;
752
753 if (!inet->saddr)
754 inet->saddr = rt->rt_src;
755 inet->rcv_saddr = inet->saddr;
756
757 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
758 /* Reset inherited state */
759 tp->rx_opt.ts_recent = 0;
760 tp->rx_opt.ts_recent_stamp = 0;
761 tp->write_seq = 0;
762 }
763
764 if (sysctl_tcp_tw_recycle &&
765 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
766 struct inet_peer *peer = rt_get_peer(rt);
767
768 /* VJ's idea. We save last timestamp seen from
769 * the destination in peer table, when entering state TIME-WAIT
770 * and initialize rx_opt.ts_recent from it, when trying new connection.
771 */
772
773 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
774 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
775 tp->rx_opt.ts_recent = peer->tcp_ts;
776 }
777 }
778
779 inet->dport = usin->sin_port;
780 inet->daddr = daddr;
781
782 tp->ext_header_len = 0;
783 if (inet->opt)
784 tp->ext_header_len = inet->opt->optlen;
785
786 tp->rx_opt.mss_clamp = 536;
787
788 /* Socket identity is still unknown (sport may be zero).
789 * However we set state to SYN-SENT and not releasing socket
790 * lock select source port, enter ourselves into the hash tables and
791 * complete initialization after this.
792 */
793 tcp_set_state(sk, TCP_SYN_SENT);
794 err = tcp_v4_hash_connect(sk);
795 if (err)
796 goto failure;
797
798 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
799 if (err)
800 goto failure;
801
802 /* OK, now commit destination to socket. */
6cbb0df7 803 sk_setup_caps(sk, &rt->u.dst);
1da177e4
LT
804
805 if (!tp->write_seq)
806 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
807 inet->daddr,
808 inet->sport,
809 usin->sin_port);
810
811 inet->id = tp->write_seq ^ jiffies;
812
813 err = tcp_connect(sk);
814 rt = NULL;
815 if (err)
816 goto failure;
817
818 return 0;
819
820failure:
821 /* This unhashes the socket and releases the local port, if necessary. */
822 tcp_set_state(sk, TCP_CLOSE);
823 ip_rt_put(rt);
824 sk->sk_route_caps = 0;
825 inet->dport = 0;
826 return err;
827}
828
829static __inline__ int tcp_v4_iif(struct sk_buff *skb)
830{
831 return ((struct rtable *)skb->dst)->rt_iif;
832}
833
834static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
835{
836 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
837}
838
60236fdd
ACM
839static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
840 struct request_sock ***prevp,
1da177e4
LT
841 __u16 rport,
842 __u32 raddr, __u32 laddr)
843{
2ad69c55 844 struct listen_sock *lopt = tp->accept_queue.listen_opt;
60236fdd 845 struct request_sock *req, **prev;
1da177e4
LT
846
847 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
848 (req = *prev) != NULL;
849 prev = &req->dl_next) {
2e6599cb
ACM
850 const struct inet_request_sock *ireq = inet_rsk(req);
851
852 if (ireq->rmt_port == rport &&
853 ireq->rmt_addr == raddr &&
854 ireq->loc_addr == laddr &&
60236fdd 855 TCP_INET_FAMILY(req->rsk_ops->family)) {
1da177e4
LT
856 BUG_TRAP(!req->sk);
857 *prevp = prev;
858 break;
859 }
860 }
861
862 return req;
863}
864
60236fdd 865static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
1da177e4
LT
866{
867 struct tcp_sock *tp = tcp_sk(sk);
2ad69c55 868 struct listen_sock *lopt = tp->accept_queue.listen_opt;
2e6599cb 869 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1da177e4 870
0e87506f 871 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
1da177e4
LT
872 tcp_synq_added(sk);
873}
874
875
876/*
877 * This routine does path mtu discovery as defined in RFC1191.
878 */
879static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
880 u32 mtu)
881{
882 struct dst_entry *dst;
883 struct inet_sock *inet = inet_sk(sk);
884 struct tcp_sock *tp = tcp_sk(sk);
885
886 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
887 * send out by Linux are always <576bytes so they should go through
888 * unfragmented).
889 */
890 if (sk->sk_state == TCP_LISTEN)
891 return;
892
893 /* We don't check in the destentry if pmtu discovery is forbidden
894 * on this route. We just assume that no packet_to_big packets
895 * are send back when pmtu discovery is not active.
896 * There is a small race when the user changes this flag in the
897 * route, but I think that's acceptable.
898 */
899 if ((dst = __sk_dst_check(sk, 0)) == NULL)
900 return;
901
902 dst->ops->update_pmtu(dst, mtu);
903
904 /* Something is about to be wrong... Remember soft error
905 * for the case, if this connection will not able to recover.
906 */
907 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
908 sk->sk_err_soft = EMSGSIZE;
909
910 mtu = dst_mtu(dst);
911
912 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
913 tp->pmtu_cookie > mtu) {
914 tcp_sync_mss(sk, mtu);
915
916 /* Resend the TCP packet because it's
917 * clear that the old packet has been
918 * dropped. This is the new "fast" path mtu
919 * discovery.
920 */
921 tcp_simple_retransmit(sk);
922 } /* else let the usual retransmit timer handle it */
923}
924
925/*
926 * This routine is called by the ICMP module when it gets some
927 * sort of error condition. If err < 0 then the socket should
928 * be closed and the error returned to the user. If err > 0
929 * it's just the icmp type << 8 | icmp code. After adjustment
930 * header points to the first 8 bytes of the tcp header. We need
931 * to find the appropriate port.
932 *
933 * The locking strategy used here is very "optimistic". When
934 * someone else accesses the socket the ICMP is just dropped
935 * and for some paths there is no check at all.
936 * A more general error queue to queue errors for later handling
937 * is probably better.
938 *
939 */
940
941void tcp_v4_err(struct sk_buff *skb, u32 info)
942{
943 struct iphdr *iph = (struct iphdr *)skb->data;
944 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
945 struct tcp_sock *tp;
946 struct inet_sock *inet;
947 int type = skb->h.icmph->type;
948 int code = skb->h.icmph->code;
949 struct sock *sk;
950 __u32 seq;
951 int err;
952
953 if (skb->len < (iph->ihl << 2) + 8) {
954 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
955 return;
956 }
957
958 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
959 th->source, tcp_v4_iif(skb));
960 if (!sk) {
961 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
962 return;
963 }
964 if (sk->sk_state == TCP_TIME_WAIT) {
965 tcp_tw_put((struct tcp_tw_bucket *)sk);
966 return;
967 }
968
969 bh_lock_sock(sk);
970 /* If too many ICMPs get dropped on busy
971 * servers this needs to be solved differently.
972 */
973 if (sock_owned_by_user(sk))
974 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
975
976 if (sk->sk_state == TCP_CLOSE)
977 goto out;
978
979 tp = tcp_sk(sk);
980 seq = ntohl(th->seq);
981 if (sk->sk_state != TCP_LISTEN &&
982 !between(seq, tp->snd_una, tp->snd_nxt)) {
983 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
984 goto out;
985 }
986
987 switch (type) {
988 case ICMP_SOURCE_QUENCH:
989 /* Just silently ignore these. */
990 goto out;
991 case ICMP_PARAMETERPROB:
992 err = EPROTO;
993 break;
994 case ICMP_DEST_UNREACH:
995 if (code > NR_ICMP_UNREACH)
996 goto out;
997
998 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
999 if (!sock_owned_by_user(sk))
1000 do_pmtu_discovery(sk, iph, info);
1001 goto out;
1002 }
1003
1004 err = icmp_err_convert[code].errno;
1005 break;
1006 case ICMP_TIME_EXCEEDED:
1007 err = EHOSTUNREACH;
1008 break;
1009 default:
1010 goto out;
1011 }
1012
1013 switch (sk->sk_state) {
60236fdd 1014 struct request_sock *req, **prev;
1da177e4
LT
1015 case TCP_LISTEN:
1016 if (sock_owned_by_user(sk))
1017 goto out;
1018
1019 req = tcp_v4_search_req(tp, &prev, th->dest,
1020 iph->daddr, iph->saddr);
1021 if (!req)
1022 goto out;
1023
1024 /* ICMPs are not backlogged, hence we cannot get
1025 an established socket here.
1026 */
1027 BUG_TRAP(!req->sk);
1028
2e6599cb 1029 if (seq != tcp_rsk(req)->snt_isn) {
1da177e4
LT
1030 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1031 goto out;
1032 }
1033
1034 /*
1035 * Still in SYN_RECV, just remove it silently.
1036 * There is no good way to pass the error to the newly
1037 * created socket, and POSIX does not want network
1038 * errors returned from accept().
1039 */
1040 tcp_synq_drop(sk, req, prev);
1041 goto out;
1042
1043 case TCP_SYN_SENT:
1044 case TCP_SYN_RECV: /* Cannot happen.
1045 It can f.e. if SYNs crossed.
1046 */
1047 if (!sock_owned_by_user(sk)) {
1048 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1049 sk->sk_err = err;
1050
1051 sk->sk_error_report(sk);
1052
1053 tcp_done(sk);
1054 } else {
1055 sk->sk_err_soft = err;
1056 }
1057 goto out;
1058 }
1059
1060 /* If we've already connected we will keep trying
1061 * until we time out, or the user gives up.
1062 *
1063 * rfc1122 4.2.3.9 allows to consider as hard errors
1064 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1065 * but it is obsoleted by pmtu discovery).
1066 *
1067 * Note, that in modern internet, where routing is unreliable
1068 * and in each dark corner broken firewalls sit, sending random
1069 * errors ordered by their masters even this two messages finally lose
1070 * their original sense (even Linux sends invalid PORT_UNREACHs)
1071 *
1072 * Now we are in compliance with RFCs.
1073 * --ANK (980905)
1074 */
1075
1076 inet = inet_sk(sk);
1077 if (!sock_owned_by_user(sk) && inet->recverr) {
1078 sk->sk_err = err;
1079 sk->sk_error_report(sk);
1080 } else { /* Only an error on timeout */
1081 sk->sk_err_soft = err;
1082 }
1083
1084out:
1085 bh_unlock_sock(sk);
1086 sock_put(sk);
1087}
1088
1089/* This routine computes an IPv4 TCP checksum. */
1090void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1091 struct sk_buff *skb)
1092{
1093 struct inet_sock *inet = inet_sk(sk);
1094
1095 if (skb->ip_summed == CHECKSUM_HW) {
1096 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1097 skb->csum = offsetof(struct tcphdr, check);
1098 } else {
1099 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1100 csum_partial((char *)th,
1101 th->doff << 2,
1102 skb->csum));
1103 }
1104}
1105
1106/*
1107 * This routine will send an RST to the other tcp.
1108 *
1109 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1110 * for reset.
1111 * Answer: if a packet caused RST, it is not for a socket
1112 * existing in our system, if it is matched to a socket,
1113 * it is just duplicate segment or bug in other side's TCP.
1114 * So that we build reply only basing on parameters
1115 * arrived with segment.
1116 * Exception: precedence violation. We do not implement it in any case.
1117 */
1118
1119static void tcp_v4_send_reset(struct sk_buff *skb)
1120{
1121 struct tcphdr *th = skb->h.th;
1122 struct tcphdr rth;
1123 struct ip_reply_arg arg;
1124
1125 /* Never send a reset in response to a reset. */
1126 if (th->rst)
1127 return;
1128
1129 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1130 return;
1131
1132 /* Swap the send and the receive. */
1133 memset(&rth, 0, sizeof(struct tcphdr));
1134 rth.dest = th->source;
1135 rth.source = th->dest;
1136 rth.doff = sizeof(struct tcphdr) / 4;
1137 rth.rst = 1;
1138
1139 if (th->ack) {
1140 rth.seq = th->ack_seq;
1141 } else {
1142 rth.ack = 1;
1143 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1144 skb->len - (th->doff << 2));
1145 }
1146
1147 memset(&arg, 0, sizeof arg);
1148 arg.iov[0].iov_base = (unsigned char *)&rth;
1149 arg.iov[0].iov_len = sizeof rth;
1150 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1151 skb->nh.iph->saddr, /*XXX*/
1152 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1153 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1154
1155 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1156
1157 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1158 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1159}
1160
1161/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1162 outside socket context is ugly, certainly. What can I do?
1163 */
1164
1165static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1166 u32 win, u32 ts)
1167{
1168 struct tcphdr *th = skb->h.th;
1169 struct {
1170 struct tcphdr th;
1171 u32 tsopt[3];
1172 } rep;
1173 struct ip_reply_arg arg;
1174
1175 memset(&rep.th, 0, sizeof(struct tcphdr));
1176 memset(&arg, 0, sizeof arg);
1177
1178 arg.iov[0].iov_base = (unsigned char *)&rep;
1179 arg.iov[0].iov_len = sizeof(rep.th);
1180 if (ts) {
1181 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1182 (TCPOPT_TIMESTAMP << 8) |
1183 TCPOLEN_TIMESTAMP);
1184 rep.tsopt[1] = htonl(tcp_time_stamp);
1185 rep.tsopt[2] = htonl(ts);
1186 arg.iov[0].iov_len = sizeof(rep);
1187 }
1188
1189 /* Swap the send and the receive. */
1190 rep.th.dest = th->source;
1191 rep.th.source = th->dest;
1192 rep.th.doff = arg.iov[0].iov_len / 4;
1193 rep.th.seq = htonl(seq);
1194 rep.th.ack_seq = htonl(ack);
1195 rep.th.ack = 1;
1196 rep.th.window = htons(win);
1197
1198 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1199 skb->nh.iph->saddr, /*XXX*/
1200 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1201 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1202
1203 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1204
1205 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1206}
1207
1208static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1209{
1210 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1211
1212 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1213 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1214
1215 tcp_tw_put(tw);
1216}
1217
60236fdd 1218static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1da177e4 1219{
2e6599cb 1220 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1da177e4
LT
1221 req->ts_recent);
1222}
1223
1224static struct dst_entry* tcp_v4_route_req(struct sock *sk,
60236fdd 1225 struct request_sock *req)
1da177e4
LT
1226{
1227 struct rtable *rt;
2e6599cb
ACM
1228 const struct inet_request_sock *ireq = inet_rsk(req);
1229 struct ip_options *opt = inet_rsk(req)->opt;
1da177e4
LT
1230 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1231 .nl_u = { .ip4_u =
1232 { .daddr = ((opt && opt->srr) ?
1233 opt->faddr :
2e6599cb
ACM
1234 ireq->rmt_addr),
1235 .saddr = ireq->loc_addr,
1da177e4
LT
1236 .tos = RT_CONN_FLAGS(sk) } },
1237 .proto = IPPROTO_TCP,
1238 .uli_u = { .ports =
1239 { .sport = inet_sk(sk)->sport,
2e6599cb 1240 .dport = ireq->rmt_port } } };
1da177e4
LT
1241
1242 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1243 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1244 return NULL;
1245 }
1246 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1247 ip_rt_put(rt);
1248 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1249 return NULL;
1250 }
1251 return &rt->u.dst;
1252}
1253
1254/*
1255 * Send a SYN-ACK after having received an ACK.
60236fdd 1256 * This still operates on a request_sock only, not on a big
1da177e4
LT
1257 * socket.
1258 */
60236fdd 1259static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1da177e4
LT
1260 struct dst_entry *dst)
1261{
2e6599cb 1262 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
1263 int err = -1;
1264 struct sk_buff * skb;
1265
1266 /* First, grab a route. */
1267 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1268 goto out;
1269
1270 skb = tcp_make_synack(sk, dst, req);
1271
1272 if (skb) {
1273 struct tcphdr *th = skb->h.th;
1274
1275 th->check = tcp_v4_check(th, skb->len,
2e6599cb
ACM
1276 ireq->loc_addr,
1277 ireq->rmt_addr,
1da177e4
LT
1278 csum_partial((char *)th, skb->len,
1279 skb->csum));
1280
2e6599cb
ACM
1281 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1282 ireq->rmt_addr,
1283 ireq->opt);
1da177e4
LT
1284 if (err == NET_XMIT_CN)
1285 err = 0;
1286 }
1287
1288out:
1289 dst_release(dst);
1290 return err;
1291}
1292
1293/*
60236fdd 1294 * IPv4 request_sock destructor.
1da177e4 1295 */
60236fdd 1296static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1297{
2e6599cb
ACM
1298 if (inet_rsk(req)->opt)
1299 kfree(inet_rsk(req)->opt);
1da177e4
LT
1300}
1301
1302static inline void syn_flood_warning(struct sk_buff *skb)
1303{
1304 static unsigned long warntime;
1305
1306 if (time_after(jiffies, (warntime + HZ * 60))) {
1307 warntime = jiffies;
1308 printk(KERN_INFO
1309 "possible SYN flooding on port %d. Sending cookies.\n",
1310 ntohs(skb->h.th->dest));
1311 }
1312}
1313
1314/*
60236fdd 1315 * Save and compile IPv4 options into the request_sock if needed.
1da177e4
LT
1316 */
1317static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1318 struct sk_buff *skb)
1319{
1320 struct ip_options *opt = &(IPCB(skb)->opt);
1321 struct ip_options *dopt = NULL;
1322
1323 if (opt && opt->optlen) {
1324 int opt_size = optlength(opt);
1325 dopt = kmalloc(opt_size, GFP_ATOMIC);
1326 if (dopt) {
1327 if (ip_options_echo(dopt, skb)) {
1328 kfree(dopt);
1329 dopt = NULL;
1330 }
1331 }
1332 }
1333 return dopt;
1334}
1335
60236fdd 1336struct request_sock_ops tcp_request_sock_ops = {
1da177e4 1337 .family = PF_INET,
2e6599cb 1338 .obj_size = sizeof(struct tcp_request_sock),
1da177e4 1339 .rtx_syn_ack = tcp_v4_send_synack,
60236fdd
ACM
1340 .send_ack = tcp_v4_reqsk_send_ack,
1341 .destructor = tcp_v4_reqsk_destructor,
1da177e4
LT
1342 .send_reset = tcp_v4_send_reset,
1343};
1344
1345int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1346{
2e6599cb 1347 struct inet_request_sock *ireq;
1da177e4 1348 struct tcp_options_received tmp_opt;
60236fdd 1349 struct request_sock *req;
1da177e4
LT
1350 __u32 saddr = skb->nh.iph->saddr;
1351 __u32 daddr = skb->nh.iph->daddr;
1352 __u32 isn = TCP_SKB_CB(skb)->when;
1353 struct dst_entry *dst = NULL;
1354#ifdef CONFIG_SYN_COOKIES
1355 int want_cookie = 0;
1356#else
1357#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1358#endif
1359
1360 /* Never answer to SYNs send to broadcast or multicast */
1361 if (((struct rtable *)skb->dst)->rt_flags &
1362 (RTCF_BROADCAST | RTCF_MULTICAST))
1363 goto drop;
1364
1365 /* TW buckets are converted to open requests without
1366 * limitations, they conserve resources and peer is
1367 * evidently real one.
1368 */
1369 if (tcp_synq_is_full(sk) && !isn) {
1370#ifdef CONFIG_SYN_COOKIES
1371 if (sysctl_tcp_syncookies) {
1372 want_cookie = 1;
1373 } else
1374#endif
1375 goto drop;
1376 }
1377
1378 /* Accept backlog is full. If we have already queued enough
1379 * of warm entries in syn queue, drop request. It is better than
1380 * clogging syn queue with openreqs with exponentially increasing
1381 * timeout.
1382 */
1383 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1384 goto drop;
1385
60236fdd 1386 req = reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1387 if (!req)
1388 goto drop;
1389
1390 tcp_clear_options(&tmp_opt);
1391 tmp_opt.mss_clamp = 536;
1392 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1393
1394 tcp_parse_options(skb, &tmp_opt, 0);
1395
1396 if (want_cookie) {
1397 tcp_clear_options(&tmp_opt);
1398 tmp_opt.saw_tstamp = 0;
1399 }
1400
1401 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1402 /* Some OSes (unknown ones, but I see them on web server, which
1403 * contains information interesting only for windows'
1404 * users) do not send their stamp in SYN. It is easy case.
1405 * We simply do not advertise TS support.
1406 */
1407 tmp_opt.saw_tstamp = 0;
1408 tmp_opt.tstamp_ok = 0;
1409 }
1410 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1411
1412 tcp_openreq_init(req, &tmp_opt, skb);
1413
2e6599cb
ACM
1414 ireq = inet_rsk(req);
1415 ireq->loc_addr = daddr;
1416 ireq->rmt_addr = saddr;
1417 ireq->opt = tcp_v4_save_options(sk, skb);
1da177e4
LT
1418 if (!want_cookie)
1419 TCP_ECN_create_request(req, skb->h.th);
1420
1421 if (want_cookie) {
1422#ifdef CONFIG_SYN_COOKIES
1423 syn_flood_warning(skb);
1424#endif
1425 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1426 } else if (!isn) {
1427 struct inet_peer *peer = NULL;
1428
1429 /* VJ's idea. We save last timestamp seen
1430 * from the destination in peer table, when entering
1431 * state TIME-WAIT, and check against it before
1432 * accepting new connection request.
1433 *
1434 * If "isn" is not zero, this request hit alive
1435 * timewait bucket, so that all the necessary checks
1436 * are made in the function processing timewait state.
1437 */
1438 if (tmp_opt.saw_tstamp &&
1439 sysctl_tcp_tw_recycle &&
1440 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1441 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1442 peer->v4daddr == saddr) {
1443 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1444 (s32)(peer->tcp_ts - req->ts_recent) >
1445 TCP_PAWS_WINDOW) {
1446 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1447 dst_release(dst);
1448 goto drop_and_free;
1449 }
1450 }
1451 /* Kill the following clause, if you dislike this way. */
1452 else if (!sysctl_tcp_syncookies &&
1453 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1454 (sysctl_max_syn_backlog >> 2)) &&
1455 (!peer || !peer->tcp_ts_stamp) &&
1456 (!dst || !dst_metric(dst, RTAX_RTT))) {
1457 /* Without syncookies last quarter of
1458 * backlog is filled with destinations,
1459 * proven to be alive.
1460 * It means that we continue to communicate
1461 * to destinations, already remembered
1462 * to the moment of synflood.
1463 */
ca933452
HO
1464 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1465 "request from %u.%u."
1466 "%u.%u/%u\n",
1467 NIPQUAD(saddr),
1468 ntohs(skb->h.th->source)));
1da177e4
LT
1469 dst_release(dst);
1470 goto drop_and_free;
1471 }
1472
1473 isn = tcp_v4_init_sequence(sk, skb);
1474 }
2e6599cb 1475 tcp_rsk(req)->snt_isn = isn;
1da177e4
LT
1476
1477 if (tcp_v4_send_synack(sk, req, dst))
1478 goto drop_and_free;
1479
1480 if (want_cookie) {
60236fdd 1481 reqsk_free(req);
1da177e4
LT
1482 } else {
1483 tcp_v4_synq_add(sk, req);
1484 }
1485 return 0;
1486
1487drop_and_free:
60236fdd 1488 reqsk_free(req);
1da177e4
LT
1489drop:
1490 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1491 return 0;
1492}
1493
1494
1495/*
1496 * The three way handshake has completed - we got a valid synack -
1497 * now create the new socket.
1498 */
1499struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1500 struct request_sock *req,
1da177e4
LT
1501 struct dst_entry *dst)
1502{
2e6599cb 1503 struct inet_request_sock *ireq;
1da177e4
LT
1504 struct inet_sock *newinet;
1505 struct tcp_sock *newtp;
1506 struct sock *newsk;
1507
1508 if (sk_acceptq_is_full(sk))
1509 goto exit_overflow;
1510
1511 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1512 goto exit;
1513
1514 newsk = tcp_create_openreq_child(sk, req, skb);
1515 if (!newsk)
1516 goto exit;
1517
6cbb0df7 1518 sk_setup_caps(newsk, dst);
1da177e4
LT
1519
1520 newtp = tcp_sk(newsk);
1521 newinet = inet_sk(newsk);
2e6599cb
ACM
1522 ireq = inet_rsk(req);
1523 newinet->daddr = ireq->rmt_addr;
1524 newinet->rcv_saddr = ireq->loc_addr;
1525 newinet->saddr = ireq->loc_addr;
1526 newinet->opt = ireq->opt;
1527 ireq->opt = NULL;
1da177e4
LT
1528 newinet->mc_index = tcp_v4_iif(skb);
1529 newinet->mc_ttl = skb->nh.iph->ttl;
1530 newtp->ext_header_len = 0;
1531 if (newinet->opt)
1532 newtp->ext_header_len = newinet->opt->optlen;
1533 newinet->id = newtp->write_seq ^ jiffies;
1534
1535 tcp_sync_mss(newsk, dst_mtu(dst));
1536 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1537 tcp_initialize_rcv_mss(newsk);
1538
1539 __tcp_v4_hash(newsk, 0);
1540 __tcp_inherit_port(sk, newsk);
1541
1542 return newsk;
1543
1544exit_overflow:
1545 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1546exit:
1547 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1548 dst_release(dst);
1549 return NULL;
1550}
1551
1552static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1553{
1554 struct tcphdr *th = skb->h.th;
1555 struct iphdr *iph = skb->nh.iph;
1556 struct tcp_sock *tp = tcp_sk(sk);
1557 struct sock *nsk;
60236fdd 1558 struct request_sock **prev;
1da177e4 1559 /* Find possible connection requests. */
60236fdd 1560 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1da177e4
LT
1561 iph->saddr, iph->daddr);
1562 if (req)
1563 return tcp_check_req(sk, skb, req, prev);
1564
1565 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1566 th->source,
1567 skb->nh.iph->daddr,
1568 ntohs(th->dest),
1569 tcp_v4_iif(skb));
1570
1571 if (nsk) {
1572 if (nsk->sk_state != TCP_TIME_WAIT) {
1573 bh_lock_sock(nsk);
1574 return nsk;
1575 }
1576 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1577 return NULL;
1578 }
1579
1580#ifdef CONFIG_SYN_COOKIES
1581 if (!th->rst && !th->syn && th->ack)
1582 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1583#endif
1584 return sk;
1585}
1586
1587static int tcp_v4_checksum_init(struct sk_buff *skb)
1588{
1589 if (skb->ip_summed == CHECKSUM_HW) {
1590 skb->ip_summed = CHECKSUM_UNNECESSARY;
1591 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1592 skb->nh.iph->daddr, skb->csum))
1593 return 0;
1594
ca933452 1595 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1da177e4
LT
1596 skb->ip_summed = CHECKSUM_NONE;
1597 }
1598 if (skb->len <= 76) {
1599 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1600 skb->nh.iph->daddr,
1601 skb_checksum(skb, 0, skb->len, 0)))
1602 return -1;
1603 skb->ip_summed = CHECKSUM_UNNECESSARY;
1604 } else {
1605 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1606 skb->nh.iph->saddr,
1607 skb->nh.iph->daddr, 0);
1608 }
1609 return 0;
1610}
1611
1612
1613/* The socket must have it's spinlock held when we get
1614 * here.
1615 *
1616 * We have a potential double-lock case here, so even when
1617 * doing backlog processing we use the BH locking scheme.
1618 * This is because we cannot sleep with the original spinlock
1619 * held.
1620 */
1621int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1622{
1623 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1624 TCP_CHECK_TIMER(sk);
1625 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1626 goto reset;
1627 TCP_CHECK_TIMER(sk);
1628 return 0;
1629 }
1630
1631 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1632 goto csum_err;
1633
1634 if (sk->sk_state == TCP_LISTEN) {
1635 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1636 if (!nsk)
1637 goto discard;
1638
1639 if (nsk != sk) {
1640 if (tcp_child_process(sk, nsk, skb))
1641 goto reset;
1642 return 0;
1643 }
1644 }
1645
1646 TCP_CHECK_TIMER(sk);
1647 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1648 goto reset;
1649 TCP_CHECK_TIMER(sk);
1650 return 0;
1651
1652reset:
1653 tcp_v4_send_reset(skb);
1654discard:
1655 kfree_skb(skb);
1656 /* Be careful here. If this function gets more complicated and
1657 * gcc suffers from register pressure on the x86, sk (in %ebx)
1658 * might be destroyed here. This current version compiles correctly,
1659 * but you have been warned.
1660 */
1661 return 0;
1662
1663csum_err:
1664 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1665 goto discard;
1666}
1667
1668/*
1669 * From tcp_input.c
1670 */
1671
1672int tcp_v4_rcv(struct sk_buff *skb)
1673{
1674 struct tcphdr *th;
1675 struct sock *sk;
1676 int ret;
1677
1678 if (skb->pkt_type != PACKET_HOST)
1679 goto discard_it;
1680
1681 /* Count it even if it's bad */
1682 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1683
1684 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1685 goto discard_it;
1686
1687 th = skb->h.th;
1688
1689 if (th->doff < sizeof(struct tcphdr) / 4)
1690 goto bad_packet;
1691 if (!pskb_may_pull(skb, th->doff * 4))
1692 goto discard_it;
1693
1694 /* An explanation is required here, I think.
1695 * Packet length and doff are validated by header prediction,
1696 * provided case of th->doff==0 is elimineted.
1697 * So, we defer the checks. */
1698 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1699 tcp_v4_checksum_init(skb) < 0))
1700 goto bad_packet;
1701
1702 th = skb->h.th;
1703 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1704 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1705 skb->len - th->doff * 4);
1706 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1707 TCP_SKB_CB(skb)->when = 0;
1708 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1709 TCP_SKB_CB(skb)->sacked = 0;
1710
1711 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1712 skb->nh.iph->daddr, ntohs(th->dest),
1713 tcp_v4_iif(skb));
1714
1715 if (!sk)
1716 goto no_tcp_socket;
1717
1718process:
1719 if (sk->sk_state == TCP_TIME_WAIT)
1720 goto do_time_wait;
1721
1722 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1723 goto discard_and_relse;
1724
1725 if (sk_filter(sk, skb, 0))
1726 goto discard_and_relse;
1727
1728 skb->dev = NULL;
1729
1730 bh_lock_sock(sk);
1731 ret = 0;
1732 if (!sock_owned_by_user(sk)) {
1733 if (!tcp_prequeue(sk, skb))
1734 ret = tcp_v4_do_rcv(sk, skb);
1735 } else
1736 sk_add_backlog(sk, skb);
1737 bh_unlock_sock(sk);
1738
1739 sock_put(sk);
1740
1741 return ret;
1742
1743no_tcp_socket:
1744 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1745 goto discard_it;
1746
1747 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1748bad_packet:
1749 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1750 } else {
1751 tcp_v4_send_reset(skb);
1752 }
1753
1754discard_it:
1755 /* Discard frame. */
1756 kfree_skb(skb);
1757 return 0;
1758
1759discard_and_relse:
1760 sock_put(sk);
1761 goto discard_it;
1762
1763do_time_wait:
1764 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1765 tcp_tw_put((struct tcp_tw_bucket *) sk);
1766 goto discard_it;
1767 }
1768
1769 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1770 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1771 tcp_tw_put((struct tcp_tw_bucket *) sk);
1772 goto discard_it;
1773 }
1774 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1775 skb, th, skb->len)) {
1776 case TCP_TW_SYN: {
1777 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1778 ntohs(th->dest),
1779 tcp_v4_iif(skb));
1780 if (sk2) {
1781 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1782 tcp_tw_put((struct tcp_tw_bucket *)sk);
1783 sk = sk2;
1784 goto process;
1785 }
1786 /* Fall through to ACK */
1787 }
1788 case TCP_TW_ACK:
1789 tcp_v4_timewait_ack(sk, skb);
1790 break;
1791 case TCP_TW_RST:
1792 goto no_tcp_socket;
1793 case TCP_TW_SUCCESS:;
1794 }
1795 goto discard_it;
1796}
1797
1da177e4
LT
1798static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1799{
1800 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1801 struct inet_sock *inet = inet_sk(sk);
1802
1803 sin->sin_family = AF_INET;
1804 sin->sin_addr.s_addr = inet->daddr;
1805 sin->sin_port = inet->dport;
1806}
1807
1808/* VJ's idea. Save last timestamp seen from this destination
1809 * and hold it at least for normal timewait interval to use for duplicate
1810 * segment detection in subsequent connections, before they enter synchronized
1811 * state.
1812 */
1813
1814int tcp_v4_remember_stamp(struct sock *sk)
1815{
1816 struct inet_sock *inet = inet_sk(sk);
1817 struct tcp_sock *tp = tcp_sk(sk);
1818 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1819 struct inet_peer *peer = NULL;
1820 int release_it = 0;
1821
1822 if (!rt || rt->rt_dst != inet->daddr) {
1823 peer = inet_getpeer(inet->daddr, 1);
1824 release_it = 1;
1825 } else {
1826 if (!rt->peer)
1827 rt_bind_peer(rt, 1);
1828 peer = rt->peer;
1829 }
1830
1831 if (peer) {
1832 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1833 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1834 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1835 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1836 peer->tcp_ts = tp->rx_opt.ts_recent;
1837 }
1838 if (release_it)
1839 inet_putpeer(peer);
1840 return 1;
1841 }
1842
1843 return 0;
1844}
1845
1846int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1847{
1848 struct inet_peer *peer = NULL;
1849
1850 peer = inet_getpeer(tw->tw_daddr, 1);
1851
1852 if (peer) {
1853 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1854 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1855 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1856 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1857 peer->tcp_ts = tw->tw_ts_recent;
1858 }
1859 inet_putpeer(peer);
1860 return 1;
1861 }
1862
1863 return 0;
1864}
1865
1866struct tcp_func ipv4_specific = {
1867 .queue_xmit = ip_queue_xmit,
1868 .send_check = tcp_v4_send_check,
32519f11 1869 .rebuild_header = inet_sk_rebuild_header,
1da177e4
LT
1870 .conn_request = tcp_v4_conn_request,
1871 .syn_recv_sock = tcp_v4_syn_recv_sock,
1872 .remember_stamp = tcp_v4_remember_stamp,
1873 .net_header_len = sizeof(struct iphdr),
1874 .setsockopt = ip_setsockopt,
1875 .getsockopt = ip_getsockopt,
1876 .addr2sockaddr = v4_addr2sockaddr,
1877 .sockaddr_len = sizeof(struct sockaddr_in),
1878};
1879
1880/* NOTE: A lot of things set to zero explicitly by call to
1881 * sk_alloc() so need not be done here.
1882 */
1883static int tcp_v4_init_sock(struct sock *sk)
1884{
1885 struct tcp_sock *tp = tcp_sk(sk);
1886
1887 skb_queue_head_init(&tp->out_of_order_queue);
1888 tcp_init_xmit_timers(sk);
1889 tcp_prequeue_init(tp);
1890
1891 tp->rto = TCP_TIMEOUT_INIT;
1892 tp->mdev = TCP_TIMEOUT_INIT;
1893
1894 /* So many TCP implementations out there (incorrectly) count the
1895 * initial SYN frame in their delayed-ACK and congestion control
1896 * algorithms that we must have the following bandaid to talk
1897 * efficiently to them. -DaveM
1898 */
1899 tp->snd_cwnd = 2;
1900
1901 /* See draft-stevens-tcpca-spec-01 for discussion of the
1902 * initialization of these values.
1903 */
1904 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1905 tp->snd_cwnd_clamp = ~0;
c1b4a7e6 1906 tp->mss_cache = 536;
1da177e4
LT
1907
1908 tp->reordering = sysctl_tcp_reordering;
5f8ef48d 1909 tp->ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1910
1911 sk->sk_state = TCP_CLOSE;
1912
1913 sk->sk_write_space = sk_stream_write_space;
1914 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1915
1916 tp->af_specific = &ipv4_specific;
1917
1918 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1919 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1920
1921 atomic_inc(&tcp_sockets_allocated);
1922
1923 return 0;
1924}
1925
1926int tcp_v4_destroy_sock(struct sock *sk)
1927{
1928 struct tcp_sock *tp = tcp_sk(sk);
1929
1930 tcp_clear_xmit_timers(sk);
1931
317a76f9
SH
1932 tcp_cleanup_congestion_control(tp);
1933
1da177e4
LT
1934 /* Cleanup up the write buffer. */
1935 sk_stream_writequeue_purge(sk);
1936
1937 /* Cleans up our, hopefully empty, out_of_order_queue. */
1938 __skb_queue_purge(&tp->out_of_order_queue);
1939
1940 /* Clean prequeue, it must be empty really */
1941 __skb_queue_purge(&tp->ucopy.prequeue);
1942
1943 /* Clean up a referenced TCP bind bucket. */
a55ebcc4 1944 if (inet_sk(sk)->bind_hash)
1da177e4
LT
1945 tcp_put_port(sk);
1946
1947 /*
1948 * If sendmsg cached page exists, toss it.
1949 */
1950 if (sk->sk_sndmsg_page) {
1951 __free_page(sk->sk_sndmsg_page);
1952 sk->sk_sndmsg_page = NULL;
1953 }
1954
1955 atomic_dec(&tcp_sockets_allocated);
1956
1957 return 0;
1958}
1959
1960EXPORT_SYMBOL(tcp_v4_destroy_sock);
1961
1962#ifdef CONFIG_PROC_FS
1963/* Proc filesystem TCP sock list dumping. */
1964
1965static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1966{
1967 return hlist_empty(head) ? NULL :
1968 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1969}
1970
1971static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1972{
1973 return tw->tw_node.next ?
1974 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1975}
1976
1977static void *listening_get_next(struct seq_file *seq, void *cur)
1978{
1979 struct tcp_sock *tp;
1980 struct hlist_node *node;
1981 struct sock *sk = cur;
1982 struct tcp_iter_state* st = seq->private;
1983
1984 if (!sk) {
1985 st->bucket = 0;
1986 sk = sk_head(&tcp_listening_hash[0]);
1987 goto get_sk;
1988 }
1989
1990 ++st->num;
1991
1992 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1993 struct request_sock *req = cur;
1da177e4
LT
1994
1995 tp = tcp_sk(st->syn_wait_sk);
1996 req = req->dl_next;
1997 while (1) {
1998 while (req) {
60236fdd 1999 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2000 cur = req;
2001 goto out;
2002 }
2003 req = req->dl_next;
2004 }
2005 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2006 break;
2007get_req:
0e87506f 2008 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4
LT
2009 }
2010 sk = sk_next(st->syn_wait_sk);
2011 st->state = TCP_SEQ_STATE_LISTENING;
0e87506f 2012 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
2013 } else {
2014 tp = tcp_sk(sk);
0e87506f
ACM
2015 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2016 if (reqsk_queue_len(&tp->accept_queue))
1da177e4 2017 goto start_req;
0e87506f 2018 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
2019 sk = sk_next(sk);
2020 }
2021get_sk:
2022 sk_for_each_from(sk, node) {
2023 if (sk->sk_family == st->family) {
2024 cur = sk;
2025 goto out;
2026 }
2027 tp = tcp_sk(sk);
0e87506f
ACM
2028 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2029 if (reqsk_queue_len(&tp->accept_queue)) {
1da177e4
LT
2030start_req:
2031 st->uid = sock_i_uid(sk);
2032 st->syn_wait_sk = sk;
2033 st->state = TCP_SEQ_STATE_OPENREQ;
2034 st->sbucket = 0;
2035 goto get_req;
2036 }
0e87506f 2037 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4 2038 }
0f7ff927 2039 if (++st->bucket < INET_LHTABLE_SIZE) {
1da177e4
LT
2040 sk = sk_head(&tcp_listening_hash[st->bucket]);
2041 goto get_sk;
2042 }
2043 cur = NULL;
2044out:
2045 return cur;
2046}
2047
2048static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2049{
2050 void *rc = listening_get_next(seq, NULL);
2051
2052 while (rc && *pos) {
2053 rc = listening_get_next(seq, rc);
2054 --*pos;
2055 }
2056 return rc;
2057}
2058
2059static void *established_get_first(struct seq_file *seq)
2060{
2061 struct tcp_iter_state* st = seq->private;
2062 void *rc = NULL;
2063
2064 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2065 struct sock *sk;
2066 struct hlist_node *node;
2067 struct tcp_tw_bucket *tw;
2068
2069 /* We can reschedule _before_ having picked the target: */
2070 cond_resched_softirq();
2071
2072 read_lock(&tcp_ehash[st->bucket].lock);
2073 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2074 if (sk->sk_family != st->family) {
2075 continue;
2076 }
2077 rc = sk;
2078 goto out;
2079 }
2080 st->state = TCP_SEQ_STATE_TIME_WAIT;
2081 tw_for_each(tw, node,
2082 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2083 if (tw->tw_family != st->family) {
2084 continue;
2085 }
2086 rc = tw;
2087 goto out;
2088 }
2089 read_unlock(&tcp_ehash[st->bucket].lock);
2090 st->state = TCP_SEQ_STATE_ESTABLISHED;
2091 }
2092out:
2093 return rc;
2094}
2095
2096static void *established_get_next(struct seq_file *seq, void *cur)
2097{
2098 struct sock *sk = cur;
2099 struct tcp_tw_bucket *tw;
2100 struct hlist_node *node;
2101 struct tcp_iter_state* st = seq->private;
2102
2103 ++st->num;
2104
2105 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2106 tw = cur;
2107 tw = tw_next(tw);
2108get_tw:
2109 while (tw && tw->tw_family != st->family) {
2110 tw = tw_next(tw);
2111 }
2112 if (tw) {
2113 cur = tw;
2114 goto out;
2115 }
2116 read_unlock(&tcp_ehash[st->bucket].lock);
2117 st->state = TCP_SEQ_STATE_ESTABLISHED;
2118
2119 /* We can reschedule between buckets: */
2120 cond_resched_softirq();
2121
2122 if (++st->bucket < tcp_ehash_size) {
2123 read_lock(&tcp_ehash[st->bucket].lock);
2124 sk = sk_head(&tcp_ehash[st->bucket].chain);
2125 } else {
2126 cur = NULL;
2127 goto out;
2128 }
2129 } else
2130 sk = sk_next(sk);
2131
2132 sk_for_each_from(sk, node) {
2133 if (sk->sk_family == st->family)
2134 goto found;
2135 }
2136
2137 st->state = TCP_SEQ_STATE_TIME_WAIT;
2138 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2139 goto get_tw;
2140found:
2141 cur = sk;
2142out:
2143 return cur;
2144}
2145
2146static void *established_get_idx(struct seq_file *seq, loff_t pos)
2147{
2148 void *rc = established_get_first(seq);
2149
2150 while (rc && pos) {
2151 rc = established_get_next(seq, rc);
2152 --pos;
2153 }
2154 return rc;
2155}
2156
2157static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2158{
2159 void *rc;
2160 struct tcp_iter_state* st = seq->private;
2161
2162 tcp_listen_lock();
2163 st->state = TCP_SEQ_STATE_LISTENING;
2164 rc = listening_get_idx(seq, &pos);
2165
2166 if (!rc) {
2167 tcp_listen_unlock();
2168 local_bh_disable();
2169 st->state = TCP_SEQ_STATE_ESTABLISHED;
2170 rc = established_get_idx(seq, pos);
2171 }
2172
2173 return rc;
2174}
2175
2176static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2177{
2178 struct tcp_iter_state* st = seq->private;
2179 st->state = TCP_SEQ_STATE_LISTENING;
2180 st->num = 0;
2181 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2182}
2183
2184static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2185{
2186 void *rc = NULL;
2187 struct tcp_iter_state* st;
2188
2189 if (v == SEQ_START_TOKEN) {
2190 rc = tcp_get_idx(seq, 0);
2191 goto out;
2192 }
2193 st = seq->private;
2194
2195 switch (st->state) {
2196 case TCP_SEQ_STATE_OPENREQ:
2197 case TCP_SEQ_STATE_LISTENING:
2198 rc = listening_get_next(seq, v);
2199 if (!rc) {
2200 tcp_listen_unlock();
2201 local_bh_disable();
2202 st->state = TCP_SEQ_STATE_ESTABLISHED;
2203 rc = established_get_first(seq);
2204 }
2205 break;
2206 case TCP_SEQ_STATE_ESTABLISHED:
2207 case TCP_SEQ_STATE_TIME_WAIT:
2208 rc = established_get_next(seq, v);
2209 break;
2210 }
2211out:
2212 ++*pos;
2213 return rc;
2214}
2215
2216static void tcp_seq_stop(struct seq_file *seq, void *v)
2217{
2218 struct tcp_iter_state* st = seq->private;
2219
2220 switch (st->state) {
2221 case TCP_SEQ_STATE_OPENREQ:
2222 if (v) {
2223 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
0e87506f 2224 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
2225 }
2226 case TCP_SEQ_STATE_LISTENING:
2227 if (v != SEQ_START_TOKEN)
2228 tcp_listen_unlock();
2229 break;
2230 case TCP_SEQ_STATE_TIME_WAIT:
2231 case TCP_SEQ_STATE_ESTABLISHED:
2232 if (v)
2233 read_unlock(&tcp_ehash[st->bucket].lock);
2234 local_bh_enable();
2235 break;
2236 }
2237}
2238
2239static int tcp_seq_open(struct inode *inode, struct file *file)
2240{
2241 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2242 struct seq_file *seq;
2243 struct tcp_iter_state *s;
2244 int rc;
2245
2246 if (unlikely(afinfo == NULL))
2247 return -EINVAL;
2248
2249 s = kmalloc(sizeof(*s), GFP_KERNEL);
2250 if (!s)
2251 return -ENOMEM;
2252 memset(s, 0, sizeof(*s));
2253 s->family = afinfo->family;
2254 s->seq_ops.start = tcp_seq_start;
2255 s->seq_ops.next = tcp_seq_next;
2256 s->seq_ops.show = afinfo->seq_show;
2257 s->seq_ops.stop = tcp_seq_stop;
2258
2259 rc = seq_open(file, &s->seq_ops);
2260 if (rc)
2261 goto out_kfree;
2262 seq = file->private_data;
2263 seq->private = s;
2264out:
2265 return rc;
2266out_kfree:
2267 kfree(s);
2268 goto out;
2269}
2270
2271int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2272{
2273 int rc = 0;
2274 struct proc_dir_entry *p;
2275
2276 if (!afinfo)
2277 return -EINVAL;
2278 afinfo->seq_fops->owner = afinfo->owner;
2279 afinfo->seq_fops->open = tcp_seq_open;
2280 afinfo->seq_fops->read = seq_read;
2281 afinfo->seq_fops->llseek = seq_lseek;
2282 afinfo->seq_fops->release = seq_release_private;
2283
2284 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2285 if (p)
2286 p->data = afinfo;
2287 else
2288 rc = -ENOMEM;
2289 return rc;
2290}
2291
2292void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2293{
2294 if (!afinfo)
2295 return;
2296 proc_net_remove(afinfo->name);
2297 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2298}
2299
60236fdd 2300static void get_openreq4(struct sock *sk, struct request_sock *req,
1da177e4
LT
2301 char *tmpbuf, int i, int uid)
2302{
2e6599cb 2303 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2304 int ttd = req->expires - jiffies;
2305
2306 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2307 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2308 i,
2e6599cb 2309 ireq->loc_addr,
1da177e4 2310 ntohs(inet_sk(sk)->sport),
2e6599cb
ACM
2311 ireq->rmt_addr,
2312 ntohs(ireq->rmt_port),
1da177e4
LT
2313 TCP_SYN_RECV,
2314 0, 0, /* could print option size, but that is af dependent. */
2315 1, /* timers active (only the expire timer) */
2316 jiffies_to_clock_t(ttd),
2317 req->retrans,
2318 uid,
2319 0, /* non standard timer */
2320 0, /* open_requests have no inode */
2321 atomic_read(&sk->sk_refcnt),
2322 req);
2323}
2324
2325static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2326{
2327 int timer_active;
2328 unsigned long timer_expires;
2329 struct tcp_sock *tp = tcp_sk(sp);
2330 struct inet_sock *inet = inet_sk(sp);
2331 unsigned int dest = inet->daddr;
2332 unsigned int src = inet->rcv_saddr;
2333 __u16 destp = ntohs(inet->dport);
2334 __u16 srcp = ntohs(inet->sport);
2335
2336 if (tp->pending == TCP_TIME_RETRANS) {
2337 timer_active = 1;
2338 timer_expires = tp->timeout;
2339 } else if (tp->pending == TCP_TIME_PROBE0) {
2340 timer_active = 4;
2341 timer_expires = tp->timeout;
2342 } else if (timer_pending(&sp->sk_timer)) {
2343 timer_active = 2;
2344 timer_expires = sp->sk_timer.expires;
2345 } else {
2346 timer_active = 0;
2347 timer_expires = jiffies;
2348 }
2349
2350 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2351 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2352 i, src, srcp, dest, destp, sp->sk_state,
2353 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2354 timer_active,
2355 jiffies_to_clock_t(timer_expires - jiffies),
2356 tp->retransmits,
2357 sock_i_uid(sp),
2358 tp->probes_out,
2359 sock_i_ino(sp),
2360 atomic_read(&sp->sk_refcnt), sp,
2361 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2362 tp->snd_cwnd,
2363 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2364}
2365
2366static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2367{
2368 unsigned int dest, src;
2369 __u16 destp, srcp;
2370 int ttd = tw->tw_ttd - jiffies;
2371
2372 if (ttd < 0)
2373 ttd = 0;
2374
2375 dest = tw->tw_daddr;
2376 src = tw->tw_rcv_saddr;
2377 destp = ntohs(tw->tw_dport);
2378 srcp = ntohs(tw->tw_sport);
2379
2380 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2381 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2382 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2383 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2384 atomic_read(&tw->tw_refcnt), tw);
2385}
2386
2387#define TMPSZ 150
2388
2389static int tcp4_seq_show(struct seq_file *seq, void *v)
2390{
2391 struct tcp_iter_state* st;
2392 char tmpbuf[TMPSZ + 1];
2393
2394 if (v == SEQ_START_TOKEN) {
2395 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2396 " sl local_address rem_address st tx_queue "
2397 "rx_queue tr tm->when retrnsmt uid timeout "
2398 "inode");
2399 goto out;
2400 }
2401 st = seq->private;
2402
2403 switch (st->state) {
2404 case TCP_SEQ_STATE_LISTENING:
2405 case TCP_SEQ_STATE_ESTABLISHED:
2406 get_tcp4_sock(v, tmpbuf, st->num);
2407 break;
2408 case TCP_SEQ_STATE_OPENREQ:
2409 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2410 break;
2411 case TCP_SEQ_STATE_TIME_WAIT:
2412 get_timewait4_sock(v, tmpbuf, st->num);
2413 break;
2414 }
2415 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2416out:
2417 return 0;
2418}
2419
2420static struct file_operations tcp4_seq_fops;
2421static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2422 .owner = THIS_MODULE,
2423 .name = "tcp",
2424 .family = AF_INET,
2425 .seq_show = tcp4_seq_show,
2426 .seq_fops = &tcp4_seq_fops,
2427};
2428
2429int __init tcp4_proc_init(void)
2430{
2431 return tcp_proc_register(&tcp4_seq_afinfo);
2432}
2433
2434void tcp4_proc_exit(void)
2435{
2436 tcp_proc_unregister(&tcp4_seq_afinfo);
2437}
2438#endif /* CONFIG_PROC_FS */
2439
2440struct proto tcp_prot = {
2441 .name = "TCP",
2442 .owner = THIS_MODULE,
2443 .close = tcp_close,
2444 .connect = tcp_v4_connect,
2445 .disconnect = tcp_disconnect,
2446 .accept = tcp_accept,
2447 .ioctl = tcp_ioctl,
2448 .init = tcp_v4_init_sock,
2449 .destroy = tcp_v4_destroy_sock,
2450 .shutdown = tcp_shutdown,
2451 .setsockopt = tcp_setsockopt,
2452 .getsockopt = tcp_getsockopt,
2453 .sendmsg = tcp_sendmsg,
2454 .recvmsg = tcp_recvmsg,
2455 .backlog_rcv = tcp_v4_do_rcv,
2456 .hash = tcp_v4_hash,
2457 .unhash = tcp_unhash,
2458 .get_port = tcp_v4_get_port,
2459 .enter_memory_pressure = tcp_enter_memory_pressure,
2460 .sockets_allocated = &tcp_sockets_allocated,
2461 .memory_allocated = &tcp_memory_allocated,
2462 .memory_pressure = &tcp_memory_pressure,
2463 .sysctl_mem = sysctl_tcp_mem,
2464 .sysctl_wmem = sysctl_tcp_wmem,
2465 .sysctl_rmem = sysctl_tcp_rmem,
2466 .max_header = MAX_TCP_HEADER,
2467 .obj_size = sizeof(struct tcp_sock),
60236fdd 2468 .rsk_prot = &tcp_request_sock_ops,
1da177e4
LT
2469};
2470
2471
2472
2473void __init tcp_v4_init(struct net_proto_family *ops)
2474{
2475 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2476 if (err < 0)
2477 panic("Failed to create the TCP control socket.\n");
2478 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2479 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2480
2481 /* Unhash it so that IP input processing does not even
2482 * see it, we do not wish this socket to see incoming
2483 * packets.
2484 */
2485 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2486}
2487
2488EXPORT_SYMBOL(ipv4_specific);
2489EXPORT_SYMBOL(tcp_bind_hash);
0f7ff927 2490EXPORT_SYMBOL(inet_bind_bucket_create);
1da177e4
LT
2491EXPORT_SYMBOL(tcp_hashinfo);
2492EXPORT_SYMBOL(tcp_inherit_port);
2493EXPORT_SYMBOL(tcp_listen_wlock);
2494EXPORT_SYMBOL(tcp_port_rover);
2495EXPORT_SYMBOL(tcp_prot);
2496EXPORT_SYMBOL(tcp_put_port);
2497EXPORT_SYMBOL(tcp_unhash);
2498EXPORT_SYMBOL(tcp_v4_conn_request);
2499EXPORT_SYMBOL(tcp_v4_connect);
2500EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
2501EXPORT_SYMBOL(tcp_v4_remember_stamp);
2502EXPORT_SYMBOL(tcp_v4_send_check);
2503EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2504
2505#ifdef CONFIG_PROC_FS
2506EXPORT_SYMBOL(tcp_proc_register);
2507EXPORT_SYMBOL(tcp_proc_unregister);
2508#endif
2509EXPORT_SYMBOL(sysctl_local_port_range);
1da177e4
LT
2510EXPORT_SYMBOL(sysctl_tcp_low_latency);
2511EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2512