[mirror_ubuntu-kernels.git] / net / ipv4 / tcp_timer.c

/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
 *
 * Authors:	Ross Biro
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

#include <linux/module.h>
#include <net/tcp.h>

int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
int sysctl_tcp_orphan_retries;

static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
static void tcp_keepalive_timer (unsigned long data);

#ifdef TCP_DEBUG
const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
EXPORT_SYMBOL(tcp_timer_bug_msg);
#endif

/*
 * Using different timers for retransmit, delayed acks and probes
 * We may wish use just one timer maintaining a list of expire jiffies 
 * to optimize.
 */

void tcp_init_xmit_timers(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	init_timer(&tp->retransmit_timer);
	tp->retransmit_timer.function=&tcp_write_timer;
	tp->retransmit_timer.data = (unsigned long) sk;
	tp->pending = 0;

	init_timer(&tp->delack_timer);
	tp->delack_timer.function=&tcp_delack_timer;
	tp->delack_timer.data = (unsigned long) sk;
	tp->ack.pending = 0;

	init_timer(&sk->sk_timer);
	sk->sk_timer.function	= &tcp_keepalive_timer;
	sk->sk_timer.data	= (unsigned long)sk;
}

void tcp_clear_xmit_timers(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	tp->pending = 0;
	sk_stop_timer(sk, &tp->retransmit_timer);

	tp->ack.pending = 0;
	tp->ack.blocked = 0;
	sk_stop_timer(sk, &tp->delack_timer);

	sk_stop_timer(sk, &sk->sk_timer);
}

static void tcp_write_err(struct sock *sk)
{
	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
	sk->sk_error_report(sk);

	tcp_done(sk);
	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
}

/* Do not allow orphaned sockets to eat all our resources.
 * This is direct violation of TCP specs, but it is required
 * to prevent DoS attacks. It is called when a retransmission timeout
 * or zero probe timeout occurs on orphaned socket.
 *
 * Criterium is still not confirmed experimentally and may change.
 * We kill the socket, if:
 * 1. If number of orphaned sockets exceeds an administratively configured
 *    limit.
 * 2. If we have strong memory pressure.
 */
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int orphans = atomic_read(&tcp_orphan_count);

	/* If peer does not open window for long time, or did not transmit 
	 * anything for long time, penalize it. */
	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
		orphans <<= 1;

	/* If some dubious ICMP arrived, penalize even more. */
	if (sk->sk_err_soft)
		orphans <<= 1;

	if (orphans >= sysctl_tcp_max_orphans ||
	    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
	     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
		if (net_ratelimit())
			printk(KERN_INFO "Out of socket memory\n");

		/* Catch exceptional cases, when connection requires reset.
		 *      1. Last segment was sent recently. */
		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
		    /*  2. Window is closed. */
		    (!tp->snd_wnd && !tp->packets_out))
			do_reset = 1;
		if (do_reset)
			tcp_send_active_reset(sk, GFP_ATOMIC);
		tcp_done(sk);
		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
		return 1;
	}
	return 0;
}

/* Calculate maximal number or retries on an orphaned socket. */
static int tcp_orphan_retries(struct sock *sk, int alive)
{
	int retries = sysctl_tcp_orphan_retries; /* May be zero. */

	/* We know from an ICMP that something is wrong. */
	if (sk->sk_err_soft && !alive)
		retries = 0;

	/* However, if socket sent something recently, select some safe
	 * number of retries. 8 corresponds to >100 seconds with minimal
	 * RTO of 200msec. */
	if (retries == 0 && alive)
		retries = 8;
	return retries;
}

/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int retry_until;

	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		if (tp->retransmits)
			dst_negative_advice(&sk->sk_dst_cache);
		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
	} else {
		if (tp->retransmits >= sysctl_tcp_retries1) {
			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
			   hole detection. :-(

			   It is place to make it. It is not made. I do not want
			   to make it. It is disguisting. It does not work in any
			   case. Let me to cite the same draft, which requires for
			   us to implement this:

   "The one security concern raised by this memo is that ICMP black holes
   are often caused by over-zealous security administrators who block
   all ICMP messages.  It is vitally important that those who design and
   deploy security systems understand the impact of strict filtering on
   upper-layer protocols.  The safest web site in the world is worthless
   if most TCP implementations cannot transfer data from it.  It would
   be far nicer to have all of the black holes fixed rather than fixing
   all of the TCP implementations."

                           Golden words :-).
		   */

			dst_negative_advice(&sk->sk_dst_cache);
		}

		retry_until = sysctl_tcp_retries2;
		if (sock_flag(sk, SOCK_DEAD)) {
			int alive = (tp->rto < TCP_RTO_MAX);
 
			retry_until = tcp_orphan_retries(sk, alive);

			if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
				return 1;
		}
	}

	if (tp->retransmits >= retry_until) {
		/* Has it gone just too far? */
		tcp_write_err(sk);
		return 1;
	}
	return 0;
}

static void tcp_delack_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct tcp_sock *tp = tcp_sk(sk);

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */
		tp->ack.blocked = 1;
		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
		sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
		goto out_unlock;
	}

	sk_stream_mem_reclaim(sk);

	if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
		goto out;

	if (time_after(tp->ack.timeout, jiffies)) {
		sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
		goto out;
	}
	tp->ack.pending &= ~TCP_ACK_TIMER;

	if (skb_queue_len(&tp->ucopy.prequeue)) {
		struct sk_buff *skb;

		NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
				 skb_queue_len(&tp->ucopy.prequeue));

		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
			sk->sk_backlog_rcv(sk, skb);

		tp->ucopy.memory = 0;
	}

	if (tcp_ack_scheduled(tp)) {
		if (!tp->ack.pingpong) {
			/* Delayed ACK missed: inflate ATO. */
			tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
		} else {
			/* Delayed ACK missed: leave pingpong mode and
			 * deflate ATO.
			 */
			tp->ack.pingpong = 0;
			tp->ack.ato = TCP_ATO_MIN;
		}
		tcp_send_ack(sk);
		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
	}
	TCP_CHECK_TIMER(sk);

out:
	if (tcp_memory_pressure)
		sk_stream_mem_reclaim(sk);
out_unlock:
	bh_unlock_sock(sk);
	sock_put(sk);
}

static void tcp_probe_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int max_probes;

	if (tp->packets_out || !sk->sk_send_head) {
		tp->probes_out = 0;
		return;
	}

	/* *WARNING* RFC 1122 forbids this
	 *
	 * It doesn't AFAIK, because we kill the retransmit timer -AK
	 *
	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
	 * this behaviour in Solaris down as a bug fix. [AC]
	 *
	 * Let me to explain. probes_out is zeroed by incoming ACKs
	 * even if they advertise zero window. Hence, connection is killed only
	 * if we received no ACKs for normal connection timeout. It is not killed
	 * only because window stays zero for some time, window may be zero
	 * until armageddon and even later. We are in full accordance
	 * with RFCs, only probe timer combines both retransmission timeout
	 * and probe timeout in one bottle.				--ANK
	 */
	max_probes = sysctl_tcp_retries2;

	if (sock_flag(sk, SOCK_DEAD)) {
		int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
 
		max_probes = tcp_orphan_retries(sk, alive);

		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
			return;
	}

	if (tp->probes_out > max_probes) {
		tcp_write_err(sk);
	} else {
		/* Only send another probe if we didn't close things up. */
		tcp_send_probe0(sk);
	}
}

/*
 *	The TCP retransmit timer.
 */

static void tcp_retransmit_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (!tp->packets_out)
		goto out;

	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));

	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
		/* Receiver dastardly shrinks window. Our retransmits
		 * become zero probes, but we should not timeout this
		 * connection. If the socket is an orphan, time it out,
		 * we cannot allow such beasts to hang infinitely.
		 */
#ifdef TCP_DEBUG
		if (net_ratelimit()) {
			struct inet_sock *inet = inet_sk(sk);
			printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
			       NIPQUAD(inet->daddr), htons(inet->dport),
			       inet->num, tp->snd_una, tp->snd_nxt);
		}
#endif
		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
			tcp_write_err(sk);
			goto out;
		}
		tcp_enter_loss(sk, 0);
		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
		__sk_dst_reset(sk);
		goto out_reset_timer;
	}

	if (tcp_write_timeout(sk))
		goto out;

	if (tp->retransmits == 0) {
		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
			if (tp->rx_opt.sack_ok) {
				if (tp->ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
			} else {
				if (tp->ca_state == TCP_CA_Recovery)
					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
				else
					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
			}
		} else if (tp->ca_state == TCP_CA_Loss) {
			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
		} else {
			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
		}
	}

	if (tcp_use_frto(sk)) {
		tcp_enter_frto(sk);
	} else {
		tcp_enter_loss(sk, 0);
	}

	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
		/* Retransmission failed because of local congestion,
		 * do not backoff.
		 */
		if (!tp->retransmits)
			tp->retransmits=1;
		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
		goto out;
	}

	/* Increase the timeout each time we retransmit.  Note that
	 * we do not increase the rtt estimate.  rto is initialized
	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
	 * that doubling rto each time is the least we can get away with.
	 * In KA9Q, Karn uses this for the first few times, and then
	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
	 * defined in the protocol as the maximum possible RTT.  I guess
	 * we'll have to use something other than TCP to talk to the
	 * University of Mars.
	 *
	 * PAWS allows us longer timeouts and large windows, so once
	 * implemented ftp to mars will work nicely. We will have to fix
	 * the 120 second clamps though!
	 */
	tp->backoff++;
	tp->retransmits++;

out_reset_timer:
	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
	if (tp->retransmits > sysctl_tcp_retries1)
		__sk_dst_reset(sk);

out:;
}

static void tcp_write_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct tcp_sock *tp = tcp_sk(sk);
	int event;

	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later */
		sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
		goto out_unlock;
	}

	if (sk->sk_state == TCP_CLOSE || !tp->pending)
		goto out;

	if (time_after(tp->timeout, jiffies)) {
		sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
		goto out;
	}

	event = tp->pending;
	tp->pending = 0;

	switch (event) {
	case TCP_TIME_RETRANS:
		tcp_retransmit_timer(sk);
		break;
	case TCP_TIME_PROBE0:
		tcp_probe_timer(sk);
		break;
	}
	TCP_CHECK_TIMER(sk);

out:
	sk_stream_mem_reclaim(sk);
out_unlock:
	bh_unlock_sock(sk);
	sock_put(sk);
}

/*
 *	Timer for listening sockets
 */

static void tcp_synack_timer(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_listen_opt *lopt = tp->listen_opt;
	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
	int thresh = max_retries;
	unsigned long now = jiffies;
	struct open_request **reqp, *req;
	int i, budget;

	if (lopt == NULL || lopt->qlen == 0)
		return;

	/* Normally all the openreqs are young and become mature
	 * (i.e. converted to established socket) for first timeout.
	 * If synack was not acknowledged for 3 seconds, it means
	 * one of the following things: synack was lost, ack was lost,
	 * rtt is high or nobody planned to ack (i.e. synflood).
	 * When server is a bit loaded, queue is populated with old
	 * open requests, reducing effective size of queue.
	 * When server is well loaded, queue size reduces to zero
	 * after several minutes of work. It is not synflood,
	 * it is normal operation. The solution is pruning
	 * too old entries overriding normal timeout, when
	 * situation becomes dangerous.
	 *
	 * Essentially, we reserve half of room for young
	 * embrions; and abort old ones without pity, if old
	 * ones are about to clog our table.
	 */
	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
		int young = (lopt->qlen_young<<1);

		while (thresh > 2) {
			if (lopt->qlen < young)
				break;
			thresh--;
			young <<= 1;
		}
	}

	if (tp->defer_accept)
		max_retries = tp->defer_accept;

	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
	i = lopt->clock_hand;

	do {
		reqp=&lopt->syn_table[i];
		while ((req = *reqp) != NULL) {
			if (time_after_eq(now, req->expires)) {
				if ((req->retrans < thresh ||
				     (req->acked && req->retrans < max_retries))
				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
					unsigned long timeo;

					if (req->retrans++ == 0)
						lopt->qlen_young--;
					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
						    TCP_RTO_MAX);
					req->expires = now + timeo;
					reqp = &req->dl_next;
					continue;
				}

				/* Drop this request */
				write_lock(&tp->syn_wait_lock);
				*reqp = req->dl_next;
				write_unlock(&tp->syn_wait_lock);
				lopt->qlen--;
				if (req->retrans == 0)
					lopt->qlen_young--;
				tcp_openreq_free(req);
				continue;
			}
			reqp = &req->dl_next;
		}

		i = (i+1)&(TCP_SYNQ_HSIZE-1);

	} while (--budget > 0);

	lopt->clock_hand = i;

	if (lopt->qlen)
		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
}

void tcp_delete_keepalive_timer (struct sock *sk)
{
	sk_stop_timer(sk, &sk->sk_timer);
}

void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
{
	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}

void tcp_set_keepalive(struct sock *sk, int val)
{
	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
		return;

	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
		tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
	else if (!val)
		tcp_delete_keepalive_timer(sk);
}


static void tcp_keepalive_timer (unsigned long data)
{
	struct sock *sk = (struct sock *) data;
	struct tcp_sock *tp = tcp_sk(sk);
	__u32 elapsed;

	/* Only process if socket is not in use. */
	bh_lock_sock(sk);
	if (sock_owned_by_user(sk)) {
		/* Try again later. */ 
		tcp_reset_keepalive_timer (sk, HZ/20);
		goto out;
	}

	if (sk->sk_state == TCP_LISTEN) {
		tcp_synack_timer(sk);
		goto out;
	}

	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
		if (tp->linger2 >= 0) {
			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;

			if (tmo > 0) {
				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				goto out;
			}
		}
		tcp_send_active_reset(sk, GFP_ATOMIC);
		goto death;
	}

	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
		goto out;

	elapsed = keepalive_time_when(tp);

	/* It is alive without keepalive 8) */
	if (tp->packets_out || sk->sk_send_head)
		goto resched;

	elapsed = tcp_time_stamp - tp->rcv_tstamp;

	if (elapsed >= keepalive_time_when(tp)) {
		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
			tcp_send_active_reset(sk, GFP_ATOMIC);
			tcp_write_err(sk);
			goto out;
		}
		if (tcp_write_wakeup(sk) <= 0) {
			tp->probes_out++;
			elapsed = keepalive_intvl_when(tp);
		} else {
			/* If keepalive was lost due to local congestion,
			 * try harder.
			 */
			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
		}
	} else {
		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
		elapsed = keepalive_time_when(tp) - elapsed;
	}

	TCP_CHECK_TIMER(sk);
	sk_stream_mem_reclaim(sk);

resched:
	tcp_reset_keepalive_timer (sk, elapsed);
	goto out;

death:	
	tcp_done(sk);

out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

EXPORT_SYMBOL(tcp_clear_xmit_timers);
EXPORT_SYMBOL(tcp_delete_keepalive_timer);
EXPORT_SYMBOL(tcp_init_xmit_timers);
EXPORT_SYMBOL(tcp_reset_keepalive_timer);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
	3	* operating system. INET is implemented using the BSD Socket
	4	* interface as the means of communication with the user level.
	5	*
	6	* Implementation of the Transmission Control Protocol(TCP).
	7	*
	8	* Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
	9	*
02c30a84	10	* Authors: Ross Biro
1da177e4 LT	11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
	12	* Mark Evans, <evansmp@uhura.aston.ac.uk>
	13	* Corey Minyard <wf-rch!minyard@relay.EU.net>
	14	* Florian La Roche, <flla@stud.uni-sb.de>
	15	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
	16	* Linus Torvalds, <torvalds@cs.helsinki.fi>
	17	* Alan Cox, <gw4pts@gw4pts.ampr.org>
	18	* Matthew Dillon, <dillon@apollo.west.oic.com>
	19	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
	20	* Jorge Cwik, <jorge@laser.satlink.net>
	21	*/
	22
	23	#include <linux/module.h>
	24	#include <net/tcp.h>
	25
	26	int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
	27	int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
	28	int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
	29	int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
	30	int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
	31	int sysctl_tcp_retries1 = TCP_RETR1;
	32	int sysctl_tcp_retries2 = TCP_RETR2;
	33	int sysctl_tcp_orphan_retries;
	34
	35	static void tcp_write_timer(unsigned long);
	36	static void tcp_delack_timer(unsigned long);
	37	static void tcp_keepalive_timer (unsigned long data);
	38
	39	#ifdef TCP_DEBUG
	40	const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
	41	EXPORT_SYMBOL(tcp_timer_bug_msg);
	42	#endif
	43
	44	/*
	45	* Using different timers for retransmit, delayed acks and probes
	46	* We may wish use just one timer maintaining a list of expire jiffies
	47	* to optimize.
	48	*/
	49
	50	void tcp_init_xmit_timers(struct sock *sk)
	51	{
	52	struct tcp_sock *tp = tcp_sk(sk);
	53
	54	init_timer(&tp->retransmit_timer);
	55	tp->retransmit_timer.function=&tcp_write_timer;
	56	tp->retransmit_timer.data = (unsigned long) sk;
	57	tp->pending = 0;
	58
	59	init_timer(&tp->delack_timer);
	60	tp->delack_timer.function=&tcp_delack_timer;
	61	tp->delack_timer.data = (unsigned long) sk;
	62	tp->ack.pending = 0;
	63
	64	init_timer(&sk->sk_timer);
	65	sk->sk_timer.function = &tcp_keepalive_timer;
	66	sk->sk_timer.data = (unsigned long)sk;
	67	}
	68
	69	void tcp_clear_xmit_timers(struct sock *sk)
	70	{
	71	struct tcp_sock *tp = tcp_sk(sk);
	72
	73	tp->pending = 0;
	74	sk_stop_timer(sk, &tp->retransmit_timer);
75
76	tp->ack.pending = 0;
77	tp->ack.blocked = 0;
78	sk_stop_timer(sk, &tp->delack_timer);
79
80	sk_stop_timer(sk, &sk->sk_timer);
81	}
82
83	static void tcp_write_err(struct sock *sk)
84	{
85	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
86	sk->sk_error_report(sk);
87
88	tcp_done(sk);
89	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
90	}
91
92	/* Do not allow orphaned sockets to eat all our resources.
93	* This is direct violation of TCP specs, but it is required
94	* to prevent DoS attacks. It is called when a retransmission timeout
95	* or zero probe timeout occurs on orphaned socket.
96	*
97	* Criterium is still not confirmed experimentally and may change.
98	* We kill the socket, if:
99	* 1. If number of orphaned sockets exceeds an administratively configured
100	* limit.
101	* 2. If we have strong memory pressure.
102	*/
103	static int tcp_out_of_resources(struct sock *sk, int do_reset)
104	{
105	struct tcp_sock *tp = tcp_sk(sk);
106	int orphans = atomic_read(&tcp_orphan_count);
107
108	/* If peer does not open window for long time, or did not transmit
109	* anything for long time, penalize it. */
110	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX \|\| !do_reset)
111	orphans <<= 1;
112
113	/* If some dubious ICMP arrived, penalize even more. */
114	if (sk->sk_err_soft)
115	orphans <<= 1;
116
117	if (orphans >= sysctl_tcp_max_orphans \|\|
118	(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
119	atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
120	if (net_ratelimit())
121	printk(KERN_INFO "Out of socket memory\n");
122
123	/* Catch exceptional cases, when connection requires reset.
124	* 1. Last segment was sent recently. */
125	if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN \|\|
126	/* 2. Window is closed. */
127	(!tp->snd_wnd && !tp->packets_out))
128	do_reset = 1;
129	if (do_reset)
130	tcp_send_active_reset(sk, GFP_ATOMIC);
131	tcp_done(sk);
132	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
133	return 1;
134	}
135	return 0;
136	}
137
138	/* Calculate maximal number or retries on an orphaned socket. */
139	static int tcp_orphan_retries(struct sock *sk, int alive)
140	{
141	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
142
143	/* We know from an ICMP that something is wrong. */
144	if (sk->sk_err_soft && !alive)
145	retries = 0;
146
147	/* However, if socket sent something recently, select some safe
148	* number of retries. 8 corresponds to >100 seconds with minimal
149	* RTO of 200msec. */
150	if (retries == 0 && alive)
151	retries = 8;
152	return retries;
153	}
154
155	/* A write timeout has occurred. Process the after effects. */
156	static int tcp_write_timeout(struct sock *sk)
157	{
158	struct tcp_sock *tp = tcp_sk(sk);
159	int retry_until;
160
161	if ((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV)) {
162	if (tp->retransmits)
163	dst_negative_advice(&sk->sk_dst_cache);
164	retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
165	} else {
166	if (tp->retransmits >= sysctl_tcp_retries1) {
167	/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
168	hole detection. :-(
169
170	It is place to make it. It is not made. I do not want
171	to make it. It is disguisting. It does not work in any
172	case. Let me to cite the same draft, which requires for
173	us to implement this:
174
175	"The one security concern raised by this memo is that ICMP black holes
176	are often caused by over-zealous security administrators who block
177	all ICMP messages. It is vitally important that those who design and
178	deploy security systems understand the impact of strict filtering on
179	upper-layer protocols. The safest web site in the world is worthless
180	if most TCP implementations cannot transfer data from it. It would
181	be far nicer to have all of the black holes fixed rather than fixing
182	all of the TCP implementations."
183
184	Golden words :-).
185	*/
186
187	dst_negative_advice(&sk->sk_dst_cache);
188	}
189
190	retry_until = sysctl_tcp_retries2;
191	if (sock_flag(sk, SOCK_DEAD)) {
192	int alive = (tp->rto < TCP_RTO_MAX);
193
194	retry_until = tcp_orphan_retries(sk, alive);
195
196	if (tcp_out_of_resources(sk, alive \|\| tp->retransmits < retry_until))
197	return 1;
198	}
199	}
200
201	if (tp->retransmits >= retry_until) {
202	/* Has it gone just too far? */
203	tcp_write_err(sk);
204	return 1;
205	}
206	return 0;
207	}
208
209	static void tcp_delack_timer(unsigned long data)
210	{
211	struct sock sk = (struct sock)data;
212	struct tcp_sock *tp = tcp_sk(sk);
213
214	bh_lock_sock(sk);
215	if (sock_owned_by_user(sk)) {
216	/* Try again later. */
217	tp->ack.blocked = 1;
218	NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
219	sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
220	goto out_unlock;
221	}
222
223	sk_stream_mem_reclaim(sk);
224
225	if (sk->sk_state == TCP_CLOSE \|\| !(tp->ack.pending & TCP_ACK_TIMER))
226	goto out;
227
228	if (time_after(tp->ack.timeout, jiffies)) {
229	sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
230	goto out;
231	}
232	tp->ack.pending &= ~TCP_ACK_TIMER;
233
234	if (skb_queue_len(&tp->ucopy.prequeue)) {
235	struct sk_buff *skb;
236
237	NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
238	skb_queue_len(&tp->ucopy.prequeue));
239
240	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241	sk->sk_backlog_rcv(sk, skb);
242
243	tp->ucopy.memory = 0;
244	}
245
246	if (tcp_ack_scheduled(tp)) {
247	if (!tp->ack.pingpong) {
248	/* Delayed ACK missed: inflate ATO. */
249	tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
250	} else {
251	/* Delayed ACK missed: leave pingpong mode and
252	* deflate ATO.
253	*/
254	tp->ack.pingpong = 0;
255	tp->ack.ato = TCP_ATO_MIN;
256	}
257	tcp_send_ack(sk);
258	NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
259	}
260	TCP_CHECK_TIMER(sk);
261
262	out:
263	if (tcp_memory_pressure)
264	sk_stream_mem_reclaim(sk);
265	out_unlock:
266	bh_unlock_sock(sk);
267	sock_put(sk);
268	}
269
270	static void tcp_probe_timer(struct sock *sk)
271	{
272	struct tcp_sock *tp = tcp_sk(sk);
273	int max_probes;
274
275	if (tp->packets_out \|\| !sk->sk_send_head) {
276	tp->probes_out = 0;
277	return;
278	}
279
280	/* WARNING RFC 1122 forbids this
281	*
282	* It doesn't AFAIK, because we kill the retransmit timer -AK
283	*
284	* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
285	* this behaviour in Solaris down as a bug fix. [AC]
286	*
287	* Let me to explain. probes_out is zeroed by incoming ACKs
288	* even if they advertise zero window. Hence, connection is killed only
289	* if we received no ACKs for normal connection timeout. It is not killed
290	* only because window stays zero for some time, window may be zero
291	* until armageddon and even later. We are in full accordance
292	* with RFCs, only probe timer combines both retransmission timeout
293	* and probe timeout in one bottle. --ANK
294	*/
295	max_probes = sysctl_tcp_retries2;
296
297	if (sock_flag(sk, SOCK_DEAD)) {
298	int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
299
300	max_probes = tcp_orphan_retries(sk, alive);
301
302	if (tcp_out_of_resources(sk, alive \|\| tp->probes_out <= max_probes))
303	return;
304	}
305
306	if (tp->probes_out > max_probes) {
307	tcp_write_err(sk);
308	} else {
309	/* Only send another probe if we didn't close things up. */
310	tcp_send_probe0(sk);
311	}
312	}
313
314	/*
315	* The TCP retransmit timer.
316	*/
317
318	static void tcp_retransmit_timer(struct sock *sk)
319	{
320	struct tcp_sock *tp = tcp_sk(sk);
321
322	if (!tp->packets_out)
323	goto out;
324
325	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
326
327	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
328	!((1 << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))) {
329	/* Receiver dastardly shrinks window. Our retransmits
330	* become zero probes, but we should not timeout this
331	* connection. If the socket is an orphan, time it out,
332	* we cannot allow such beasts to hang infinitely.
333	*/
334	#ifdef TCP_DEBUG
335	if (net_ratelimit()) {
336	struct inet_sock *inet = inet_sk(sk);
337	printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
338	NIPQUAD(inet->daddr), htons(inet->dport),
339	inet->num, tp->snd_una, tp->snd_nxt);
340	}
341	#endif
342	if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
343	tcp_write_err(sk);
344	goto out;
345	}
346	tcp_enter_loss(sk, 0);
347	tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
348	__sk_dst_reset(sk);
349	goto out_reset_timer;
350	}
351
352	if (tcp_write_timeout(sk))
353	goto out;
354
355	if (tp->retransmits == 0) {
356	if (tp->ca_state == TCP_CA_Disorder \|\| tp->ca_state == TCP_CA_Recovery) {
357	if (tp->rx_opt.sack_ok) {
358	if (tp->ca_state == TCP_CA_Recovery)
359	NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
360	else
361	NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
362	} else {
363	if (tp->ca_state == TCP_CA_Recovery)
364	NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
365	else
366	NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
367	}
368	} else if (tp->ca_state == TCP_CA_Loss) {
369	NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
370	} else {
371	NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
372	}
373	}
374
375	if (tcp_use_frto(sk)) {
376	tcp_enter_frto(sk);
377	} else {
378	tcp_enter_loss(sk, 0);
379	}
380
381	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
382	/* Retransmission failed because of local congestion,
383	* do not backoff.
384	*/
385	if (!tp->retransmits)
386	tp->retransmits=1;
387	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
388	min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
389	goto out;
390	}
391
392	/* Increase the timeout each time we retransmit. Note that
393	* we do not increase the rtt estimate. rto is initialized
394	* from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
395	* that doubling rto each time is the least we can get away with.
396	* In KA9Q, Karn uses this for the first few times, and then
397	* goes to quadratic. netBSD doubles, but only goes up to *64,
398	* and clamps at 1 to 64 sec afterwards. Note that 120 sec is
399	* defined in the protocol as the maximum possible RTT. I guess
400	* we'll have to use something other than TCP to talk to the
401	* University of Mars.
402	*
403	* PAWS allows us longer timeouts and large windows, so once
404	* implemented ftp to mars will work nicely. We will have to fix
405	* the 120 second clamps though!
406	*/
407	tp->backoff++;
408	tp->retransmits++;
409
410	out_reset_timer:
411	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
412	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
413	if (tp->retransmits > sysctl_tcp_retries1)
414	__sk_dst_reset(sk);
415
416	out:;
417	}
418
419	static void tcp_write_timer(unsigned long data)
420	{
421	struct sock sk = (struct sock)data;
422	struct tcp_sock *tp = tcp_sk(sk);
423	int event;
424
425	bh_lock_sock(sk);
426	if (sock_owned_by_user(sk)) {
427	/* Try again later */
428	sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
429	goto out_unlock;
430	}
431
432	if (sk->sk_state == TCP_CLOSE \|\| !tp->pending)
433	goto out;
434
435	if (time_after(tp->timeout, jiffies)) {
436	sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
437	goto out;
438	}
439
440	event = tp->pending;
441	tp->pending = 0;
442
443	switch (event) {
444	case TCP_TIME_RETRANS:
445	tcp_retransmit_timer(sk);
446	break;
447	case TCP_TIME_PROBE0:
448	tcp_probe_timer(sk);
449	break;
450	}
451	TCP_CHECK_TIMER(sk);
452
453	out:
454	sk_stream_mem_reclaim(sk);
455	out_unlock:
456	bh_unlock_sock(sk);
457	sock_put(sk);
458	}
459
460	/*
461	* Timer for listening sockets
462	*/
463
464	static void tcp_synack_timer(struct sock *sk)
465	{
466	struct tcp_sock *tp = tcp_sk(sk);
467	struct tcp_listen_opt *lopt = tp->listen_opt;
468	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
469	int thresh = max_retries;
470	unsigned long now = jiffies;
471	struct open_request *reqp, req;
472	int i, budget;
473
474	if (lopt == NULL \|\| lopt->qlen == 0)
475	return;
476
477	/* Normally all the openreqs are young and become mature
478	* (i.e. converted to established socket) for first timeout.
479	* If synack was not acknowledged for 3 seconds, it means
480	* one of the following things: synack was lost, ack was lost,
481	* rtt is high or nobody planned to ack (i.e. synflood).
482	* When server is a bit loaded, queue is populated with old
483	* open requests, reducing effective size of queue.
484	* When server is well loaded, queue size reduces to zero
485	* after several minutes of work. It is not synflood,
486	* it is normal operation. The solution is pruning
487	* too old entries overriding normal timeout, when
488	* situation becomes dangerous.
489	*
490	* Essentially, we reserve half of room for young
491	* embrions; and abort old ones without pity, if old
492	* ones are about to clog our table.
493	*/
494	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
495	int young = (lopt->qlen_young<<1);
496
497	while (thresh > 2) {
498	if (lopt->qlen < young)
499	break;
500	thresh--;
501	young <<= 1;
502	}
503	}
504
505	if (tp->defer_accept)
506	max_retries = tp->defer_accept;
507
508	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
509	i = lopt->clock_hand;
510
511	do {
512	reqp=&lopt->syn_table[i];
513	while ((req = *reqp) != NULL) {
514	if (time_after_eq(now, req->expires)) {
515	if ((req->retrans < thresh \|\|
516	(req->acked && req->retrans < max_retries))
517	&& !req->class->rtx_syn_ack(sk, req, NULL)) {
518	unsigned long timeo;
519
520	if (req->retrans++ == 0)
521	lopt->qlen_young--;
522	timeo = min((TCP_TIMEOUT_INIT << req->retrans),
523	TCP_RTO_MAX);
524	req->expires = now + timeo;
525	reqp = &req->dl_next;
526	continue;
527	}
528
529	/* Drop this request */
530	write_lock(&tp->syn_wait_lock);
531	*reqp = req->dl_next;
532	write_unlock(&tp->syn_wait_lock);
533	lopt->qlen--;
534	if (req->retrans == 0)
535	lopt->qlen_young--;
536	tcp_openreq_free(req);
537	continue;
538	}
539	reqp = &req->dl_next;
540	}
541
542	i = (i+1)&(TCP_SYNQ_HSIZE-1);
543
544	} while (--budget > 0);
545
546	lopt->clock_hand = i;
547
548	if (lopt->qlen)
549	tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
550	}
551
552	void tcp_delete_keepalive_timer (struct sock *sk)
553	{
554	sk_stop_timer(sk, &sk->sk_timer);
555	}
556
557	void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
558	{
559	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
560	}
561
562	void tcp_set_keepalive(struct sock *sk, int val)
563	{
564	if ((1 << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN))
565	return;
566
567	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
568	tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
569	else if (!val)
570	tcp_delete_keepalive_timer(sk);
571	}
572
573
574	static void tcp_keepalive_timer (unsigned long data)
575	{
576	struct sock sk = (struct sock ) data;
577	struct tcp_sock *tp = tcp_sk(sk);
578	__u32 elapsed;
579
580	/* Only process if socket is not in use. */
581	bh_lock_sock(sk);
582	if (sock_owned_by_user(sk)) {
583	/* Try again later. */
584	tcp_reset_keepalive_timer (sk, HZ/20);
585	goto out;
586	}
587
588	if (sk->sk_state == TCP_LISTEN) {
589	tcp_synack_timer(sk);
590	goto out;
591	}
592
593	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
594	if (tp->linger2 >= 0) {
595	int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
596
597	if (tmo > 0) {
598	tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
599	goto out;
600	}
601	}
602	tcp_send_active_reset(sk, GFP_ATOMIC);
603	goto death;
604	}
605
606	if (!sock_flag(sk, SOCK_KEEPOPEN) \|\| sk->sk_state == TCP_CLOSE)
607	goto out;
608
609	elapsed = keepalive_time_when(tp);
610
611	/* It is alive without keepalive 8) */
612	if (tp->packets_out \|\| sk->sk_send_head)
613	goto resched;
614
615	elapsed = tcp_time_stamp - tp->rcv_tstamp;
616
617	if (elapsed >= keepalive_time_when(tp)) {
618	if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) \|\|
619	(tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
620	tcp_send_active_reset(sk, GFP_ATOMIC);
621	tcp_write_err(sk);
622	goto out;
623	}
624	if (tcp_write_wakeup(sk) <= 0) {
625	tp->probes_out++;
626	elapsed = keepalive_intvl_when(tp);
627	} else {
628	/* If keepalive was lost due to local congestion,
629	* try harder.
630	*/
631	elapsed = TCP_RESOURCE_PROBE_INTERVAL;
632	}
633	} else {
634	/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
635	elapsed = keepalive_time_when(tp) - elapsed;
636	}
637
638	TCP_CHECK_TIMER(sk);
639	sk_stream_mem_reclaim(sk);
640
641	resched:
642	tcp_reset_keepalive_timer (sk, elapsed);
643	goto out;
644
645	death:
646	tcp_done(sk);
647
648	out:
649	bh_unlock_sock(sk);
650	sock_put(sk);
651	}
652
653	EXPORT_SYMBOL(tcp_clear_xmit_timers);
654	EXPORT_SYMBOL(tcp_delete_keepalive_timer);
655	EXPORT_SYMBOL(tcp_init_xmit_timers);
656	EXPORT_SYMBOL(tcp_reset_keepalive_timer);