tcp_timer.c

/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_timer.c.html,v 1.1.1.1 2004/09/19 17:08:45 daniel Exp $
 *
 * Authors:	Ross Biro, 
 *		Fred N. van Kempen, 
 *		Mark Evans, 
 *		Corey Minyard 
 *		Florian La Roche, 
 *		Charles Hedrick, 
 *		Linus Torvalds, 
 *		Alan Cox, 
 *		Matthew Dillon, 
 *		Arnt Gulbrandsen, 
 *		Jorge Cwik, 
 */

#include 

int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;

static void tcp_sltimer_handler(unsigned long);
static void tcp_syn_recv_timer(unsigned long);
static void tcp_keepalive(unsigned long data);
static void tcp_twkill(unsigned long);

struct timer_list	tcp_slow_timer = {
	NULL, NULL,
	0, 0,
	tcp_sltimer_handler,
};


struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
	{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK	*/
	{ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive},  /* KEEPALIVE	*/
	{ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}         /* TWKILL	*/
};

const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";

/*
 * Using different timers for retransmit, delayed acks and probes
 * We may wish use just one timer maintaining a list of expire jiffies 
 * to optimize.
 */

void tcp_init_xmit_timers(struct sock *sk)
{
	init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
	sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
	sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
	
	init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
	sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
	sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;

	init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
	sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
	sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
}

/*
 *	Reset the retransmission timer
 */
 
void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
{
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;

	switch (what) {
	case TIME_RETRANS:
		/* When seting the transmit timer the probe timer 
		 * should not be set.
		 * The delayed ack timer can be set if we are changing the
		 * retransmit timer when removing acked frames.
		 */
		if(tp->probe_timer.prev)
			del_timer(&tp->probe_timer);
		mod_timer(&tp->retransmit_timer, jiffies+when);
		break;

	case TIME_DACK:
		mod_timer(&tp->delack_timer, jiffies+when);
		break;

	case TIME_PROBE0:
		mod_timer(&tp->probe_timer, jiffies+when);
		break;	

	case TIME_WRITE:
		printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
		break;

	default:
		printk(KERN_DEBUG "bug: unknown timer value\n");
	};
}

void tcp_clear_xmit_timers(struct sock *sk)
{	
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;

	if(tp->retransmit_timer.prev)
		del_timer(&tp->retransmit_timer);
	if(tp->delack_timer.prev)
		del_timer(&tp->delack_timer);
	if(tp->probe_timer.prev)
		del_timer(&tp->probe_timer);
}

static int tcp_write_err(struct sock *sk, int force)
{
	sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
	sk->error_report(sk);
	
	tcp_clear_xmit_timers(sk);
	
	/* Time wait the socket. */
	if (!force && ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
		tcp_time_wait(sk);
	} else {
		/* Clean up time. */
		tcp_set_state(sk, TCP_CLOSE);
		return 0;
	}
	return 1;
}

/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);

	/* Look for a 'soft' timeout. */
	if ((sk->state == TCP_ESTABLISHED &&
	     tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
	    (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
		dst_negative_advice(&sk->dst_cache);
	}
	
	/* Have we tried to SYN too many times (repent repent 8)) */
	if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
		tcp_write_err(sk, 1);
		/* Don't FIN, we got nothing back */
		return 0;
	}

	/* Has it gone just too far? */
	if (tp->retransmits > sysctl_tcp_retries2) 
		return tcp_write_err(sk, 0);

	return 1;
}

void tcp_delack_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;

	if(!sk->zapped &&
	   sk->tp_pinfo.af_tcp.delayed_acks &&
	   sk->state != TCP_CLOSE) {
		/* If socket is currently locked, defer the ACK. */
		if (!atomic_read(&sk->sock_readers))
			tcp_send_ack(sk);
		else
			tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
	}
}

void tcp_probe_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;

	if(sk->zapped) 
		return;
	
	if (atomic_read(&sk->sock_readers)) {
		/* Try again later. */
		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
		return;
	}

	/* *WARNING* RFC 1122 forbids this 
	 * It doesn't AFAIK, because we kill the retransmit timer -AK
	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
	 * this behaviour in Solaris down as a bug fix. [AC]
	 */
	if (tp->probes_out > sysctl_tcp_retries2) {
		if(sk->err_soft)
			sk->err = sk->err_soft;
		else
			sk->err = ETIMEDOUT;
		sk->error_report(sk);

		if ((1<state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
			/* Time wait the socket. */
			tcp_time_wait(sk);
		} else {
			/* Clean up time. */
			tcp_set_state(sk, TCP_CLOSE);
		}
	} else {
		/* Only send another probe if we didn't close things up. */
		tcp_send_probe0(sk);
	}
}

static __inline__ int tcp_keepopen_proc(struct sock *sk)
{
	int res = 0;

	if ((1<state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
		struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
		__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;

		if (elapsed >= sysctl_tcp_keepalive_time) {
			if (tp->probes_out > sysctl_tcp_keepalive_probes) {
				if(sk->err_soft)
					sk->err = sk->err_soft;
				else
					sk->err = ETIMEDOUT;

				tcp_set_state(sk, TCP_CLOSE);
				sk->shutdown = SHUTDOWN_MASK;
				if (!sk->dead)
					sk->state_change(sk);
			} else {
				tp->probes_out++;
				tp->pending = TIME_KEEPOPEN;
				tcp_write_wakeup(sk);
				res = 1;
			}
		}
	}
	return res;
}

/* Kill off TIME_WAIT sockets once their lifetime has expired. */
int tcp_tw_death_row_slot = 0;
static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
	{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };

extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);

static void tcp_twkill(unsigned long data)
{
	struct tcp_tw_bucket *tw;
	int killed = 0;

	tw = tcp_tw_death_row[tcp_tw_death_row_slot];
	tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
	while(tw != NULL) {
		struct tcp_tw_bucket *next = tw->next_death;

		tcp_timewait_kill(tw);
		killed++;
		tw = next;
	}
	if(killed != 0) {
		struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
		atomic_sub(killed, &slt->count);
	}
	tcp_tw_death_row_slot =
	  ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
}

/* These are always called from BH context.  See callers in
 * tcp_input.c to verify this.
 */
void tcp_tw_schedule(struct tcp_tw_bucket *tw)
{
	int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
	struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];

	if((tw->next_death = *tpp) != NULL)
		(*tpp)->pprev_death = &tw->next_death;
	*tpp = tw;
	tw->pprev_death = tpp;

	tw->death_slot = slot;

	tcp_inc_slow_timer(TCP_SLT_TWKILL);
}

/* Happens rarely if at all, no care about scalability here. */
void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
{
	struct tcp_tw_bucket **tpp;
	int slot;

	if(tw->next_death)
		tw->next_death->pprev_death = tw->pprev_death;
	*tw->pprev_death = tw->next_death;
	tw->pprev_death = NULL;

	slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
	tpp = &tcp_tw_death_row[slot];
	if((tw->next_death = *tpp) != NULL)
		(*tpp)->pprev_death = &tw->next_death;
	*tpp = tw;
	tw->pprev_death = tpp;

	tw->death_slot = slot;
	/* Timer was incremented when we first entered the table. */
}

/* This is for handling early-kills of TIME_WAIT sockets. */
void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
{
	if(tw->next_death)
		tw->next_death->pprev_death = tw->pprev_death;
	*tw->pprev_death = tw->next_death;
	tw->pprev_death = NULL;
	tcp_dec_slow_timer(TCP_SLT_TWKILL);
}

/*
 *	Check all sockets for keepalive timer
 *	Called every 75 seconds
 *	This timer is started by af_inet init routine and is constantly
 *	running.
 *
 *	It might be better to maintain a count of sockets that need it using
 *	setsockopt/tcp_destroy_sk and only set the timer when needed.
 */

/*
 *	don't send over 5 keepopens at a time to avoid burstiness 
 *	on big servers [AC]
 */
#define MAX_KA_PROBES	5

int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;

/* Keepopen's are only valid for "established" TCP's, nicely our listener
 * hash gets rid of most of the useless testing, so we run through a couple
 * of the established hash chains each clock tick.  -DaveM
 *
 * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
 * going off for them, so we only need check the first half of the established
 * hash table, even less testing under heavy load.
 *
 * I _really_ would rather do this by adding a new timer_struct to struct sock,
 * and this way only those who set the keepalive option will get the overhead.
 * The idea is you set it for 2 hours when the sock is first connected, when it
 * does fire off (if at all, most sockets die earlier) you check for the keepalive
 * option and also if the sock has been idle long enough to start probing.
 */
static void tcp_keepalive(unsigned long data)
{
	static int chain_start = 0;
	int count = 0;
	int i;
	
	for(i = chain_start; i < (chain_start + ((tcp_ehash_size/2) >> 2)); i++) {
		struct sock *sk = tcp_ehash[i];
		while(sk) {
			if(!atomic_read(&sk->sock_readers) && sk->keepopen) {
				count += tcp_keepopen_proc(sk);
				if(count == sysctl_tcp_max_ka_probes)
					goto out;
			}
			sk = sk->next;
		}
	}
out:
	chain_start = ((chain_start + ((tcp_ehash_size/2)>>2)) &
		       ((tcp_ehash_size/2) - 1));
}

/*
 *	The TCP retransmit timer. This lacks a few small details.
 *
 *	1. 	An initial rtt timeout on the probe0 should cause what we can
 *		of the first write queue buffer to be split and sent.
 *	2.	On a 'major timeout' as defined by RFC1122 we shouldn't report
 *		ETIMEDOUT if we know an additional 'soft' error caused this.
 *		tcp_err should save a 'soft error' for us.
 *	[Unless someone has broken it then it does, except for one 2.0 
 *	broken case of a send when the route/device is directly unreachable,
 *	and we error but should retry! - FIXME] [AC]
 */

void tcp_retransmit_timer(unsigned long data)
{
	struct sock *sk = (struct sock*)data;
	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;

	/* We are reset. We will send no more retransmits. */
	if(sk->zapped) {
		tcp_clear_xmit_timer(sk, TIME_RETRANS);
		return;
	}

	if (atomic_read(&sk->sock_readers)) {
		/* Try again later */  
		tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
		return;
	}

	/* Clear delay ack timer. */
	tcp_clear_xmit_timer(sk, TIME_DACK);

	/* RFC 2018, clear all 'sacked' flags in retransmission queue,
	 * the sender may have dropped out of order frames and we must
	 * send them out should this timer fire on us.
	 */
	if(tp->sack_ok) {
		struct sk_buff *skb = skb_peek(&sk->write_queue);

		while((skb != NULL) &&
		      (skb != tp->send_head) &&
		      (skb != (struct sk_buff *)&sk->write_queue)) {
			TCP_SKB_CB(skb)->sacked &=
				~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
			skb = skb->next;
		}
	}

	/* Retransmission. */
	tp->retrans_head = NULL;
	tp->rexmt_done = 0;
	tp->fackets_out = 0;
	tp->retrans_out = 0;
	if (tp->retransmits == 0) {
		/* Remember window where we lost:
		 * "one half of the current window but at least 2 segments"
		 *
		 * Here "current window" means the effective one, which
		 * means it must be an accurate representation of our current
		 * sending rate _and_ the snd_wnd.
		 */
		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
		tp->snd_cwnd_cnt = 0;
		tp->snd_cwnd = 1;
	}

	tp->retransmits++;

	tp->dup_acks = 0;
	tp->high_seq = tp->snd_nxt;
	tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));

	/* Increase the timeout each time we retransmit.  Note that
	 * we do not increase the rtt estimate.  rto is initialized
	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
	 * that doubling rto each time is the least we can get away with.
	 * In KA9Q, Karn uses this for the first few times, and then
	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
	 * defined in the protocol as the maximum possible RTT.  I guess
	 * we'll have to use something other than TCP to talk to the
	 * University of Mars.
	 *
	 * PAWS allows us longer timeouts and large windows, so once
	 * implemented ftp to mars will work nicely. We will have to fix
	 * the 120 second clamps though!
	 */
	tp->backoff++;
	tp->rto = min(tp->rto << 1, 120*HZ);
	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);

	tcp_write_timeout(sk);
}

/*
 *	Slow timer for SYN-RECV sockets
 */

/* This now scales very nicely. -DaveM */
static void tcp_syn_recv_timer(unsigned long data)
{
	struct sock *sk;
	unsigned long now = jiffies;
	int i;

	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
		sk = tcp_listening_hash[i];

		while(sk) {
			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
			
			/* TCP_LISTEN is implied. */
			if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
				struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
				struct open_request *req = tp->syn_wait_queue;
				do {
					struct open_request *conn;
				  
					conn = req;
					req = req->dl_next;

					if (conn->sk ||
					    ((long)(now - conn->expires)) <= 0) {
						prev = conn; 
						continue; 
					}

					tcp_synq_unlink(tp, conn, prev);
					if (conn->retrans >= sysctl_tcp_retries1) {
#ifdef TCP_DEBUG
						printk(KERN_DEBUG "syn_recv: "
						       "too many retransmits\n");
#endif
						(*conn->class->destructor)(conn);
						tcp_dec_slow_timer(TCP_SLT_SYNACK);
						tp->syn_backlog--;
						tcp_openreq_free(conn);

						if (!tp->syn_wait_queue)
							break;
					} else {
						unsigned long timeo;
						struct open_request *op; 

						(*conn->class->rtx_syn_ack)(sk, conn);

						conn->retrans++;
#ifdef TCP_DEBUG
						printk(KERN_DEBUG "syn_ack rtx %d\n",
						       conn->retrans);
#endif
						timeo = min((TCP_TIMEOUT_INIT 
							     << conn->retrans),
							    120*HZ);
						conn->expires = now + timeo;
						op = prev->dl_next; 
						tcp_synq_queue(tp, conn);
						if (op != prev->dl_next)
							prev = prev->dl_next;
					}
					/* old prev still valid here */
				} while (req);
			}
			sk = sk->next;
		}
	}
}

void tcp_sltimer_handler(unsigned long data)
{
	struct tcp_sl_timer *slt = tcp_slt_array;
	unsigned long next = ~0UL;
	unsigned long now = jiffies;
	int i;

	for (i=0; i < TCP_SLT_MAX; i++, slt++) {
		if (atomic_read(&slt->count)) {
			long trigger;

			trigger = slt->period - ((long)(now - slt->last));

			if (trigger <= 0) {
				(*slt->handler)((unsigned long) slt);
				slt->last = now;
				trigger = slt->period;
			}

			/* Only reschedule if some events remain. */
			if (atomic_read(&slt->count))
				next = min(next, trigger);
		}
	}
	if (next != ~0UL)
		mod_timer(&tcp_slow_timer, (now + next));
}

void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
{
	unsigned long now = jiffies;
	unsigned long when;

	slt->last = now;

	when = now + slt->period;

	if (tcp_slow_timer.prev) {
		if ((long)(tcp_slow_timer.expires - when) >= 0)
			mod_timer(&tcp_slow_timer, when);
	} else {
		tcp_slow_timer.expires = when;
		add_timer(&tcp_slow_timer);
	}
}