Import of Linux 2.2.12 subset (ipv4 stack and related)

author: Roland McGrath <roland@gnu.org> 2000-02-04 03:21:18 +0000
committer: Roland McGrath <roland@gnu.org> 2000-02-04 03:21:18 +0000
commit: 9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch)
tree: 8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/net/ipv4/tcp_timer.c
1 files changed, 595 insertions, 0 deletions
diff --git a/pfinet/linux-src/net/ipv4/tcp_timer.c b/pfinet/linux-src/net/ipv4/tcp_timer.c
new file mode 100644
index 00000000..21029f8e
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp_timer.c
@@ -0,0 +1,595 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_timer.c,v 1.62.2.3 1999/06/20 20:14:30 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
+int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_retries1 = TCP_RETR1;
+int sysctl_tcp_retries2 = TCP_RETR2;
+
+static void tcp_sltimer_handler(unsigned long);
+static void tcp_syn_recv_timer(unsigned long);
+static void tcp_keepalive(unsigned long data);
+static void tcp_twkill(unsigned long);
+
+struct timer_list	tcp_slow_timer = {
+	NULL, NULL,
+	0, 0,
+	tcp_sltimer_handler,
+};
+
+
+struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
+	{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK	*/
+	{ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive},  /* KEEPALIVE	*/
+	{ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}         /* TWKILL	*/
+};
+
+const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
+	sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
+	sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
+	
+	init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
+	sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
+	sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
+
+	init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
+	sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
+	sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
+}
+
+/*
+ *	Reset the retransmission timer
+ */
+ 
+void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	switch (what) {
+	case TIME_RETRANS:
+		/* When seting the transmit timer the probe timer 
+		 * should not be set.
+		 * The delayed ack timer can be set if we are changing the
+		 * retransmit timer when removing acked frames.
+		 */
+		if(tp->probe_timer.prev)
+			del_timer(&tp->probe_timer);
+		mod_timer(&tp->retransmit_timer, jiffies+when);
+		break;
+
+	case TIME_DACK:
+		mod_timer(&tp->delack_timer, jiffies+when);
+		break;
+
+	case TIME_PROBE0:
+		mod_timer(&tp->probe_timer, jiffies+when);
+		break;	
+
+	case TIME_WRITE:
+		printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
+		break;
+
+	default:
+		printk(KERN_DEBUG "bug: unknown timer value\n");
+	};
+}
+
+void tcp_clear_xmit_timers(struct sock *sk)
+{	
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	if(tp->retransmit_timer.prev)
+		del_timer(&tp->retransmit_timer);
+	if(tp->delack_timer.prev)
+		del_timer(&tp->delack_timer);
+	if(tp->probe_timer.prev)
+		del_timer(&tp->probe_timer);
+}
+
+static int tcp_write_err(struct sock *sk, int force)
+{
+	sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
+	sk->error_report(sk);
+	
+	tcp_clear_xmit_timers(sk);
+	
+	/* Time wait the socket. */
+	if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
+		tcp_time_wait(sk);
+	} else {
+		/* Clean up time. */
+		tcp_set_state(sk, TCP_CLOSE);
+		return 0;
+	}
+	return 1;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	/* Look for a 'soft' timeout. */
+	if ((sk->state == TCP_ESTABLISHED &&
+	     tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
+	    (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
+		dst_negative_advice(&sk->dst_cache);
+	}
+	
+	/* Have we tried to SYN too many times (repent repent 8)) */
+	if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
+		tcp_write_err(sk, 1);
+		/* Don't FIN, we got nothing back */
+		return 0;
+	}
+
+	/* Has it gone just too far? */
+	if (tp->retransmits > sysctl_tcp_retries2) 
+		return tcp_write_err(sk, 0);
+
+	return 1;
+}
+
+void tcp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+
+	if(!sk->zapped &&
+	   sk->tp_pinfo.af_tcp.delayed_acks &&
+	   sk->state != TCP_CLOSE) {
+		/* If socket is currently locked, defer the ACK. */
+		if (!atomic_read(&sk->sock_readers))
+			tcp_send_ack(sk);
+		else
+			tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
+	}
+}
+
+void tcp_probe_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	if(sk->zapped) 
+		return;
+	
+	if (atomic_read(&sk->sock_readers)) {
+		/* Try again later. */
+		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+		return;
+	}
+
+	/* *WARNING* RFC 1122 forbids this 
+	 * It doesn't AFAIK, because we kill the retransmit timer -AK
+	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+	 * this behaviour in Solaris down as a bug fix. [AC]
+	 */
+	if (tp->probes_out > sysctl_tcp_retries2) {
+		if(sk->err_soft)
+			sk->err = sk->err_soft;
+		else
+			sk->err = ETIMEDOUT;
+		sk->error_report(sk);
+
+		if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
+			/* Time wait the socket. */
+			tcp_time_wait(sk);
+		} else {
+			/* Clean up time. */
+			tcp_set_state(sk, TCP_CLOSE);
+		}
+	} else {
+		/* Only send another probe if we didn't close things up. */
+		tcp_send_probe0(sk);
+	}
+}
+
+static __inline__ int tcp_keepopen_proc(struct sock *sk)
+{
+	int res = 0;
+
+	if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
+		struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+		__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+		if (elapsed >= sysctl_tcp_keepalive_time) {
+			if (tp->probes_out > sysctl_tcp_keepalive_probes) {
+				if(sk->err_soft)
+					sk->err = sk->err_soft;
+				else
+					sk->err = ETIMEDOUT;
+
+				tcp_set_state(sk, TCP_CLOSE);
+				sk->shutdown = SHUTDOWN_MASK;
+				if (!sk->dead)
+					sk->state_change(sk);
+			} else {
+				tp->probes_out++;
+				tp->pending = TIME_KEEPOPEN;
+				tcp_write_wakeup(sk);
+				res = 1;
+			}
+		}
+	}
+	return res;
+}
+
+/* Kill off TIME_WAIT sockets once their lifetime has expired. */
+int tcp_tw_death_row_slot = 0;
+static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
+	{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
+
+static void tcp_twkill(unsigned long data)
+{
+	struct tcp_tw_bucket *tw;
+	int killed = 0;
+
+	tw = tcp_tw_death_row[tcp_tw_death_row_slot];
+	tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
+	while(tw != NULL) {
+		struct tcp_tw_bucket *next = tw->next_death;
+
+		tcp_timewait_kill(tw);
+		killed++;
+		tw = next;
+	}
+	if(killed != 0) {
+		struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
+		atomic_sub(killed, &slt->count);
+	}
+	tcp_tw_death_row_slot =
+	  ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+}
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+void tcp_tw_schedule(struct tcp_tw_bucket *tw)
+{
+	int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+	struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
+
+	if((tw->next_death = *tpp) != NULL)
+		(*tpp)->pprev_death = &tw->next_death;
+	*tpp = tw;
+	tw->pprev_death = tpp;
+
+	tw->death_slot = slot;
+
+	tcp_inc_slow_timer(TCP_SLT_TWKILL);
+}
+
+/* Happens rarely if at all, no care about scalability here. */
+void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
+{
+	struct tcp_tw_bucket **tpp;
+	int slot;
+
+	if(tw->next_death)
+		tw->next_death->pprev_death = tw->pprev_death;
+	*tw->pprev_death = tw->next_death;
+	tw->pprev_death = NULL;
+
+	slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+	tpp = &tcp_tw_death_row[slot];
+	if((tw->next_death = *tpp) != NULL)
+		(*tpp)->pprev_death = &tw->next_death;
+	*tpp = tw;
+	tw->pprev_death = tpp;
+
+	tw->death_slot = slot;
+	/* Timer was incremented when we first entered the table. */
+}
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
+	if(tw->next_death)
+		tw->next_death->pprev_death = tw->pprev_death;
+	*tw->pprev_death = tw->next_death;
+	tw->pprev_death = NULL;
+	tcp_dec_slow_timer(TCP_SLT_TWKILL);
+}
+
+/*
+ *	Check all sockets for keepalive timer
+ *	Called every 75 seconds
+ *	This timer is started by af_inet init routine and is constantly
+ *	running.
+ *
+ *	It might be better to maintain a count of sockets that need it using
+ *	setsockopt/tcp_destroy_sk and only set the timer when needed.
+ */
+
+/*
+ *	don't send over 5 keepopens at a time to avoid burstiness 
+ *	on big servers [AC]
+ */
+#define MAX_KA_PROBES	5
+
+int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
+
+/* Keepopen's are only valid for "established" TCP's, nicely our listener
+ * hash gets rid of most of the useless testing, so we run through a couple
+ * of the established hash chains each clock tick.  -DaveM
+ *
+ * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
+ * going off for them, so we only need check the first half of the established
+ * hash table, even less testing under heavy load.
+ *
+ * I _really_ would rather do this by adding a new timer_struct to struct sock,
+ * and this way only those who set the keepalive option will get the overhead.
+ * The idea is you set it for 2 hours when the sock is first connected, when it
+ * does fire off (if at all, most sockets die earlier) you check for the keepalive
+ * option and also if the sock has been idle long enough to start probing.
+ */
+static void tcp_keepalive(unsigned long data)
+{
+	static int chain_start = 0;
+	int count = 0;
+	int i;
+	
+	for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) {
+		struct sock *sk = tcp_established_hash[i];
+		while(sk) {
+			if(!atomic_read(&sk->sock_readers) && sk->keepopen) {
+				count += tcp_keepopen_proc(sk);
+				if(count == sysctl_tcp_max_ka_probes)
+					goto out;
+			}
+			sk = sk->next;
+		}
+	}
+out:
+	chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
+		       ((TCP_HTABLE_SIZE/2) - 1));
+}
+
+/*
+ *	The TCP retransmit timer. This lacks a few small details.
+ *
+ *	1. 	An initial rtt timeout on the probe0 should cause what we can
+ *		of the first write queue buffer to be split and sent.
+ *	2.	On a 'major timeout' as defined by RFC1122 we shouldn't report
+ *		ETIMEDOUT if we know an additional 'soft' error caused this.
+ *		tcp_err should save a 'soft error' for us.
+ *	[Unless someone has broken it then it does, except for one 2.0 
+ *	broken case of a send when the route/device is directly unreachable,
+ *	and we error but should retry! - FIXME] [AC]
+ */
+
+void tcp_retransmit_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	/* We are reset. We will send no more retransmits. */
+	if(sk->zapped) {
+		tcp_clear_xmit_timer(sk, TIME_RETRANS);
+		return;
+	}
+
+	if (atomic_read(&sk->sock_readers)) {
+		/* Try again later */  
+		tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+		return;
+	}
+
+	/* Clear delay ack timer. */
+	tcp_clear_xmit_timer(sk, TIME_DACK);
+
+	/* RFC 2018, clear all 'sacked' flags in retransmission queue,
+	 * the sender may have dropped out of order frames and we must
+	 * send them out should this timer fire on us.
+	 */
+	if(tp->sack_ok) {
+		struct sk_buff *skb = skb_peek(&sk->write_queue);
+
+		while((skb != NULL) &&
+		      (skb != tp->send_head) &&
+		      (skb != (struct sk_buff *)&sk->write_queue)) {
+			TCP_SKB_CB(skb)->sacked &=
+				~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
+			skb = skb->next;
+		}
+	}
+
+	/* Retransmission. */
+	tp->retrans_head = NULL;
+	tp->rexmt_done = 0;
+	tp->fackets_out = 0;
+	tp->retrans_out = 0;
+	if (tp->retransmits == 0) {
+		/* Remember window where we lost:
+		 * "one half of the current window but at least 2 segments"
+		 *
+		 * Here "current window" means the effective one, which
+		 * means it must be an accurate representation of our current
+		 * sending rate _and_ the snd_wnd.
+		 */
+		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_cwnd_cnt = 0;
+		tp->snd_cwnd = 1;
+	}
+
+	tp->retransmits++;
+
+	tp->dup_acks = 0;
+	tp->high_seq = tp->snd_nxt;
+	tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+
+	/* Increase the timeout each time we retransmit.  Note that
+	 * we do not increase the rtt estimate.  rto is initialized
+	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+	 * that doubling rto each time is the least we can get away with.
+	 * In KA9Q, Karn uses this for the first few times, and then
+	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
+	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+	 * defined in the protocol as the maximum possible RTT.  I guess
+	 * we'll have to use something other than TCP to talk to the
+	 * University of Mars.
+	 *
+	 * PAWS allows us longer timeouts and large windows, so once
+	 * implemented ftp to mars will work nicely. We will have to fix
+	 * the 120 second clamps though!
+	 */
+	tp->backoff++;
+	tp->rto = min(tp->rto << 1, 120*HZ);
+	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+
+	tcp_write_timeout(sk);
+}
+
+/*
+ *	Slow timer for SYN-RECV sockets
+ */
+
+/* This now scales very nicely. -DaveM */
+static void tcp_syn_recv_timer(unsigned long data)
+{
+	struct sock *sk;
+	unsigned long now = jiffies;
+	int i;
+
+	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
+		sk = tcp_listening_hash[i];
+
+		while(sk) {
+			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+			
+			/* TCP_LISTEN is implied. */
+			if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
+				struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
+				struct open_request *req = tp->syn_wait_queue;
+				do {
+					struct open_request *conn;
+				  
+					conn = req;
+					req = req->dl_next;
+
+					if (conn->sk ||
+					    ((long)(now - conn->expires)) <= 0) {
+						prev = conn; 
+						continue; 
+					}
+
+					tcp_synq_unlink(tp, conn, prev);
+					if (conn->retrans >= sysctl_tcp_retries1) {
+#ifdef TCP_DEBUG
+						printk(KERN_DEBUG "syn_recv: "
+						       "too many retransmits\n");
+#endif
+						(*conn->class->destructor)(conn);
+						tcp_dec_slow_timer(TCP_SLT_SYNACK);
+						tp->syn_backlog--;
+						tcp_openreq_free(conn);
+
+						if (!tp->syn_wait_queue)
+							break;
+					} else {
+						unsigned long timeo;
+						struct open_request *op; 
+
+						(*conn->class->rtx_syn_ack)(sk, conn);
+
+						conn->retrans++;
+#ifdef TCP_DEBUG
+						printk(KERN_DEBUG "syn_ack rtx %d\n",
+						       conn->retrans);
+#endif
+						timeo = min((TCP_TIMEOUT_INIT 
+							     << conn->retrans),
+							    120*HZ);
+						conn->expires = now + timeo;
+						op = prev->dl_next; 
+						tcp_synq_queue(tp, conn);
+						if (op != prev->dl_next)
+							prev = prev->dl_next;
+					}
+					/* old prev still valid here */
+				} while (req);
+			}
+			sk = sk->next;
+		}
+	}
+}
+
+void tcp_sltimer_handler(unsigned long data)
+{
+	struct tcp_sl_timer *slt = tcp_slt_array;
+	unsigned long next = ~0UL;
+	unsigned long now = jiffies;
+	int i;
+
+	for (i=0; i < TCP_SLT_MAX; i++, slt++) {
+		if (atomic_read(&slt->count)) {
+			long trigger;
+
+			trigger = slt->period - ((long)(now - slt->last));
+
+			if (trigger <= 0) {
+				(*slt->handler)((unsigned long) slt);
+				slt->last = now;
+				trigger = slt->period;
+			}
+
+			/* Only reschedule if some events remain. */
+			if (atomic_read(&slt->count))
+				next = min(next, trigger);
+		}
+	}
+	if (next != ~0UL)
+		mod_timer(&tcp_slow_timer, (now + next));
+}
+
+void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
+{
+	unsigned long now = jiffies;
+	unsigned long when;
+
+	slt->last = now;
+
+	when = now + slt->period;
+
+	if (tcp_slow_timer.prev) {
+		if ((long)(tcp_slow_timer.expires - when) >= 0)
+			mod_timer(&tcp_slow_timer, when);
+	} else {
+		tcp_slow_timer.expires = when;
+		add_timer(&tcp_slow_timer);
+	}
+}
author	Roland McGrath <roland@gnu.org>	2000-02-04 03:21:18 +0000
committer	Roland McGrath <roland@gnu.org>	2000-02-04 03:21:18 +0000
commit	9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch)
tree	8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/net/ipv4/tcp_timer.c