diff options
Diffstat (limited to 'pfinet.old/linux-src/net/ipv4/tcp_input.c~')
-rw-r--r-- | pfinet.old/linux-src/net/ipv4/tcp_input.c~ | 2449 |
1 files changed, 0 insertions, 2449 deletions
diff --git a/pfinet.old/linux-src/net/ipv4/tcp_input.c~ b/pfinet.old/linux-src/net/ipv4/tcp_input.c~ deleted file mode 100644 index c5095624..00000000 --- a/pfinet.old/linux-src/net/ipv4/tcp_input.c~ +++ /dev/null @@ -1,2449 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Implementation of the Transmission Control Protocol(TCP). - * - * Version: $Id: tcp_input.c,v 1.164.2.8 1999/09/23 19:21:23 davem Exp $ - * - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Mark Evans, <evansmp@uhura.aston.ac.uk> - * Corey Minyard <wf-rch!minyard@relay.EU.net> - * Florian La Roche, <flla@stud.uni-sb.de> - * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> - * Linus Torvalds, <torvalds@cs.helsinki.fi> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * Matthew Dillon, <dillon@apollo.west.oic.com> - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Jorge Cwik, <jorge@laser.satlink.net> - */ - -/* - * Changes: - * Pedro Roque : Fast Retransmit/Recovery. - * Two receive queues. - * Retransmit queue handled by TCP. - * Better retransmit timer handling. - * New congestion avoidance. - * Header prediction. - * Variable renaming. - * - * Eric : Fast Retransmit. - * Randy Scott : MSS option defines. - * Eric Schenk : Fixes to slow start algorithm. - * Eric Schenk : Yet another double ACK bug. - * Eric Schenk : Delayed ACK bug fixes. - * Eric Schenk : Floyd style fast retrans war avoidance. - * David S. Miller : Don't allow zero congestion window. - * Eric Schenk : Fix retransmitter so that it sends - * next packet on ack of previous packet. - * Andi Kleen : Moved open_request checking here - * and process RSTs for open_requests. - * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of - * timestamps. - * Andrey Savochkin: Check sequence numbers correctly when - * removing SACKs due to in sequence incoming - * data segments. - * Andi Kleen: Make sure we never ack data there is not - * enough room for. Also make this condition - * a fatal error if it might still happen. - * Andi Kleen: Add tcp_measure_rcv_mss to make - * connections with MSS<min(MTU,ann. MSS) - * work without delayed acks. - * Andi Kleen: Process packets with PSH set in the - * fast path. - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/sysctl.h> -#include <net/tcp.h> -#include <linux/ipsec.h> - -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif - -extern int sysctl_tcp_fin_timeout; - -/* These are on by default so the code paths get tested. - * For the final 2.2 this may be undone at our discretion. -DaveM - */ -int sysctl_tcp_timestamps = 1; -int sysctl_tcp_window_scaling = 1; -int sysctl_tcp_sack = 1; - -int sysctl_tcp_syncookies = SYNC_INIT; -int sysctl_tcp_stdurg; -int sysctl_tcp_rfc1337; - -static int prune_queue(struct sock *sk); - -/* There is something which you must keep in mind when you analyze the - * behavior of the tp->ato delayed ack timeout interval. When a - * connection starts up, we want to ack as quickly as possible. The - * problem is that "good" TCP's do slow start at the beginning of data - * transmission. The means that until we send the first few ACK's the - * sender will sit on his end and only queue most of his data, because - * he can only send snd_cwnd unacked packets at any given time. For - * each ACK we send, he increments snd_cwnd and transmits more of his - * queue. -DaveM - */ -static void tcp_delack_estimator(struct tcp_opt *tp) -{ - if(tp->ato == 0) { - tp->lrcvtime = tcp_time_stamp; - - /* Help sender leave slow start quickly, - * and also makes sure we do not take this - * branch ever again for this connection. - */ - tp->ato = 1; - tcp_enter_quickack_mode(tp); - } else { - int m = tcp_time_stamp - tp->lrcvtime; - - tp->lrcvtime = tcp_time_stamp; - if(m <= 0) - m = 1; - if(m > tp->rto) - tp->ato = tp->rto; - else { - /* This funny shift makes sure we - * clear the "quick ack mode" bit. - */ - tp->ato = ((tp->ato << 1) >> 2) + m; - } - } -} - -/* - * Remember to send an ACK later. - */ -static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, - struct sk_buff *skb) -{ - tp->delayed_acks++; - - /* Tiny-grams with PSH set artifically deflate our - * ato measurement, but with a lower bound. - */ - if(th->psh && (skb->len < (tp->mss_cache >> 1))) { - /* Preserve the quickack state. */ - if((tp->ato & 0x7fffffff) > HZ/50) - tp->ato = ((tp->ato & 0x80000000) | - (HZ/50)); - } -} - -/* Called to compute a smoothed rtt estimate. The data fed to this - * routine either comes from timestamps, or from segments that were - * known _not_ to have been retransmitted [see Karn/Partridge - * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 - * piece by Van Jacobson. - * NOTE: the next three routines used to be one big routine. - * To save cycles in the RFC 1323 implementation it was better to break - * it up into three procedures. -- erics - */ - -static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) -{ - long m = mrtt; /* RTT */ - - /* The following amusing code comes from Jacobson's - * article in SIGCOMM '88. Note that rtt and mdev - * are scaled versions of rtt and mean deviation. - * This is designed to be as fast as possible - * m stands for "measurement". - * - * On a 1990 paper the rto value is changed to: - * RTO = rtt + 4 * mdev - */ - if(m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - } else { - /* no previous measure. */ - tp->srtt = m<<3; /* take the measured time to be rtt */ - tp->mdev = m<<2; /* make sure rto = 3*rtt */ - } -} - -/* Calculate rto without backoff. This is the second half of Van Jacobson's - * routine referred to above. - */ - -static __inline__ void tcp_set_rto(struct tcp_opt *tp) -{ - tp->rto = (tp->srtt >> 3) + tp->mdev; - tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); -} - - -/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound - * on packet lifetime in the internet. We need the HZ/5 lower - * bound to behave correctly against BSD stacks with a fixed - * delayed ack. - * FIXME: It's not entirely clear this lower bound is the best - * way to avoid the problem. Is it possible to drop the lower - * bound and still avoid trouble with BSD stacks? Perhaps - * some modification to the RTO calculation that takes delayed - * ack bias into account? This needs serious thought. -- erics - */ -static __inline__ void tcp_bound_rto(struct tcp_opt *tp) -{ - if (tp->rto > 120*HZ) - tp->rto = 120*HZ; - if (tp->rto < HZ/5) - tp->rto = HZ/5; -} - -/* WARNING: this must not be called if tp->saw_timestamp was false. */ -extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, - __u32 start_seq, __u32 end_seq) -{ - /* It is start_seq <= last_ack_seq combined - with in window check. If start_seq<=last_ack_seq<=rcv_nxt, - then segment is in window if end_seq>=rcv_nxt. - */ - if (!after(start_seq, tp->last_ack_sent) && - !before(end_seq, tp->rcv_nxt)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Plus: expired timestamps. - * - * Plus: resets failing PAWS. - */ - if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - } -} - -#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) - -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len) -{ - return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - (s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS && - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ - len != (th->doff * 4)); -} - - -static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) -{ - u32 end_window = tp->rcv_wup + tp->rcv_wnd; - - if (tp->rcv_wnd && - after(end_seq, tp->rcv_nxt) && - before(seq, end_window)) - return 1; - if (seq != end_window) - return 0; - return (seq == end_seq); -} - -/* This functions checks to see if the tcp header is actually acceptable. */ -extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) -{ - if (seq == tp->rcv_nxt) - return (tp->rcv_wnd || (end_seq == seq)); - - return __tcp_sequence(tp, seq, end_seq); -} - -/* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) -{ - sk->zapped = 1; - - /* We want the right error as BSD sees it (and indeed as we do). */ - switch (sk->state) { - case TCP_SYN_SENT: - sk->err = ECONNREFUSED; - break; - case TCP_CLOSE_WAIT: - sk->err = EPIPE; - break; - default: - sk->err = ECONNRESET; - }; - tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) - sk->state_change(sk); -} - -/* This tags the retransmission queue when SACKs arrive. */ -static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int i = nsacks; - - while(i--) { - struct sk_buff *skb = skb_peek(&sk->write_queue); - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if(after(TCP_SKB_CB(skb)->seq, end_seq)) - break; - - /* We play conservative, we don't allow SACKS to partially - * tag a sequence space. - */ - fack_count++; - if(!after(start_seq, TCP_SKB_CB(skb)->seq) && - !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { - /* If this was a retransmitted frame, account for it. */ - if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && - tp->retrans_out) - tp->retrans_out--; - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - - /* RULE: All new SACKs will either decrease retrans_out - * or advance fackets_out. - */ - if(fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - } - skb = skb->next; - } - sp++; /* Move on to the next SACK block. */ - } -} - -/* Look for tcp options. Normally only called on SYN and SYNACK packets. - * But, this can also be called on packets in the established flow when - * the fast version below fails. - */ -void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) -{ - unsigned char *ptr; - int length=(th->doff*4)-sizeof(struct tcphdr); - int saw_mss = 0; - - ptr = (unsigned char *)(th + 1); - tp->saw_tstamp = 0; - - while(length>0) { - int opcode=*ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - switch(opcode) { - case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) { - u16 in_mss = ntohs(*(__u16 *)ptr); - if (in_mss == 0) - in_mss = 536; - if (tp->mss_clamp > in_mss) - tp->mss_clamp = in_mss; - saw_mss = 1; - } - break; - case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn) - if (!no_fancy && sysctl_tcp_window_scaling) { - tp->wscale_ok = 1; - tp->snd_wscale = *(__u8 *)ptr; - if(tp->snd_wscale > 14) { - if(net_ratelimit()) - printk("tcp_parse_options: Illegal window " - "scaling value %d >14 received.", - tp->snd_wscale); - tp->snd_wscale = 14; - } - } - break; - case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { - if (sysctl_tcp_timestamps && !no_fancy) { - tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); - } - } - break; - case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn) { - if (sysctl_tcp_sack && !no_fancy) { - tp->sack_ok = 1; - tp->num_sacks = 0; - } - } - break; - - case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && - sysctl_tcp_sack && (sk != NULL) && !th->syn) { - int sack_bytes = opsize - TCPOLEN_SACK_BASE; - - if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { - int num_sacks = sack_bytes >> 3; - struct tcp_sack_block *sackp; - - sackp = (struct tcp_sack_block *)ptr; - tcp_sacktag_write_queue(sk, sackp, num_sacks); - } - } - }; - ptr+=opsize-2; - length-=opsize; - }; - } - if(th->syn && saw_mss == 0) - tp->mss_clamp = 536; -} - -/* Fast parse options. This hopes to only see timestamps. - * If it is wrong it falls back on tcp_parse_options(). - */ -static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) -{ - /* If we didn't send out any options ignore them all. */ - if (tp->tcp_header_len == sizeof(struct tcphdr)) - return 0; - if (th->doff == sizeof(struct tcphdr)>>2) { - tp->saw_tstamp = 0; - return 0; - } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __u32 *ptr = (__u32 *)(th + 1); - if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*++ptr); - tp->rcv_tsecr = ntohl(*++ptr); - return 1; - } - } - tcp_parse_options(sk, th, tp, 0); - return 1; -} - -#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ - -static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) -{ - if (tp->dup_acks > 3) - tp->snd_cwnd = (tp->snd_ssthresh); - - tp->dup_acks = 0; -} - -/* NOTE: This code assumes that tp->dup_acks gets cleared when a - * retransmit timer fires. - */ -static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Note: If not_dup is set this implies we got a - * data carrying packet or a window update. - * This carries no new information about possible - * lost packets, so we have to ignore it for the purposes - * of counting duplicate acks. Ideally this does not imply we - * should stop our fast retransmit phase, more acks may come - * later without data to help us. Unfortunately this would make - * the code below much more complex. For now if I see such - * a packet I clear the fast retransmit phase. - */ - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { - /* This is the standard reno style fast retransmit branch. */ - - /* 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. - */ - if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; - if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd = (tp->snd_ssthresh + 3); - tp->high_seq = tp->snd_nxt; - if(!tp->fackets_out) - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - else - tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } else if (++tp->dup_acks > 3) { - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". We do not use - * duplicate ACKs to artificially inflate the congestion - * window when doing FACK. - */ - if(!tp->fackets_out) { - tp->snd_cwnd++; - } else { - /* Fill any further holes which may have - * appeared. - * - * We may want to change this to run every - * further multiple-of-3 dup ack increments, - * to be more robust against out-of-order - * packet delivery. -DaveM - */ - tcp_fack_retransmit(sk); - } - } - } else if (tp->high_seq != 0) { - /* In this branch we deal with clearing the Floyd style - * block on duplicate fast retransmits, and if requested - * we do Hoe style secondary fast retransmits. - */ - if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { - /* Once we have acked all the packets up to high_seq - * we are done this fast retransmit phase. - * Alternatively data arrived. In this case we - * Have to abort the fast retransmit attempt. - * Note that we do want to accept a window - * update since this is expected with Hoe's algorithm. - */ - clear_fast_retransmit(tp); - - /* After we have cleared up to high_seq we can - * clear the Floyd style block. - */ - if (!before(ack, tp->high_seq)) { - tp->high_seq = 0; - tp->fackets_out = 0; - } - } else if (tp->dup_acks >= 3) { - if (!tp->fackets_out) { - /* Hoe Style. We didn't ack the whole - * window. Take this as a cue that - * another packet was lost and retransmit it. - * Don't muck with the congestion window here. - * Note that we have to be careful not to - * act if this was a window update and it - * didn't ack new data, since this does - * not indicate a packet left the system. - * We can test this by just checking - * if ack changed from snd_una, since - * the only way to get here without advancing - * from snd_una is if this was a window update. - */ - if (ack != tp->snd_una && before(ack, tp->high_seq)) { - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } else { - /* FACK style, fill any remaining holes in - * receiver's queue. - */ - tcp_fack_retransmit(sk); - } - } - } -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) -{ - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } -} - -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, - __u32 *seq, __u32 *seq_rtt) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - __u32 now = tcp_time_stamp; - int acked = 0; - - /* If we are retransmitting, and this ACK clears up to - * the retransmit head, or further, then clear our state. - */ - if (tp->retrans_head != NULL && - !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq)) - tp->retrans_head = NULL; - - while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u8 sacked = scb->sacked; - - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ - if (after(scb->end_seq, ack)) - break; - - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out) - tp->retrans_out--; - if(!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; - if(sacked & TCPCB_SACKED_RETRANS) - acked |= FLAG_RETRANS_DATA_ACKED; - if(tp->fackets_out) - tp->fackets_out--; - } else { - /* This is pure paranoia. */ - tp->retrans_head = NULL; - } - tp->packets_out--; - *seq = scb->seq; - *seq_rtt = now - scb->when; - __skb_unlink(skb, skb->list); - kfree_skb(skb); - } - return acked; -} - -static void tcp_ack_probe(struct sock *sk, __u32 ack) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Our probe was answered. */ - tp->probes_out = 0; - - /* Was it a usable window open? */ - - /* should always be non-null */ - if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { - tp->backoff = 0; - tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - } else { - tcp_reset_xmit_timer(sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); - } -} - -/* Should we open up the congestion window? */ -static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) -{ - /* Data must have been acked. */ - if ((flag & FLAG_DATA_ACKED) == 0) - return 0; - - /* Some of the data acked was retransmitted somehow? */ - if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) { - /* We advance in all cases except during - * non-FACK fast retransmit/recovery. - */ - if (tp->fackets_out != 0 || - tp->retransmits != 0) - return 1; - - /* Non-FACK fast retransmit does it's own - * congestion window management, don't get - * in the way. - */ - return 0; - } - - /* New non-retransmitted data acked, always advance. */ - return 1; -} - -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, - u32 seq, u32 ack, int flag) -{ - __u32 seq_rtt; - - /* RTTM Rule: A TSecr value received in a segment is used to - * update the averaged RTT measurement only if the segment - * acknowledges some new data, i.e., only if it advances the - * left edge of the send window. - * - * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> - */ - if (!(flag & FLAG_DATA_ACKED)) - return; - - seq_rtt = tcp_time_stamp - tp->rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - tp->backoff = 0; - tcp_set_rto(tp); - } else { - /* Still retransmitting, use backoff */ - tcp_set_rto(tp); - tp->rto = tp->rto << tp->backoff; - } - } else { - tcp_set_rto(tp); - } - - tcp_bound_rto(tp); -} - -static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) -{ - struct sk_buff *skb = skb_peek(&sk->write_queue); - - /* Some data was ACK'd, if still retransmitting (due to a - * timeout), resend more of the retransmit queue. The - * congestion window is handled properly by that code. - */ - if (tp->retransmits) { - tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); - if ((__s32)when < 0) - when = 1; - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); - } -} - -/* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, struct tcphdr *th, - u32 ack_seq, u32 ack, int len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int flag = 0; - u32 seq = 0; - u32 seq_rtt = 0; - - if(sk->zapped) - return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - tp->probes_out = 0; - - tp->rcv_tstamp = tcp_time_stamp; - - /* If the ack is newer than sent or older than previous acks - * then we can probably ignore it. - */ - if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) - goto uninteresting_ack; - - /* If there is data set flag 1 */ - if (len != th->doff*4) { - flag |= FLAG_DATA; - tcp_delack_estimator(tp); - } - - /* Update our send window. */ - - /* This is the window update code as per RFC 793 - * snd_wl{1,2} are used to prevent unordered - * segments from shrinking the window - */ - if (before(tp->snd_wl1, ack_seq) || - (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { - u32 nwin = ntohs(th->window) << tp->snd_wscale; - - if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { - flag |= FLAG_WIN_UPDATE; - tp->snd_wnd = nwin; - - tp->snd_wl1 = ack_seq; - tp->snd_wl2 = ack; - - if (nwin > tp->max_window) - tp->max_window = nwin; - } - } - - /* We passed data and got it acked, remove any soft error - * log. Something worked... - */ - sk->err_soft = 0; - - /* If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. - */ - if (tp->pending == TIME_PROBE0) - tcp_ack_probe(sk, ack); - - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - - /* We must do this here, before code below clears out important - * state contained in tp->fackets_out and tp->retransmits. -DaveM - */ - if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); - - /* If we have a timestamp, we always do rtt estimates. */ - if (tp->saw_tstamp) { - tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); - } else { - /* If we were retransmiting don't count rtt estimate. */ - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - } - } else { - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - if (flag & FLAG_DATA_ACKED) { - if(!(flag & FLAG_RETRANS_DATA_ACKED)) { - tp->backoff = 0; - tcp_rtt_estimator(tp, seq_rtt); - tcp_set_rto(tp); - tcp_bound_rto(tp); - } - } - } - } - - if (tp->packets_out) { - if (flag & FLAG_DATA_ACKED) - tcp_ack_packets_out(sk, tp); - } else { - tcp_clear_xmit_timer(sk, TIME_RETRANS); - } - - flag &= (FLAG_DATA | FLAG_WIN_UPDATE); - if ((ack == tp->snd_una && tp->packets_out && flag == 0) || - (tp->high_seq != 0)) { - tcp_fast_retrans(sk, ack, flag); - } else { - /* Clear any aborted fast retransmit starts. */ - tp->dup_acks = 0; - } - /* It is not a brain fart, I thought a bit now. 8) - * - * Forward progress is indicated, if: - * 1. the ack acknowledges new data. - * 2. or the ack is duplicate, but it is caused by new segment - * arrival. This case is filtered by: - * - it contains no data, syn or fin. - * - it does not update window. - * 3. or new SACK. It is difficult to check, so that we ignore it. - * - * Forward progress is also indicated by arrival new data, - * which was caused by window open from our side. This case is more - * difficult and it is made (alas, incorrectly) in tcp_data_queue(). - * --ANK (990513) - */ - if (ack != tp->snd_una || (flag == 0 && !th->fin)) - dst_confirm(sk->dst_cache); - - /* Remember the highest ack received. */ - tp->snd_una = ack; - return 1; - -uninteresting_ack: - SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); - return 0; -} - -/* New-style handling of TIME_WAIT sockets. */ -extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); - -void tcp_timewait_kill(struct tcp_tw_bucket *tw) -{ - struct tcp_bind_bucket *tb = tw->tb; - - /* Disassociate with bind bucket. */ - if(tw->bind_next) - tw->bind_next->bind_pprev = tw->bind_pprev; - *(tw->bind_pprev) = tw->bind_next; - if (tb->owners == NULL) { - if (tb->next) - tb->next->pprev = tb->pprev; - *(tb->pprev) = tb->next; - kmem_cache_free(tcp_bucket_cachep, tb); - } - - /* Unlink from established hashes. */ - if(tw->next) - tw->next->pprev = tw->pprev; - *tw->pprev = tw->next; - - /* We decremented the prot->inuse count when we entered TIME_WAIT - * and the sock from which this came was destroyed. - */ - tw->sklist_next->sklist_prev = tw->sklist_prev; - tw->sklist_prev->sklist_next = tw->sklist_next; - - /* Ok, now free it up. */ - kmem_cache_free(tcp_timewait_cachep, tw); -} - -/* We come here as a special case from the AF specific TCP input processing, - * and the SKB has no owner. Essentially handling this is very simple, - * we just keep silently eating rx'd packets, acking them if necessary, - * until none show up for the entire timeout period. - * - * Return 0, TCP_TW_ACK, TCP_TW_RST - */ -enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - /* RFC 1122: - * "When a connection is [...] on TIME-WAIT state [...] - * [a TCP] MAY accept a new SYN from the remote TCP to - * reopen the connection directly, if it: - * - * (1) assigns its initial sequence number for the new - * connection to be larger than the largest sequence - * number it used on the previous connection incarnation, - * and - * - * (2) returns to TIME-WAIT state if the SYN turns out - * to be an old duplicate". - */ - if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) { - struct sock *sk; - struct tcp_func *af_specific = tw->af_specific; - __u32 isn; - - isn = tw->snd_nxt + 128000; - if(isn == 0) - isn++; - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - sk = af_specific->get_sock(skb, th); - if(sk == NULL || - !ipsec_sk_policy(sk,skb) || - atomic_read(&sk->sock_readers) != 0) - return 0; - skb_set_owner_r(skb, sk); - af_specific = sk->tp_pinfo.af_tcp.af_specific; - if(af_specific->conn_request(sk, skb, isn) < 0) - return TCP_TW_RST; /* Toss a reset back. */ - return 0; /* Discard the frame. */ - } - - /* Check RST or SYN */ - if(th->rst || th->syn) { - /* This is TIME_WAIT assasination, in two flavors. - * Oh well... nobody has a sufficient solution to this - * protocol bug yet. - */ - if(sysctl_tcp_rfc1337 == 0) { - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - } - if(!th->rst) - return TCP_TW_RST; /* toss a reset back */ - return 0; - } else { - /* In this case we must reset the TIMEWAIT timer. */ - if(th->ack) - tcp_tw_reschedule(tw); - } - /* Ack old packets if necessary */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) && - (th->doff * 4) > len) - return TCP_TW_ACK; - return 0; -} - -/* Enter the time wait state. This is always called from BH - * context. Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) -{ - struct sock **head, *sktw; - - /* Step 1: Remove SK from established hash. */ - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - tcp_reg_zap(sk); - - /* Step 2: Put TW into bind hash where SK was. */ - tw->tb = (struct tcp_bind_bucket *)sk->prev; - if((tw->bind_next = sk->bind_next) != NULL) - sk->bind_next->bind_pprev = &tw->bind_next; - tw->bind_pprev = sk->bind_pprev; - *sk->bind_pprev = (struct sock *)tw; - sk->prev = NULL; - - /* Step 3: Same for the protocol sklist. */ - (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw; - (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw; - sk->sklist_next = NULL; - sk->prot->inuse--; - - /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ - head = &tcp_ehash[sk->hashent + (tcp_ehash_size/2)]; - sktw = (struct sock *)tw; - if((sktw->next = *head) != NULL) - (*head)->pprev = &sktw->next; - *head = sktw; - sktw->pprev = head; -} - -void tcp_time_wait(struct sock *sk) -{ - struct tcp_tw_bucket *tw; - - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - if(tw != NULL) { - /* Give us an identity. */ - tw->daddr = sk->daddr; - tw->rcv_saddr = sk->rcv_saddr; - tw->bound_dev_if= sk->bound_dev_if; - tw->num = sk->num; - tw->state = TCP_TIME_WAIT; - tw->sport = sk->sport; - tw->dport = sk->dport; - tw->family = sk->family; - tw->reuse = sk->reuse; - tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; - tw->window = tcp_select_window(sk); - tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if(tw->family == PF_INET6) { - memcpy(&tw->v6_daddr, - &sk->net_pinfo.af_inet6.daddr, - sizeof(struct in6_addr)); - memcpy(&tw->v6_rcv_saddr, - &sk->net_pinfo.af_inet6.rcv_saddr, - sizeof(struct in6_addr)); - } -#endif - /* Linkage updates. */ - tcp_tw_hashdance(sk, tw); - - /* Get the TIME_WAIT timeout firing. */ - tcp_tw_schedule(tw); - - /* CLOSE the SK. */ - if(sk->state == TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab--; - sk->state = TCP_CLOSE; - net_reset_timer(sk, TIME_DONE, - min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); - } else { - /* Sorry, we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - tcp_set_state(sk, TCP_CLOSE); - } - - /* Prevent rcvmsg/sndmsg calls, and wake people up. */ - sk->shutdown = SHUTDOWN_MASK; - if(!sk->dead) - sk->state_change(sk); -} - -/* - * Process the FIN bit. This now behaves as it is supposed to work - * and the FIN takes effect when it is validly part of sequence - * space. Not before when we get holes. - * - * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT - * (and thence onto LAST-ACK and finally, CLOSE, we never enter - * TIME-WAIT) - * - * If we are in FINWAIT-1, a received FIN indicates simultaneous - * close and we go into CLOSING (and later onto TIME-WAIT) - * - * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. - */ - -static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) -{ - sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; - - tcp_send_ack(sk); - - if (!sk->dead) { - sk->state_change(sk); - sock_wake_async(sk->socket, 1); - } - - switch(sk->state) { - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - /* Move to CLOSE_WAIT */ - tcp_set_state(sk, TCP_CLOSE_WAIT); - if (th->rst) - sk->shutdown = SHUTDOWN_MASK; - break; - - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* Received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_LAST_ACK: - /* RFC793: Remain in the LAST-ACK state. */ - break; - - case TCP_FIN_WAIT1: - /* This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - * - * This causes a WRITE timeout, which will either - * move on to TIME_WAIT when we timeout, or resend - * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already - * correct for handling this timeout. - */ - tcp_set_state(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_time_wait(sk); - break; - default: - /* Only TCP_LISTEN and TCP_CLOSE are left, in these - * cases we should never reach this piece of code. - */ - printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); - break; - }; -} - -/* These routines update the SACK block as out-of-order packets arrive or - * in-order packets close up the sequence space. - */ -static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) -{ - int this_sack, num_sacks = tp->num_sacks; - struct tcp_sack_block *swalk = &tp->selective_acks[0]; - - /* If more than one SACK block, see if the recent change to SP eats into - * or hits the sequence space of other SACK blocks, if so coalesce. - */ - if(num_sacks != 1) { - for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { - if(swalk == sp) - continue; - - /* First case, bottom of SP moves into top of the - * sequence space of SWALK. - */ - if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { - sp->start_seq = swalk->start_seq; - goto coalesce; - } - /* Second case, top of SP moves into bottom of the - * sequence space of SWALK. - */ - if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { - sp->end_seq = swalk->end_seq; - goto coalesce; - } - } - } - /* SP is the only SACK, or no coalescing cases found. */ - return; - -coalesce: - /* Zap SWALK, by moving every further SACK up by one slot. - * Decrease num_sacks. - */ - for(; this_sack < num_sacks-1; this_sack++, swalk++) { - struct tcp_sack_block *next = (swalk + 1); - swalk->start_seq = next->start_seq; - swalk->end_seq = next->end_seq; - } - tp->num_sacks--; -} - -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) -{ - __u32 tmp; - - tmp = sack1->start_seq; - sack1->start_seq = sack2->start_seq; - sack2->start_seq = tmp; - - tmp = sack1->end_seq; - sack1->end_seq = sack2->end_seq; - sack2->end_seq = tmp; -} - -static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int cur_sacks = tp->num_sacks; - - if (!cur_sacks) - goto new_sack; - - /* Optimize for the common case, new ofo frames arrive - * "in order". ;-) This also satisfies the requirements - * of RFC2018 about ordering of SACKs. - */ - if(sp->end_seq == TCP_SKB_CB(skb)->seq) { - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tcp_sack_maybe_coalesce(tp, sp); - } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) { - /* Re-ordered arrival, in this case, can be optimized - * as well. - */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_maybe_coalesce(tp, sp); - } else { - struct tcp_sack_block *swap = sp + 1; - int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4); - - /* Oh well, we have to move things around. - * Try to find a SACK we can tack this onto. - */ - - for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { - if((swap->end_seq == TCP_SKB_CB(skb)->seq) || - (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { - if(swap->end_seq == TCP_SKB_CB(skb)->seq) - swap->end_seq = TCP_SKB_CB(skb)->end_seq; - else - swap->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_swap(sp, swap); - tcp_sack_maybe_coalesce(tp, sp); - return; - } - } - - /* Could not find an adjacent existing SACK, build a new one, - * put it at the front, and shift everyone else down. We - * always know there is at least one SACK present already here. - * - * If the sack array is full, forget about the last one. - */ - if (cur_sacks >= max_sacks) { - cur_sacks--; - tp->num_sacks--; - } - while(cur_sacks >= 1) { - struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; - struct tcp_sack_block *prev = (this - 1); - this->start_seq = prev->start_seq; - this->end_seq = prev->end_seq; - cur_sacks--; - } - - new_sack: - /* Build the new head SACK, and we're done. */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tp->num_sacks++; - } -} - -static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) -{ - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int num_sacks = tp->num_sacks; - int this_sack; - - /* This is an in order data segment _or_ an out-of-order SKB being - * moved to the receive queue, so we know this removed SKB will eat - * from the front of a SACK. - */ - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - /* Check if the start of the sack is covered by skb. */ - if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) && - before(sp->start_seq, TCP_SKB_CB(skb)->end_seq)) - break; - } - - /* This should only happen if so many SACKs get built that some get - * pushed out before we get here, or we eat some in sequence packets - * which are before the first SACK block. - */ - if(this_sack >= num_sacks) - return; - - sp->start_seq = TCP_SKB_CB(skb)->end_seq; - if(!before(sp->start_seq, sp->end_seq)) { - /* Zap this SACK, by moving forward any other SACKS. */ - for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { - struct tcp_sack_block *next = (sp + 1); - sp->start_seq = next->start_seq; - sp->end_seq = next->end_seq; - } - tp->num_sacks--; - } -} - -static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) -{ - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int num_sacks = tp->num_sacks; - int this_sack; - - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) - break; - } - if(this_sack >= num_sacks) - return; - sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; -} - -/* This one checks to see if we can put data from the - * out_of_order queue into the receive_queue. - */ -static void tcp_ofo_queue(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - while ((skb = skb_peek(&tp->out_of_order_queue))) { - if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) - break; - - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); - kfree_skb(skb); - continue; - } - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - - if(tp->sack_ok) - tcp_sack_remove_skb(tp, skb); - __skb_unlink(skb, skb->list); - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); - } -} - -static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) -{ - struct sk_buff *skb1; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Queue data for delivery to the user. - * Packets in sequence go to the receive queue. - * Out of sequence packets to the out_of_order_queue. - */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - /* Ok. In sequence. */ - queue_and_out: - dst_confirm(sk->dst_cache); - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) { - tcp_fin(skb, sk, skb->h.th); - } else { - tcp_remember_ack(tp, skb->h.th, skb); - } - /* This may have eaten into a SACK block. */ - if(tp->sack_ok && tp->num_sacks) - tcp_sack_remove_skb(tp, skb); - tcp_ofo_queue(sk); - - /* Turn on fast path. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - (0x10 << 16) | - tp->snd_wnd); - return; - } - - /* An old packet, either a retransmit or some packet got lost. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); - tcp_enter_quickack_mode(tp); - kfree_skb(skb); - return; - } - - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - /* Partial packet, seq < rcv_next < end_seq */ - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - - goto queue_and_out; - } - - /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks++; - tcp_enter_quickack_mode(tp); - - /* Disable header prediction. */ - tp->pred_flags = 0; - - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - - if (skb_peek(&tp->out_of_order_queue) == NULL) { - /* Initial out of order segment, build 1 SACK. */ - if(tp->sack_ok) { - tp->num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; - } - __skb_queue_head(&tp->out_of_order_queue,skb); - } else { - for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { - /* Already there. */ - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) { - if (skb->len >= skb1->len) { - if(tp->sack_ok) - tcp_sack_extend(tp, skb1, skb); - __skb_append(skb1, skb); - __skb_unlink(skb1, skb1->list); - kfree_skb(skb1); - } else { - /* A duplicate, smaller than what is in the - * out-of-order queue right now, toss it. - */ - kfree_skb(skb); - } - break; - } - - if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) { - __skb_append(skb1, skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; - } - - /* See if we've hit the start. If so insert. */ - if (skb1 == skb_peek(&tp->out_of_order_queue)) { - __skb_queue_head(&tp->out_of_order_queue,skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; - } - } - } -} - - -/* - * This routine handles the data. If there is room in the buffer, - * it will be have already been moved into it. If there is no - * room, then we will just have to discard the packet. - */ - -static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) -{ - struct tcphdr *th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - char *str1 = "pfinet: tcp_data check point 1.\n"; - char *str2 = "pfinet: tcp_data check point 2.\n"; - char *str3 = "pfinet: tcp_data check point 3.\n"; - int stderr_fd = fileno (stderr); - - th = skb->h.th; - skb_pull(skb, th->doff*4); - skb_trim(skb, len - (th->doff*4)); - - if (skb->len == 0 && !th->fin) - return(0); - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr); - /* - * If our receive queue has grown past its limits shrink it. - * Make sure to do this before moving snd_nxt, otherwise - * data might be acked for that we don't have enough room. - */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { - if (prune_queue(sk) < 0) { - /* Still not enough room. That can happen when - * skb->true_size differs significantly from skb->len. - */ - return 0; - } - } - - tcp_data_queue(sk, skb); - - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr); - if (before(tp->rcv_nxt, tp->copied_seq)) { - printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); - tp->rcv_nxt = tp->copied_seq; - } - - /* Above, tcp_data_queue() increments delayed_acks appropriately. - * Now tell the user we may have some data. - */ - if (!sk->dead) { - sk->data_ready(sk,0); - } - write (stderr_fd, str3, strlen (str3) + 1); - fflush (stderr); - return(1); -} - -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < tp->snd_cwnd) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } -} - -static __inline__ void tcp_data_snd_check(struct sock *sk) -{ - struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); -} - -/* - * Adapt the MSS value used to make delayed ack decision to the - * real world. - */ -static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len = skb->len, lss; - - if (len > tp->rcv_mss) - tp->rcv_mss = len; - lss = tp->last_seg_size; - tp->last_seg_size = 0; - if (len >= 536) { - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; - } -} - -/* - * Check if sending an ack is needed. - */ -static __inline__ void __tcp_ack_snd_check(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* This also takes care of updating the window. - * This if statement needs to be simplified. - * - * Rules for delaying an ack: - * - delay time <= 0.5 HZ - * - we don't have a window update to send - * - must send at least every 2 full sized packets - * - must send an ACK if we have any out of order data - * - * With an extra heuristic to handle loss of packet - * situations and also helping the sender leave slow - * start in an expediant manner. - */ - - /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || - /* We will update the window "significantly" or... */ - tcp_raise_window(sk) || - /* We entered "quick ACK" mode or... */ - tcp_in_quickack_mode(tp) || - /* We have out of order data */ - (skb_peek(&tp->out_of_order_queue) != NULL)) { - /* Then ack it now */ - tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ - tcp_send_delayed_ack(tp, HZ/2); - } -} - -static __inline__ void tcp_ack_snd_check(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->delayed_acks == 0) { - /* We sent a data segment already. */ - return; - } - __tcp_ack_snd_check(sk); -} - - -/* - * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be - * moved inline now as tcp_urg is only called from one - * place. We handle URGent data wrong. We have to - as - * BSD still doesn't use the correction from RFC961. - * For 1003.1g we should support a new option TCP_STDURG to permit - * either form (or just set the sysctl tcp_stdurg). - */ - -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 ptr = ntohs(th->urg_ptr); - - if (ptr && !sysctl_tcp_stdurg) - ptr--; - ptr += ntohl(th->seq); - - /* Ignore urgent data that we've already seen and read. */ - if (after(tp->copied_seq, ptr)) - return; - - /* Do we already have a newer (or duplicate) urgent pointer? */ - if (tp->urg_data && !after(ptr, tp->urg_seq)) - return; - - /* Tell the world about our new urgent pointer. */ - if (sk->proc != 0) { - if (sk->proc > 0) - kill_proc(sk->proc, SIGURG, 1); - else - kill_pg(-sk->proc, SIGURG, 1); - } - - /* We may be adding urgent data when the last byte read was - * urgent. To do this requires some care. We cannot just ignore - * tp->copied_seq since we would read the last urgent byte again - * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) - */ - if (tp->urg_seq == tp->copied_seq) - tp->copied_seq++; /* Move the copied sequence on correctly */ - tp->urg_data = URG_NOTYET; - tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; -} - -/* This is the 'fast' part of urgent handling. */ -static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) - tcp_check_urg(sk,th); - - /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == URG_NOTYET) { - u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); - - /* Is the urgent pointer pointing into this packet? */ - if (ptr < len) { - tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); - if (!sk->dead) - sk->data_ready(sk,0); - } - } -} - -/* Clean the out_of_order queue if we can, trying to get - * the socket within its memory limits again. - * - * Return less than zero if we should start dropping frames - * until the socket owning process reads some of the data - * to stabilize the situation. - */ -static int prune_queue(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff * skb; - - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - - net_statistics.PruneCalled++; - - /* First, purge the out_of_order queue. */ - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - if(skb != NULL) { - /* Free it all. */ - do { net_statistics.OfoPruned += skb->len; - kfree_skb(skb); - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - } while(skb != NULL); - - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if(tp->sack_ok) - tp->num_sacks = 0; - } - - /* If we are really being abused, tell the caller to silently - * drop receive data on the floor. It will get retransmitted - * and hopefully then we'll have sufficient space. - * - * We used to try to purge the in-order packets too, but that - * turns out to be deadly and fraught with races. Consider: - * - * 1) If we acked the data, we absolutely cannot drop the - * packet. This data would then never be retransmitted. - * 2) It is possible, with a proper sequence of events involving - * delayed acks and backlog queue handling, to have the user - * read the data before it gets acked. The previous code - * here got this wrong, and it lead to data corruption. - * 3) Too much state changes happen when the FIN arrives, so once - * we've seen that we can't remove any in-order data safely. - * - * The net result is that removing in-order receive data is too - * complex for anyones sanity. So we don't do it anymore. But - * if we are really having our buffer space abused we stop accepting - * new receive data. - */ - if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) - return 0; - - /* Massive buffer overcommit. */ - return -1; -} - -/* - * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. - */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued; - u32 flg; - char *str1 = "pfinet tcp_rcv_established check point 1\n"; - char *str2 = "pfinet tcp_rcv_established check point 2\n"; - int stderr_fd = fileno (stderr); - - /* - * Header prediction. - * The code follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */ - - /* - * RFC1323: H1. Apply PAWS check first. - */ - if (tcp_fast_parse_options(sk, th, tp)) { - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - } - } - - flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16); - - /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 else it will be !0 - * (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (len <= th->doff*4) { - /* Bulk data transfer: sender */ - if (len == th->doff*4) { - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - kfree_skb(skb); - tcp_data_snd_check(sk); - return 0; - } else { /* Header too small */ - tcp_statistics.TcpInErrs++; - goto discard; - } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && - atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { - /* Bulk data transfer: receiver */ - __skb_pull(skb,th->doff*4); - - tcp_measure_rcv_mss(sk, skb); - - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - */ - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - - /* FIN bit check is not done since if FIN is set in - * this frame, the pred_flags won't match up. -DaveM - */ - sk->data_ready(sk, 0); - tcp_delack_estimator(tp); - - tcp_remember_ack(tp, th, skb); - - __tcp_ack_snd_check(sk); - return 0; - } - } - - /* - * Standard slow path. - */ - - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - /* RFC793, page 37: "In all states except SYN-SENT, all reset - * (RST) segments are validated by checking their SEQ-fields." - * And page 69: "If an incoming segment is not acceptable, - * an acknowledgment should be sent in reply (unless the RST bit - * is set, if so drop the segment and return)". - */ - if (th->rst) - goto discard; - if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", - TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tp->rcv_wup, tp->rcv_wnd); - } - tcp_send_ack(sk); - goto discard; - } - - if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - SOCK_DEBUG(sk, "syn in established state\n"); - tcp_statistics.TcpInErrs++; - tcp_reset(sk); - return 1; - } - - if(th->rst) { - tcp_reset(sk); - goto discard; - } - - if(th->ack) - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); - - /* Process urgent data. */ - tcp_urg(sk, th, len); - - /* step 7: process the segment text */ - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - * - * Dave!!! Phrase above (and all about rcv_mss) has - * nothing to do with reality. rcv_mss must measure TOTAL - * size, including sacks, IP options etc. Hence, measure_rcv_mss - * must occure before pulling etc, otherwise it will flap - * like hell. Even putting it before tcp_data is wrong, - * it should use skb->tail - skb->nh.raw instead. - * --ANK (980805) - * - * BTW I broke it. Now all TCP options are handled equally - * in mss_clamp calculations (i.e. ignored, rfc1122), - * and mss_cache does include all of them (i.e. tstamps) - * except for sacks, to calulate effective mss faster. - * --ANK (980805) - */ - tcp_measure_rcv_mss(sk, skb); - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr_fd); - /* Be careful, tcp_data() may have put this into TIME_WAIT. */ - if(sk->state != TCP_CLOSE) { - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - } - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr_fd); - - if (!queued) { - discard: - kfree_skb(skb); - } - - return 0; -} - -/* - * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented - * as an open_request. - */ - -struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - struct open_request *req) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 flg; - - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. - */ - - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); - - flg &= __constant_htonl(0x00170000); - /* Only SYN set? */ - if (flg == __constant_htonl(0x00020000)) { - if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { - /* retransmited syn. - */ - req->class->rtx_syn_ack(sk, req); - return NULL; - } else { - return sk; /* Pass new SYN to the listen socket. */ - } - } - - /* We know it's an ACK here */ - if (req->sk) { - /* socket already created but not - * yet accepted()... - */ - sk = req->sk; - } else { - /* In theory the packet could be for a cookie, but - * TIME_WAIT should guard us against this. - * XXX: Nevertheless check for cookies? - * This sequence number check is done again later, - * but we do it here to prevent syn flood attackers - * from creating big SYN_RECV sockets. - */ - if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) || - !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, - req->rcv_isn+1+req->rcv_wnd)) { - req->class->send_reset(skb); - return NULL; - } - - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - if (sk == NULL) - return NULL; - - req->expires = 0UL; - req->sk = sk; - } - skb_orphan(skb); - skb_set_owner_r(skb, sk); - return sk; -} - -/* - * This function implements the receiving procedure of RFC 793 for - * all states except ESTABLISHED and TIME_WAIT. - * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be - * address independent. - */ - -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued = 0; - - switch (sk->state) { - case TCP_CLOSE: - /* When state == CLOSED, hash lookup always fails. - * - * But, there is a back door, the backlog queue. - * If we have a sequence of packets in the backlog - * during __release_sock() which have a sequence such - * that: - * packet X causes entry to TCP_CLOSE state - * ... - * packet X + N has FIN bit set - * - * We report a (luckily) harmless error in this case. - * The issue is that backlog queue processing bypasses - * any hash lookups (we know which socket packets are for). - * The correct behavior here is what 2.0.x did, since - * a TCP_CLOSE socket does not exist. Drop the frame - * and send a RST back to the other end. - */ - return 1; - - case TCP_LISTEN: - /* These use the socket TOS.. - * might want to be the received TOS - */ - if(th->ack) - return 1; - - if(th->syn) { - if(tp->af_specific->conn_request(sk, skb, 0) < 0) - return 1; - - /* Now we have several options: In theory there is - * nothing else in the frame. KA9Q has an option to - * send data with the syn, BSD accepts data with the - * syn up to the [to be] advertised window and - * Solaris 2.1 gives you a protocol error. For now - * we just ignore it, that fits the spec precisely - * and avoids incompatibilities. It would be nice in - * future to drop through and process the data. - * - * Now that TTCP is starting to be used we ought to - * queue this data. - * But, this leaves one open to an easy denial of - * service attack, and SYN cookies can't defend - * against this problem. So, we drop the data - * in the interest of security over speed. - */ - goto discard; - } - - goto discard; - break; - - case TCP_SYN_SENT: - /* SYN sent means we have to look for a suitable ack and - * either reset for bad matches or go to connected. - * The SYN_SENT case is unusual and should - * not be in line code. [AC] - */ - if(th->ack) { - /* rfc793: - * "If the state is SYN-SENT then - * first check the ACK bit - * If the ACK bit is set - * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send - * a reset (unless the RST bit is set, if so drop - * the segment and return)" - * - * I cite this place to emphasize one essential - * detail, this check is different of one - * in established state: SND.UNA <= SEG.ACK <= SND.NXT. - * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, - * because we have no previous data sent before SYN. - * --ANK(990513) - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: - */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) - return 1; - - /* Now ACK is acceptable. - * - * "If the RST bit is set - * If the ACK was acceptable then signal the user "error: - * connection reset", drop the segment, enter CLOSED state, - * delete TCB, and return." - */ - - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - /* rfc793: - * "fifth, if neither of the SYN or RST bits is set then - * drop the segment and return." - * - * See note below! - * --ANK(990513) - */ - - if (!th->syn) - goto discard; - - /* rfc793: - * "If the SYN bit is on ... - * are acceptable then ... - * (our SYN has been ACKed), change the connection - * state to ESTABLISHED..." - * - * Do you see? SYN-less ACKs in SYN-SENT state are - * completely ignored. - * - * The bug causing stalled SYN-SENT sockets - * was here: tcp_ack advanced snd_una and canceled - * retransmit timer, so that bare ACK received - * in SYN-SENT state (even with invalid ack==ISS, - * because tcp_ack check is too weak for SYN-SENT) - * causes moving socket to invalid semi-SYN-SENT, - * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * - * --ANK (990514) - */ - - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - - /* Ok.. it's good. Set up sequence numbers and - * move to established. - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; - tp->fin_seq = TCP_SKB_CB(skb)->seq; - - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_parse_options(sk, th, tp, 0); - - if (tp->wscale_ok == 0) { - tp->snd_wscale = tp->rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp,65535); - } - - if (tp->tstamp_ok) { - tp->tcp_header_len = - sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else - tp->tcp_header_len = sizeof(struct tcphdr); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - /* Can't be earlier, doff would be wrong. */ - tcp_send_ack(sk); - - sk->dport = th->source; - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - sk->state_change(sk); - sock_wake_async(sk->socket, 0); - } - } else { - if(th->syn && !th->rst) { - /* The previous version of the code - * checked for "connecting to self" - * here. that check is done now in - * tcp_connect. - */ - tcp_set_state(sk, TCP_SYN_RECV); - tcp_parse_options(sk, th, tp, 0); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - - tcp_send_synack(sk); - } else - break; - } - - /* tp->tcp_header_len and tp->mss_clamp - probably changed, synchronize mss. - */ - tcp_sync_mss(sk, tp->pmtu_cookie); - tp->rcv_mss = tp->mss_cache; - - if (sk->state == TCP_SYN_RECV) - goto discard; - - goto step6; - } - - /* Parse the tcp_options present on this header. - * By this point we really only expect timestamps. - * Note that this really has to be here and not later for PAWS - * (RFC1323) to work. - */ - if (tcp_fast_parse_options(sk, th, tp)) { - /* NOTE: assumes saw_tstamp is never set if we didn't - * negotiate the option. tcp_fast_parse_options() must - * guarantee this. - */ - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - } - } - - /* The silly FIN test here is necessary to see an advancing ACK in - * retransmitted FIN frames properly. Consider the following sequence: - * - * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ - * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1 - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test) - * - * At this point the connection will deadlock with host1 believing - * that his FIN is never ACK'd, and thus it will retransmit it's FIN - * forever. The following fix is from Taral (taral@taral.net). - */ - - /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) && - !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) { - if (!th->rst) { - tcp_send_ack(sk); - } - goto discard; - } - - /* step 2: check RST bit */ - if(th->rst) { - tcp_reset(sk); - goto discard; - } - - /* step 3: check security and precedence [ignored] */ - - /* step 4: - * - * Check for a SYN, and ensure it matches the SYN we were - * first sent. We have to handle the rather unusual (but valid) - * sequence that KA9Q derived products may generate of - * - * SYN - * SYN|ACK Data - * ACK (lost) - * SYN|ACK Data + More Data - * .. we must ACK not RST... - * - * We keep syn_seq as the sequence space occupied by the - * original syn. - */ - - if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - tcp_reset(sk); - return 1; - } - - /* step 5: check the ACK field */ - if (th->ack) { - int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - - switch(sk->state) { - case TCP_SYN_RECV: - if (acceptable) { - tcp_set_state(sk, TCP_ESTABLISHED); - sk->dport = th->source; - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) - sk->state_change(sk); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; - - } else { - SOCK_DEBUG(sk, "bad ack\n"); - return 1; - } - break; - - case TCP_FIN_WAIT1: - if (tp->snd_una == tp->write_seq) { - sk->shutdown |= SEND_SHUTDOWN; - tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) - sk->state_change(sk); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } - break; - - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk); - goto discard; - } - break; - - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - if (!sk->dead) - sk->state_change(sk); - goto discard; - } - break; - } - } else - goto discard; - -step6: - /* step 6: check the URG bit */ - tcp_urg(sk, th, len); - - /* step 7: process the segment text */ - switch (sk->state) { - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) - break; - - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - /* RFC 793 says to queue data in these states, - * RFC 1122 says we MUST send a reset. - * BSD 4.4 also does reset. - */ - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { - if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - tcp_reset(sk); - return 1; - } - } - - case TCP_ESTABLISHED: - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - */ - tcp_measure_rcv_mss(sk, skb); - break; - } - - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - - if (!queued) { -discard: - kfree_skb(skb); - } - return 0; -} |