From c7923f6aa252a29ccb4f16bd91469c9000a2bd94 Mon Sep 17 00:00:00 2001 From: "Michael I. Bushnell" Date: Wed, 12 Jul 1995 15:42:49 +0000 Subject: Initial revision --- pfinet/linux-inet/tcp.c | 5100 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 5100 insertions(+) create mode 100644 pfinet/linux-inet/tcp.c (limited to 'pfinet/linux-inet/tcp.c') diff --git a/pfinet/linux-inet/tcp.c b/pfinet/linux-inet/tcp.c new file mode 100644 index 00000000..c73ad07f --- /dev/null +++ b/pfinet/linux-inet/tcp.c @@ -0,0 +1,5100 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 + * and was trying to connect (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It wakes people + * on errors. select behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_reset() fixed to work for everything not just + * packets for unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. Acking + * any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst receive + * otherwise odd bits of prattle escape still + * Alan Cox : Fixed another acking RST frame bug. Should stop + * LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential nasty. + * Alan Cox : Added some better commenting, as the tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised state ack error/ + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the TCP code + * (Be very nice to this man if tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle select() after URG properly in all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in selecting before an accept. + * Alan Cox : Kept the state trace facility since it's + * handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on the RFC's + * for other useful protocol references see: + * Comer, KA9Q NOS, and for a reference on the + * difference between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC and using multiple + * timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * + * + * To Fix: + * Fast path the code. Two things here - fix the window calculation + * so it doesn't iterate over the queue, also spot packets with no funny + * options arriving in order and process directly. + * + * Implement RFC 1191 [Path MTU discovery] + * Look at the effect of implementing RFC 1337 suggestions and their impact. + * Rewrite output state machine to use a single queue and do low window + * situations as per the spec (RFC 1122) + * Speed up input assembly algorithm. + * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we + * could do with it working on IPv4 + * User settable/learned rtt/max window/mtu + * Cope with MTU/device switches when retransmitting in tcp. + * Fix the window handling to use PR's new code. + * + * Change the fundamental structure to a single send queue maintained + * by TCP (removing the bogus ip stuff [thus fixing mtu drops on + * active routes too]). Cut the queue off in tcp_retransmit/ + * tcp_transmit. + * Change the receive queue to assemble as it goes. This lets us + * dispose of most of tcp_sequence, half of tcp_ack and chunks of + * tcp_data/tcp_read as well as the window shrink crud. + * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack + * tcp_queue_skb seem obvious routines to extract. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "snmp.h" +#include "ip.h" +#include "protocol.h" +#include "icmp.h" +#include "tcp.h" +#include "arp.h" +#include +#include "sock.h" +#include "route.h" +#include +#include +#include +#include +#include + +/* + * The MSL timer is the 'normal' timer. + */ + +#define reset_msl_timer(x,y,z) reset_timer(x,y,z) + +#define SEQ_TICK 3 +unsigned long seq_offset; +struct tcp_mib tcp_statistics; + +static void tcp_close(struct sock *sk, int timeout); + + +/* + * The less said about this the better, but it works and will do for 1.2 + */ + +static struct wait_queue *master_select_wakeup; + +static __inline__ int min(unsigned int a, unsigned int b) +{ + if (a < b) + return(a); + return(b); +} + +#undef STATE_TRACE + +#ifdef STATE_TRACE +static char *statename[]={ + "Unused","Established","Syn Sent","Syn Recv", + "Fin Wait 1","Fin Wait 2","Time Wait", "Close", + "Close Wait","Last ACK","Listen","Closing" +}; +#endif + +static __inline__ void tcp_set_state(struct sock *sk, int state) +{ + if(sk->state==TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab--; +#ifdef STATE_TRACE + if(sk->debug) + printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]); +#endif + /* This is a hack but it doesn't occur often and it's going to + be a real to fix nicely */ + + if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV) + { + wake_up_interruptible(&master_select_wakeup); + } + sk->state=state; + if(state==TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab++; +} + +/* + * This routine picks a TCP windows for a socket based on + * the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * For now we use NET2E3's heuristic of offering half the memory + * we have handy. All is not as bad as this seems however because + * of two things. Firstly we will bin packets even within the window + * in order to get the data we are waiting for into the memory limit. + * Secondly we bin common duplicate forms at receive time + * Better heuristics welcome + */ + +int tcp_select_window(struct sock *sk) +{ + int new_window = sk->prot->rspace(sk); + + if(sk->window_clamp) + new_window=min(sk->window_clamp,new_window); + /* + * Two things are going on here. First, we don't ever offer a + * window less than min(sk->mss, MAX_WINDOW/2). This is the + * receiver side of SWS as specified in RFC1122. + * Second, we always give them at least the window they + * had before, in order to avoid retracting window. This + * is technically allowed, but RFC1122 advises against it and + * in practice it causes trouble. + * + * Fixme: This doesn't correctly handle the case where + * new_window > sk->window but not by enough to allow for the + * shift in sequence space. + */ + if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window) + return(sk->window); + return(new_window); +} + +/* + * Find someone to 'accept'. Must be called with + * sk->inuse=1 or cli() + */ + +static struct sk_buff *tcp_find_established(struct sock *s) +{ + struct sk_buff *p=skb_peek(&s->receive_queue); + if(p==NULL) + return NULL; + do + { + if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1) + return p; + p=p->next; + } + while(p!=(struct sk_buff *)&s->receive_queue); + return NULL; +} + +/* + * Remove a completed connection and return it. This is used by + * tcp_accept() to get connections from the queue. + */ + +static struct sk_buff *tcp_dequeue_established(struct sock *s) +{ + struct sk_buff *skb; + unsigned long flags; + save_flags(flags); + cli(); + skb=tcp_find_established(s); + if(skb!=NULL) + skb_unlink(skb); /* Take it off the queue */ + restore_flags(flags); + return skb; +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. Currently it is only called by + * tcp_close, and timeout mirrors the value there. + */ + +static void tcp_close_pending (struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) + { + skb->sk->dead=1; + tcp_close(skb->sk, 0); + kfree_skb(skb, FREE_READ); + } + return; +} + +/* + * Enter the time wait state. + */ + +static void tcp_time_wait(struct sock *sk) +{ + tcp_set_state(sk,TCP_TIME_WAIT); + sk->shutdown = SHUTDOWN_MASK; + if (!sk->dead) + sk->state_change(sk); + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); +} + +/* + * A socket has timed out on its send queue and wants to do a + * little retransmitting. Currently this means TCP. + */ + +void tcp_do_retransmit(struct sock *sk, int all) +{ + struct sk_buff * skb; + struct proto *prot; + struct device *dev; + int ct=0; + + prot = sk->prot; + skb = sk->send_head; + + while (skb != NULL) + { + struct tcphdr *th; + struct iphdr *iph; + int size; + + dev = skb->dev; + IS_SKB(skb); + skb->when = jiffies; + + /* + * In general it's OK just to use the old packet. However we + * need to use the current ack and window fields. Urg and + * urg_ptr could possibly stand to be updated as well, but we + * don't keep the necessary data. That shouldn't be a problem, + * if the other end is doing the right thing. Since we're + * changing the packet, we have to issue a new IP identifier. + */ + + iph = (struct iphdr *)(skb->data + dev->hard_header_len); + th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); + size = skb->len - (((unsigned char *) th) - skb->data); + + /* + * Note: We ought to check for window limits here but + * currently this is done (less efficiently) elsewhere. + * We do need to check for a route change but can't handle + * that until we have the new 1.3.x buffers in. + * + */ + + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + /* + * This is not the right way to handle this. We have to + * issue an up to date window and ack report with this + * retransmit to keep the odd buggy tcp that relies on + * the fact BSD does this happy. + * We don't however need to recalculate the entire + * checksum, so someone wanting a small problem to play + * with might like to implement RFC1141/RFC1624 and speed + * this up by avoiding a full checksum. + */ + + th->ack_seq = ntohl(sk->acked_seq); + th->window = ntohs(tcp_select_window(sk)); + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + /* + * If the interface is (still) up and running, kick it. + */ + + if (dev->flags & IFF_UP) + { + /* + * If the packet is still being sent by the device/protocol + * below then don't retransmit. This is both needed, and good - + * especially with connected mode AX.25 where it stops resends + * occurring of an as yet unsent anyway frame! + * We still add up the counts as the round trip time wants + * adjusting. + */ + if (sk && !skb_device_locked(skb)) + { + /* Remove it from any existing driver queue first! */ + skb_unlink(skb); + /* Now queue it */ + ip_statistics.IpOutRequests++; + dev_queue_xmit(skb, dev, sk->priority); + } + } + + /* + * Count retransmissions + */ + + ct++; + sk->prot->retransmits ++; + + /* + * Only one retransmit requested. + */ + + if (!all) + break; + + /* + * This should cut it off before we send too many packets. + */ + + if (ct >= sk->cong_window) + break; + skb = skb->link3; + } +} + +/* + * Reset the retransmission timer + */ + +static void reset_xmit_timer(struct sock *sk, int why, unsigned long when) +{ + del_timer(&sk->retransmit_timer); + sk->ip_xmit_timeout = why; + if((int)when < 0) + { + when=3; + printk("Error: Negative timer in xmit_timer\n"); + } + sk->retransmit_timer.expires=when; + add_timer(&sk->retransmit_timer); +} + +/* + * This is the normal code called for timeouts. It does the retransmission + * and then does backoff. tcp_do_retransmit is separated out because + * tcp_ack needs to send stuff from the retransmit queue without + * initiating a backoff. + */ + + +void tcp_retransmit_time(struct sock *sk, int all) +{ + tcp_do_retransmit(sk, all); + + /* + * Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + + sk->retransmits++; + sk->backoff++; + sk->rto = min(sk->rto << 1, 120*HZ); + reset_xmit_timer(sk, TIME_WRITE, sk->rto); +} + + +/* + * A timer event has trigger a tcp retransmit timeout. The + * socket xmit queue is ready and set up to send. Because + * the ack receive code keeps the queue straight we do + * nothing clever here. + */ + +static void tcp_retransmit(struct sock *sk, int all) +{ + if (all) + { + tcp_retransmit_time(sk, all); + return; + } + + sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ + /* sk->ssthresh in theory can be zero. I guess that's OK */ + sk->cong_count = 0; + + sk->cong_window = 1; + + /* Do the actual retransmit. */ + tcp_retransmit_time(sk, all); +} + +/* + * A write timeout has occurred. Process the after effects. + */ + +static int tcp_write_timeout(struct sock *sk) +{ + /* + * Look for a 'soft' timeout. + */ + if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) + || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) + { + /* + * Attempt to recover if arp has changed (unlikely!) or + * a route has shifted (not supported prior to 1.3). + */ + arp_destroy (sk->daddr, 0); + ip_route_check (sk->daddr); + } + /* + * Has it gone just too far ? + */ + if (sk->retransmits > TCP_RETR2) + { + sk->err = ETIMEDOUT; + sk->error_report(sk); + del_timer(&sk->retransmit_timer); + /* + * Time wait the socket + */ + if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) + { + tcp_set_state(sk,TCP_TIME_WAIT); + reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + } + else + { + /* + * Clean up time. + */ + tcp_set_state(sk, TCP_CLOSE); + return 0; + } + } + return 1; +} + +/* + * The TCP retransmit timer. This lacks a few small details. + * + * 1. An initial rtt timeout on the probe0 should cause what we can + * of the first write queue buffer to be split and sent. + * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report + * ETIMEDOUT if we know an additional 'soft' error caused this. + * tcp_err should save a 'soft error' for us. + */ + +static void retransmit_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + int why = sk->ip_xmit_timeout; + + /* + * only process if socket is not in use + */ + + cli(); + if (sk->inuse || in_bh) + { + /* Try again in 1 second */ + sk->retransmit_timer.expires = HZ; + add_timer(&sk->retransmit_timer); + sti(); + return; + } + + sk->inuse = 1; + sti(); + + /* Always see if we need to send an ack. */ + + if (sk->ack_backlog && !sk->zapped) + { + sk->prot->read_wakeup (sk); + if (! sk->dead) + sk->data_ready(sk,0); + } + + /* Now we need to figure out why the socket was on the timer. */ + + switch (why) + { + /* Window probing */ + case TIME_PROBE0: + tcp_send_probe0(sk); + tcp_write_timeout(sk); + break; + /* Retransmitting */ + case TIME_WRITE: + /* It could be we got here because we needed to send an ack. + * So we need to check for that. + */ + { + struct sk_buff *skb; + unsigned long flags; + + save_flags(flags); + cli(); + skb = sk->send_head; + if (!skb) + { + restore_flags(flags); + } + else + { + /* + * Kicked by a delayed ack. Reset timer + * correctly now + */ + if (jiffies < skb->when + sk->rto) + { + reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies); + restore_flags(flags); + break; + } + restore_flags(flags); + /* + * Retransmission + */ + sk->prot->retransmit (sk, 0); + tcp_write_timeout(sk); + } + break; + } + /* Sending Keepalives */ + case TIME_KEEPOPEN: + /* + * this reset_timer() call is a hack, this is not + * how KEEPOPEN is supposed to work. + */ + reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + + /* Send something to keep the connection open. */ + if (sk->prot->write_wakeup) + sk->prot->write_wakeup (sk); + sk->retransmits++; + tcp_write_timeout(sk); + break; + default: + printk ("rexmit_timer: timer expired - reason unknown\n"); + break; + } + release_sock(sk); +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + */ + +void tcp_err(int err, unsigned char *header, unsigned long daddr, + unsigned long saddr, struct inet_protocol *protocol) +{ + struct tcphdr *th; + struct sock *sk; + struct iphdr *iph=(struct iphdr *)header; + + header+=4*iph->ihl; + + + th =(struct tcphdr *)header; + sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr); + + if (sk == NULL) + return; + + if(err<0) + { + sk->err = -err; + sk->error_report(sk); + return; + } + + if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) + { + /* + * FIXME: + * For now we will just trigger a linear backoff. + * The slow start code should cause a real backoff here. + */ + if (sk->cong_window > 4) + sk->cong_window--; + return; + } + +/* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ + + /* + * If we've already connected we will keep trying + * until we time out, or the user gives up. + */ + + if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) + { + if (sk->state == TCP_SYN_SENT) + { + tcp_statistics.TcpAttemptFails++; + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + } + sk->err = icmp_err_convert[err & 0xff].errno; + } + return; +} + + +/* + * Walk down the receive queue counting readable data until we hit the end or we find a gap + * in the received data queue (ie a frame missing that needs sending to us). Not + * sorting using two queues as data arrives makes life so much harder. + */ + +static int tcp_readable(struct sock *sk) +{ + unsigned long counted; + unsigned long amount; + struct sk_buff *skb; + int sum; + unsigned long flags; + + if(sk && sk->debug) + printk("tcp_readable: %p - ",sk); + + save_flags(flags); + cli(); + if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) + { + restore_flags(flags); + if(sk && sk->debug) + printk("empty\n"); + return(0); + } + + counted = sk->copied_seq; /* Where we are at the moment */ + amount = 0; + + /* + * Do until a push or until we are out of data. + */ + + do + { + if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ + break; + sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ + if (skb->h.th->syn) + sum++; + if (sum > 0) + { /* Add it up, move on */ + amount += sum; + if (skb->h.th->syn) + amount--; + counted += sum; + } + /* + * Don't count urg data ... but do it in the right place! + * Consider: "old_data (ptr is here) URG PUSH data" + * The old code would stop at the first push because + * it counted the urg (amount==1) and then does amount-- + * *after* the loop. This means tcp_readable() always + * returned zero if any URG PUSH was in the queue, even + * though there was normal data available. If we subtract + * the urg data right here, we even get it to work for more + * than one URG PUSH skb without normal data. + * This means that select() finally works now with urg data + * in the queue. Note that rlogin was never affected + * because it doesn't use select(); it uses two processes + * and a blocking read(). And the queue scan in tcp_read() + * was correct. Mike + */ + if (skb->h.th->urg) + amount--; /* don't count urg data */ + if (amount && skb->h.th->psh) break; + skb = skb->next; + } + while(skb != (struct sk_buff *)&sk->receive_queue); + + restore_flags(flags); + if(sk->debug) + printk("got %lu bytes.\n",amount); + return(amount); +} + +/* + * LISTEN is a special case for select.. + */ +static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait) +{ + if (sel_type == SEL_IN) { + int retval; + + sk->inuse = 1; + retval = (tcp_find_established(sk) != NULL); + release_sock(sk); + if (!retval) + select_wait(&master_select_wakeup,wait); + return retval; + } + return 0; +} + + +/* + * Wait for a TCP event. + * + * Note that we don't need to set "sk->inuse", as the upper select layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +static int tcp_select(struct sock *sk, int sel_type, select_table *wait) +{ + if (sk->state == TCP_LISTEN) + return tcp_listen_select(sk, sel_type, wait); + + switch(sel_type) { + case SEL_IN: + if (sk->err) + return 1; + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) + break; + + if (sk->shutdown & RCV_SHUTDOWN) + return 1; + + if (sk->acked_seq == sk->copied_seq) + break; + + if (sk->urg_seq != sk->copied_seq || + sk->acked_seq != sk->copied_seq+1 || + sk->urginline || !sk->urg_data) + return 1; + break; + + case SEL_OUT: + if (sk->shutdown & SEND_SHUTDOWN) + return 0; + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) + break; + /* + * This is now right thanks to a small fix + * by Matt Dillon. + */ + + if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header) + break; + return 1; + + case SEL_EX: + if (sk->err || sk->urg_data) + return 1; + break; + } + select_wait(sk->sleep, wait); + return 0; +} + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int err; + switch(cmd) + { + + case TIOCINQ: +#ifdef FIXME /* FIXME: */ + case FIONREAD: +#endif + { + unsigned long amount; + + if (sk->state == TCP_LISTEN) + return(-EINVAL); + + sk->inuse = 1; + amount = tcp_readable(sk); + release_sock(sk); + err=verify_area(VERIFY_WRITE,(void *)arg, + sizeof(unsigned long)); + if(err) + return err; + put_fs_long(amount,(unsigned long *)arg); + return(0); + } + case SIOCATMARK: + { + int answ = sk->urg_data && sk->urg_seq == sk->copied_seq; + + err = verify_area(VERIFY_WRITE,(void *) arg, + sizeof(unsigned long)); + if (err) + return err; + put_fs_long(answ,(int *) arg); + return(0); + } + case TIOCOUTQ: + { + unsigned long amount; + + if (sk->state == TCP_LISTEN) return(-EINVAL); + amount = sk->prot->wspace(sk); + err=verify_area(VERIFY_WRITE,(void *)arg, + sizeof(unsigned long)); + if(err) + return err; + put_fs_long(amount,(unsigned long *)arg); + return(0); + } + default: + return(-EINVAL); + } +} + + +/* + * This routine computes a TCP checksum. + */ + +unsigned short tcp_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr) +{ + unsigned long sum; + + if (saddr == 0) saddr = ip_my_addr(); + +/* + * stupid, gcc complains when I use just one __asm__ block, + * something about too many reloads, but this is just two + * instructions longer than what I want + */ + __asm__(" + addl %%ecx, %%ebx + adcl %%edx, %%ebx + adcl $0, %%ebx + " + : "=b"(sum) + : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256) + : "bx", "cx", "dx" ); + __asm__(" + movl %%ecx, %%edx + cld + cmpl $32, %%ecx + jb 2f + shrl $5, %%ecx + clc +1: lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + lodsl + adcl %%eax, %%ebx + loop 1b + adcl $0, %%ebx + movl %%edx, %%ecx +2: andl $28, %%ecx + je 4f + shrl $2, %%ecx + clc +3: lodsl + adcl %%eax, %%ebx + loop 3b + adcl $0, %%ebx +4: movl $0, %%eax + testw $2, %%dx + je 5f + lodsw + addl %%eax, %%ebx + adcl $0, %%ebx + movw $0, %%ax +5: test $1, %%edx + je 6f + lodsb + addl %%eax, %%ebx + adcl $0, %%ebx +6: movl %%ebx, %%eax + shrl $16, %%eax + addw %%ax, %%bx + adcw $0, %%bx + " + : "=b"(sum) + : "0"(sum), "c"(len), "S"(th) + : "ax", "bx", "cx", "dx", "si" ); + + /* We only want the bottom 16 bits, but we never cleared the top 16. */ + + return((~sum) & 0xffff); +} + + + +void tcp_send_check(struct tcphdr *th, unsigned long saddr, + unsigned long daddr, int len, struct sock *sk) +{ + th->check = 0; + th->check = tcp_check(th, len, saddr, daddr); + return; +} + +/* + * This is the main buffer sending routine. We queue the buffer + * having checked it is sane seeming. + */ + +static void tcp_send_skb(struct sock *sk, struct sk_buff *skb) +{ + int size; + struct tcphdr * th = skb->h.th; + + /* + * length of packet (not counting length of pre-tcp headers) + */ + + size = skb->len - ((unsigned char *) th - skb->data); + + /* + * Sanity check it.. + */ + + if (size < sizeof(struct tcphdr) || size > skb->len) + { + printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", + skb, skb->data, th, skb->len); + kfree_skb(skb, FREE_WRITE); + return; + } + + /* + * If we have queued a header size packet.. (these crash a few + * tcp stacks if ack is not set) + */ + + if (size == sizeof(struct tcphdr)) + { + /* If it's got a syn or fin it's notionally included in the size..*/ + if(!th->syn && !th->fin) + { + printk("tcp_send_skb: attempt to queue a bogon.\n"); + kfree_skb(skb,FREE_WRITE); + return; + } + } + + /* + * Actual processing. + */ + + tcp_statistics.TcpOutSegs++; + skb->h.seq = ntohl(th->seq) + size - 4*th->doff; + + /* + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) We are retransmitting (Nagle's rule) + * c) We have too many packets 'in flight' + */ + + if (after(skb->h.seq, sk->window_seq) || + (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) || + sk->packets_out >= sk->cong_window) + { + /* checksum will be supplied by tcp_write_xmit. So + * we shouldn't need to set it at all. I'm being paranoid */ + th->check = 0; + if (skb->next != NULL) + { + printk("tcp_send_partial: next != NULL\n"); + skb_unlink(skb); + } + skb_queue_tail(&sk->write_queue, skb); + + /* + * If we don't fit we have to start the zero window + * probes. This is broken - we really need to do a partial + * send _first_ (This is what causes the Cisco and PC/TCP + * grief). + */ + + if (before(sk->window_seq, sk->write_queue.next->h.seq) && + sk->send_head == NULL && sk->ack_backlog == 0) + reset_xmit_timer(sk, TIME_PROBE0, sk->rto); + } + else + { + /* + * This is going straight out + */ + + th->ack_seq = ntohl(sk->acked_seq); + th->window = ntohs(tcp_select_window(sk)); + + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + sk->sent_seq = sk->write_seq; + + /* + * This is mad. The tcp retransmit queue is put together + * by the ip layer. This causes half the problems with + * unroutable FIN's and other things. + */ + + sk->prot->queue_xmit(sk, skb->dev, skb, 0); + + /* + * Set for next retransmit based on expected ACK time. + * FIXME: We set this every time which means our + * retransmits are really about a window behind. + */ + + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } +} + +/* + * Locking problems lead us to a messy situation where we can have + * multiple partially complete buffers queued up. This is really bad + * as we don't want to be sending partial buffers. Fix this with + * a semaphore or similar to lock tcp_write per socket. + * + * These routines are pretty self descriptive. + */ + +struct sk_buff * tcp_dequeue_partial(struct sock * sk) +{ + struct sk_buff * skb; + unsigned long flags; + + save_flags(flags); + cli(); + skb = sk->partial; + if (skb) { + sk->partial = NULL; + del_timer(&sk->partial_timer); + } + restore_flags(flags); + return skb; +} + +/* + * Empty the partial queue + */ + +static void tcp_send_partial(struct sock *sk) +{ + struct sk_buff *skb; + + if (sk == NULL) + return; + while ((skb = tcp_dequeue_partial(sk)) != NULL) + tcp_send_skb(sk, skb); +} + +/* + * Queue a partial frame + */ + +void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) +{ + struct sk_buff * tmp; + unsigned long flags; + + save_flags(flags); + cli(); + tmp = sk->partial; + if (tmp) + del_timer(&sk->partial_timer); + sk->partial = skb; + init_timer(&sk->partial_timer); + /* + * Wait up to 1 second for the buffer to fill. + */ + sk->partial_timer.expires = HZ; + sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; + sk->partial_timer.data = (unsigned long) sk; + add_timer(&sk->partial_timer); + restore_flags(flags); + if (tmp) + tcp_send_skb(sk, tmp); +} + + +/* + * This routine sends an ack and also updates the window. + */ + +static void tcp_send_ack(unsigned long sequence, unsigned long ack, + struct sock *sk, + struct tcphdr *th, unsigned long daddr) +{ + struct sk_buff *buff; + struct tcphdr *t1; + struct device *dev = NULL; + int tmp; + + if(sk->zapped) + return; /* We have been reset, we may not send again */ + + /* + * We need to grab some memory, and put together an ack, + * and then put it into the queue to be sent. + */ + + buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + { + /* + * Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + */ + + sk->ack_backlog++; + if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) + { + reset_xmit_timer(sk, TIME_WRITE, HZ); + } + return; + } + + /* + * Assemble a suitable TCP frame + */ + + buff->len = sizeof(struct tcphdr); + buff->sk = sk; + buff->localroute = sk->localroute; + t1 =(struct tcphdr *) buff->data; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, + IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); + if (tmp < 0) + { + buff->free = 1; + sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); + return; + } + buff->len += tmp; + t1 =(struct tcphdr *)((char *)t1 +tmp); + + memcpy(t1, th, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->seq = ntohl(sequence); + t1->ack = 1; + sk->window = tcp_select_window(sk); + t1->window = ntohs(sk->window); + t1->res1 = 0; + t1->res2 = 0; + t1->rst = 0; + t1->urg = 0; + t1->syn = 0; + t1->psh = 0; + t1->fin = 0; + + /* + * If we have nothing queued for transmit and the transmit timer + * is on we are just doing an ACK timeout and need to switch + * to a keepalive. + */ + + if (ack == sk->acked_seq) + { + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + sk->ack_timed = 0; + if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL + && sk->ip_xmit_timeout == TIME_WRITE) + { + if(sk->keepopen) { + reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); + } else { + delete_timer(sk); + } + } + } + + /* + * Fill in the packet and send it + */ + + t1->ack_seq = ntohl(ack); + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); + if (sk->debug) + printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack); + tcp_statistics.TcpOutSegs++; + sk->prot->queue_xmit(sk, dev, buff, 1); +} + + +/* + * This routine builds a generic TCP header. + */ + +extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) +{ + + memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); + th->seq = htonl(sk->write_seq); + th->psh =(push == 0) ? 1 : 0; + th->doff = sizeof(*th)/4; + th->ack = 1; + th->fin = 0; + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + sk->ack_timed = 0; + th->ack_seq = htonl(sk->acked_seq); + sk->window = tcp_select_window(sk); + th->window = htons(sk->window); + + return(sizeof(*th)); +} + +/* + * This routine copies from a user buffer into a socket, + * and starts the transmit system. + */ + +static int tcp_write(struct sock *sk, unsigned char *from, + int len, int nonblock, unsigned flags) +{ + int copied = 0; + int copy; + int tmp; + struct sk_buff *skb; + struct sk_buff *send_tmp; + unsigned char *buff; + struct proto *prot; + struct device *dev = NULL; + + sk->inuse=1; + prot = sk->prot; + while(len > 0) + { + if (sk->err) + { /* Stop on an error */ + release_sock(sk); + if (copied) + return(copied); + tmp = -sk->err; + sk->err = 0; + return(tmp); + } + + /* + * First thing we do is make sure that we are established. + */ + + if (sk->shutdown & SEND_SHUTDOWN) + { + release_sock(sk); + sk->err = EPIPE; + if (copied) + return(copied); + sk->err = 0; + return(-EPIPE); + } + + /* + * Wait for a connection to finish. + */ + + while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) + { + if (sk->err) + { + release_sock(sk); + if (copied) + return(copied); + tmp = -sk->err; + sk->err = 0; + return(tmp); + } + + if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) + { + release_sock(sk); + if (copied) + return(copied); + + if (sk->err) + { + tmp = -sk->err; + sk->err = 0; + return(tmp); + } + + if (sk->keepopen) + { + send_sig(SIGPIPE, current, 0); + } + return(-EPIPE); + } + + if (nonblock || copied) + { + release_sock(sk); + if (copied) + return(copied); + return(-EAGAIN); + } + + release_sock(sk); + cli(); + + if (sk->state != TCP_ESTABLISHED && + sk->state != TCP_CLOSE_WAIT && sk->err == 0) + { + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) + { + sti(); + if (copied) + return(copied); + return(-ERESTARTSYS); + } + } + sk->inuse = 1; + sti(); + } + + /* + * The following code can result in copy <= if sk->mss is ever + * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). + * sk->mtu is constant once SYN processing is finished. I.e. we + * had better not get here until we've seen his SYN and at least one + * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) + * But ESTABLISHED should guarantee that. sk->max_window is by definition + * non-decreasing. Note that any ioctl to set user_mss must be done + * before the exchange of SYN's. If the initial ack from the other + * end has a window of 0, max_window and thus mss will both be 0. + */ + + /* + * Now we need to check if we have a half built packet. + */ + + if ((skb = tcp_dequeue_partial(sk)) != NULL) + { + int hdrlen; + + /* IP header + TCP header */ + hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data) + + sizeof(struct tcphdr); + + /* Add more stuff to the end of skb->len */ + if (!(flags & MSG_OOB)) + { + copy = min(sk->mss - (skb->len - hdrlen), len); + /* FIXME: this is really a bug. */ + if (copy <= 0) + { + printk("TCP: **bug**: \"copy\" <= 0!!\n"); + copy = 0; + } + + memcpy_fromfs(skb->data + skb->len, from, copy); + skb->len += copy; + from += copy; + copied += copy; + len -= copy; + sk->write_seq += copy; + } + if ((skb->len - hdrlen) >= sk->mss || + (flags & MSG_OOB) || !sk->packets_out) + tcp_send_skb(sk, skb); + else + tcp_enqueue_partial(skb, sk); + continue; + } + + /* + * We also need to worry about the window. + * If window < 1/2 the maximum window we've seen from this + * host, don't use it. This is sender side + * silly window prevention, as specified in RFC1122. + * (Note that this is different than earlier versions of + * SWS prevention, e.g. RFC813.). What we actually do is + * use the whole MSS. Since the results in the right + * edge of the packet being outside the window, it will + * be queued for later rather than sent. + */ + + copy = sk->window_seq - sk->write_seq; + if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss) + copy = sk->mss; + if (copy > len) + copy = len; + + /* + * We should really check the window here also. + */ + + send_tmp = NULL; + if (copy < sk->mss && !(flags & MSG_OOB)) + { + /* + * We will release the socket in case we sleep here. + */ + release_sock(sk); + /* + * NB: following must be mtu, because mss can be increased. + * mss is always <= mtu + */ + skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL); + sk->inuse = 1; + send_tmp = skb; + } + else + { + /* + * We will release the socket in case we sleep here. + */ + release_sock(sk); + skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL); + sk->inuse = 1; + } + + /* + * If we didn't get any memory, we need to sleep. + */ + + if (skb == NULL) + { + sk->socket->flags |= SO_NOSPACE; + if (nonblock) + { + release_sock(sk); + if (copied) + return(copied); + return(-EAGAIN); + } + + /* + * FIXME: here is another race condition. + */ + + tmp = sk->wmem_alloc; + release_sock(sk); + cli(); + /* + * Again we will try to avoid it. + */ + if (tmp <= sk->wmem_alloc && + (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) + && sk->err == 0) + { + sk->socket->flags &= ~SO_NOSPACE; + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) + { + sti(); + if (copied) + return(copied); + return(-ERESTARTSYS); + } + } + sk->inuse = 1; + sti(); + continue; + } + + skb->len = 0; + skb->sk = sk; + skb->free = 0; + skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); + + buff = skb->data; + + /* + * FIXME: we need to optimize this. + * Perhaps some hints here would be good. + */ + + tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); + if (tmp < 0 ) + { + prot->wfree(sk, skb->mem_addr, skb->mem_len); + release_sock(sk); + if (copied) + return(copied); + return(tmp); + } + skb->len += tmp; + skb->dev = dev; + buff += tmp; + skb->h.th =(struct tcphdr *) buff; + tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); + if (tmp < 0) + { + prot->wfree(sk, skb->mem_addr, skb->mem_len); + release_sock(sk); + if (copied) + return(copied); + return(tmp); + } + + if (flags & MSG_OOB) + { + ((struct tcphdr *)buff)->urg = 1; + ((struct tcphdr *)buff)->urg_ptr = ntohs(copy); + } + skb->len += tmp; + memcpy_fromfs(buff+tmp, from, copy); + + from += copy; + copied += copy; + len -= copy; + skb->len += copy; + skb->free = 0; + sk->write_seq += copy; + + if (send_tmp != NULL && sk->packets_out) + { + tcp_enqueue_partial(send_tmp, sk); + continue; + } + tcp_send_skb(sk, skb); + } + sk->err = 0; + +/* + * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly + * interactive fast network servers. It's meant to be on and + * it really improves the throughput though not the echo time + * on my slow slip link - Alan + */ + +/* + * Avoid possible race on send_tmp - c/o Johannes Stille + */ + + if(sk->partial && ((!sk->packets_out) + /* If not nagling we can send on the before case too.. */ + || (sk->nonagle && before(sk->write_seq , sk->window_seq)) + )) + tcp_send_partial(sk); + + release_sock(sk); + return(copied); +} + +/* + * This is just a wrapper. + */ + +static int tcp_sendto(struct sock *sk, unsigned char *from, + int len, int nonblock, unsigned flags, + struct sockaddr_in *addr, int addr_len) +{ + if (flags & ~(MSG_OOB|MSG_DONTROUTE)) + return -EINVAL; + if (sk->state == TCP_CLOSE) + return -ENOTCONN; + if (addr_len < sizeof(*addr)) + return -EINVAL; + if (addr->sin_family && addr->sin_family != AF_INET) + return -EINVAL; + if (addr->sin_port != sk->dummy_th.dest) + return -EISCONN; + if (addr->sin_addr.s_addr != sk->daddr) + return -EISCONN; + return tcp_write(sk, from, len, nonblock, flags); +} + + +/* + * Send an ack if one is backlogged at this point. Ought to merge + * this with tcp_send_ack(). + */ + +static void tcp_read_wakeup(struct sock *sk) +{ + int tmp; + struct device *dev = NULL; + struct tcphdr *t1; + struct sk_buff *buff; + + if (!sk->ack_backlog) + return; + + /* + * FIXME: we need to put code here to prevent this routine from + * being called. Being called once in a while is ok, so only check + * if this is the second time in a row. + */ + + /* + * We need to grab some memory, and put together an ack, + * and then put it into the queue to be sent. + */ + + buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); + if (buff == NULL) + { + /* Try again real soon. */ + reset_xmit_timer(sk, TIME_WRITE, HZ); + return; + } + + buff->len = sizeof(struct tcphdr); + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); + if (tmp < 0) + { + buff->free = 1; + sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); + return; + } + + buff->len += tmp; + t1 =(struct tcphdr *)(buff->data +tmp); + + memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); + t1->seq = htonl(sk->sent_seq); + t1->ack = 1; + t1->res1 = 0; + t1->res2 = 0; + t1->rst = 0; + t1->urg = 0; + t1->syn = 0; + t1->psh = 0; + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + sk->window = tcp_select_window(sk); + t1->window = ntohs(sk->window); + t1->ack_seq = ntohl(sk->acked_seq); + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); + sk->prot->queue_xmit(sk, dev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + + +/* + * FIXME: + * This routine frees used buffers. + * It should consider sending an ACK to let the + * other end know we now have a bigger window. + */ + +static void cleanup_rbuf(struct sock *sk) +{ + unsigned long flags; + unsigned long left; + struct sk_buff *skb; + unsigned long rspace; + + if(sk->debug) + printk("cleaning rbuf for sk=%p\n", sk); + + save_flags(flags); + cli(); + + left = sk->prot->rspace(sk); + + /* + * We have to loop through all the buffer headers, + * and try to free up all the space we can. + */ + + while((skb=skb_peek(&sk->receive_queue)) != NULL) + { + if (!skb->used || skb->users) + break; + skb_unlink(skb); + skb->sk = sk; + kfree_skb(skb, FREE_READ); + } + + restore_flags(flags); + + /* + * FIXME: + * At this point we should send an ack if the difference + * in the window, and the amount of space is bigger than + * TCP_WINDOW_DIFF. + */ + + if(sk->debug) + printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk), + left); + if ((rspace=sk->prot->rspace(sk)) != left) + { + /* + * This area has caused the most trouble. The current strategy + * is to simply do nothing if the other end has room to send at + * least 3 full packets, because the ack from those will auto- + * matically update the window. If the other end doesn't think + * we have much space left, but we have room for at least 1 more + * complete packet than it thinks we do, we will send an ack + * immediately. Otherwise we will wait up to .5 seconds in case + * the user reads some more. + */ + sk->ack_backlog++; + /* + * It's unclear whether to use sk->mtu or sk->mss here. They differ only + * if the other end is offering a window smaller than the agreed on MSS + * (called sk->mtu here). In theory there's no connection between send + * and receive, and so no reason to think that they're going to send + * small packets. For the moment I'm using the hack of reducing the mss + * only on the send side, so I'm putting mtu here. + */ + + if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) + { + /* Send an ack right now. */ + tcp_read_wakeup(sk); + } + else + { + /* Force it to send an ack soon. */ + int was_active = del_timer(&sk->retransmit_timer); + if (!was_active || TCP_ACK_TIME < sk->timer.expires) + { + reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); + } + else + add_timer(&sk->retransmit_timer); + } + } +} + + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_read_urg(struct sock * sk, int nonblock, + unsigned char *to, int len, unsigned flags) +{ + /* + * No URG data to read + */ + if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->err) + { + int tmp = -sk->err; + sk->err = 0; + return tmp; + } + + if (sk->state == TCP_CLOSE || sk->done) + { + if (!sk->done) { + sk->done = 1; + return 0; + } + return -ENOTCONN; + } + + if (sk->shutdown & RCV_SHUTDOWN) + { + sk->done = 1; + return 0; + } + sk->inuse = 1; + if (sk->urg_data & URG_VALID) + { + char c = sk->urg_data; + if (!(flags & MSG_PEEK)) + sk->urg_data = URG_READ; + put_fs_byte(c, to); + release_sock(sk); + return 1; + } + release_sock(sk); + + /* + * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +} + + +/* + * This routine copies from a sock struct into the user buffer. + */ + +static int tcp_read(struct sock *sk, unsigned char *to, + int len, int nonblock, unsigned flags) +{ + struct wait_queue wait = { current, NULL }; + int copied = 0; + unsigned long peek_seq; + volatile unsigned long *seq; /* So gcc doesn't overoptimise */ + unsigned long used; + + /* + * This error should be checked. + */ + + if (sk->state == TCP_LISTEN) + return -ENOTCONN; + + /* + * Urgent data needs to be handled specially. + */ + + if (flags & MSG_OOB) + return tcp_read_urg(sk, nonblock, to, len, flags); + + /* + * Copying sequence to update. This is volatile to handle + * the multi-reader case neatly (memcpy_to/fromfs might be + * inline and thus not flush cached variables otherwise). + */ + + peek_seq = sk->copied_seq; + seq = &sk->copied_seq; + if (flags & MSG_PEEK) + seq = &peek_seq; + + add_wait_queue(sk->sleep, &wait); + sk->inuse = 1; + while (len > 0) + { + struct sk_buff * skb; + unsigned long offset; + + /* + * Are we at urgent data? Stop if we have read anything. + */ + + if (copied && sk->urg_data && sk->urg_seq == *seq) + break; + + /* + * Next get a buffer. + */ + + current->state = TASK_INTERRUPTIBLE; + + skb = skb_peek(&sk->receive_queue); + do + { + if (!skb) + break; + if (before(*seq, skb->h.th->seq)) + break; + offset = *seq - skb->h.th->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + skb->used = 1; + skb = skb->next; + } + while (skb != (struct sk_buff *)&sk->receive_queue); + + if (copied) + break; + + if (sk->err) + { + copied = -sk->err; + sk->err = 0; + break; + } + + if (sk->state == TCP_CLOSE) + { + if (!sk->done) + { + sk->done = 1; + break; + } + copied = -ENOTCONN; + break; + } + + if (sk->shutdown & RCV_SHUTDOWN) + { + sk->done = 1; + break; + } + + if (nonblock) + { + copied = -EAGAIN; + break; + } + + cleanup_rbuf(sk); + release_sock(sk); + sk->socket->flags |= SO_WAITDATA; + schedule(); + sk->socket->flags &= ~SO_WAITDATA; + sk->inuse = 1; + + if (current->signal & ~current->blocked) + { + copied = -ERESTARTSYS; + break; + } + continue; + + found_ok_skb: + /* + * Lock the buffer. We can be fairly relaxed as + * an interrupt will never steal a buffer we are + * using unless I've missed something serious in + * tcp_data. + */ + + skb->users++; + + /* + * Ok so how much can we use ? + */ + + used = skb->len - offset; + if (len < used) + used = len; + /* + * Do we have urgent data here? + */ + + if (sk->urg_data) + { + unsigned long urg_offset = sk->urg_seq - *seq; + if (urg_offset < used) + { + if (!urg_offset) + { + if (!sk->urginline) + { + ++*seq; + offset++; + used--; + } + } + else + used = urg_offset; + } + } + + /* + * Copy it - We _MUST_ update *seq first so that we + * don't ever double read when we have dual readers + */ + + *seq += used; + + /* + * This memcpy_tofs can sleep. If it sleeps and we + * do a second read it relies on the skb->users to avoid + * a crash when cleanup_rbuf() gets called. + */ + + memcpy_tofs(to,((unsigned char *)skb->h.th) + + skb->h.th->doff*4 + offset, used); + copied += used; + len -= used; + to += used; + + /* + * We now will not sleep again until we are finished + * with skb. Sorry if you are doing the SMP port + * but you'll just have to fix it neatly ;) + */ + + skb->users --; + + if (after(sk->copied_seq,sk->urg_seq)) + sk->urg_data = 0; + if (used + offset < skb->len) + continue; + + /* + * Process the FIN. + */ + + if (skb->h.th->fin) + goto found_fin_ok; + if (flags & MSG_PEEK) + continue; + skb->used = 1; + continue; + + found_fin_ok: + ++*seq; + if (flags & MSG_PEEK) + break; + + /* + * All is done + */ + + skb->used = 1; + sk->shutdown |= RCV_SHUTDOWN; + break; + + } + remove_wait_queue(sk->sleep, &wait); + current->state = TASK_RUNNING; + + /* Clean up data we have read: This will do ACK frames */ + cleanup_rbuf(sk); + release_sock(sk); + return copied; +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static int tcp_close_state(struct sock *sk, int dead) +{ + int ns=TCP_CLOSE; + int send_fin=0; + switch(sk->state) + { + case TCP_SYN_SENT: /* No SYN back, no FIN needed */ + break; + case TCP_SYN_RECV: + case TCP_ESTABLISHED: /* Closedown begin */ + ns=TCP_FIN_WAIT1; + send_fin=1; + break; + case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */ + case TCP_FIN_WAIT2: + case TCP_CLOSING: + ns=sk->state; + break; + case TCP_CLOSE: + case TCP_LISTEN: + break; + case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and + wait only for the ACK */ + ns=TCP_LAST_ACK; + send_fin=1; + } + + tcp_set_state(sk,ns); + + /* + * This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + */ + if(dead && ns==TCP_FIN_WAIT2) + { + int timer_active=del_timer(&sk->timer); + if(timer_active) + add_timer(&sk->timer); + else + reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); + } + + return send_fin; +} + +/* + * Send a fin. + */ + +static void tcp_send_fin(struct sock *sk) +{ + struct proto *prot =(struct proto *)sk->prot; + struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; + struct tcphdr *t1; + struct sk_buff *buff; + struct device *dev=NULL; + int tmp; + + release_sock(sk); /* in case the malloc sleeps. */ + + buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); + sk->inuse = 1; + + if (buff == NULL) + { + /* This is a disaster if it occurs */ + printk("tcp_send_fin: Impossible malloc failure"); + return; + } + + /* + * Administrivia + */ + + buff->sk = sk; + buff->len = sizeof(*t1); + buff->localroute = sk->localroute; + t1 =(struct tcphdr *) buff->data; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, + sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl); + if (tmp < 0) + { + int t; + /* + * Finish anyway, treat this as a send that got lost. + * (Not good). + */ + + buff->free = 1; + prot->wfree(sk,buff->mem_addr, buff->mem_len); + sk->write_seq++; + t=del_timer(&sk->timer); + if(t) + add_timer(&sk->timer); + else + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return; + } + + /* + * We ought to check if the end of the queue is a buffer and + * if so simply add the fin to that buffer, not send it ahead. + */ + + t1 =(struct tcphdr *)((char *)t1 +tmp); + buff->len += tmp; + buff->dev = dev; + memcpy(t1, th, sizeof(*t1)); + t1->seq = ntohl(sk->write_seq); + sk->write_seq++; + buff->h.seq = sk->write_seq; + t1->ack = 1; + t1->ack_seq = ntohl(sk->acked_seq); + t1->window = ntohs(sk->window=tcp_select_window(sk)); + t1->fin = 1; + t1->rst = 0; + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); + + /* + * If there is data in the write queue, the fin must be appended to + * the write queue. + */ + + if (skb_peek(&sk->write_queue) != NULL) + { + buff->free = 0; + if (buff->next != NULL) + { + printk("tcp_send_fin: next != NULL\n"); + skb_unlink(buff); + } + skb_queue_tail(&sk->write_queue, buff); + } + else + { + sk->sent_seq = sk->write_seq; + sk->prot->queue_xmit(sk, dev, buff, 0); + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set sk->dead=1. + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* + * We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + + if (!(how & SEND_SHUTDOWN)) + return; + + /* + * If we've already sent a FIN, or it's a closed state + */ + + if (sk->state == TCP_FIN_WAIT1 || + sk->state == TCP_FIN_WAIT2 || + sk->state == TCP_CLOSING || + sk->state == TCP_LAST_ACK || + sk->state == TCP_TIME_WAIT || + sk->state == TCP_CLOSE || + sk->state == TCP_LISTEN + ) + { + return; + } + sk->inuse = 1; + + /* + * flag that the sender has shutdown + */ + + sk->shutdown |= SEND_SHUTDOWN; + + /* + * Clear out any half completed packets. + */ + + if (sk->partial) + tcp_send_partial(sk); + + /* + * FIN if needed + */ + + if(tcp_close_state(sk,0)) + tcp_send_fin(sk); + + release_sock(sk); +} + + +static int +tcp_recvfrom(struct sock *sk, unsigned char *to, + int to_len, int nonblock, unsigned flags, + struct sockaddr_in *addr, int *addr_len) +{ + int result; + + /* + * Have to check these first unlike the old code. If + * we check them after we lose data on an error + * which is wrong + */ + + if(addr_len) + *addr_len = sizeof(*addr); + result=tcp_read(sk, to, to_len, nonblock, flags); + + if (result < 0) + return(result); + + if(addr) + { + addr->sin_family = AF_INET; + addr->sin_port = sk->dummy_th.dest; + addr->sin_addr.s_addr = sk->daddr; + } + return(result); +} + + +/* + * This routine will send an RST to the other tcp. + */ + +static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, + struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) +{ + struct sk_buff *buff; + struct tcphdr *t1; + int tmp; + struct device *ndev=NULL; + + /* + * Cannot reset a reset (Think about it). + */ + + if(th->rst) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->len = sizeof(*t1); + buff->sk = NULL; + buff->dev = dev; + buff->localroute = 0; + + t1 =(struct tcphdr *) buff->data; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, + sizeof(struct tcphdr),tos,ttl); + if (tmp < 0) + { + buff->free = 1; + prot->wfree(NULL, buff->mem_addr, buff->mem_len); + return; + } + + t1 =(struct tcphdr *)((char *)t1 +tmp); + buff->len += tmp; + memcpy(t1, th, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->rst = 1; + t1->window = 0; + + if(th->ack) + { + t1->ack = 0; + t1->seq = th->ack_seq; + t1->ack_seq = 0; + } + else + { + t1->ack = 1; + if(!th->syn) + t1->ack_seq=htonl(th->seq); + else + t1->ack_seq=htonl(th->seq+1); + t1->seq=0; + } + + t1->syn = 0; + t1->urg = 0; + t1->fin = 0; + t1->psh = 0; + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); + prot->queue_xmit(NULL, ndev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + + +/* + * Look for tcp options. Parses everything but only knows about MSS. + * This routine is always called with the packet containing the SYN. + * However it may also be called with the ack to the SYN. So you + * can't assume this is always the SYN. It's always called after + * we have set up sk->mtu to our own MTU. + * + * We need at minimum to add PAWS support here. Possibly large windows + * as Linux gets deployed on 100Mb/sec networks. + */ + +static void tcp_options(struct sock *sk, struct tcphdr *th) +{ + unsigned char *ptr; + int length=(th->doff*4)-sizeof(struct tcphdr); + int mss_seen = 0; + + ptr = (unsigned char *)(th + 1); + + while(length>0) + { + int opcode=*ptr++; + int opsize=*ptr++; + switch(opcode) + { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + ptr--; /* the opsize=*ptr++ above was a mistake */ + continue; + + default: + if(opsize<=2) /* Avoid silly options looping forever */ + return; + switch(opcode) + { + case TCPOPT_MSS: + if(opsize==4 && th->syn) + { + sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); + mss_seen = 1; + } + break; + /* Add other options here as people feel the urge to implement stuff like large windows */ + } + ptr+=opsize-2; + length-=opsize; + } + } + if (th->syn) + { + if (! mss_seen) + sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ + } +#ifdef CONFIG_INET_PCTCP + sk->mss = min(sk->max_window >> 1, sk->mtu); +#else + sk->mss = min(sk->max_window, sk->mtu); +#endif +} + +static inline unsigned long default_mask(unsigned long dst) +{ + dst = ntohl(dst); + if (IN_CLASSA(dst)) + return htonl(IN_CLASSA_NET); + if (IN_CLASSB(dst)) + return htonl(IN_CLASSB_NET); + return htonl(IN_CLASSC_NET); +} + +/* + * Default sequence number picking algorithm. + * As close as possible to RFC 793, which + * suggests using a 250kHz clock. + * Further reading shows this assumes 2MB/s networks. + * For 10MB/s ethernet, a 1MHz clock is appropriate. + * That's funny, Linux has one built in! Use it! + */ + +extern inline unsigned long tcp_init_seq(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv.tv_usec+tv.tv_sec*1000000; +} + +/* + * This routine handles a connection request. + * It should make sure we haven't already responded. + * Because of the way BSD works, we have to send a syn/ack now. + * This also means it will be harder to close a socket which is + * listening. + */ + +static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, + unsigned long daddr, unsigned long saddr, + struct options *opt, struct device *dev, unsigned long seq) +{ + struct sk_buff *buff; + struct tcphdr *t1; + unsigned char *ptr; + struct sock *newsk; + struct tcphdr *th; + struct device *ndev=NULL; + int tmp; + struct rtable *rt; + + th = skb->h.th; + + /* If the socket is dead, don't accept the connection. */ + if (!sk->dead) + { + sk->data_ready(sk,0); + } + else + { + if(sk->debug) + printk("Reset on %p: Connect on dead socket.\n",sk); + tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + /* + * Make sure we can accept more. This will prevent a + * flurry of syns from eating up all our memory. + */ + + if (sk->ack_backlog >= sk->max_ack_backlog) + { + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + /* + * We need to build a new sock struct. + * It is sort of bad to have a socket without an inode attached + * to it, but the wake_up's will just wake up the listening socket, + * and if the listening socket is destroyed before this is taken + * off of the queue, this will take care of it. + */ + + newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); + if (newsk == NULL) + { + /* just ignore the syn. It will get retransmitted. */ + tcp_statistics.TcpAttemptFails++; + kfree_skb(skb, FREE_READ); + return; + } + + memcpy(newsk, sk, sizeof(*newsk)); + skb_queue_head_init(&newsk->write_queue); + skb_queue_head_init(&newsk->receive_queue); + newsk->send_head = NULL; + newsk->send_tail = NULL; + skb_queue_head_init(&newsk->back_log); + newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ + newsk->rto = TCP_TIMEOUT_INIT; + newsk->mdev = 0; + newsk->max_window = 0; + newsk->cong_window = 1; + newsk->cong_count = 0; + newsk->ssthresh = 0; + newsk->backoff = 0; + newsk->blog = 0; + newsk->intr = 0; + newsk->proc = 0; + newsk->done = 0; + newsk->partial = NULL; + newsk->pair = NULL; + newsk->wmem_alloc = 0; + newsk->rmem_alloc = 0; + newsk->localroute = sk->localroute; + + newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; + + newsk->err = 0; + newsk->shutdown = 0; + newsk->ack_backlog = 0; + newsk->acked_seq = skb->h.th->seq+1; + newsk->copied_seq = skb->h.th->seq+1; + newsk->fin_seq = skb->h.th->seq; + newsk->state = TCP_SYN_RECV; + newsk->timeout = 0; + newsk->ip_xmit_timeout = 0; + newsk->write_seq = seq; + newsk->window_seq = newsk->write_seq; + newsk->rcv_ack_seq = newsk->write_seq; + newsk->urg_data = 0; + newsk->retransmits = 0; + newsk->linger=0; + newsk->destroy = 0; + init_timer(&newsk->timer); + newsk->timer.data = (unsigned long)newsk; + newsk->timer.function = &net_timer; + init_timer(&newsk->retransmit_timer); + newsk->retransmit_timer.data = (unsigned long)newsk; + newsk->retransmit_timer.function=&retransmit_timer; + newsk->dummy_th.source = skb->h.th->dest; + newsk->dummy_th.dest = skb->h.th->source; + + /* + * Swap these two, they are from our point of view. + */ + + newsk->daddr = saddr; + newsk->saddr = daddr; + + put_sock(newsk->num,newsk); + newsk->dummy_th.res1 = 0; + newsk->dummy_th.doff = 6; + newsk->dummy_th.fin = 0; + newsk->dummy_th.syn = 0; + newsk->dummy_th.rst = 0; + newsk->dummy_th.psh = 0; + newsk->dummy_th.ack = 0; + newsk->dummy_th.urg = 0; + newsk->dummy_th.res2 = 0; + newsk->acked_seq = skb->h.th->seq + 1; + newsk->copied_seq = skb->h.th->seq + 1; + newsk->socket = NULL; + + /* + * Grab the ttl and tos values and use them + */ + + newsk->ip_ttl=sk->ip_ttl; + newsk->ip_tos=skb->ip_hdr->tos; + + /* + * Use 512 or whatever user asked for + */ + + /* + * Note use of sk->user_mss, since user has no direct access to newsk + */ + + rt=ip_rt_route(saddr, NULL,NULL); + + if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) + newsk->window_clamp = rt->rt_window; + else + newsk->window_clamp = 0; + + if (sk->user_mss) + newsk->mtu = sk->user_mss; + else if(rt!=NULL && (rt->rt_flags&RTF_MSS)) + newsk->mtu = rt->rt_mss - HEADER_SIZE; + else + { +#ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */ + if ((saddr ^ daddr) & default_mask(saddr)) +#else + if ((saddr ^ daddr) & dev->pa_mask) +#endif + newsk->mtu = 576 - HEADER_SIZE; + else + newsk->mtu = MAX_WINDOW; + } + + /* + * But not bigger than device MTU + */ + + newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE); + + /* + * This will min with what arrived in the packet + */ + + tcp_options(newsk,skb->h.th); + + buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + { + sk->err = -ENOMEM; + newsk->dead = 1; + newsk->state = TCP_CLOSE; + /* And this will destroy it */ + release_sock(newsk); + kfree_skb(skb, FREE_READ); + tcp_statistics.TcpAttemptFails++; + return; + } + + buff->len = sizeof(struct tcphdr)+4; + buff->sk = newsk; + buff->localroute = newsk->localroute; + + t1 =(struct tcphdr *) buff->data; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev, + IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); + + /* + * Something went wrong. + */ + + if (tmp < 0) + { + sk->err = tmp; + buff->free = 1; + kfree_skb(buff,FREE_WRITE); + newsk->dead = 1; + newsk->state = TCP_CLOSE; + release_sock(newsk); + skb->sk = sk; + kfree_skb(skb, FREE_READ); + tcp_statistics.TcpAttemptFails++; + return; + } + + buff->len += tmp; + t1 =(struct tcphdr *)((char *)t1 +tmp); + + memcpy(t1, skb->h.th, sizeof(*t1)); + buff->h.seq = newsk->write_seq; + /* + * Swap the send and the receive. + */ + t1->dest = skb->h.th->source; + t1->source = newsk->dummy_th.source; + t1->seq = ntohl(newsk->write_seq++); + t1->ack = 1; + newsk->window = tcp_select_window(newsk); + newsk->sent_seq = newsk->write_seq; + t1->window = ntohs(newsk->window); + t1->res1 = 0; + t1->res2 = 0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->syn = 1; + t1->ack_seq = ntohl(skb->h.th->seq+1); + t1->doff = sizeof(*t1)/4+1; + ptr =(unsigned char *)(t1+1); + ptr[0] = 2; + ptr[1] = 4; + ptr[2] = ((newsk->mtu) >> 8) & 0xff; + ptr[3] =(newsk->mtu) & 0xff; + + tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk); + newsk->prot->queue_xmit(newsk, ndev, buff, 0); + reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT); + skb->sk = newsk; + + /* + * Charge the sock_buff to newsk. + */ + + sk->rmem_alloc -= skb->mem_len; + newsk->rmem_alloc += skb->mem_len; + + skb_queue_tail(&sk->receive_queue,skb); + sk->ack_backlog++; + release_sock(newsk); + tcp_statistics.TcpOutSegs++; +} + + +static void tcp_close(struct sock *sk, int timeout) +{ + /* + * We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + */ + + sk->inuse = 1; + + if(sk->state == TCP_LISTEN) + { + /* Special case */ + tcp_set_state(sk, TCP_CLOSE); + tcp_close_pending(sk); + release_sock(sk); + return; + } + + sk->keepopen = 1; + sk->shutdown = SHUTDOWN_MASK; + + if (!sk->dead) + sk->state_change(sk); + + if (timeout == 0) + { + struct sk_buff *skb; + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb, FREE_READ); + /* + * Get rid off any half-completed packets. + */ + + if (sk->partial) + tcp_send_partial(sk); + } + + + /* + * Timeout is not the same thing - however the code likes + * to send both the same way (sigh). + */ + + if(timeout) + { + tcp_set_state(sk, TCP_CLOSE); /* Dead */ + } + else + { + if(tcp_close_state(sk,1)==1) + { + tcp_send_fin(sk); + } + } + release_sock(sk); +} + + +/* + * This routine takes stuff off of the write queue, + * and puts it in the xmit queue. This happens as incoming acks + * open up the remote window for us. + */ + +static void tcp_write_xmit(struct sock *sk) +{ + struct sk_buff *skb; + + /* + * The bytes will have to remain here. In time closedown will + * empty the write queue and all will be happy + */ + + if(sk->zapped) + return; + + /* + * Anything on the transmit queue that fits the window can + * be added providing we are not + * + * a) retransmitting (Nagle's rule) + * b) exceeding our congestion window. + */ + + while((skb = skb_peek(&sk->write_queue)) != NULL && + before(skb->h.seq, sk->window_seq + 1) && + (sk->retransmits == 0 || + sk->ip_xmit_timeout != TIME_WRITE || + before(skb->h.seq, sk->rcv_ack_seq + 1)) + && sk->packets_out < sk->cong_window) + { + IS_SKB(skb); + skb_unlink(skb); + + /* + * See if we really need to send the packet. + */ + + if (before(skb->h.seq, sk->rcv_ack_seq +1)) + { + /* + * This is acked data. We can discard it. This + * cannot currently occur. + */ + + sk->retransmits = 0; + kfree_skb(skb, FREE_WRITE); + if (!sk->dead) + sk->write_space(sk); + } + else + { + struct tcphdr *th; + struct iphdr *iph; + int size; +/* + * put in the ack seq and window at this point rather than earlier, + * in order to keep them monotonic. We really want to avoid taking + * back window allocations. That's legal, but RFC1122 says it's frowned on. + * Ack and window will in general have changed since this packet was put + * on the write queue. + */ + iph = (struct iphdr *)(skb->data + + skb->dev->hard_header_len); + th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); + size = skb->len - (((unsigned char *) th) - skb->data); + + th->ack_seq = ntohl(sk->acked_seq); + th->window = ntohs(tcp_select_window(sk)); + + tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + + sk->sent_seq = skb->h.seq; + + /* + * IP manages our queue for some crazy reason + */ + + sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); + + /* + * Again we slide the timer wrongly + */ + + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } + } +} + + +/* + * This routine deals with incoming acks, but not outgoing ones. + */ + +extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len) +{ + unsigned long ack; + int flag = 0; + + /* + * 1 - there was data in packet as well as ack or new data is sent or + * in shutdown state + * 2 - data from retransmit queue was acked and removed + * 4 - window shrunk or data from retransmit queue was acked and removed + */ + + if(sk->zapped) + return(1); /* Dead, cant ack any more so why bother */ + + /* + * Have we discovered a larger window + */ + + ack = ntohl(th->ack_seq); + + if (ntohs(th->window) > sk->max_window) + { + sk->max_window = ntohs(th->window); +#ifdef CONFIG_INET_PCTCP + /* Hack because we don't send partial packets to non SWS + handling hosts */ + sk->mss = min(sk->max_window>>1, sk->mtu); +#else + sk->mss = min(sk->max_window, sk->mtu); +#endif + } + + /* + * We have dropped back to keepalive timeouts. Thus we have + * no retransmits pending. + */ + + if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN) + sk->retransmits = 0; + + /* + * If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + + if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) + { + if(sk->debug) + printk("Ack ignored %lu %lu\n",ack,sk->sent_seq); + + /* + * Keepalive processing. + */ + + if (after(ack, sk->sent_seq)) + { + return(0); + } + + /* + * Restart the keepalive timer. + */ + + if (sk->keepopen) + { + if(sk->ip_xmit_timeout==TIME_KEEPOPEN) + reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + } + return(1); + } + + /* + * If there is data set flag 1 + */ + + if (len != th->doff*4) + flag |= 1; + + /* + * See if our window has been shrunk. + */ + + if (after(sk->window_seq, ack+ntohs(th->window))) + { + /* + * We may need to move packets from the send queue + * to the write queue, if the window has been shrunk on us. + * The RFC says you are not allowed to shrink your window + * like this, but if the other end does, you must be able + * to deal with it. + */ + struct sk_buff *skb; + struct sk_buff *skb2; + struct sk_buff *wskb = NULL; + + skb2 = sk->send_head; + sk->send_head = NULL; + sk->send_tail = NULL; + + /* + * This is an artifact of a flawed concept. We want one + * queue and a smarter send routine when we send all. + */ + + flag |= 4; /* Window changed */ + + sk->window_seq = ack + ntohs(th->window); + cli(); + while (skb2 != NULL) + { + skb = skb2; + skb2 = skb->link3; + skb->link3 = NULL; + if (after(skb->h.seq, sk->window_seq)) + { + if (sk->packets_out > 0) + sk->packets_out--; + /* We may need to remove this from the dev send list. */ + if (skb->next != NULL) + { + skb_unlink(skb); + } + /* Now add it to the write_queue. */ + if (wskb == NULL) + skb_queue_head(&sk->write_queue,skb); + else + skb_append(wskb,skb); + wskb = skb; + } + else + { + if (sk->send_head == NULL) + { + sk->send_head = skb; + sk->send_tail = skb; + } + else + { + sk->send_tail->link3 = skb; + sk->send_tail = skb; + } + skb->link3 = NULL; + } + } + sti(); + } + + /* + * Pipe has emptied + */ + + if (sk->send_tail == NULL || sk->send_head == NULL) + { + sk->send_head = NULL; + sk->send_tail = NULL; + sk->packets_out= 0; + } + + /* + * Update the right hand window edge of the host + */ + + sk->window_seq = ack + ntohs(th->window); + + /* + * We don't want too many packets out there. + */ + + if (sk->ip_xmit_timeout == TIME_WRITE && + sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) + { + /* + * This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. Because we keep cong_window in integral + * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a + * counter and increment it once every cwnd times. It's possible + * that this should be done only if sk->retransmits == 0. I'm + * interpreting "new data is acked" as including data that has + * been retransmitted but is just now being acked. + */ + if (sk->cong_window < sk->ssthresh) + /* + * In "safe" area, increase + */ + sk->cong_window++; + else + { + /* + * In dangerous area, increase slowly. In theory this is + * sk->cong_window += 1 / sk->cong_window + */ + if (sk->cong_count >= sk->cong_window) + { + sk->cong_window++; + sk->cong_count = 0; + } + else + sk->cong_count++; + } + } + + /* + * Remember the highest ack received. + */ + + sk->rcv_ack_seq = ack; + + /* + * If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + + if (sk->ip_xmit_timeout == TIME_PROBE0) + { + sk->retransmits = 0; /* Our probe was answered */ + + /* + * Was it a usable window open ? + */ + + if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ + ! before (sk->window_seq, sk->write_queue.next->h.seq)) + { + sk->backoff = 0; + + /* + * Recompute rto from rtt. this eliminates any backoff. + */ + + sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; + if (sk->rto > 120*HZ) + sk->rto = 120*HZ; + if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about + .2 of a second because of BSD delayed acks - on a 100Mb/sec link + .2 of a second is going to need huge windows (SIGH) */ + sk->rto = 20; + } + } + + /* + * See if we can take anything off of the retransmit queue. + */ + + while(sk->send_head != NULL) + { + /* Check for a bug. */ + if (sk->send_head->link3 && + after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) + printk("INET: tcp.c: *** bug send_list out of order.\n"); + + /* + * If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived the other end. + */ + + if (before(sk->send_head->h.seq, ack+1)) + { + struct sk_buff *oskb; + if (sk->retransmits) + { + /* + * We were retransmitting. don't count this in RTT est + */ + flag |= 2; + + /* + * even though we've gotten an ack, we're still + * retransmitting as long as we're sending from + * the retransmit queue. Keeping retransmits non-zero + * prevents us from getting new data interspersed with + * retransmissions. + */ + + if (sk->send_head->link3) /* Any more queued retransmits? */ + sk->retransmits = 1; + else + sk->retransmits = 0; + } + /* + * Note that we only reset backoff and rto in the + * rtt recomputation code. And that doesn't happen + * if there were retransmissions in effect. So the + * first new packet after the retransmissions is + * sent with the backoff still in effect. Not until + * we get an ack from a non-retransmitted packet do + * we reset the backoff and rto. This allows us to deal + * with a situation where the network delay has increased + * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + /* + * We have one less packet out there. + */ + + if (sk->packets_out > 0) + sk->packets_out --; + /* + * Wake up the process, it can probably write more. + */ + if (!sk->dead) + sk->write_space(sk); + oskb = sk->send_head; + + if (!(flag&2)) /* Not retransmitting */ + { + long m; + + /* + * The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + */ + + m = jiffies - oskb->when; /* RTT */ + if(m<=0) + m=1; /* IS THIS RIGHT FOR <0 ??? */ + m -= (sk->rtt >> 3); /* m is now error in rtt est */ + sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (sk->mdev >> 2); /* similar update on mdev */ + sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + + /* + * Now update timeout. Note that this removes any backoff. + */ + + sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; + if (sk->rto > 120*HZ) + sk->rto = 120*HZ; + if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ + sk->rto = 20; + sk->backoff = 0; + } + flag |= (2|4); /* 2 is really more like 'don't adjust the rtt + In this case as we just set it up */ + cli(); + oskb = sk->send_head; + IS_SKB(oskb); + sk->send_head = oskb->link3; + if (sk->send_head == NULL) + { + sk->send_tail = NULL; + } + + /* + * We may need to remove this from the dev send list. + */ + + if (oskb->next) + skb_unlink(oskb); + sti(); + kfree_skb(oskb, FREE_WRITE); /* write. */ + if (!sk->dead) + sk->write_space(sk); + } + else + { + break; + } + } + + /* + * XXX someone ought to look at this too.. at the moment, if skb_peek() + * returns non-NULL, we complete ignore the timer stuff in the else + * clause. We ought to organize the code so that else clause can + * (should) be executed regardless, possibly moving the PROBE timer + * reset over. The skb_peek() thing should only move stuff to the + * write queue, NOT also manage the timer functions. + */ + + /* + * Maybe we can take some stuff off of the write queue, + * and put it onto the xmit queue. + */ + if (skb_peek(&sk->write_queue) != NULL) + { + if (after (sk->window_seq+1, sk->write_queue.next->h.seq) && + (sk->retransmits == 0 || + sk->ip_xmit_timeout != TIME_WRITE || + before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1)) + && sk->packets_out < sk->cong_window) + { + /* + * Add more data to the send queue. + */ + flag |= 1; + tcp_write_xmit(sk); + } + else if (before(sk->window_seq, sk->write_queue.next->h.seq) && + sk->send_head == NULL && + sk->ack_backlog == 0 && + sk->state != TCP_TIME_WAIT) + { + /* + * Data to queue but no room. + */ + reset_xmit_timer(sk, TIME_PROBE0, sk->rto); + } + } + else + { + /* + * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets + * from TCP_CLOSE we don't do anything + * + * from anything else, if there is write data (or fin) pending, + * we use a TIME_WRITE timeout, else if keepalive we reset to + * a KEEPALIVE timeout, else we delete the timer. + * + * We do not set flag for nominal write data, otherwise we may + * force a state where we start to write itsy bitsy tidbits + * of data. + */ + + switch(sk->state) { + case TCP_TIME_WAIT: + /* + * keep us in TIME_WAIT until we stop getting packets, + * reset the timeout. + */ + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + break; + case TCP_CLOSE: + /* + * don't touch the timer. + */ + break; + default: + /* + * Must check send_head, write_queue, and ack_backlog + * to determine which timeout to use. + */ + if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } else if (sk->keepopen) { + reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); + } else { + del_timer(&sk->retransmit_timer); + sk->ip_xmit_timeout = 0; + } + break; + } + } + + /* + * We have nothing queued but space to send. Send any partial + * packets immediately (end of Nagle rule application). + */ + + if (sk->packets_out == 0 && sk->partial != NULL && + skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) + { + flag |= 1; + tcp_send_partial(sk); + } + + /* + * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and + * we are now waiting for an acknowledge to our FIN. The other end is + * already in TIME_WAIT. + * + * Move to TCP_CLOSE on success. + */ + + if (sk->state == TCP_LAST_ACK) + { + if (!sk->dead) + sk->state_change(sk); + if(sk->debug) + printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n", + sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); + if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) + { + flag |= 1; + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + } + } + + /* + * Incoming ACK to a FIN we sent in the case of our initiating the close. + * + * Move to FIN_WAIT2 to await a FIN from the other end. Set + * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. + */ + + if (sk->state == TCP_FIN_WAIT1) + { + + if (!sk->dead) + sk->state_change(sk); + if (sk->rcv_ack_seq == sk->write_seq) + { + flag |= 1; + sk->shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + } + } + + /* + * Incoming ACK to a FIN we sent in the case of a simultaneous close. + * + * Move to TIME_WAIT + */ + + if (sk->state == TCP_CLOSING) + { + + if (!sk->dead) + sk->state_change(sk); + if (sk->rcv_ack_seq == sk->write_seq) + { + flag |= 1; + tcp_time_wait(sk); + } + } + + /* + * Final ack of a three way shake + */ + + if(sk->state==TCP_SYN_RECV) + { + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_options(sk,th); + sk->dummy_th.dest=th->source; + sk->copied_seq = sk->acked_seq; + if(!sk->dead) + sk->state_change(sk); + if(sk->max_window==0) + { + sk->max_window=32; /* Sanity check */ + sk->mss=min(sk->max_window,sk->mtu); + } + } + + /* + * I make no guarantees about the first clause in the following + * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under + * what conditions "!flag" would be true. However I think the rest + * of the conditions would prevent that from causing any + * unnecessary retransmission. + * Clearly if the first packet has expired it should be + * retransmitted. The other alternative, "flag&2 && retransmits", is + * harder to explain: You have to look carefully at how and when the + * timer is set and with what timeout. The most recent transmission always + * sets the timer. So in general if the most recent thing has timed + * out, everything before it has as well. So we want to go ahead and + * retransmit some more. If we didn't explicitly test for this + * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" + * would not be true. If you look at the pattern of timing, you can + * show that rto is increased fast enough that the next packet would + * almost never be retransmitted immediately. Then you'd end up + * waiting for a timeout to send each packet on the retransmission + * queue. With my implementation of the Karn sampling algorithm, + * the timeout would double each time. The net result is that it would + * take a hideous amount of time to recover from a single dropped packet. + * It's possible that there should also be a test for TIME_WRITE, but + * I think as long as "send_head != NULL" and "retransmit" is on, we've + * got to be in real retransmission mode. + * Note that tcp_do_retransmit is called with all==1. Setting cong_window + * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. + * As long as no further losses occur, this seems reasonable. + */ + + if (((!flag) || (flag&4)) && sk->send_head != NULL && + (((flag&2) && sk->retransmits) || + (sk->send_head->when + sk->rto < jiffies))) + { + if(sk->send_head->when + sk->rto < jiffies) + tcp_retransmit(sk,0); + else + { + tcp_do_retransmit(sk, 1); + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + } + } + + return(1); +} + + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + * + */ + +static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + sk->fin_seq = th->seq + skb->len + th->syn + th->fin; + + if (!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 1); + } + + switch(sk->state) + { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_ESTABLISHED: + /* + * move to CLOSE_WAIT, tcp_data() already handled + * sending the ack. + */ + tcp_set_state(sk,TCP_CLOSE_WAIT); + if (th->rst) + sk->shutdown = SHUTDOWN_MASK; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* + * received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_TIME_WAIT: + /* + * received a retransmission of the FIN, + * restart the TIME_WAIT timer. + */ + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + case TCP_FIN_WAIT1: + /* + * This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + * + * This causes a WRITE timeout, which will either + * move on to TIME_WAIT when we timeout, or resend + * the FIN properly (maybe we get rid of that annoying + * FIN lost hang). The TIME_WRITE code is already correct + * for handling this timeout. + */ + + if(sk->ip_xmit_timeout != TIME_WRITE) + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + tcp_set_state(sk,TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* + * received a FIN -- send ACK and enter TIME_WAIT + */ + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + sk->shutdown|=SHUTDOWN_MASK; + tcp_set_state(sk,TCP_TIME_WAIT); + break; + case TCP_CLOSE: + /* + * already in CLOSE + */ + break; + default: + tcp_set_state(sk,TCP_LAST_ACK); + + /* Start the timers. */ + reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + } + + return(0); +} + + + +/* + * This routine handles the data. If there is room in the buffer, + * it will be have already been moved into it. If there is no + * room, then we will just have to discard the packet. + */ + +extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, + unsigned long saddr, unsigned short len) +{ + struct sk_buff *skb1, *skb2; + struct tcphdr *th; + int dup_dumped=0; + unsigned long new_seq; + unsigned long shut_seq; + + th = skb->h.th; + skb->len = len -(th->doff*4); + + /* + * The bytes in the receive read/assembly queue has increased. Needed for the + * low memory discard algorithm + */ + + sk->bytes_rcv += skb->len; + + if (skb->len == 0 && !th->fin) + { + /* + * Don't want to keep passing ack's back and forth. + * (someone sent us dataless, boring frame) + */ + if (!th->ack) + tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr); + kfree_skb(skb, FREE_READ); + return(0); + } + + /* + * We no longer have anyone receiving data on this connection. + */ + +#ifndef TCP_DONT_RST_SHUTDOWN + + if(sk->shutdown & RCV_SHUTDOWN) + { + /* + * FIXME: BSD has some magic to avoid sending resets to + * broken 4.2 BSD keepalives. Much to my surprise a few non + * BSD stacks still have broken keepalives so we want to + * cope with it. + */ + + if(skb->len) /* We don't care if it's just an ack or + a keepalive/window probe */ + { + new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ + + /* Do this the way 4.4BSD treats it. Not what I'd + regard as the meaning of the spec but it's what BSD + does and clearly they know everything 8) */ + + /* + * This is valid because of two things + * + * a) The way tcp_data behaves at the bottom. + * b) A fin takes effect when read not when received. + */ + + shut_seq=sk->acked_seq+1; /* Last byte */ + + if(after(new_seq,shut_seq)) + { + if(sk->debug) + printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n", + sk, new_seq, shut_seq, sk->blog); + if(sk->dead) + { + sk->acked_seq = new_seq + th->fin; + tcp_reset(sk->saddr, sk->daddr, skb->h.th, + sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); + tcp_statistics.TcpEstabResets++; + tcp_set_state(sk,TCP_CLOSE); + sk->err = EPIPE; + sk->shutdown = SHUTDOWN_MASK; + kfree_skb(skb, FREE_READ); + return 0; + } + } + } + } + +#endif + + /* + * Now we have to walk the chain, and figure out where this one + * goes into it. This is set up so that the last packet we received + * will be the first one we look at, that way if everything comes + * in order, there will be no performance loss, and if they come + * out of order we will be able to fit things in nicely. + * + * [AC: This is wrong. We should assume in order first and then walk + * forwards from the first hole based upon real traffic patterns.] + * + */ + + if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */ + { + skb_queue_head(&sk->receive_queue,skb); + skb1= NULL; + } + else + { + for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) + { + if(sk->debug) + { + printk("skb1=%p :", skb1); + printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq); + printk("skb->h.th->seq = %ld\n",skb->h.th->seq); + printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq, + sk->acked_seq); + } + + /* + * Optimisation: Duplicate frame or extension of previous frame from + * same sequence point (lost ack case). + * The frame contains duplicate data or replaces a previous frame + * discard the previous frame (safe as sk->inuse is set) and put + * the new one in its place. + */ + + if (th->seq==skb1->h.th->seq && skb->len>= skb1->len) + { + skb_append(skb1,skb); + skb_unlink(skb1); + kfree_skb(skb1,FREE_READ); + dup_dumped=1; + skb1=NULL; + break; + } + + /* + * Found where it fits + */ + + if (after(th->seq+1, skb1->h.th->seq)) + { + skb_append(skb1,skb); + break; + } + + /* + * See if we've hit the start. If so insert. + */ + if (skb1 == skb_peek(&sk->receive_queue)) + { + skb_queue_head(&sk->receive_queue, skb); + break; + } + } + } + + /* + * Figure out what the ack value for this frame is + */ + + th->ack_seq = th->seq + skb->len; + if (th->syn) + th->ack_seq++; + if (th->fin) + th->ack_seq++; + + if (before(sk->acked_seq, sk->copied_seq)) + { + printk("*** tcp.c:tcp_data bug acked < copied\n"); + sk->acked_seq = sk->copied_seq; + } + + /* + * Now figure out if we can ack anything. This is very messy because we really want two + * receive queues, a completed and an assembly queue. We also want only one transmit + * queue. + */ + + if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) + { + if (before(th->seq, sk->acked_seq+1)) + { + int newwindow; + + if (after(th->ack_seq, sk->acked_seq)) + { + newwindow = sk->window-(th->ack_seq - sk->acked_seq); + if (newwindow < 0) + newwindow = 0; + sk->window = newwindow; + sk->acked_seq = th->ack_seq; + } + skb->acked = 1; + + /* + * When we ack the fin, we do the FIN + * processing. + */ + + if (skb->h.th->fin) + { + tcp_fin(skb,sk,skb->h.th); + } + + for(skb2 = skb->next; + skb2 != (struct sk_buff *)&sk->receive_queue; + skb2 = skb2->next) + { + if (before(skb2->h.th->seq, sk->acked_seq+1)) + { + if (after(skb2->h.th->ack_seq, sk->acked_seq)) + { + newwindow = sk->window - + (skb2->h.th->ack_seq - sk->acked_seq); + if (newwindow < 0) + newwindow = 0; + sk->window = newwindow; + sk->acked_seq = skb2->h.th->ack_seq; + } + skb2->acked = 1; + /* + * When we ack the fin, we do + * the fin handling. + */ + if (skb2->h.th->fin) + { + tcp_fin(skb,sk,skb->h.th); + } + + /* + * Force an immediate ack. + */ + + sk->ack_backlog = sk->max_ack_backlog; + } + else + { + break; + } + } + + /* + * This also takes care of updating the window. + * This if statement needs to be simplified. + */ + if (!sk->delay_acks || + sk->ack_backlog >= sk->max_ack_backlog || + sk->bytes_rcv > sk->max_unacked || th->fin) { + /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */ + } + else + { + sk->ack_backlog++; + if(sk->debug) + printk("Ack queued.\n"); + reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); + } + } + } + + /* + * If we've missed a packet, send an ack. + * Also start a timer to send another. + */ + + if (!skb->acked) + { + + /* + * This is important. If we don't have much room left, + * we need to throw out a few packets so we have a good + * window. Note that mtu is used, not mss, because mss is really + * for the send side. He could be sending us stuff as large as mtu. + */ + + while (sk->prot->rspace(sk) < sk->mtu) + { + skb1 = skb_peek(&sk->receive_queue); + if (skb1 == NULL) + { + printk("INET: tcp.c:tcp_data memory leak detected.\n"); + break; + } + + /* + * Don't throw out something that has been acked. + */ + + if (skb1->acked) + { + break; + } + + skb_unlink(skb1); + kfree_skb(skb1, FREE_READ); + } + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + sk->ack_backlog++; + reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); + } + else + { + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + } + + /* + * Now tell the user we may have some data. + */ + + if (!sk->dead) + { + if(sk->debug) + printk("Data wakeup.\n"); + sk->data_ready(sk,0); + } + return(0); +} + + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + unsigned long ptr = ntohs(th->urg_ptr); + + if (ptr) + ptr--; + ptr += th->seq; + + /* ignore urgent data that we've already seen and read */ + if (after(sk->copied_seq, ptr)) + return; + + /* do we already have a newer (or duplicate) urgent pointer? */ + if (sk->urg_data && !after(ptr, sk->urg_seq)) + return; + + /* tell the world about our new urgent pointer */ + if (sk->proc != 0) { + if (sk->proc > 0) { + kill_proc(sk->proc, SIGURG, 1); + } else { + kill_pg(-sk->proc, SIGURG, 1); + } + } + sk->urg_data = URG_NOTYET; + sk->urg_seq = ptr; +} + +/* + * This is the 'fast' part of urgent handling. + */ + +extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th, + unsigned long saddr, unsigned long len) +{ + unsigned long ptr; + + /* + * Check if we get a new urgent pointer - normally not + */ + + if (th->urg) + tcp_check_urg(sk,th); + + /* + * Do we wait for any urgent data? - normally not + */ + + if (sk->urg_data != URG_NOTYET) + return 0; + + /* + * Is the urgent pointer pointing into this packet? + */ + + ptr = sk->urg_seq - th->seq + th->doff*4; + if (ptr >= len) + return 0; + + /* + * Ok, got the correct packet, update info + */ + + sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + if (!sk->dead) + sk->data_ready(sk,0); + return 0; +} + +/* + * This will accept the next outstanding connection. + */ + +static struct sock *tcp_accept(struct sock *sk, int flags) +{ + struct sock *newsk; + struct sk_buff *skb; + + /* + * We need to make sure that this socket is listening, + * and that it has something pending. + */ + + if (sk->state != TCP_LISTEN) + { + sk->err = EINVAL; + return(NULL); + } + + /* Avoid the race. */ + cli(); + sk->inuse = 1; + + while((skb = tcp_dequeue_established(sk)) == NULL) + { + if (flags & O_NONBLOCK) + { + sti(); + release_sock(sk); + sk->err = EAGAIN; + return(NULL); + } + + release_sock(sk); + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) + { + sti(); + sk->err = ERESTARTSYS; + return(NULL); + } + sk->inuse = 1; + } + sti(); + + /* + * Now all we need to do is return skb->sk. + */ + + newsk = skb->sk; + + kfree_skb(skb, FREE_READ); + sk->ack_backlog--; + release_sock(sk); + return(newsk); +} + + +/* + * This will initiate an outgoing connection. + */ + +static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) +{ + struct sk_buff *buff; + struct device *dev=NULL; + unsigned char *ptr; + int tmp; + int atype; + struct tcphdr *t1; + struct rtable *rt; + + if (sk->state != TCP_CLOSE) + { + return(-EISCONN); + } + + if (addr_len < 8) + return(-EINVAL); + + if (usin->sin_family && usin->sin_family != AF_INET) + return(-EAFNOSUPPORT); + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(usin->sin_addr.s_addr==INADDR_ANY) + usin->sin_addr.s_addr=ip_my_addr(); + + /* + * Don't want a TCP connection going to a broadcast address + */ + + if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) + return -ENETUNREACH; + + sk->inuse = 1; + sk->daddr = usin->sin_addr.s_addr; + sk->write_seq = tcp_init_seq(); + sk->window_seq = sk->write_seq; + sk->rcv_ack_seq = sk->write_seq -1; + sk->err = 0; + sk->dummy_th.dest = usin->sin_port; + release_sock(sk); + + buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); + if (buff == NULL) + { + return(-ENOMEM); + } + sk->inuse = 1; + buff->len = 24; + buff->sk = sk; + buff->free = 0; + buff->localroute = sk->localroute; + + t1 = (struct tcphdr *) buff->data; + + /* + * Put in the IP header and routing stuff. + */ + + rt=ip_rt_route(sk->daddr, NULL, NULL); + + + /* + * We need to build the routing stuff from the things saved in skb. + */ + + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); + if (tmp < 0) + { + sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); + release_sock(sk); + return(-ENETUNREACH); + } + + buff->len += tmp; + t1 = (struct tcphdr *)((char *)t1 +tmp); + + memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); + t1->seq = ntohl(sk->write_seq++); + sk->sent_seq = sk->write_seq; + buff->h.seq = sk->write_seq; + t1->ack = 0; + t1->window = 2; + t1->res1=0; + t1->res2=0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->syn = 1; + t1->urg_ptr = 0; + t1->doff = 6; + /* use 512 or whatever user asked for */ + + if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) + sk->window_clamp=rt->rt_window; + else + sk->window_clamp=0; + + if (sk->user_mss) + sk->mtu = sk->user_mss; + else if(rt!=NULL && (rt->rt_flags&RTF_MTU)) + sk->mtu = rt->rt_mss; + else + { +#ifdef CONFIG_INET_SNARL + if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr)) +#else + if ((sk->saddr ^ sk->daddr) & dev->pa_mask) +#endif + sk->mtu = 576 - HEADER_SIZE; + else + sk->mtu = MAX_WINDOW; + } + /* + * but not bigger than device MTU + */ + + if(sk->mtu <32) + sk->mtu = 32; /* Sanity limit */ + + sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE); + + /* + * Put in the TCP options to say MTU. + */ + + ptr = (unsigned char *)(t1+1); + ptr[0] = 2; + ptr[1] = 4; + ptr[2] = (sk->mtu) >> 8; + ptr[3] = (sk->mtu) & 0xff; + tcp_send_check(t1, sk->saddr, sk->daddr, + sizeof(struct tcphdr) + 4, sk); + + /* + * This must go first otherwise a really quick response will get reset. + */ + + tcp_set_state(sk,TCP_SYN_SENT); + sk->rto = TCP_TIMEOUT_INIT; +#if 0 /* we already did this */ + init_timer(&sk->retransmit_timer); +#endif + sk->retransmit_timer.function=&retransmit_timer; + sk->retransmit_timer.data = (unsigned long)sk; + reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */ + sk->retransmits = TCP_SYN_RETRIES; + + sk->prot->queue_xmit(sk, dev, buff, 0); + reset_xmit_timer(sk, TIME_WRITE, sk->rto); + tcp_statistics.TcpActiveOpens++; + tcp_statistics.TcpOutSegs++; + + release_sock(sk); + return(0); +} + + +/* This functions checks to see if the tcp header is actually acceptable. */ +extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len, + struct options *opt, unsigned long saddr, struct device *dev) +{ + unsigned long next_seq; + + next_seq = len - 4*th->doff; + if (th->fin) + next_seq++; + /* if we have a zero window, we can't have any data in the packet.. */ + if (next_seq && !sk->window) + goto ignore_it; + next_seq += th->seq; + + /* + * This isn't quite right. sk->acked_seq could be more recent + * than sk->window. This is however close enough. We will accept + * slightly more packets than we should, but it should not cause + * problems unless someone is trying to forge packets. + */ + + /* have we already seen all of this packet? */ + if (!after(next_seq+1, sk->acked_seq)) + goto ignore_it; + /* or does it start beyond the window? */ + if (!before(th->seq, sk->acked_seq + sk->window + 1)) + goto ignore_it; + + /* ok, at least part of this packet would seem interesting.. */ + return 1; + +ignore_it: + if (th->rst) + return 0; + + /* + * Send a reset if we get something not ours and we are + * unsynchronized. Note: We don't do anything to our end. We + * are just killing the bogus remote connection then we will + * connect again and it will work (with luck). + */ + + if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) + { + tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); + return 1; + } + + /* Try to resync things. */ + tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); + return 0; +} + +/* + * When we get a reset we do this. + */ + +static int tcp_std_reset(struct sock *sk, struct sk_buff *skb) +{ + sk->zapped = 1; + sk->err = ECONNRESET; + if (sk->state == TCP_SYN_SENT) + sk->err = ECONNREFUSED; + if (sk->state == TCP_CLOSE_WAIT) + sk->err = EPIPE; +#ifdef TCP_DO_RFC1337 + /* + * Time wait assassination protection [RFC1337] + */ + if(sk->state!=TCP_TIME_WAIT) + { + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + } +#else + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; +#endif + if (!sk->dead) + sk->state_change(sk); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); +} + +/* + * A TCP packet has arrived. + */ + +int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, + unsigned long daddr, unsigned short len, + unsigned long saddr, int redo, struct inet_protocol * protocol) +{ + struct tcphdr *th; + struct sock *sk; + int syn_ok=0; + + if (!skb) + { + printk("IMPOSSIBLE 1\n"); + return(0); + } + + if (!dev) + { + printk("IMPOSSIBLE 2\n"); + return(0); + } + + tcp_statistics.TcpInSegs++; + + if(skb->pkt_type!=PACKET_HOST) + { + kfree_skb(skb,FREE_READ); + return(0); + } + + th = skb->h.th; + + /* + * Find the socket. + */ + + sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); + + /* + * If this socket has got a reset it's to all intents and purposes + * really dead. Count closed sockets as dead. + * + * Note: BSD appears to have a bug here. A 'closed' TCP in BSD + * simply drops data. This seems incorrect as a 'closed' TCP doesn't + * exist so should cause resets as if the port was unreachable. + */ + + if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE)) + sk=NULL; + + if (!redo) + { + if (tcp_check(th, len, saddr, daddr )) + { + skb->sk = NULL; + kfree_skb(skb,FREE_READ); + /* + * We don't release the socket because it was + * never marked in use. + */ + return(0); + } + th->seq = ntohl(th->seq); + + /* See if we know about the socket. */ + if (sk == NULL) + { + /* + * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset) + */ + tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); + skb->sk = NULL; + /* + * Discard frame + */ + kfree_skb(skb, FREE_READ); + return(0); + } + + skb->len = len; + skb->acked = 0; + skb->used = 0; + skb->free = 0; + skb->saddr = daddr; + skb->daddr = saddr; + + /* We may need to add it to the backlog here. */ + cli(); + if (sk->inuse) + { + skb_queue_tail(&sk->back_log, skb); + sti(); + return(0); + } + sk->inuse = 1; + sti(); + } + else + { + if (sk==NULL) + { + tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + return(0); + } + } + + + if (!sk->prot) + { + printk("IMPOSSIBLE 3\n"); + return(0); + } + + + /* + * Charge the memory to the socket. + */ + + if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); + } + + skb->sk=sk; + sk->rmem_alloc += skb->mem_len; + + /* + * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We + * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug + * compatibility. We also set up variables more thoroughly [Karn notes in the + * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. + */ + + if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ + { + + /* + * Now deal with unusual cases. + */ + + if(sk->state==TCP_LISTEN) + { + if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ + tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); + + /* + * We don't care for RST, and non SYN are absorbed (old segments) + * Broadcast/multicast SYN isn't allowed. Note - bug if you change the + * netmask on a running connection it can go broadcast. Even Sun's have + * this problem so I'm ignoring it + */ + + if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * Guess we need to make a new socket up + */ + + tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); + + /* + * Now we have several options: In theory there is nothing else + * in the frame. KA9Q has an option to send data with the syn, + * BSD accepts data with the syn up to the [to be] advertised window + * and Solaris 2.1 gives you a protocol error. For now we just ignore + * it, that fits the spec precisely and avoids incompatibilities. It + * would be nice in future to drop through and process the data. + */ + + release_sock(sk); + return 0; + } + + /* retransmitted SYN? */ + if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * SYN sent means we have to look for a suitable ack and either reset + * for bad matches or go to connected + */ + + if(sk->state==TCP_SYN_SENT) + { + /* Crossed SYN or previous junk segment */ + if(th->ack) + { + /* We got an ack, but it's not a good ack */ + if(!tcp_ack(sk,th,saddr,len)) + { + /* Reset the ack - its an ack from a + different connection [ th->rst is checked in tcp_reset()] */ + tcp_statistics.TcpAttemptFails++; + tcp_reset(daddr, saddr, th, + sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); + kfree_skb(skb, FREE_READ); + release_sock(sk); + return(0); + } + if(th->rst) + return tcp_std_reset(sk,skb); + if(!th->syn) + { + /* A valid ack from a different connection + start. Shouldn't happen but cover it */ + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + /* + * Ok.. it's good. Set up sequence numbers and + * move to established. + */ + syn_ok=1; /* Don't reset this connection for the syn */ + sk->acked_seq=th->seq+1; + sk->fin_seq=th->seq; + tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr); + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_options(sk,th); + sk->dummy_th.dest=th->source; + sk->copied_seq = sk->acked_seq; + if(!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 0); + } + if(sk->max_window==0) + { + sk->max_window = 32; + sk->mss = min(sk->max_window, sk->mtu); + } + } + else + { + /* See if SYN's cross. Drop if boring */ + if(th->syn && !th->rst) + { + /* Crossed SYN's are fine - but talking to + yourself is right out... */ + if(sk->saddr==saddr && sk->daddr==daddr && + sk->dummy_th.source==th->source && + sk->dummy_th.dest==th->dest) + { + tcp_statistics.TcpAttemptFails++; + return tcp_std_reset(sk,skb); + } + tcp_set_state(sk,TCP_SYN_RECV); + + /* + * FIXME: + * Must send SYN|ACK here + */ + } + /* Discard junk segment */ + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + /* + * SYN_RECV with data maybe.. drop through + */ + goto rfc_step6; + } + + /* + * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is + * a more complex suggestion for fixing these reuse issues in RFC1644 + * but not yet ready for general use. Also see RFC1379. + */ + +#define BSD_TIME_WAIT +#ifdef BSD_TIME_WAIT + if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && + after(th->seq, sk->acked_seq) && !th->rst) + { + long seq=sk->write_seq; + if(sk->debug) + printk("Doing a BSD time wait\n"); + tcp_statistics.TcpEstabResets++; + sk->rmem_alloc -= skb->mem_len; + skb->sk = NULL; + sk->err=ECONNRESET; + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + release_sock(sk); + sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); + if (sk && sk->state==TCP_LISTEN) + { + sk->inuse=1; + skb->sk = sk; + sk->rmem_alloc += skb->mem_len; + tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); + release_sock(sk); + return 0; + } + kfree_skb(skb, FREE_READ); + return 0; + } +#endif + } + + /* + * We are now in normal data flow (see the step list in the RFC) + * Note most of these are inline now. I'll inline the lot when + * I have time to test it hard and look at what gcc outputs + */ + + if(!tcp_sequence(sk,th,len,opt,saddr,dev)) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + if(th->rst) + return tcp_std_reset(sk,skb); + + /* + * !syn_ok is effectively the state test in RFC793. + */ + + if(th->syn && !syn_ok) + { + tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); + return tcp_std_reset(sk,skb); + } + + /* + * Process the ACK + */ + + + if(th->ack && !tcp_ack(sk,th,saddr,len)) + { + /* + * Our three way handshake failed. + */ + + if(sk->state==TCP_SYN_RECV) + { + tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); + } + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + +rfc_step6: /* I'll clean this up later */ + + /* + * Process urgent data + */ + + if(tcp_urg(sk, th, saddr, len)) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + + /* + * Process the encapsulated data + */ + + if(tcp_data(skb,sk, saddr, len)) + { + kfree_skb(skb, FREE_READ); + release_sock(sk); + return 0; + } + + /* + * And done + */ + + release_sock(sk); + return 0; +} + +/* + * This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + */ + +static void tcp_write_wakeup(struct sock *sk) +{ + struct sk_buff *buff; + struct tcphdr *t1; + struct device *dev=NULL; + int tmp; + + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + /* + * Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + + if (sk->state != TCP_ESTABLISHED && + sk->state != TCP_CLOSE_WAIT && + sk->state != TCP_FIN_WAIT1 && + sk->state != TCP_LAST_ACK && + sk->state != TCP_CLOSING + ) + { + return; + } + + buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->len = sizeof(struct tcphdr); + buff->free = 1; + buff->sk = sk; + buff->localroute = sk->localroute; + + t1 = (struct tcphdr *) buff->data; + + /* Put in the IP header and routing stuff. */ + tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); + if (tmp < 0) + { + sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); + return; + } + + buff->len += tmp; + t1 = (struct tcphdr *)((char *)t1 +tmp); + + memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); + + /* + * Use a previous sequence. + * This should cause the other end to send an ack. + */ + + t1->seq = htonl(sk->sent_seq-1); + t1->ack = 1; + t1->res1= 0; + t1->res2= 0; + t1->rst = 0; + t1->urg = 0; + t1->psh = 0; + t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ + t1->syn = 0; + t1->ack_seq = ntohl(sk->acked_seq); + t1->window = ntohs(tcp_select_window(sk)); + t1->doff = sizeof(*t1)/4; + tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); + /* + * Send it and free it. + * This will prevent the timer from automatically being restarted. + */ + sk->prot->queue_xmit(sk, dev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +/* + * A window probe timeout has occurred. + */ + +void tcp_send_probe0(struct sock *sk) +{ + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + tcp_write_wakeup(sk); + + sk->backoff++; + sk->rto = min(sk->rto << 1, 120*HZ); + reset_xmit_timer (sk, TIME_PROBE0, sk->rto); + sk->retransmits++; + sk->prot->retransmits ++; +} + +/* + * Socket option code for TCP. + */ + +int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) +{ + int val,err; + + if(level!=SOL_TCP) + return ip_setsockopt(sk,level,optname,optval,optlen); + + if (optval == NULL) + return(-EINVAL); + + err=verify_area(VERIFY_READ, optval, sizeof(int)); + if(err) + return err; + + val = get_fs_long((unsigned long *)optval); + + switch(optname) + { + case TCP_MAXSEG: +/* + * values greater than interface MTU won't take effect. however at + * the point when this call is done we typically don't yet know + * which interface is going to be used + */ + if(val<1||val>MAX_WINDOW) + return -EINVAL; + sk->user_mss=val; + return 0; + case TCP_NODELAY: + sk->nonagle=(val==0)?0:1; + return 0; + default: + return(-ENOPROTOOPT); + } +} + +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) +{ + int val,err; + + if(level!=SOL_TCP) + return ip_getsockopt(sk,level,optname,optval,optlen); + + switch(optname) + { + case TCP_MAXSEG: + val=sk->user_mss; + break; + case TCP_NODELAY: + val=sk->nonagle; + break; + default: + return(-ENOPROTOOPT); + } + err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); + if(err) + return err; + put_fs_long(sizeof(int),(unsigned long *) optlen); + + err=verify_area(VERIFY_WRITE, optval, sizeof(int)); + if(err) + return err; + put_fs_long(val,(unsigned long *)optval); + + return(0); +} + + +struct proto tcp_prot = { + sock_wmalloc, + sock_rmalloc, + sock_wfree, + sock_rfree, + sock_rspace, + sock_wspace, + tcp_close, + tcp_read, + tcp_write, + tcp_sendto, + tcp_recvfrom, + ip_build_header, + tcp_connect, + tcp_accept, + ip_queue_xmit, + tcp_retransmit, + tcp_write_wakeup, + tcp_read_wakeup, + tcp_rcv, + tcp_select, + tcp_ioctl, + NULL, + tcp_shutdown, + tcp_setsockopt, + tcp_getsockopt, + 128, + 0, + {NULL,}, + "TCP", + 0, 0 +}; -- cgit v1.2.3