/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: @(#)tcp.c 1.0.16 05/25/93 * * Authors: Ross Biro, * Fred N. van Kempen, * Mark Evans, * Corey Minyard * Florian La Roche, * Charles Hedrick, * Linus Torvalds, * Alan Cox, * Matthew Dillon, * Arnt Gulbrandsen, * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 * and was trying to connect (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It wakes people * on errors. select behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_reset() fixed to work for everything not just * packets for unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. Acking * any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst receive * otherwise odd bits of prattle escape still * Alan Cox : Fixed another acking RST frame bug. Should stop * LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential nasty. * Alan Cox : Added some better commenting, as the tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised state ack error/ * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the TCP code * (Be very nice to this man if tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle select() after URG properly in all cases. * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in selecting before an accept. * Alan Cox : Kept the state trace facility since it's * handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on the RFC's * for other useful protocol references see: * Comer, KA9Q NOS, and for a reference on the * difference between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC and using multiple * timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * * * To Fix: * Fast path the code. Two things here - fix the window calculation * so it doesn't iterate over the queue, also spot packets with no funny * options arriving in order and process directly. * * Implement RFC 1191 [Path MTU discovery] * Look at the effect of implementing RFC 1337 suggestions and their impact. * Rewrite output state machine to use a single queue and do low window * situations as per the spec (RFC 1122) * Speed up input assembly algorithm. * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we * could do with it working on IPv4 * User settable/learned rtt/max window/mtu * Cope with MTU/device switches when retransmitting in tcp. * Fix the window handling to use PR's new code. * * Change the fundamental structure to a single send queue maintained * by TCP (removing the bogus ip stuff [thus fixing mtu drops on * active routes too]). Cut the queue off in tcp_retransmit/ * tcp_transmit. * Change the receive queue to assemble as it goes. This lets us * dispose of most of tcp_sequence, half of tcp_ack and chunks of * tcp_data/tcp_read as well as the window shrink crud. * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack * tcp_queue_skb seem obvious routines to extract. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or(at your option) any later version. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "snmp.h" #include "ip.h" #include "protocol.h" #include "icmp.h" #include "tcp.h" #include "arp.h" #include #include "sock.h" #include "route.h" #include #include #include #include #include /* * The MSL timer is the 'normal' timer. */ #define reset_msl_timer(x,y,z) reset_timer(x,y,z) #define SEQ_TICK 3 unsigned long seq_offset; struct tcp_mib tcp_statistics; static void tcp_close(struct sock *sk, int timeout); /* * The less said about this the better, but it works and will do for 1.2 */ static struct wait_queue *master_select_wakeup; static __inline__ int min(unsigned int a, unsigned int b) { if (a < b) return(a); return(b); } #undef STATE_TRACE #ifdef STATE_TRACE static char *statename[]={ "Unused","Established","Syn Sent","Syn Recv", "Fin Wait 1","Fin Wait 2","Time Wait", "Close", "Close Wait","Last ACK","Listen","Closing" }; #endif static __inline__ void tcp_set_state(struct sock *sk, int state) { if(sk->state==TCP_ESTABLISHED) tcp_statistics.TcpCurrEstab--; #ifdef STATE_TRACE if(sk->debug) printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]); #endif /* This is a hack but it doesn't occur often and it's going to be a real to fix nicely */ if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV) { wake_up_interruptible(&master_select_wakeup); } sk->state=state; if(state==TCP_ESTABLISHED) tcp_statistics.TcpCurrEstab++; } /* * This routine picks a TCP windows for a socket based on * the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * For now we use NET2E3's heuristic of offering half the memory * we have handy. All is not as bad as this seems however because * of two things. Firstly we will bin packets even within the window * in order to get the data we are waiting for into the memory limit. * Secondly we bin common duplicate forms at receive time * Better heuristics welcome */ int tcp_select_window(struct sock *sk) { int new_window = sk->prot->rspace(sk); if(sk->window_clamp) new_window=min(sk->window_clamp,new_window); /* * Two things are going on here. First, we don't ever offer a * window less than min(sk->mss, MAX_WINDOW/2). This is the * receiver side of SWS as specified in RFC1122. * Second, we always give them at least the window they * had before, in order to avoid retracting window. This * is technically allowed, but RFC1122 advises against it and * in practice it causes trouble. * * Fixme: This doesn't correctly handle the case where * new_window > sk->window but not by enough to allow for the * shift in sequence space. */ if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window) return(sk->window); return(new_window); } /* * Find someone to 'accept'. Must be called with * sk->inuse=1 or cli() */ static struct sk_buff *tcp_find_established(struct sock *s) { struct sk_buff *p=skb_peek(&s->receive_queue); if(p==NULL) return NULL; do { if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1) return p; p=p->next; } while(p!=(struct sk_buff *)&s->receive_queue); return NULL; } /* * Remove a completed connection and return it. This is used by * tcp_accept() to get connections from the queue. */ static struct sk_buff *tcp_dequeue_established(struct sock *s) { struct sk_buff *skb; unsigned long flags; save_flags(flags); cli(); skb=tcp_find_established(s); if(skb!=NULL) skb_unlink(skb); /* Take it off the queue */ restore_flags(flags); return skb; } /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. Currently it is only called by * tcp_close, and timeout mirrors the value there. */ static void tcp_close_pending (struct sock *sk) { struct sk_buff *skb; while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { skb->sk->dead=1; tcp_close(skb->sk, 0); kfree_skb(skb, FREE_READ); } return; } /* * Enter the time wait state. */ static void tcp_time_wait(struct sock *sk) { tcp_set_state(sk,TCP_TIME_WAIT); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } /* * A socket has timed out on its send queue and wants to do a * little retransmitting. Currently this means TCP. */ void tcp_do_retransmit(struct sock *sk, int all) { struct sk_buff * skb; struct proto *prot; struct device *dev; int ct=0; prot = sk->prot; skb = sk->send_head; while (skb != NULL) { struct tcphdr *th; struct iphdr *iph; int size; dev = skb->dev; IS_SKB(skb); skb->when = jiffies; /* * In general it's OK just to use the old packet. However we * need to use the current ack and window fields. Urg and * urg_ptr could possibly stand to be updated as well, but we * don't keep the necessary data. That shouldn't be a problem, * if the other end is doing the right thing. Since we're * changing the packet, we have to issue a new IP identifier. */ iph = (struct iphdr *)(skb->data + dev->hard_header_len); th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); size = skb->len - (((unsigned char *) th) - skb->data); /* * Note: We ought to check for window limits here but * currently this is done (less efficiently) elsewhere. * We do need to check for a route change but can't handle * that until we have the new 1.3.x buffers in. * */ iph->id = htons(ip_id_count++); ip_send_check(iph); /* * This is not the right way to handle this. We have to * issue an up to date window and ack report with this * retransmit to keep the odd buggy tcp that relies on * the fact BSD does this happy. * We don't however need to recalculate the entire * checksum, so someone wanting a small problem to play * with might like to implement RFC1141/RFC1624 and speed * this up by avoiding a full checksum. */ th->ack_seq = ntohl(sk->acked_seq); th->window = ntohs(tcp_select_window(sk)); tcp_send_check(th, sk->saddr, sk->daddr, size, sk); /* * If the interface is (still) up and running, kick it. */ if (dev->flags & IFF_UP) { /* * If the packet is still being sent by the device/protocol * below then don't retransmit. This is both needed, and good - * especially with connected mode AX.25 where it stops resends * occurring of an as yet unsent anyway frame! * We still add up the counts as the round trip time wants * adjusting. */ if (sk && !skb_device_locked(skb)) { /* Remove it from any existing driver queue first! */ skb_unlink(skb); /* Now queue it */ ip_statistics.IpOutRequests++; dev_queue_xmit(skb, dev, sk->priority); } } /* * Count retransmissions */ ct++; sk->prot->retransmits ++; /* * Only one retransmit requested. */ if (!all) break; /* * This should cut it off before we send too many packets. */ if (ct >= sk->cong_window) break; skb = skb->link3; } } /* * Reset the retransmission timer */ static void reset_xmit_timer(struct sock *sk, int why, unsigned long when) { del_timer(&sk->retransmit_timer); sk->ip_xmit_timeout = why; if((int)when < 0) { when=3; printk("Error: Negative timer in xmit_timer\n"); } sk->retransmit_timer.expires=when; add_timer(&sk->retransmit_timer); } /* * This is the normal code called for timeouts. It does the retransmission * and then does backoff. tcp_do_retransmit is separated out because * tcp_ack needs to send stuff from the retransmit queue without * initiating a backoff. */ void tcp_retransmit_time(struct sock *sk, int all) { tcp_do_retransmit(sk, all); /* * Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ sk->retransmits++; sk->backoff++; sk->rto = min(sk->rto << 1, 120*HZ); reset_xmit_timer(sk, TIME_WRITE, sk->rto); } /* * A timer event has trigger a tcp retransmit timeout. The * socket xmit queue is ready and set up to send. Because * the ack receive code keeps the queue straight we do * nothing clever here. */ static void tcp_retransmit(struct sock *sk, int all) { if (all) { tcp_retransmit_time(sk, all); return; } sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ /* sk->ssthresh in theory can be zero. I guess that's OK */ sk->cong_count = 0; sk->cong_window = 1; /* Do the actual retransmit. */ tcp_retransmit_time(sk, all); } /* * A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { /* * Look for a 'soft' timeout. */ if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) { /* * Attempt to recover if arp has changed (unlikely!) or * a route has shifted (not supported prior to 1.3). */ arp_destroy (sk->daddr, 0); ip_route_check (sk->daddr); } /* * Has it gone just too far ? */ if (sk->retransmits > TCP_RETR2) { sk->err = ETIMEDOUT; sk->error_report(sk); del_timer(&sk->retransmit_timer); /* * Time wait the socket */ if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) { tcp_set_state(sk,TCP_TIME_WAIT); reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { /* * Clean up time. */ tcp_set_state(sk, TCP_CLOSE); return 0; } } return 1; } /* * The TCP retransmit timer. This lacks a few small details. * * 1. An initial rtt timeout on the probe0 should cause what we can * of the first write queue buffer to be split and sent. * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report * ETIMEDOUT if we know an additional 'soft' error caused this. * tcp_err should save a 'soft error' for us. */ static void retransmit_timer(unsigned long data) { struct sock *sk = (struct sock*)data; int why = sk->ip_xmit_timeout; /* * only process if socket is not in use */ cli(); if (sk->inuse || in_bh) { /* Try again in 1 second */ sk->retransmit_timer.expires = HZ; add_timer(&sk->retransmit_timer); sti(); return; } sk->inuse = 1; sti(); /* Always see if we need to send an ack. */ if (sk->ack_backlog && !sk->zapped) { sk->prot->read_wakeup (sk); if (! sk->dead) sk->data_ready(sk,0); } /* Now we need to figure out why the socket was on the timer. */ switch (why) { /* Window probing */ case TIME_PROBE0: tcp_send_probe0(sk); tcp_write_timeout(sk); break; /* Retransmitting */ case TIME_WRITE: /* It could be we got here because we needed to send an ack. * So we need to check for that. */ { struct sk_buff *skb; unsigned long flags; save_flags(flags); cli(); skb = sk->send_head; if (!skb) { restore_flags(flags); } else { /* * Kicked by a delayed ack. Reset timer * correctly now */ if (jiffies < skb->when + sk->rto) { reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies); restore_flags(flags); break; } restore_flags(flags); /* * Retransmission */ sk->prot->retransmit (sk, 0); tcp_write_timeout(sk); } break; } /* Sending Keepalives */ case TIME_KEEPOPEN: /* * this reset_timer() call is a hack, this is not * how KEEPOPEN is supposed to work. */ reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); /* Send something to keep the connection open. */ if (sk->prot->write_wakeup) sk->prot->write_wakeup (sk); sk->retransmits++; tcp_write_timeout(sk); break; default: printk ("rexmit_timer: timer expired - reason unknown\n"); break; } release_sock(sk); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. */ void tcp_err(int err, unsigned char *header, unsigned long daddr, unsigned long saddr, struct inet_protocol *protocol) { struct tcphdr *th; struct sock *sk; struct iphdr *iph=(struct iphdr *)header; header+=4*iph->ihl; th =(struct tcphdr *)header; sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr); if (sk == NULL) return; if(err<0) { sk->err = -err; sk->error_report(sk); return; } if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) { /* * FIXME: * For now we will just trigger a linear backoff. * The slow start code should cause a real backoff here. */ if (sk->cong_window > 4) sk->cong_window--; return; } /* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ /* * If we've already connected we will keep trying * until we time out, or the user gives up. */ if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) { if (sk->state == TCP_SYN_SENT) { tcp_statistics.TcpAttemptFails++; tcp_set_state(sk,TCP_CLOSE); sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ } sk->err = icmp_err_convert[err & 0xff].error; } return; } /* * Walk down the receive queue counting readable data until we hit the end or we find a gap * in the received data queue (ie a frame missing that needs sending to us). Not * sorting using two queues as data arrives makes life so much harder. */ static int tcp_readable(struct sock *sk) { unsigned long counted; unsigned long amount; struct sk_buff *skb; int sum; unsigned long flags; if(sk && sk->debug) printk("tcp_readable: %p - ",sk); save_flags(flags); cli(); if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) { restore_flags(flags); if(sk && sk->debug) printk("empty\n"); return(0); } counted = sk->copied_seq; /* Where we are at the moment */ amount = 0; /* * Do until a push or until we are out of data. */ do { if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ break; sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ if (skb->h.th->syn) sum++; if (sum > 0) { /* Add it up, move on */ amount += sum; if (skb->h.th->syn) amount--; counted += sum; } /* * Don't count urg data ... but do it in the right place! * Consider: "old_data (ptr is here) URG PUSH data" * The old code would stop at the first push because * it counted the urg (amount==1) and then does amount-- * *after* the loop. This means tcp_readable() always * returned zero if any URG PUSH was in the queue, even * though there was normal data available. If we subtract * the urg data right here, we even get it to work for more * than one URG PUSH skb without normal data. * This means that select() finally works now with urg data * in the queue. Note that rlogin was never affected * because it doesn't use select(); it uses two processes * and a blocking read(). And the queue scan in tcp_read() * was correct. Mike */ if (skb->h.th->urg) amount--; /* don't count urg data */ if (amount && skb->h.th->psh) break; skb = skb->next; } while(skb != (struct sk_buff *)&sk->receive_queue); restore_flags(flags); if(sk->debug) printk("got %lu bytes.\n",amount); return(amount); } /* * LISTEN is a special case for select.. */ static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait) { if (sel_type == SEL_IN) { int retval; sk->inuse = 1; retval = (tcp_find_established(sk) != NULL); release_sock(sk); if (!retval) select_wait(&master_select_wakeup,wait); return retval; } return 0; } /* * Wait for a TCP event. * * Note that we don't need to set "sk->inuse", as the upper select layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) { if (sk->state == TCP_LISTEN) return tcp_listen_select(sk, sel_type, wait); switch(sel_type) { case SEL_IN: if (sk->err) return 1; if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) break; if (sk->shutdown & RCV_SHUTDOWN) return 1; if (sk->acked_seq == sk->copied_seq) break; if (sk->urg_seq != sk->copied_seq || sk->acked_seq != sk->copied_seq+1 || sk->urginline || !sk->urg_data) return 1; break; case SEL_OUT: if (sk->shutdown & SEND_SHUTDOWN) return 0; if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) break; /* * This is now right thanks to a small fix * by Matt Dillon. */ if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header) break; return 1; case SEL_EX: if (sk->err || sk->urg_data) return 1; break; } select_wait(sk->sleep, wait); return 0; } #ifndef _HURD_ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int err; switch(cmd) { case TIOCINQ: #ifdef FIXME /* FIXME: */ case FIONREAD: #endif { unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); sk->inuse = 1; amount = tcp_readable(sk); release_sock(sk); err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); if(err) return err; put_fs_long(amount,(unsigned long *)arg); return(0); } case SIOCATMARK: { int answ = sk->urg_data && sk->urg_seq == sk->copied_seq; err = verify_area(VERIFY_WRITE,(void *) arg, sizeof(unsigned long)); if (err) return err; put_fs_long(answ,(int *) arg); return(0); } case TIOCOUTQ: { unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); amount = sk->prot->wspace(sk); err=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); if(err) return err; put_fs_long(amount,(unsigned long *)arg); return(0); } default: return(-EINVAL); } } #endif /* * This routine computes a TCP checksum. */ unsigned short tcp_check(struct tcphdr *th, int len, unsigned long saddr, unsigned long daddr) { unsigned long sum; if (saddr == 0) saddr = ip_my_addr(); /* * stupid, gcc complains when I use just one __asm__ block, * something about too many reloads, but this is just two * instructions longer than what I want */ __asm__(" addl %%ecx, %%ebx adcl %%edx, %%ebx adcl $0, %%ebx " : "=b"(sum) : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256) : "bx", "cx", "dx" ); __asm__(" movl %%ecx, %%edx cld cmpl $32, %%ecx jb 2f shrl $5, %%ecx clc 1: lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx lodsl adcl %%eax, %%ebx loop 1b adcl $0, %%ebx movl %%edx, %%ecx 2: andl $28, %%ecx je 4f shrl $2, %%ecx clc 3: lodsl adcl %%eax, %%ebx loop 3b adcl $0, %%ebx 4: movl $0, %%eax testw $2, %%dx je 5f lodsw addl %%eax, %%ebx adcl $0, %%ebx movw $0, %%ax 5: test $1, %%edx je 6f lodsb addl %%eax, %%ebx adcl $0, %%ebx 6: movl %%ebx, %%eax shrl $16, %%eax addw %%ax, %%bx adcw $0, %%bx " : "=b"(sum) : "0"(sum), "c"(len), "S"(th) : "ax", "bx", "cx", "dx", "si" ); /* We only want the bottom 16 bits, but we never cleared the top 16. */ return((~sum) & 0xffff); } void tcp_send_check(struct tcphdr *th, unsigned long saddr, unsigned long daddr, int len, struct sock *sk) { th->check = 0; th->check = tcp_check(th, len, saddr, daddr); return; } /* * This is the main buffer sending routine. We queue the buffer * having checked it is sane seeming. */ static void tcp_send_skb(struct sock *sk, struct sk_buff *skb) { int size; struct tcphdr * th = skb->h.th; /* * length of packet (not counting length of pre-tcp headers) */ size = skb->len - ((unsigned char *) th - skb->data); /* * Sanity check it.. */ if (size < sizeof(struct tcphdr) || size > skb->len) { printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", skb, skb->data, th, skb->len); kfree_skb(skb, FREE_WRITE); return; } /* * If we have queued a header size packet.. (these crash a few * tcp stacks if ack is not set) */ if (size == sizeof(struct tcphdr)) { /* If it's got a syn or fin it's notionally included in the size..*/ if(!th->syn && !th->fin) { printk("tcp_send_skb: attempt to queue a bogon.\n"); kfree_skb(skb,FREE_WRITE); return; } } /* * Actual processing. */ tcp_statistics.TcpOutSegs++; skb->h.seq = ntohl(th->seq) + size - 4*th->doff; /* * We must queue if * * a) The right edge of this frame exceeds the window * b) We are retransmitting (Nagle's rule) * c) We have too many packets 'in flight' */ if (after(skb->h.seq, sk->window_seq) || (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) || sk->packets_out >= sk->cong_window) { /* checksum will be supplied by tcp_write_xmit. So * we shouldn't need to set it at all. I'm being paranoid */ th->check = 0; if (skb->next != NULL) { printk("tcp_send_partial: next != NULL\n"); skb_unlink(skb); } skb_queue_tail(&sk->write_queue, skb); /* * If we don't fit we have to start the zero window * probes. This is broken - we really need to do a partial * send _first_ (This is what causes the Cisco and PC/TCP * grief). */ if (before(sk->window_seq, sk->write_queue.next->h.seq) && sk->send_head == NULL && sk->ack_backlog == 0) reset_xmit_timer(sk, TIME_PROBE0, sk->rto); } else { /* * This is going straight out */ th->ack_seq = ntohl(sk->acked_seq); th->window = ntohs(tcp_select_window(sk)); tcp_send_check(th, sk->saddr, sk->daddr, size, sk); sk->sent_seq = sk->write_seq; /* * This is mad. The tcp retransmit queue is put together * by the ip layer. This causes half the problems with * unroutable FIN's and other things. */ sk->prot->queue_xmit(sk, skb->dev, skb, 0); /* * Set for next retransmit based on expected ACK time. * FIXME: We set this every time which means our * retransmits are really about a window behind. */ reset_xmit_timer(sk, TIME_WRITE, sk->rto); } } /* * Locking problems lead us to a messy situation where we can have * multiple partially complete buffers queued up. This is really bad * as we don't want to be sending partial buffers. Fix this with * a semaphore or similar to lock tcp_write per socket. * * These routines are pretty self descriptive. */ struct sk_buff * tcp_dequeue_partial(struct sock * sk) { struct sk_buff * skb; unsigned long flags; save_flags(flags); cli(); skb = sk->partial; if (skb) { sk->partial = NULL; del_timer(&sk->partial_timer); } restore_flags(flags); return skb; } /* * Empty the partial queue */ static void tcp_send_partial(struct sock *sk) { struct sk_buff *skb; if (sk == NULL) return; while ((skb = tcp_dequeue_partial(sk)) != NULL) tcp_send_skb(sk, skb); } /* * Queue a partial frame */ void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) { struct sk_buff * tmp; unsigned long flags; save_flags(flags); cli(); tmp = sk->partial; if (tmp) del_timer(&sk->partial_timer); sk->partial = skb; init_timer(&sk->partial_timer); /* * Wait up to 1 second for the buffer to fill. */ sk->partial_timer.expires = HZ; sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; sk->partial_timer.data = (unsigned long) sk; add_timer(&sk->partial_timer); restore_flags(flags); if (tmp) tcp_send_skb(sk, tmp); } /* * This routine sends an ack and also updates the window. */ static void tcp_send_ack(unsigned long sequence, unsigned long ack, struct sock *sk, struct tcphdr *th, unsigned long daddr) { struct sk_buff *buff; struct tcphdr *t1; struct device *dev = NULL; int tmp; if(sk->zapped) return; /* We have been reset, we may not send again */ /* * We need to grab some memory, and put together an ack, * and then put it into the queue to be sent. */ buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); if (buff == NULL) { /* * Force it to send an ack. We don't have to do this * (ACK is unreliable) but it's much better use of * bandwidth on slow links to send a spare ack than * resend packets. */ sk->ack_backlog++; if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) { reset_xmit_timer(sk, TIME_WRITE, HZ); } return; } /* * Assemble a suitable TCP frame */ buff->len = sizeof(struct tcphdr); buff->sk = sk; buff->localroute = sk->localroute; t1 =(struct tcphdr *) buff->data; /* * Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { buff->free = 1; sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); return; } buff->len += tmp; t1 =(struct tcphdr *)((char *)t1 +tmp); memcpy(t1, th, sizeof(*t1)); /* * Swap the send and the receive. */ t1->dest = th->source; t1->source = th->dest; t1->seq = ntohl(sequence); t1->ack = 1; sk->window = tcp_select_window(sk); t1->window = ntohs(sk->window); t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->syn = 0; t1->psh = 0; t1->fin = 0; /* * If we have nothing queued for transmit and the transmit timer * is on we are just doing an ACK timeout and need to switch * to a keepalive. */ if (ack == sk->acked_seq) { sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->ack_timed = 0; if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL && sk->ip_xmit_timeout == TIME_WRITE) { if(sk->keepopen) { reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); } else { delete_timer(sk); } } } /* * Fill in the packet and send it */ t1->ack_seq = ntohl(ack); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); if (sk->debug) printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack); tcp_statistics.TcpOutSegs++; sk->prot->queue_xmit(sk, dev, buff, 1); } /* * This routine builds a generic TCP header. */ extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) { memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); th->seq = htonl(sk->write_seq); th->psh =(push == 0) ? 1 : 0; th->doff = sizeof(*th)/4; th->ack = 1; th->fin = 0; sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->ack_timed = 0; th->ack_seq = htonl(sk->acked_seq); sk->window = tcp_select_window(sk); th->window = htons(sk->window); return(sizeof(*th)); } /* * This routine copies from a user buffer into a socket, * and starts the transmit system. */ static int tcp_write(struct sock *sk, unsigned char *from, int len, int nonblock, unsigned flags) { int copied = 0; int copy; int tmp; struct sk_buff *skb; struct sk_buff *send_tmp; unsigned char *buff; struct proto *prot; struct device *dev = NULL; sk->inuse=1; prot = sk->prot; while(len > 0) { if (sk->err) { /* Stop on an error */ release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } /* * First thing we do is make sure that we are established. */ if (sk->shutdown & SEND_SHUTDOWN) { release_sock(sk); sk->err = EPIPE; if (copied) return(copied); sk->err = 0; return(-EPIPE); } /* * Wait for a connection to finish. */ while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { if (sk->err) { release_sock(sk); if (copied) return(copied); tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { release_sock(sk); if (copied) return(copied); if (sk->err) { tmp = -sk->err; sk->err = 0; return(tmp); } if (sk->keepopen) { send_sig(SIGPIPE, current, 0); } return(-EPIPE); } if (nonblock || copied) { release_sock(sk); if (copied) return(copied); return(-EAGAIN); } release_sock(sk); cli(); if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); } /* * The following code can result in copy <= if sk->mss is ever * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). * sk->mtu is constant once SYN processing is finished. I.e. we * had better not get here until we've seen his SYN and at least one * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) * But ESTABLISHED should guarantee that. sk->max_window is by definition * non-decreasing. Note that any ioctl to set user_mss must be done * before the exchange of SYN's. If the initial ack from the other * end has a window of 0, max_window and thus mss will both be 0. */ /* * Now we need to check if we have a half built packet. */ if ((skb = tcp_dequeue_partial(sk)) != NULL) { int hdrlen; /* IP header + TCP header */ hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data) + sizeof(struct tcphdr); /* Add more stuff to the end of skb->len */ if (!(flags & MSG_OOB)) { copy = min(sk->mss - (skb->len - hdrlen), len); /* FIXME: this is really a bug. */ if (copy <= 0) { printk("TCP: **bug**: \"copy\" <= 0!!\n"); copy = 0; } memcpy_fromfs(skb->data + skb->len, from, copy); skb->len += copy; from += copy; copied += copy; len -= copy; sk->write_seq += copy; } if ((skb->len - hdrlen) >= sk->mss || (flags & MSG_OOB) || !sk->packets_out) tcp_send_skb(sk, skb); else tcp_enqueue_partial(skb, sk); continue; } /* * We also need to worry about the window. * If window < 1/2 the maximum window we've seen from this * host, don't use it. This is sender side * silly window prevention, as specified in RFC1122. * (Note that this is different than earlier versions of * SWS prevention, e.g. RFC813.). What we actually do is * use the whole MSS. Since the results in the right * edge of the packet being outside the window, it will * be queued for later rather than sent. */ copy = sk->window_seq - sk->write_seq; if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss) copy = sk->mss; if (copy > len) copy = len; /* * We should really check the window here also. */ send_tmp = NULL; if (copy < sk->mss && !(flags & MSG_OOB)) { /* * We will release the socket in case we sleep here. */ release_sock(sk); /* * NB: following must be mtu, because mss can be increased. * mss is always <= mtu */ skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL); sk->inuse = 1; send_tmp = skb; } else { /* * We will release the socket in case we sleep here. */ release_sock(sk); skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL); sk->inuse = 1; } /* * If we didn't get any memory, we need to sleep. */ if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; if (nonblock) { release_sock(sk); if (copied) return(copied); return(-EAGAIN); } /* * FIXME: here is another race condition. */ tmp = sk->wmem_alloc; release_sock(sk); cli(); /* * Again we will try to avoid it. */ if (tmp <= sk->wmem_alloc && (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) && sk->err == 0) { sk->socket->flags &= ~SO_NOSPACE; interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); if (copied) return(copied); return(-ERESTARTSYS); } } sk->inuse = 1; sti(); continue; } skb->len = 0; skb->sk = sk; skb->free = 0; skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); buff = skb->data; /* * FIXME: we need to optimize this. * Perhaps some hints here would be good. */ tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); if (tmp < 0 ) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); if (copied) return(copied); return(tmp); } skb->len += tmp; skb->dev = dev; buff += tmp; skb->h.th =(struct tcphdr *) buff; tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); if (tmp < 0) { prot->wfree(sk, skb->mem_addr, skb->mem_len); release_sock(sk); if (copied) return(copied); return(tmp); } if (flags & MSG_OOB) { ((struct tcphdr *)buff)->urg = 1; ((struct tcphdr *)buff)->urg_ptr = ntohs(copy); } skb->len += tmp; memcpy_fromfs(buff+tmp, from, copy); from += copy; copied += copy; len -= copy; skb->len += copy; skb->free = 0; sk->write_seq += copy; if (send_tmp != NULL && sk->packets_out) { tcp_enqueue_partial(send_tmp, sk); continue; } tcp_send_skb(sk, skb); } sk->err = 0; /* * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly * interactive fast network servers. It's meant to be on and * it really improves the throughput though not the echo time * on my slow slip link - Alan */ /* * Avoid possible race on send_tmp - c/o Johannes Stille */ if(sk->partial && ((!sk->packets_out) /* If not nagling we can send on the before case too.. */ || (sk->nonagle && before(sk->write_seq , sk->window_seq)) )) tcp_send_partial(sk); release_sock(sk); return(copied); } /* * This is just a wrapper. */ static int tcp_sendto(struct sock *sk, unsigned char *from, int len, int nonblock, unsigned flags, struct sockaddr_in *addr, int addr_len) { if (flags & ~(MSG_OOB|MSG_DONTROUTE)) return -EINVAL; if (sk->state == TCP_CLOSE) return -ENOTCONN; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin_family && addr->sin_family != AF_INET) return -EINVAL; if (addr->sin_port != sk->dummy_th.dest) return -EISCONN; if (addr->sin_addr.s_addr != sk->daddr) return -EISCONN; return tcp_write(sk, from, len, nonblock, flags); } /* * Send an ack if one is backlogged at this point. Ought to merge * this with tcp_send_ack(). */ static void tcp_read_wakeup(struct sock *sk) { int tmp; struct device *dev = NULL; struct tcphdr *t1; struct sk_buff *buff; if (!sk->ack_backlog) return; /* * FIXME: we need to put code here to prevent this routine from * being called. Being called once in a while is ok, so only check * if this is the second time in a row. */ /* * We need to grab some memory, and put together an ack, * and then put it into the queue to be sent. */ buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); if (buff == NULL) { /* Try again real soon. */ reset_xmit_timer(sk, TIME_WRITE, HZ); return; } buff->len = sizeof(struct tcphdr); buff->sk = sk; buff->localroute = sk->localroute; /* * Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { buff->free = 1; sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); return; } buff->len += tmp; t1 =(struct tcphdr *)(buff->data +tmp); memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); t1->seq = htonl(sk->sent_seq); t1->ack = 1; t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->syn = 0; t1->psh = 0; sk->ack_backlog = 0; sk->bytes_rcv = 0; sk->window = tcp_select_window(sk); t1->window = ntohs(sk->window); t1->ack_seq = ntohl(sk->acked_seq); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); sk->prot->queue_xmit(sk, dev, buff, 1); tcp_statistics.TcpOutSegs++; } /* * FIXME: * This routine frees used buffers. * It should consider sending an ACK to let the * other end know we now have a bigger window. */ static void cleanup_rbuf(struct sock *sk) { unsigned long flags; unsigned long left; struct sk_buff *skb; unsigned long rspace; if(sk->debug) printk("cleaning rbuf for sk=%p\n", sk); save_flags(flags); cli(); left = sk->prot->rspace(sk); /* * We have to loop through all the buffer headers, * and try to free up all the space we can. */ while((skb=skb_peek(&sk->receive_queue)) != NULL) { if (!skb->used || skb->users) break; skb_unlink(skb); skb->sk = sk; kfree_skb(skb, FREE_READ); } restore_flags(flags); /* * FIXME: * At this point we should send an ack if the difference * in the window, and the amount of space is bigger than * TCP_WINDOW_DIFF. */ if(sk->debug) printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk), left); if ((rspace=sk->prot->rspace(sk)) != left) { /* * This area has caused the most trouble. The current strategy * is to simply do nothing if the other end has room to send at * least 3 full packets, because the ack from those will auto- * matically update the window. If the other end doesn't think * we have much space left, but we have room for at least 1 more * complete packet than it thinks we do, we will send an ack * immediately. Otherwise we will wait up to .5 seconds in case * the user reads some more. */ sk->ack_backlog++; /* * It's unclear whether to use sk->mtu or sk->mss here. They differ only * if the other end is offering a window smaller than the agreed on MSS * (called sk->mtu here). In theory there's no connection between send * and receive, and so no reason to think that they're going to send * small packets. For the moment I'm using the hack of reducing the mss * only on the send side, so I'm putting mtu here. */ if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) { /* Send an ack right now. */ tcp_read_wakeup(sk); } else { /* Force it to send an ack soon. */ int was_active = del_timer(&sk->retransmit_timer); if (!was_active || TCP_ACK_TIME < sk->timer.expires) { reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); } else add_timer(&sk->retransmit_timer); } } } /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_read_urg(struct sock * sk, int nonblock, unsigned char *to, int len, unsigned flags) { /* * No URG data to read */ if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->err) { int tmp = -sk->err; sk->err = 0; return tmp; } if (sk->state == TCP_CLOSE || sk->done) { if (!sk->done) { sk->done = 1; return 0; } return -ENOTCONN; } if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; return 0; } sk->inuse = 1; if (sk->urg_data & URG_VALID) { char c = sk->urg_data; if (!(flags & MSG_PEEK)) sk->urg_data = URG_READ; put_fs_byte(c, to); release_sock(sk); return 1; } release_sock(sk); /* * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike */ return -EAGAIN; } /* * This routine copies from a sock struct into the user buffer. */ static int tcp_read(struct sock *sk, unsigned char *to, int len, int nonblock, unsigned flags) { #ifndef _HURD_ struct wait_queue wait = { current, NULL }; #endif int copied = 0; unsigned long peek_seq; volatile unsigned long *seq; /* So gcc doesn't overoptimise */ unsigned long used; /* * This error should be checked. */ if (sk->state == TCP_LISTEN) return -ENOTCONN; /* * Urgent data needs to be handled specially. */ if (flags & MSG_OOB) return tcp_read_urg(sk, nonblock, to, len, flags); /* * Copying sequence to update. This is volatile to handle * the multi-reader case neatly (memcpy_to/fromfs might be * inline and thus not flush cached variables otherwise). */ peek_seq = sk->copied_seq; seq = &sk->copied_seq; if (flags & MSG_PEEK) seq = &peek_seq; #ifndef _HURD_ add_wait_queue(sk->sleep, &wait); #endif sk->inuse = 1; while (len > 0) { struct sk_buff * skb; unsigned long offset; /* * Are we at urgent data? Stop if we have read anything. */ if (copied && sk->urg_data && sk->urg_seq == *seq) break; /* * Next get a buffer. */ #ifndef _HURD_ current->state = TASK_INTERRUPTIBLE; #endif skb = skb_peek(&sk->receive_queue); do { if (!skb) break; if (before(*seq, skb->h.th->seq)) break; offset = *seq - skb->h.th->seq; if (skb->h.th->syn) offset--; if (offset < skb->len) goto found_ok_skb; if (skb->h.th->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) skb->used = 1; skb = skb->next; } while (skb != (struct sk_buff *)&sk->receive_queue); if (copied) break; if (sk->err) { copied = -sk->err; sk->err = 0; break; } if (sk->state == TCP_CLOSE) { if (!sk->done) { sk->done = 1; break; } copied = -ENOTCONN; break; } if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; break; } if (nonblock) { copied = -EAGAIN; break; } cleanup_rbuf(sk); release_sock(sk); sk->socket->flags |= SO_WAITDATA; #ifdef _HURD_ interruptible_sleep_on (sk->sleep); #else schedule(); #endif sk->socket->flags &= ~SO_WAITDATA; sk->inuse = 1; if (current->signal & ~current->blocked) { copied = -ERESTARTSYS; break; } continue; found_ok_skb: /* * Lock the buffer. We can be fairly relaxed as * an interrupt will never steal a buffer we are * using unless I've missed something serious in * tcp_data. */ skb->users++; /* * Ok so how much can we use ? */ used = skb->len - offset; if (len < used) used = len; /* * Do we have urgent data here? */ if (sk->urg_data) { unsigned long urg_offset = sk->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sk->urginline) { ++*seq; offset++; used--; } } else used = urg_offset; } } /* * Copy it - We _MUST_ update *seq first so that we * don't ever double read when we have dual readers */ *seq += used; /* * This memcpy_tofs can sleep. If it sleeps and we * do a second read it relies on the skb->users to avoid * a crash when cleanup_rbuf() gets called. */ memcpy_tofs(to,((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); copied += used; len -= used; to += used; /* * We now will not sleep again until we are finished * with skb. Sorry if you are doing the SMP port * but you'll just have to fix it neatly ;) */ skb->users --; if (after(sk->copied_seq,sk->urg_seq)) sk->urg_data = 0; if (used + offset < skb->len) continue; /* * Process the FIN. */ if (skb->h.th->fin) goto found_fin_ok; if (flags & MSG_PEEK) continue; skb->used = 1; continue; found_fin_ok: ++*seq; if (flags & MSG_PEEK) break; /* * All is done */ skb->used = 1; sk->shutdown |= RCV_SHUTDOWN; break; } #ifndef _HURD_ remove_wait_queue(sk->sleep, &wait); current->state = TASK_RUNNING; #endif /* Clean up data we have read: This will do ACK frames */ cleanup_rbuf(sk); release_sock(sk); return copied; } /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static int tcp_close_state(struct sock *sk, int dead) { int ns=TCP_CLOSE; int send_fin=0; switch(sk->state) { case TCP_SYN_SENT: /* No SYN back, no FIN needed */ break; case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Closedown begin */ ns=TCP_FIN_WAIT1; send_fin=1; break; case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */ case TCP_FIN_WAIT2: case TCP_CLOSING: ns=sk->state; break; case TCP_CLOSE: case TCP_LISTEN: break; case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and wait only for the ACK */ ns=TCP_LAST_ACK; send_fin=1; } tcp_set_state(sk,ns); /* * This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 3 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. */ if(dead && ns==TCP_FIN_WAIT2) { int timer_active=del_timer(&sk->timer); if(timer_active) add_timer(&sk->timer); else reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); } return send_fin; } /* * Send a fin. */ static void tcp_send_fin(struct sock *sk) { struct proto *prot =(struct proto *)sk->prot; struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; struct tcphdr *t1; struct sk_buff *buff; struct device *dev=NULL; int tmp; release_sock(sk); /* in case the malloc sleeps. */ buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); sk->inuse = 1; if (buff == NULL) { /* This is a disaster if it occurs */ printk("tcp_send_fin: Impossible malloc failure"); return; } /* * Administrivia */ buff->sk = sk; buff->len = sizeof(*t1); buff->localroute = sk->localroute; t1 =(struct tcphdr *) buff->data; /* * Put in the IP header and routing stuff. */ tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl); if (tmp < 0) { int t; /* * Finish anyway, treat this as a send that got lost. * (Not good). */ buff->free = 1; prot->wfree(sk,buff->mem_addr, buff->mem_len); sk->write_seq++; t=del_timer(&sk->timer); if(t) add_timer(&sk->timer); else reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return; } /* * We ought to check if the end of the queue is a buffer and * if so simply add the fin to that buffer, not send it ahead. */ t1 =(struct tcphdr *)((char *)t1 +tmp); buff->len += tmp; buff->dev = dev; memcpy(t1, th, sizeof(*t1)); t1->seq = ntohl(sk->write_seq); sk->write_seq++; buff->h.seq = sk->write_seq; t1->ack = 1; t1->ack_seq = ntohl(sk->acked_seq); t1->window = ntohs(sk->window=tcp_select_window(sk)); t1->fin = 1; t1->rst = 0; t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); /* * If there is data in the write queue, the fin must be appended to * the write queue. */ if (skb_peek(&sk->write_queue) != NULL) { buff->free = 0; if (buff->next != NULL) { printk("tcp_send_fin: next != NULL\n"); skb_unlink(buff); } skb_queue_tail(&sk->write_queue, buff); } else { sk->sent_seq = sk->write_seq; sk->prot->queue_xmit(sk, dev, buff, 0); reset_xmit_timer(sk, TIME_WRITE, sk->rto); } } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or set sk->dead=1. */ void tcp_shutdown(struct sock *sk, int how) { /* * We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* * If we've already sent a FIN, or it's a closed state */ if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING || sk->state == TCP_LAST_ACK || sk->state == TCP_TIME_WAIT || sk->state == TCP_CLOSE || sk->state == TCP_LISTEN ) { return; } sk->inuse = 1; /* * flag that the sender has shutdown */ sk->shutdown |= SEND_SHUTDOWN; /* * Clear out any half completed packets. */ if (sk->partial) tcp_send_partial(sk); /* * FIN if needed */ if(tcp_close_state(sk,0)) tcp_send_fin(sk); release_sock(sk); } static int tcp_recvfrom(struct sock *sk, unsigned char *to, int to_len, int nonblock, unsigned flags, struct sockaddr_in *addr, int *addr_len) { int result; /* * Have to check these first unlike the old code. If * we check them after we lose data on an error * which is wrong */ if(addr_len) *addr_len = sizeof(*addr); result=tcp_read(sk, to, to_len, nonblock, flags); if (result < 0) return(result); if(addr) { addr->sin_family = AF_INET; addr->sin_port = sk->dummy_th.dest; addr->sin_addr.s_addr = sk->daddr; } return(result); } /* * This routine will send an RST to the other tcp. */ static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) { struct sk_buff *buff; struct tcphdr *t1; int tmp; struct device *ndev=NULL; /* * Cannot reset a reset (Think about it). */ if(th->rst) return; /* * We need to grab some memory, and put together an RST, * and then put it into the queue to be sent. */ buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); if (buff == NULL) return; buff->len = sizeof(*t1); buff->sk = NULL; buff->dev = dev; buff->localroute = 0; t1 =(struct tcphdr *) buff->data; /* * Put in the IP header and routing stuff. */ tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, sizeof(struct tcphdr),tos,ttl); if (tmp < 0) { buff->free = 1; prot->wfree(NULL, buff->mem_addr, buff->mem_len); return; } t1 =(struct tcphdr *)((char *)t1 +tmp); buff->len += tmp; memcpy(t1, th, sizeof(*t1)); /* * Swap the send and the receive. */ t1->dest = th->source; t1->source = th->dest; t1->rst = 1; t1->window = 0; if(th->ack) { t1->ack = 0; t1->seq = th->ack_seq; t1->ack_seq = 0; } else { t1->ack = 1; if(!th->syn) t1->ack_seq=htonl(th->seq); else t1->ack_seq=htonl(th->seq+1); t1->seq=0; } t1->syn = 0; t1->urg = 0; t1->fin = 0; t1->psh = 0; t1->doff = sizeof(*t1)/4; tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); prot->queue_xmit(NULL, ndev, buff, 1); tcp_statistics.TcpOutSegs++; } /* * Look for tcp options. Parses everything but only knows about MSS. * This routine is always called with the packet containing the SYN. * However it may also be called with the ack to the SYN. So you * can't assume this is always the SYN. It's always called after * we have set up sk->mtu to our own MTU. * * We need at minimum to add PAWS support here. Possibly large windows * as Linux gets deployed on 100Mb/sec networks. */ static void tcp_options(struct sock *sk, struct tcphdr *th) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); int mss_seen = 0; ptr = (unsigned char *)(th + 1); while(length>0) { int opcode=*ptr++; int opsize=*ptr++; switch(opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; ptr--; /* the opsize=*ptr++ above was a mistake */ continue; default: if(opsize<=2) /* Avoid silly options looping forever */ return; switch(opcode) { case TCPOPT_MSS: if(opsize==4 && th->syn) { sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); mss_seen = 1; } break; /* Add other options here as people feel the urge to implement stuff like large windows */ } ptr+=opsize-2; length-=opsize; } } if (th->syn) { if (! mss_seen) sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ } #ifdef CONFIG_INET_PCTCP sk->mss = min(sk->max_window >> 1, sk->mtu); #else sk->mss = min(sk->max_window, sk->mtu); #endif } static inline unsigned long default_mask(unsigned long dst) { dst = ntohl(dst); if (IN_CLASSA(dst)) return htonl(IN_CLASSA_NET); if (IN_CLASSB(dst)) return htonl(IN_CLASSB_NET); return htonl(IN_CLASSC_NET); } /* * Default sequence number picking algorithm. * As close as possible to RFC 793, which * suggests using a 250kHz clock. * Further reading shows this assumes 2MB/s networks. * For 10MB/s ethernet, a 1MHz clock is appropriate. * That's funny, Linux has one built in! Use it! */ extern inline unsigned long tcp_init_seq(void) { struct timeval tv; do_gettimeofday(&tv); return tv.tv_usec+tv.tv_sec*1000000; } /* * This routine handles a connection request. * It should make sure we haven't already responded. * Because of the way BSD works, we have to send a syn/ack now. * This also means it will be harder to close a socket which is * listening. */ static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, unsigned long daddr, unsigned long saddr, struct options *opt, struct device *dev, unsigned long seq) { struct sk_buff *buff; struct tcphdr *t1; unsigned char *ptr; struct sock *newsk; struct tcphdr *th; struct device *ndev=NULL; int tmp; struct rtable *rt; th = skb->h.th; /* If the socket is dead, don't accept the connection. */ if (!sk->dead) { sk->data_ready(sk,0); } else { if(sk->debug) printk("Reset on %p: Connect on dead socket.\n",sk); tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); tcp_statistics.TcpAttemptFails++; kfree_skb(skb, FREE_READ); return; } /* * Make sure we can accept more. This will prevent a * flurry of syns from eating up all our memory. */ if (sk->ack_backlog >= sk->max_ack_backlog) { tcp_statistics.TcpAttemptFails++; kfree_skb(skb, FREE_READ); return; } /* * We need to build a new sock struct. * It is sort of bad to have a socket without an inode attached * to it, but the wake_up's will just wake up the listening socket, * and if the listening socket is destroyed before this is taken * off of the queue, this will take care of it. */ newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); if (newsk == NULL) { /* just ignore the syn. It will get retransmitted. */ tcp_statistics.TcpAttemptFails++; kfree_skb(skb, FREE_READ); return; } memcpy(newsk, sk, sizeof(*newsk)); skb_queue_head_init(&newsk->write_queue); skb_queue_head_init(&newsk->receive_queue); newsk->send_head = NULL; newsk->send_tail = NULL; skb_queue_head_init(&newsk->back_log); newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ newsk->rto = TCP_TIMEOUT_INIT; newsk->mdev = 0; newsk->max_window = 0; newsk->cong_window = 1; newsk->cong_count = 0; newsk->ssthresh = 0; newsk->backoff = 0; newsk->blog = 0; newsk->intr = 0; newsk->proc = 0; newsk->done = 0; newsk->partial = NULL; newsk->pair = NULL; newsk->wmem_alloc = 0; newsk->rmem_alloc = 0; newsk->localroute = sk->localroute; newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; newsk->err = 0; newsk->shutdown = 0; newsk->ack_backlog = 0; newsk->acked_seq = skb->h.th->seq+1; newsk->copied_seq = skb->h.th->seq+1; newsk->fin_seq = skb->h.th->seq; newsk->state = TCP_SYN_RECV; newsk->timeout = 0; newsk->ip_xmit_timeout = 0; newsk->write_seq = seq; newsk->window_seq = newsk->write_seq; newsk->rcv_ack_seq = newsk->write_seq; newsk->urg_data = 0; newsk->retransmits = 0; newsk->linger=0; newsk->destroy = 0; init_timer(&newsk->timer); newsk->timer.data = (unsigned long)newsk; newsk->timer.function = &net_timer; init_timer(&newsk->retransmit_timer); newsk->retransmit_timer.data = (unsigned long)newsk; newsk->retransmit_timer.function=&retransmit_timer; newsk->dummy_th.source = skb->h.th->dest; newsk->dummy_th.dest = skb->h.th->source; /* * Swap these two, they are from our point of view. */ newsk->daddr = saddr; newsk->saddr = daddr; put_sock(newsk->num,newsk); newsk->dummy_th.res1 = 0; newsk->dummy_th.doff = 6; newsk->dummy_th.fin = 0; newsk->dummy_th.syn = 0; newsk->dummy_th.rst = 0; newsk->dummy_th.psh = 0; newsk->dummy_th.ack = 0; newsk->dummy_th.urg = 0; newsk->dummy_th.res2 = 0; newsk->acked_seq = skb->h.th->seq + 1; newsk->copied_seq = skb->h.th->seq + 1; newsk->socket = NULL; /* * Grab the ttl and tos values and use them */ newsk->ip_ttl=sk->ip_ttl; newsk->ip_tos=skb->ip_hdr->tos; /* * Use 512 or whatever user asked for */ /* * Note use of sk->user_mss, since user has no direct access to newsk */ rt=ip_rt_route(saddr, NULL,NULL); if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) newsk->window_clamp = rt->rt_window; else newsk->window_clamp = 0; if (sk->user_mss) newsk->mtu = sk->user_mss; else if(rt!=NULL && (rt->rt_flags&RTF_MSS)) newsk->mtu = rt->rt_mss - HEADER_SIZE; else { #ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */ if ((saddr ^ daddr) & default_mask(saddr)) #else if ((saddr ^ daddr) & dev->pa_mask) #endif newsk->mtu = 576 - HEADER_SIZE; else newsk->mtu = MAX_WINDOW; } /* * But not bigger than device MTU */ newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE); /* * This will min with what arrived in the packet */ tcp_options(newsk,skb->h.th); buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); if (buff == NULL) { sk->err = -ENOMEM; newsk->dead = 1; newsk->state = TCP_CLOSE; /* And this will destroy it */ release_sock(newsk); kfree_skb(skb, FREE_READ); tcp_statistics.TcpAttemptFails++; return; } buff->len = sizeof(struct tcphdr)+4; buff->sk = newsk; buff->localroute = newsk->localroute; t1 =(struct tcphdr *) buff->data; /* * Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev, IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); /* * Something went wrong. */ if (tmp < 0) { sk->err = tmp; buff->free = 1; kfree_skb(buff,FREE_WRITE); newsk->dead = 1; newsk->state = TCP_CLOSE; release_sock(newsk); skb->sk = sk; kfree_skb(skb, FREE_READ); tcp_statistics.TcpAttemptFails++; return; } buff->len += tmp; t1 =(struct tcphdr *)((char *)t1 +tmp); memcpy(t1, skb->h.th, sizeof(*t1)); buff->h.seq = newsk->write_seq; /* * Swap the send and the receive. */ t1->dest = skb->h.th->source; t1->source = newsk->dummy_th.source; t1->seq = ntohl(newsk->write_seq++); t1->ack = 1; newsk->window = tcp_select_window(newsk); newsk->sent_seq = newsk->write_seq; t1->window = ntohs(newsk->window); t1->res1 = 0; t1->res2 = 0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->syn = 1; t1->ack_seq = ntohl(skb->h.th->seq+1); t1->doff = sizeof(*t1)/4+1; ptr =(unsigned char *)(t1+1); ptr[0] = 2; ptr[1] = 4; ptr[2] = ((newsk->mtu) >> 8) & 0xff; ptr[3] =(newsk->mtu) & 0xff; tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk); newsk->prot->queue_xmit(newsk, ndev, buff, 0); reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT); skb->sk = newsk; /* * Charge the sock_buff to newsk. */ sk->rmem_alloc -= skb->mem_len; newsk->rmem_alloc += skb->mem_len; skb_queue_tail(&sk->receive_queue,skb); sk->ack_backlog++; release_sock(newsk); tcp_statistics.TcpOutSegs++; } static void tcp_close(struct sock *sk, int timeout) { /* * We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ sk->inuse = 1; if(sk->state == TCP_LISTEN) { /* Special case */ tcp_set_state(sk, TCP_CLOSE); tcp_close_pending(sk); release_sock(sk); return; } sk->keepopen = 1; sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); if (timeout == 0) { struct sk_buff *skb; /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while((skb=skb_dequeue(&sk->receive_queue))!=NULL) kfree_skb(skb, FREE_READ); /* * Get rid off any half-completed packets. */ if (sk->partial) tcp_send_partial(sk); } /* * Timeout is not the same thing - however the code likes * to send both the same way (sigh). */ if(timeout) { tcp_set_state(sk, TCP_CLOSE); /* Dead */ } else { if(tcp_close_state(sk,1)==1) { tcp_send_fin(sk); } } release_sock(sk); } /* * This routine takes stuff off of the write queue, * and puts it in the xmit queue. This happens as incoming acks * open up the remote window for us. */ static void tcp_write_xmit(struct sock *sk) { struct sk_buff *skb; /* * The bytes will have to remain here. In time closedown will * empty the write queue and all will be happy */ if(sk->zapped) return; /* * Anything on the transmit queue that fits the window can * be added providing we are not * * a) retransmitting (Nagle's rule) * b) exceeding our congestion window. */ while((skb = skb_peek(&sk->write_queue)) != NULL && before(skb->h.seq, sk->window_seq + 1) && (sk->retransmits == 0 || sk->ip_xmit_timeout != TIME_WRITE || before(skb->h.seq, sk->rcv_ack_seq + 1)) && sk->packets_out < sk->cong_window) { IS_SKB(skb); skb_unlink(skb); /* * See if we really need to send the packet. */ if (before(skb->h.seq, sk->rcv_ack_seq +1)) { /* * This is acked data. We can discard it. This * cannot currently occur. */ sk->retransmits = 0; kfree_skb(skb, FREE_WRITE); if (!sk->dead) sk->write_space(sk); } else { struct tcphdr *th; struct iphdr *iph; int size; /* * put in the ack seq and window at this point rather than earlier, * in order to keep them monotonic. We really want to avoid taking * back window allocations. That's legal, but RFC1122 says it's frowned on. * Ack and window will in general have changed since this packet was put * on the write queue. */ iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len); th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); size = skb->len - (((unsigned char *) th) - skb->data); th->ack_seq = ntohl(sk->acked_seq); th->window = ntohs(tcp_select_window(sk)); tcp_send_check(th, sk->saddr, sk->daddr, size, sk); sk->sent_seq = skb->h.seq; /* * IP manages our queue for some crazy reason */ sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); /* * Again we slide the timer wrongly */ reset_xmit_timer(sk, TIME_WRITE, sk->rto); } } } /* * This routine deals with incoming acks, but not outgoing ones. */ extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len) { unsigned long ack; int flag = 0; /* * 1 - there was data in packet as well as ack or new data is sent or * in shutdown state * 2 - data from retransmit queue was acked and removed * 4 - window shrunk or data from retransmit queue was acked and removed */ if(sk->zapped) return(1); /* Dead, cant ack any more so why bother */ /* * Have we discovered a larger window */ ack = ntohl(th->ack_seq); if (ntohs(th->window) > sk->max_window) { sk->max_window = ntohs(th->window); #ifdef CONFIG_INET_PCTCP /* Hack because we don't send partial packets to non SWS handling hosts */ sk->mss = min(sk->max_window>>1, sk->mtu); #else sk->mss = min(sk->max_window, sk->mtu); #endif } /* * We have dropped back to keepalive timeouts. Thus we have * no retransmits pending. */ if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN) sk->retransmits = 0; /* * If the ack is newer than sent or older than previous acks * then we can probably ignore it. */ if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) { if(sk->debug) printk("Ack ignored %lu %lu\n",ack,sk->sent_seq); /* * Keepalive processing. */ if (after(ack, sk->sent_seq)) { return(0); } /* * Restart the keepalive timer. */ if (sk->keepopen) { if(sk->ip_xmit_timeout==TIME_KEEPOPEN) reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); } return(1); } /* * If there is data set flag 1 */ if (len != th->doff*4) flag |= 1; /* * See if our window has been shrunk. */ if (after(sk->window_seq, ack+ntohs(th->window))) { /* * We may need to move packets from the send queue * to the write queue, if the window has been shrunk on us. * The RFC says you are not allowed to shrink your window * like this, but if the other end does, you must be able * to deal with it. */ struct sk_buff *skb; struct sk_buff *skb2; struct sk_buff *wskb = NULL; skb2 = sk->send_head; sk->send_head = NULL; sk->send_tail = NULL; /* * This is an artifact of a flawed concept. We want one * queue and a smarter send routine when we send all. */ flag |= 4; /* Window changed */ sk->window_seq = ack + ntohs(th->window); cli(); while (skb2 != NULL) { skb = skb2; skb2 = skb->link3; skb->link3 = NULL; if (after(skb->h.seq, sk->window_seq)) { if (sk->packets_out > 0) sk->packets_out--; /* We may need to remove this from the dev send list. */ if (skb->next != NULL) { skb_unlink(skb); } /* Now add it to the write_queue. */ if (wskb == NULL) skb_queue_head(&sk->write_queue,skb); else skb_append(wskb,skb); wskb = skb; } else { if (sk->send_head == NULL) { sk->send_head = skb; sk->send_tail = skb; } else { sk->send_tail->link3 = skb; sk->send_tail = skb; } skb->link3 = NULL; } } sti(); } /* * Pipe has emptied */ if (sk->send_tail == NULL || sk->send_head == NULL) { sk->send_head = NULL; sk->send_tail = NULL; sk->packets_out= 0; } /* * Update the right hand window edge of the host */ sk->window_seq = ack + ntohs(th->window); /* * We don't want too many packets out there. */ if (sk->ip_xmit_timeout == TIME_WRITE && sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) { /* * This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. Because we keep cong_window in integral * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a * counter and increment it once every cwnd times. It's possible * that this should be done only if sk->retransmits == 0. I'm * interpreting "new data is acked" as including data that has * been retransmitted but is just now being acked. */ if (sk->cong_window < sk->ssthresh) /* * In "safe" area, increase */ sk->cong_window++; else { /* * In dangerous area, increase slowly. In theory this is * sk->cong_window += 1 / sk->cong_window */ if (sk->cong_count >= sk->cong_window) { sk->cong_window++; sk->cong_count = 0; } else sk->cong_count++; } } /* * Remember the highest ack received. */ sk->rcv_ack_seq = ack; /* * If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ if (sk->ip_xmit_timeout == TIME_PROBE0) { sk->retransmits = 0; /* Our probe was answered */ /* * Was it a usable window open ? */ if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ ! before (sk->window_seq, sk->write_queue.next->h.seq)) { sk->backoff = 0; /* * Recompute rto from rtt. this eliminates any backoff. */ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; if (sk->rto > 120*HZ) sk->rto = 120*HZ; if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about .2 of a second because of BSD delayed acks - on a 100Mb/sec link .2 of a second is going to need huge windows (SIGH) */ sk->rto = 20; } } /* * See if we can take anything off of the retransmit queue. */ while(sk->send_head != NULL) { /* Check for a bug. */ if (sk->send_head->link3 && after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) printk("INET: tcp.c: *** bug send_list out of order.\n"); /* * If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived the other end. */ if (before(sk->send_head->h.seq, ack+1)) { struct sk_buff *oskb; if (sk->retransmits) { /* * We were retransmitting. don't count this in RTT est */ flag |= 2; /* * even though we've gotten an ack, we're still * retransmitting as long as we're sending from * the retransmit queue. Keeping retransmits non-zero * prevents us from getting new data interspersed with * retransmissions. */ if (sk->send_head->link3) /* Any more queued retransmits? */ sk->retransmits = 1; else sk->retransmits = 0; } /* * Note that we only reset backoff and rto in the * rtt recomputation code. And that doesn't happen * if there were retransmissions in effect. So the * first new packet after the retransmissions is * sent with the backoff still in effect. Not until * we get an ack from a non-retransmitted packet do * we reset the backoff and rto. This allows us to deal * with a situation where the network delay has increased * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ /* * We have one less packet out there. */ if (sk->packets_out > 0) sk->packets_out --; /* * Wake up the process, it can probably write more. */ if (!sk->dead) sk->write_space(sk); oskb = sk->send_head; if (!(flag&2)) /* Not retransmitting */ { long m; /* * The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". */ m = jiffies - oskb->when; /* RTT */ if(m<=0) m=1; /* IS THIS RIGHT FOR <0 ??? */ m -= (sk->rtt >> 3); /* m is now error in rtt est */ sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) m = -m; /* m is now abs(error) */ m -= (sk->mdev >> 2); /* similar update on mdev */ sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ /* * Now update timeout. Note that this removes any backoff. */ sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; if (sk->rto > 120*HZ) sk->rto = 120*HZ; if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ sk->rto = 20; sk->backoff = 0; } flag |= (2|4); /* 2 is really more like 'don't adjust the rtt In this case as we just set it up */ cli(); oskb = sk->send_head; IS_SKB(oskb); sk->send_head = oskb->link3; if (sk->send_head == NULL) { sk->send_tail = NULL; } /* * We may need to remove this from the dev send list. */ if (oskb->next) skb_unlink(oskb); sti(); kfree_skb(oskb, FREE_WRITE); /* write. */ if (!sk->dead) sk->write_space(sk); } else { break; } } /* * XXX someone ought to look at this too.. at the moment, if skb_peek() * returns non-NULL, we complete ignore the timer stuff in the else * clause. We ought to organize the code so that else clause can * (should) be executed regardless, possibly moving the PROBE timer * reset over. The skb_peek() thing should only move stuff to the * write queue, NOT also manage the timer functions. */ /* * Maybe we can take some stuff off of the write queue, * and put it onto the xmit queue. */ if (skb_peek(&sk->write_queue) != NULL) { if (after (sk->window_seq+1, sk->write_queue.next->h.seq) && (sk->retransmits == 0 || sk->ip_xmit_timeout != TIME_WRITE || before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1)) && sk->packets_out < sk->cong_window) { /* * Add more data to the send queue. */ flag |= 1; tcp_write_xmit(sk); } else if (before(sk->window_seq, sk->write_queue.next->h.seq) && sk->send_head == NULL && sk->ack_backlog == 0 && sk->state != TCP_TIME_WAIT) { /* * Data to queue but no room. */ reset_xmit_timer(sk, TIME_PROBE0, sk->rto); } } else { /* * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets * from TCP_CLOSE we don't do anything * * from anything else, if there is write data (or fin) pending, * we use a TIME_WRITE timeout, else if keepalive we reset to * a KEEPALIVE timeout, else we delete the timer. * * We do not set flag for nominal write data, otherwise we may * force a state where we start to write itsy bitsy tidbits * of data. */ switch(sk->state) { case TCP_TIME_WAIT: /* * keep us in TIME_WAIT until we stop getting packets, * reset the timeout. */ reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); break; case TCP_CLOSE: /* * don't touch the timer. */ break; default: /* * Must check send_head, write_queue, and ack_backlog * to determine which timeout to use. */ if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { reset_xmit_timer(sk, TIME_WRITE, sk->rto); } else if (sk->keepopen) { reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); } else { del_timer(&sk->retransmit_timer); sk->ip_xmit_timeout = 0; } break; } } /* * We have nothing queued but space to send. Send any partial * packets immediately (end of Nagle rule application). */ if (sk->packets_out == 0 && sk->partial != NULL && skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) { flag |= 1; tcp_send_partial(sk); } /* * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and * we are now waiting for an acknowledge to our FIN. The other end is * already in TIME_WAIT. * * Move to TCP_CLOSE on success. */ if (sk->state == TCP_LAST_ACK) { if (!sk->dead) sk->state_change(sk); if(sk->debug) printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n", sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) { flag |= 1; tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; } } /* * Incoming ACK to a FIN we sent in the case of our initiating the close. * * Move to FIN_WAIT2 to await a FIN from the other end. Set * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. */ if (sk->state == TCP_FIN_WAIT1) { if (!sk->dead) sk->state_change(sk); if (sk->rcv_ack_seq == sk->write_seq) { flag |= 1; sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); } } /* * Incoming ACK to a FIN we sent in the case of a simultaneous close. * * Move to TIME_WAIT */ if (sk->state == TCP_CLOSING) { if (!sk->dead) sk->state_change(sk); if (sk->rcv_ack_seq == sk->write_seq) { flag |= 1; tcp_time_wait(sk); } } /* * Final ack of a three way shake */ if(sk->state==TCP_SYN_RECV) { tcp_set_state(sk, TCP_ESTABLISHED); tcp_options(sk,th); sk->dummy_th.dest=th->source; sk->copied_seq = sk->acked_seq; if(!sk->dead) sk->state_change(sk); if(sk->max_window==0) { sk->max_window=32; /* Sanity check */ sk->mss=min(sk->max_window,sk->mtu); } } /* * I make no guarantees about the first clause in the following * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under * what conditions "!flag" would be true. However I think the rest * of the conditions would prevent that from causing any * unnecessary retransmission. * Clearly if the first packet has expired it should be * retransmitted. The other alternative, "flag&2 && retransmits", is * harder to explain: You have to look carefully at how and when the * timer is set and with what timeout. The most recent transmission always * sets the timer. So in general if the most recent thing has timed * out, everything before it has as well. So we want to go ahead and * retransmit some more. If we didn't explicitly test for this * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" * would not be true. If you look at the pattern of timing, you can * show that rto is increased fast enough that the next packet would * almost never be retransmitted immediately. Then you'd end up * waiting for a timeout to send each packet on the retransmission * queue. With my implementation of the Karn sampling algorithm, * the timeout would double each time. The net result is that it would * take a hideous amount of time to recover from a single dropped packet. * It's possible that there should also be a test for TIME_WRITE, but * I think as long as "send_head != NULL" and "retransmit" is on, we've * got to be in real retransmission mode. * Note that tcp_do_retransmit is called with all==1. Setting cong_window * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. * As long as no further losses occur, this seems reasonable. */ if (((!flag) || (flag&4)) && sk->send_head != NULL && (((flag&2) && sk->retransmits) || (sk->send_head->when + sk->rto < jiffies))) { if(sk->send_head->when + sk->rto < jiffies) tcp_retransmit(sk,0); else { tcp_do_retransmit(sk, 1); reset_xmit_timer(sk, TIME_WRITE, sk->rto); } } return(1); } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. * */ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { sk->fin_seq = th->seq + skb->len + th->syn + th->fin; if (!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 1); } switch(sk->state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_ESTABLISHED: /* * move to CLOSE_WAIT, tcp_data() already handled * sending the ack. */ tcp_set_state(sk,TCP_CLOSE_WAIT); if (th->rst) sk->shutdown = SHUTDOWN_MASK; break; case TCP_CLOSE_WAIT: case TCP_CLOSING: /* * received a retransmission of the FIN, do * nothing. */ break; case TCP_TIME_WAIT: /* * received a retransmission of the FIN, * restart the TIME_WAIT timer. */ reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return(0); case TCP_FIN_WAIT1: /* * This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. * * This causes a WRITE timeout, which will either * move on to TIME_WAIT when we timeout, or resend * the FIN properly (maybe we get rid of that annoying * FIN lost hang). The TIME_WRITE code is already correct * for handling this timeout. */ if(sk->ip_xmit_timeout != TIME_WRITE) reset_xmit_timer(sk, TIME_WRITE, sk->rto); tcp_set_state(sk,TCP_CLOSING); break; case TCP_FIN_WAIT2: /* * received a FIN -- send ACK and enter TIME_WAIT */ reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); sk->shutdown|=SHUTDOWN_MASK; tcp_set_state(sk,TCP_TIME_WAIT); break; case TCP_CLOSE: /* * already in CLOSE */ break; default: tcp_set_state(sk,TCP_LAST_ACK); /* Start the timers. */ reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); return(0); } return(0); } /* * This routine handles the data. If there is room in the buffer, * it will be have already been moved into it. If there is no * room, then we will just have to discard the packet. */ extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned long saddr, unsigned short len) { struct sk_buff *skb1, *skb2; struct tcphdr *th; int dup_dumped=0; unsigned long new_seq; unsigned long shut_seq; th = skb->h.th; skb->len = len -(th->doff*4); /* * The bytes in the receive read/assembly queue has increased. Needed for the * low memory discard algorithm */ sk->bytes_rcv += skb->len; if (skb->len == 0 && !th->fin) { /* * Don't want to keep passing ack's back and forth. * (someone sent us dataless, boring frame) */ if (!th->ack) tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr); kfree_skb(skb, FREE_READ); return(0); } /* * We no longer have anyone receiving data on this connection. */ #ifndef TCP_DONT_RST_SHUTDOWN if(sk->shutdown & RCV_SHUTDOWN) { /* * FIXME: BSD has some magic to avoid sending resets to * broken 4.2 BSD keepalives. Much to my surprise a few non * BSD stacks still have broken keepalives so we want to * cope with it. */ if(skb->len) /* We don't care if it's just an ack or a keepalive/window probe */ { new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ /* Do this the way 4.4BSD treats it. Not what I'd regard as the meaning of the spec but it's what BSD does and clearly they know everything 8) */ /* * This is valid because of two things * * a) The way tcp_data behaves at the bottom. * b) A fin takes effect when read not when received. */ shut_seq=sk->acked_seq+1; /* Last byte */ if(after(new_seq,shut_seq)) { if(sk->debug) printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n", sk, new_seq, shut_seq, sk->blog); if(sk->dead) { sk->acked_seq = new_seq + th->fin; tcp_reset(sk->saddr, sk->daddr, skb->h.th, sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); tcp_statistics.TcpEstabResets++; tcp_set_state(sk,TCP_CLOSE); sk->err = EPIPE; sk->shutdown = SHUTDOWN_MASK; kfree_skb(skb, FREE_READ); return 0; } } } } #endif /* * Now we have to walk the chain, and figure out where this one * goes into it. This is set up so that the last packet we received * will be the first one we look at, that way if everything comes * in order, there will be no performance loss, and if they come * out of order we will be able to fit things in nicely. * * [AC: This is wrong. We should assume in order first and then walk * forwards from the first hole based upon real traffic patterns.] * */ if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */ { skb_queue_head(&sk->receive_queue,skb); skb1= NULL; } else { for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) { if(sk->debug) { printk("skb1=%p :", skb1); printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq); printk("skb->h.th->seq = %ld\n",skb->h.th->seq); printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq, sk->acked_seq); } /* * Optimisation: Duplicate frame or extension of previous frame from * same sequence point (lost ack case). * The frame contains duplicate data or replaces a previous frame * discard the previous frame (safe as sk->inuse is set) and put * the new one in its place. */ if (th->seq==skb1->h.th->seq && skb->len>= skb1->len) { skb_append(skb1,skb); skb_unlink(skb1); kfree_skb(skb1,FREE_READ); dup_dumped=1; skb1=NULL; break; } /* * Found where it fits */ if (after(th->seq+1, skb1->h.th->seq)) { skb_append(skb1,skb); break; } /* * See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&sk->receive_queue)) { skb_queue_head(&sk->receive_queue, skb); break; } } } /* * Figure out what the ack value for this frame is */ th->ack_seq = th->seq + skb->len; if (th->syn) th->ack_seq++; if (th->fin) th->ack_seq++; if (before(sk->acked_seq, sk->copied_seq)) { printk("*** tcp.c:tcp_data bug acked < copied\n"); sk->acked_seq = sk->copied_seq; } /* * Now figure out if we can ack anything. This is very messy because we really want two * receive queues, a completed and an assembly queue. We also want only one transmit * queue. */ if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) { if (before(th->seq, sk->acked_seq+1)) { int newwindow; if (after(th->ack_seq, sk->acked_seq)) { newwindow = sk->window-(th->ack_seq - sk->acked_seq); if (newwindow < 0) newwindow = 0; sk->window = newwindow; sk->acked_seq = th->ack_seq; } skb->acked = 1; /* * When we ack the fin, we do the FIN * processing. */ if (skb->h.th->fin) { tcp_fin(skb,sk,skb->h.th); } for(skb2 = skb->next; skb2 != (struct sk_buff *)&sk->receive_queue; skb2 = skb2->next) { if (before(skb2->h.th->seq, sk->acked_seq+1)) { if (after(skb2->h.th->ack_seq, sk->acked_seq)) { newwindow = sk->window - (skb2->h.th->ack_seq - sk->acked_seq); if (newwindow < 0) newwindow = 0; sk->window = newwindow; sk->acked_seq = skb2->h.th->ack_seq; } skb2->acked = 1; /* * When we ack the fin, we do * the fin handling. */ if (skb2->h.th->fin) { tcp_fin(skb,sk,skb->h.th); } /* * Force an immediate ack. */ sk->ack_backlog = sk->max_ack_backlog; } else { break; } } /* * This also takes care of updating the window. * This if statement needs to be simplified. */ if (!sk->delay_acks || sk->ack_backlog >= sk->max_ack_backlog || sk->bytes_rcv > sk->max_unacked || th->fin) { /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */ } else { sk->ack_backlog++; if(sk->debug) printk("Ack queued.\n"); reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); } } } /* * If we've missed a packet, send an ack. * Also start a timer to send another. */ if (!skb->acked) { /* * This is important. If we don't have much room left, * we need to throw out a few packets so we have a good * window. Note that mtu is used, not mss, because mss is really * for the send side. He could be sending us stuff as large as mtu. */ while (sk->prot->rspace(sk) < sk->mtu) { skb1 = skb_peek(&sk->receive_queue); if (skb1 == NULL) { printk("INET: tcp.c:tcp_data memory leak detected.\n"); break; } /* * Don't throw out something that has been acked. */ if (skb1->acked) { break; } skb_unlink(skb1); kfree_skb(skb1, FREE_READ); } tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); sk->ack_backlog++; reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); } else { tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); } /* * Now tell the user we may have some data. */ if (!sk->dead) { if(sk->debug) printk("Data wakeup.\n"); sk->data_ready(sk,0); } return(0); } /* * This routine is only called when we have urgent data * signalled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. */ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) { unsigned long ptr = ntohs(th->urg_ptr); if (ptr) ptr--; ptr += th->seq; /* ignore urgent data that we've already seen and read */ if (after(sk->copied_seq, ptr)) return; /* do we already have a newer (or duplicate) urgent pointer? */ if (sk->urg_data && !after(ptr, sk->urg_seq)) return; /* tell the world about our new urgent pointer */ if (sk->proc != 0) { if (sk->proc > 0) { kill_proc(sk->proc, SIGURG, 1); } else { kill_pg(-sk->proc, SIGURG, 1); } } sk->urg_data = URG_NOTYET; sk->urg_seq = ptr; } /* * This is the 'fast' part of urgent handling. */ extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long saddr, unsigned long len) { unsigned long ptr; /* * Check if we get a new urgent pointer - normally not */ if (th->urg) tcp_check_urg(sk,th); /* * Do we wait for any urgent data? - normally not */ if (sk->urg_data != URG_NOTYET) return 0; /* * Is the urgent pointer pointing into this packet? */ ptr = sk->urg_seq - th->seq + th->doff*4; if (ptr >= len) return 0; /* * Ok, got the correct packet, update info */ sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); return 0; } /* * This will accept the next outstanding connection. */ static struct sock *tcp_accept(struct sock *sk, int flags) { struct sock *newsk; struct sk_buff *skb; /* * We need to make sure that this socket is listening, * and that it has something pending. */ if (sk->state != TCP_LISTEN) { sk->err = EINVAL; return(NULL); } /* Avoid the race. */ cli(); sk->inuse = 1; while((skb = tcp_dequeue_established(sk)) == NULL) { if (flags & O_NONBLOCK) { sti(); release_sock(sk); sk->err = EAGAIN; return(NULL); } release_sock(sk); interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) { sti(); sk->err = ERESTARTSYS; return(NULL); } sk->inuse = 1; } sti(); /* * Now all we need to do is return skb->sk. */ newsk = skb->sk; kfree_skb(skb, FREE_READ); sk->ack_backlog--; release_sock(sk); return(newsk); } /* * This will initiate an outgoing connection. */ static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) { struct sk_buff *buff; struct device *dev=NULL; unsigned char *ptr; int tmp; int atype; struct tcphdr *t1; struct rtable *rt; if (sk->state != TCP_CLOSE) { return(-EISCONN); } if (addr_len < 8) return(-EINVAL); if (usin->sin_family && usin->sin_family != AF_INET) return(-EAFNOSUPPORT); /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if(usin->sin_addr.s_addr==INADDR_ANY) usin->sin_addr.s_addr=ip_my_addr(); /* * Don't want a TCP connection going to a broadcast address */ if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) return -ENETUNREACH; sk->inuse = 1; sk->daddr = usin->sin_addr.s_addr; sk->write_seq = tcp_init_seq(); sk->window_seq = sk->write_seq; sk->rcv_ack_seq = sk->write_seq -1; sk->err = 0; sk->dummy_th.dest = usin->sin_port; release_sock(sk); buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); if (buff == NULL) { return(-ENOMEM); } sk->inuse = 1; buff->len = 24; buff->sk = sk; buff->free = 0; buff->localroute = sk->localroute; t1 = (struct tcphdr *) buff->data; /* * Put in the IP header and routing stuff. */ rt=ip_rt_route(sk->daddr, NULL, NULL); /* * We need to build the routing stuff from the things saved in skb. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); release_sock(sk); return(-ENETUNREACH); } buff->len += tmp; t1 = (struct tcphdr *)((char *)t1 +tmp); memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); t1->seq = ntohl(sk->write_seq++); sk->sent_seq = sk->write_seq; buff->h.seq = sk->write_seq; t1->ack = 0; t1->window = 2; t1->res1=0; t1->res2=0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->syn = 1; t1->urg_ptr = 0; t1->doff = 6; /* use 512 or whatever user asked for */ if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) sk->window_clamp=rt->rt_window; else sk->window_clamp=0; if (sk->user_mss) sk->mtu = sk->user_mss; else if(rt!=NULL && (rt->rt_flags&RTF_MTU)) sk->mtu = rt->rt_mss; else { #ifdef CONFIG_INET_SNARL if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr)) #else if ((sk->saddr ^ sk->daddr) & dev->pa_mask) #endif sk->mtu = 576 - HEADER_SIZE; else sk->mtu = MAX_WINDOW; } /* * but not bigger than device MTU */ if(sk->mtu <32) sk->mtu = 32; /* Sanity limit */ sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE); /* * Put in the TCP options to say MTU. */ ptr = (unsigned char *)(t1+1); ptr[0] = 2; ptr[1] = 4; ptr[2] = (sk->mtu) >> 8; ptr[3] = (sk->mtu) & 0xff; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(struct tcphdr) + 4, sk); /* * This must go first otherwise a really quick response will get reset. */ tcp_set_state(sk,TCP_SYN_SENT); sk->rto = TCP_TIMEOUT_INIT; #if 0 /* we already did this */ init_timer(&sk->retransmit_timer); #endif sk->retransmit_timer.function=&retransmit_timer; sk->retransmit_timer.data = (unsigned long)sk; reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */ sk->retransmits = TCP_SYN_RETRIES; sk->prot->queue_xmit(sk, dev, buff, 0); reset_xmit_timer(sk, TIME_WRITE, sk->rto); tcp_statistics.TcpActiveOpens++; tcp_statistics.TcpOutSegs++; release_sock(sk); return(0); } /* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len, struct options *opt, unsigned long saddr, struct device *dev) { unsigned long next_seq; next_seq = len - 4*th->doff; if (th->fin) next_seq++; /* if we have a zero window, we can't have any data in the packet.. */ if (next_seq && !sk->window) goto ignore_it; next_seq += th->seq; /* * This isn't quite right. sk->acked_seq could be more recent * than sk->window. This is however close enough. We will accept * slightly more packets than we should, but it should not cause * problems unless someone is trying to forge packets. */ /* have we already seen all of this packet? */ if (!after(next_seq+1, sk->acked_seq)) goto ignore_it; /* or does it start beyond the window? */ if (!before(th->seq, sk->acked_seq + sk->window + 1)) goto ignore_it; /* ok, at least part of this packet would seem interesting.. */ return 1; ignore_it: if (th->rst) return 0; /* * Send a reset if we get something not ours and we are * unsynchronized. Note: We don't do anything to our end. We * are just killing the bogus remote connection then we will * connect again and it will work (with luck). */ if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) { tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); return 1; } /* Try to resync things. */ tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); return 0; } /* * When we get a reset we do this. */ static int tcp_std_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; sk->err = ECONNRESET; if (sk->state == TCP_SYN_SENT) sk->err = ECONNREFUSED; if (sk->state == TCP_CLOSE_WAIT) sk->err = EPIPE; #ifdef TCP_DO_RFC1337 /* * Time wait assassination protection [RFC1337] */ if(sk->state!=TCP_TIME_WAIT) { tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; } #else tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; #endif if (!sk->dead) sk->state_change(sk); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } /* * A TCP packet has arrived. */ int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, unsigned long daddr, unsigned short len, unsigned long saddr, int redo, struct inet_protocol * protocol) { struct tcphdr *th; struct sock *sk; int syn_ok=0; if (!skb) { printk("IMPOSSIBLE 1\n"); return(0); } if (!dev) { printk("IMPOSSIBLE 2\n"); return(0); } tcp_statistics.TcpInSegs++; if(skb->pkt_type!=PACKET_HOST) { kfree_skb(skb,FREE_READ); return(0); } th = skb->h.th; /* * Find the socket. */ sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); /* * If this socket has got a reset it's to all intents and purposes * really dead. Count closed sockets as dead. * * Note: BSD appears to have a bug here. A 'closed' TCP in BSD * simply drops data. This seems incorrect as a 'closed' TCP doesn't * exist so should cause resets as if the port was unreachable. */ if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE)) sk=NULL; if (!redo) { if (tcp_check(th, len, saddr, daddr )) { skb->sk = NULL; kfree_skb(skb,FREE_READ); /* * We don't release the socket because it was * never marked in use. */ return(0); } th->seq = ntohl(th->seq); /* See if we know about the socket. */ if (sk == NULL) { /* * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset) */ tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); skb->sk = NULL; /* * Discard frame */ kfree_skb(skb, FREE_READ); return(0); } skb->len = len; skb->acked = 0; skb->used = 0; skb->free = 0; skb->saddr = daddr; skb->daddr = saddr; /* We may need to add it to the backlog here. */ cli(); if (sk->inuse) { skb_queue_tail(&sk->back_log, skb); sti(); return(0); } sk->inuse = 1; sti(); } else { if (sk==NULL) { tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); skb->sk = NULL; kfree_skb(skb, FREE_READ); return(0); } } if (!sk->prot) { printk("IMPOSSIBLE 3\n"); return(0); } /* * Charge the memory to the socket. */ if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) { kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } skb->sk=sk; sk->rmem_alloc += skb->mem_len; /* * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug * compatibility. We also set up variables more thoroughly [Karn notes in the * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. */ if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ { /* * Now deal with unusual cases. */ if(sk->state==TCP_LISTEN) { if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); /* * We don't care for RST, and non SYN are absorbed (old segments) * Broadcast/multicast SYN isn't allowed. Note - bug if you change the * netmask on a running connection it can go broadcast. Even Sun's have * this problem so I'm ignoring it */ if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) { kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * Guess we need to make a new socket up */ tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); /* * Now we have several options: In theory there is nothing else * in the frame. KA9Q has an option to send data with the syn, * BSD accepts data with the syn up to the [to be] advertised window * and Solaris 2.1 gives you a protocol error. For now we just ignore * it, that fits the spec precisely and avoids incompatibilities. It * would be nice in future to drop through and process the data. */ release_sock(sk); return 0; } /* retransmitted SYN? */ if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq) { kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * SYN sent means we have to look for a suitable ack and either reset * for bad matches or go to connected */ if(sk->state==TCP_SYN_SENT) { /* Crossed SYN or previous junk segment */ if(th->ack) { /* We got an ack, but it's not a good ack */ if(!tcp_ack(sk,th,saddr,len)) { /* Reset the ack - its an ack from a different connection [ th->rst is checked in tcp_reset()] */ tcp_statistics.TcpAttemptFails++; tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); kfree_skb(skb, FREE_READ); release_sock(sk); return(0); } if(th->rst) return tcp_std_reset(sk,skb); if(!th->syn) { /* A valid ack from a different connection start. Shouldn't happen but cover it */ kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * Ok.. it's good. Set up sequence numbers and * move to established. */ syn_ok=1; /* Don't reset this connection for the syn */ sk->acked_seq=th->seq+1; sk->fin_seq=th->seq; tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr); tcp_set_state(sk, TCP_ESTABLISHED); tcp_options(sk,th); sk->dummy_th.dest=th->source; sk->copied_seq = sk->acked_seq; if(!sk->dead) { sk->state_change(sk); sock_wake_async(sk->socket, 0); } if(sk->max_window==0) { sk->max_window = 32; sk->mss = min(sk->max_window, sk->mtu); } } else { /* See if SYN's cross. Drop if boring */ if(th->syn && !th->rst) { /* Crossed SYN's are fine - but talking to yourself is right out... */ if(sk->saddr==saddr && sk->daddr==daddr && sk->dummy_th.source==th->source && sk->dummy_th.dest==th->dest) { tcp_statistics.TcpAttemptFails++; return tcp_std_reset(sk,skb); } tcp_set_state(sk,TCP_SYN_RECV); /* * FIXME: * Must send SYN|ACK here */ } /* Discard junk segment */ kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * SYN_RECV with data maybe.. drop through */ goto rfc_step6; } /* * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is * a more complex suggestion for fixing these reuse issues in RFC1644 * but not yet ready for general use. Also see RFC1379. */ #define BSD_TIME_WAIT #ifdef BSD_TIME_WAIT if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && after(th->seq, sk->acked_seq) && !th->rst) { long seq=sk->write_seq; if(sk->debug) printk("Doing a BSD time wait\n"); tcp_statistics.TcpEstabResets++; sk->rmem_alloc -= skb->mem_len; skb->sk = NULL; sk->err=ECONNRESET; tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; release_sock(sk); sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); if (sk && sk->state==TCP_LISTEN) { sk->inuse=1; skb->sk = sk; sk->rmem_alloc += skb->mem_len; tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); release_sock(sk); return 0; } kfree_skb(skb, FREE_READ); return 0; } #endif } /* * We are now in normal data flow (see the step list in the RFC) * Note most of these are inline now. I'll inline the lot when * I have time to test it hard and look at what gcc outputs */ if(!tcp_sequence(sk,th,len,opt,saddr,dev)) { kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } if(th->rst) return tcp_std_reset(sk,skb); /* * !syn_ok is effectively the state test in RFC793. */ if(th->syn && !syn_ok) { tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); return tcp_std_reset(sk,skb); } /* * Process the ACK */ if(th->ack && !tcp_ack(sk,th,saddr,len)) { /* * Our three way handshake failed. */ if(sk->state==TCP_SYN_RECV) { tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); } kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } rfc_step6: /* I'll clean this up later */ /* * Process urgent data */ if(tcp_urg(sk, th, saddr, len)) { kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * Process the encapsulated data */ if(tcp_data(skb,sk, saddr, len)) { kfree_skb(skb, FREE_READ); release_sock(sk); return 0; } /* * And done */ release_sock(sk); return 0; } /* * This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. */ static void tcp_write_wakeup(struct sock *sk) { struct sk_buff *buff; struct tcphdr *t1; struct device *dev=NULL; int tmp; if (sk->zapped) return; /* After a valid reset we can send no more */ /* * Write data can still be transmitted/retransmitted in the * following states. If any other state is encountered, return. * [listen/close will never occur here anyway] */ if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->state != TCP_FIN_WAIT1 && sk->state != TCP_LAST_ACK && sk->state != TCP_CLOSING ) { return; } buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); if (buff == NULL) return; buff->len = sizeof(struct tcphdr); buff->free = 1; buff->sk = sk; buff->localroute = sk->localroute; t1 = (struct tcphdr *) buff->data; /* Put in the IP header and routing stuff. */ tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); if (tmp < 0) { sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); return; } buff->len += tmp; t1 = (struct tcphdr *)((char *)t1 +tmp); memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); /* * Use a previous sequence. * This should cause the other end to send an ack. */ t1->seq = htonl(sk->sent_seq-1); t1->ack = 1; t1->res1= 0; t1->res2= 0; t1->rst = 0; t1->urg = 0; t1->psh = 0; t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ t1->syn = 0; t1->ack_seq = ntohl(sk->acked_seq); t1->window = ntohs(tcp_select_window(sk)); t1->doff = sizeof(*t1)/4; tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); /* * Send it and free it. * This will prevent the timer from automatically being restarted. */ sk->prot->queue_xmit(sk, dev, buff, 1); tcp_statistics.TcpOutSegs++; } /* * A window probe timeout has occurred. */ void tcp_send_probe0(struct sock *sk) { if (sk->zapped) return; /* After a valid reset we can send no more */ tcp_write_wakeup(sk); sk->backoff++; sk->rto = min(sk->rto << 1, 120*HZ); reset_xmit_timer (sk, TIME_PROBE0, sk->rto); sk->retransmits++; sk->prot->retransmits ++; } /* * Socket option code for TCP. */ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int val,err; if(level!=SOL_TCP) return ip_setsockopt(sk,level,optname,optval,optlen); if (optval == NULL) return(-EINVAL); err=verify_area(VERIFY_READ, optval, sizeof(int)); if(err) return err; val = get_fs_long((unsigned long *)optval); switch(optname) { case TCP_MAXSEG: /* * values greater than interface MTU won't take effect. however at * the point when this call is done we typically don't yet know * which interface is going to be used */ if(val<1||val>MAX_WINDOW) return -EINVAL; sk->user_mss=val; return 0; case TCP_NODELAY: sk->nonagle=(val==0)?0:1; return 0; default: return(-ENOPROTOOPT); } } int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { int val,err; if(level!=SOL_TCP) return ip_getsockopt(sk,level,optname,optval,optlen); switch(optname) { case TCP_MAXSEG: val=sk->user_mss; break; case TCP_NODELAY: val=sk->nonagle; break; default: return(-ENOPROTOOPT); } err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); if(err) return err; put_fs_long(sizeof(int),(unsigned long *) optlen); err=verify_area(VERIFY_WRITE, optval, sizeof(int)); if(err) return err; put_fs_long(val,(unsigned long *)optval); return(0); } struct proto tcp_prot = { sock_wmalloc, sock_rmalloc, sock_wfree, sock_rfree, sock_rspace, sock_wspace, tcp_close, tcp_read, tcp_write, tcp_sendto, tcp_recvfrom, ip_build_header, tcp_connect, tcp_accept, ip_queue_xmit, tcp_retransmit, tcp_write_wakeup, tcp_read_wakeup, tcp_rcv, tcp_select, #ifdef _HURD_ NULL, #else tcp_ioctl, #endif NULL, tcp_shutdown, tcp_setsockopt, tcp_getsockopt, 128, 0, {NULL,}, "TCP", 0, 0 };