diff options
Diffstat (limited to 'pfinet/linux-src/net/ipv4/tcp.c')
-rw-r--r-- | pfinet/linux-src/net/ipv4/tcp.c | 1826 |
1 files changed, 1826 insertions, 0 deletions
diff --git a/pfinet/linux-src/net/ipv4/tcp.c b/pfinet/linux-src/net/ipv4/tcp.c new file mode 100644 index 00000000..65763215 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp.c @@ -0,0 +1,1826 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp.c,v 1.140.2.4 1999/08/09 03:13:12 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but its a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +/* + * RFC1122 status: + * NOTE: I'm not going to be doing comments in the code for this one except + * for violations and the like. tcp.c is just too big... If I say something + * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out + * with Alan. -- MS 950903 + * [Note: Most of the TCP code has been rewriten/redesigned since this + * RFC1122 check. It is probably not correct anymore. It should be redone + * before 2.2. -AK] + * + * Use of PSH (4.2.2.2) + * MAY aggregate data sent without the PSH flag. (does) + * MAY queue data received without the PSH flag. (does) + * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) + * MAY implement PSH on send calls. (doesn't, thus:) + * MUST NOT buffer data indefinitely (doesn't [1 second]) + * MUST set PSH on last segment (does) + * MAY pass received PSH to application layer (doesn't) + * SHOULD send maximum-sized segment whenever possible. (almost always does) + * + * Window Size (4.2.2.3, 4.2.2.16) + * MUST treat window size as an unsigned number (does) + * SHOULD treat window size as a 32-bit number (does not) + * MUST NOT shrink window once it is offered (does not normally) + * + * Urgent Pointer (4.2.2.4) + * **MUST point urgent pointer to last byte of urgent data (not right + * after). (doesn't, to be like BSD. That's configurable, but defaults + * to off) + * MUST inform application layer asynchronously of incoming urgent + * data. (does) + * MUST provide application with means of determining the amount of + * urgent data pending. (does) + * **MUST support urgent data sequence of arbitrary length. (doesn't, but + * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) + * [Follows BSD 1 byte of urgent data] + * + * TCP Options (4.2.2.5) + * MUST be able to receive TCP options in any segment. (does) + * MUST ignore unsupported options (does) + * + * Maximum Segment Size Option (4.2.2.6) + * MUST implement both sending and receiving MSS. (does, but currently + * only uses the smaller of both of them) + * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send + * it always). (does, even when MSS == 536, which is legal) + * MUST assume MSS == 536 if no MSS received at connection setup (does) + * MUST calculate "effective send MSS" correctly: + * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) + * (does - but allows operator override) + * + * TCP Checksum (4.2.2.7) + * MUST generate and check TCP checksum. (does) + * + * Initial Sequence Number Selection (4.2.2.8) + * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's + * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is + * necessary for 10Mbps networks - and harder than BSD to spoof! + * With syncookies we don't) + * + * Simultaneous Open Attempts (4.2.2.10) + * MUST support simultaneous open attempts (does) + * + * Recovery from Old Duplicate SYN (4.2.2.11) + * MUST keep track of active vs. passive open (does) + * + * RST segment (4.2.2.12) + * SHOULD allow an RST segment to contain data (does, but doesn't do + * anything with it, which is standard) + * + * Closing a Connection (4.2.2.13) + * MUST inform application of whether connection was closed by RST or + * normal close. (does) + * MAY allow "half-duplex" close (treat connection as closed for the + * local app, even before handshake is done). (does) + * MUST linger in TIME_WAIT for 2 * MSL (does) + * + * Retransmission Timeout (4.2.2.15) + * MUST implement Jacobson's slow start and congestion avoidance + * stuff. (does) + * + * Probing Zero Windows (4.2.2.17) + * MUST support probing of zero windows. (does) + * MAY keep offered window closed indefinitely. (does) + * MUST allow remote window to stay closed indefinitely. (does) + * + * Passive Open Calls (4.2.2.18) + * MUST NOT let new passive open affect other connections. (doesn't) + * MUST support passive opens (LISTENs) concurrently. (does) + * + * Time to Live (4.2.2.19) + * MUST make TCP TTL configurable. (does - IP_TTL option) + * + * Event Processing (4.2.2.20) + * SHOULD queue out-of-order segments. (does) + * MUST aggregate ACK segments whenever possible. (does but badly) + * + * Retransmission Timeout Calculation (4.2.3.1) + * MUST implement Karn's algorithm and Jacobson's algorithm for RTO + * calculation. (does, or at least explains them in the comments 8*b) + * SHOULD initialize RTO to 0 and RTT to 3. (does) + * + * When to Send an ACK Segment (4.2.3.2) + * SHOULD implement delayed ACK. (does) + * MUST keep ACK delay < 0.5 sec. (does) + * + * When to Send a Window Update (4.2.3.3) + * MUST implement receiver-side SWS. (does) + * + * When to Send Data (4.2.3.4) + * MUST implement sender-side SWS. (does) + * SHOULD implement Nagle algorithm. (does) + * + * TCP Connection Failures (4.2.3.5) + * MUST handle excessive retransmissions "properly" (see the RFC). (does) + * SHOULD inform application layer of soft errors. (does) + * + * TCP Keep-Alives (4.2.3.6) + * MAY provide keep-alives. (does) + * MUST make keep-alives configurable on a per-connection basis. (does) + * MUST default to no keep-alives. (does) + * MUST make keep-alive interval configurable. (does) + * MUST make default keep-alive interval > 2 hours. (does) + * MUST NOT interpret failure to ACK keep-alive packet as dead + * connection. (doesn't) + * SHOULD send keep-alive with no data. (does) + * + * TCP Multihoming (4.2.3.7) + * MUST get source address from IP layer before sending first + * SYN. (does) + * MUST use same local address for all segments of a connection. (does) + * + * IP Options (4.2.3.8) + * MUST ignore unsupported IP options. (does) + * MAY support Time Stamp and Record Route. (does) + * MUST allow application to specify a source route. (does) + * MUST allow received Source Route option to set route for all future + * segments on this connection. (does not (security issues)) + * + * ICMP messages (4.2.3.9) + * MUST act on ICMP errors. (does) + * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore + * because that is deprecated now by the IETF, can be turned on) + * MUST NOT abort connection upon receipt of soft Destination + * Unreachables (0, 1, 5), Time Exceededs and Parameter + * Problems. (doesn't) + * SHOULD report soft Destination Unreachables etc. to the + * application. (does, except during SYN_RECV and may drop messages + * in some rare cases before accept() - ICMP is unreliable) + * SHOULD abort connection upon receipt of hard Destination Unreachable + * messages (2, 3, 4). (does, but see above) + * + * Remote Address Validation (4.2.3.10) + * MUST reject as an error OPEN for invalid remote IP address. (does) + * MUST ignore SYN with invalid source address. (does) + * MUST silently discard incoming SYN for broadcast/multicast + * address. (does) + * + * Asynchronous Reports (4.2.4.1) + * MUST provide mechanism for reporting soft errors to application + * layer. (does) + * + * Type of Service (4.2.4.2) + * MUST allow application layer to set Type of Service. (does IP_TOS) + * + * (Whew. -- MS 950903) + * (Updated by AK, but not complete yet.) + **/ + +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/poll.h> +#include <linux/init.h> + +#include <net/icmp.h> +#include <net/tcp.h> + +#include <asm/uaccess.h> + +int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + +struct tcp_mib tcp_statistics; + +kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; + +/* + * Find someone to 'accept'. Must be called with + * the socket locked or with interrupts disabled + */ + +static struct open_request *tcp_find_established(struct tcp_opt *tp, + struct open_request **prevp) +{ + struct open_request *req = tp->syn_wait_queue; + struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; + while(req) { + if (req->sk && + ((1 << req->sk->state) & + ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) + break; + prev = req; + req = req->dl_next; + } + *prevp = prev; + return req; +} + +/* + * Walk down the receive queue counting readable data. + * + * Must be called with the socket lock held. + */ + +static int tcp_readable(struct sock *sk) +{ + unsigned long counted; + unsigned long amount; + struct sk_buff *skb; + int sum; + + SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); + + skb = skb_peek(&sk->receive_queue); + if (skb == NULL) { + SOCK_DEBUG(sk, "empty\n"); + return(0); + } + + counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ + amount = 0; + + /* Do until a push or until we are out of data. */ + do { + /* Found a hole so stops here. */ + if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ + break; + + /* Length - header but start from where we are up to + * avoid overlaps. + */ + sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); + if (sum >= 0) { + /* Add it up, move on. */ + amount += sum; + counted += sum; + if (skb->h.th->syn) + counted++; + } + + /* Don't count urg data ... but do it in the right place! + * Consider: "old_data (ptr is here) URG PUSH data" + * The old code would stop at the first push because + * it counted the urg (amount==1) and then does amount-- + * *after* the loop. This means tcp_readable() always + * returned zero if any URG PUSH was in the queue, even + * though there was normal data available. If we subtract + * the urg data right here, we even get it to work for more + * than one URG PUSH skb without normal data. + * This means that poll() finally works now with urg data + * in the queue. Note that rlogin was never affected + * because it doesn't use poll(); it uses two processes + * and a blocking read(). And the queue scan in tcp_read() + * was correct. Mike <pall@rz.uni-karlsruhe.de> + */ + + /* Don't count urg data. */ + if (skb->h.th->urg) + amount--; +#if 0 + if (amount && skb->h.th->psh) break; +#endif + skb = skb->next; + } while(skb != (struct sk_buff *)&sk->receive_queue); + + SOCK_DEBUG(sk, "got %lu bytes.\n",amount); + return(amount); +} + +/* + * LISTEN is a special case for poll.. + */ +static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) +{ + struct open_request *req, *dummy; + + lock_sock(sk); + req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy); + release_sock(sk); + if (req) + return POLLIN | POLLRDNORM; + return 0; +} + +/* + * Compute minimal free write space needed to queue new packets. + */ +#define tcp_min_write_space(__sk) \ + (atomic_read(&(__sk)->wmem_alloc) / 2) + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + poll_wait(file, sk->sleep, wait); + if (sk->state == TCP_LISTEN) + return tcp_listen_poll(sk, wait); + + mask = 0; + if (sk->err) + mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + */ + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* Connected? */ + if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq+1 || + sk->urginline || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->shutdown & SEND_SHUTDOWN)) { + if (sock_wspace(sk) >= tcp_min_write_space(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + sk->socket->flags |= SO_NOSPACE; + } + } + + if (tp->urg_data & URG_VALID) + mask |= POLLPRI; + } + return mask; +} + +/* + * Socket write_space callback. + * This (or rather the sock_wake_async) should agree with poll. + */ +void tcp_write_space(struct sock *sk) +{ + if (sk->dead) + return; + + wake_up_interruptible(sk->sleep); + if (sock_wspace(sk) >= + tcp_min_write_space(sk)) + sock_wake_async(sk->socket, 2); +} + + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int answ; + + switch(cmd) { + case TIOCINQ: +#ifdef FIXME /* FIXME: */ + case FIONREAD: +#endif + if (sk->state == TCP_LISTEN) + return(-EINVAL); + lock_sock(sk); + answ = tcp_readable(sk); + release_sock(sk); + break; + case SIOCATMARK: + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + } + case TIOCOUTQ: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + answ = sock_wspace(sk); + break; + default: + return(-ENOIOCTLCMD); + }; + + return put_user(answ, (int *)arg); +} + +/* + * Wait for a socket to get into the connected state + * + * Note: must be called with the socket locked. + */ +static int wait_for_tcp_connect(struct sock * sk, int flags) +{ + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { + if(sk->err) + return sock_error(sk); + if((1 << sk->state) & + ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if(sk->keepopen && !(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, tsk, 0); + return -EPIPE; + } + if(flags & MSG_DONTWAIT) + return -EAGAIN; + if(signal_pending(tsk)) + return -ERESTARTSYS; + + tsk->state = TASK_INTERRUPTIBLE; + add_wait_queue(sk->sleep, &wait); + release_sock(sk); + + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && + sk->err == 0) + schedule(); + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); + } + return 0; +} + +static inline int tcp_memory_free(struct sock *sk) +{ + return atomic_read(&sk->wmem_alloc) < sk->sndbuf; +} + +/* + * Wait for more memory for a socket + */ +static void wait_for_tcp_memory(struct sock * sk) +{ + release_sock(sk); + if (!tcp_memory_free(sk)) { + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (tcp_memory_free(sk)) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + } + lock_sock(sk); +} + +/* + * Wait for a buffer. + */ +static int wait_for_buffer(struct sock *sk) +{ + struct wait_queue wait = { current, NULL }; + + release_sock(sk); + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); + return 0; +} + +/* When all user supplied data has been queued set the PSH bit */ +#define PSH_NEEDED (seglen == 0 && iovlen == 0) + +/* + * This routine copies from a user buffer into a socket, + * and starts the transmit system. + * + * Note: must be called with the socket locked. + */ + +int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) +{ + struct iovec *iov; + struct tcp_opt *tp; + struct sk_buff *skb; + int iovlen, flags; + int mss_now; + int err, copied; + + lock_sock(sk); + + err = 0; + tp = &(sk->tp_pinfo.af_tcp); + + /* Wait for a connection to finish. */ + flags = msg->msg_flags; + if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if((err = wait_for_tcp_connect(sk, flags)) != 0) + goto out; + + /* This should be in poll */ + sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ + + mss_now = tcp_current_mss(sk); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + while(--iovlen >= 0) { + int seglen=iov->iov_len; + unsigned char * from=iov->iov_base; + + iov++; + + while(seglen > 0) { + int copy, tmp, queue_it, psh; + + if (err) + goto do_fault2; + + /* Stop on errors. */ + if (sk->err) + goto do_sock_err; + + /* Make sure that we are established. */ + if (sk->shutdown & SEND_SHUTDOWN) + goto do_shutdown; + + /* Now we need to check if we have a half + * built packet we can tack some data onto. + */ + if (tp->send_head && !(flags & MSG_OOB)) { + skb = sk->write_queue.prev; + copy = skb->len; + /* If the remote does SWS avoidance we should + * queue the best we can if not we should in + * fact send multiple packets... + * A method for detecting this would be most + * welcome. + */ + if (skb_tailroom(skb) > 0 && + (mss_now - copy) > 0 && + tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { + int last_byte_was_odd = (copy % 4); + + /* + * Check for parallel writers sleeping in user access. + */ + if (tp->partial_writers++ > 0) { + wait_for_buffer(sk); + tp->partial_writers--; + continue; + } + + copy = mss_now - copy; + if(copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if(copy > seglen) + copy = seglen; + + if(last_byte_was_odd) { + if(copy_from_user(skb_put(skb, copy), + from, copy)) + err = -EFAULT; + skb->csum = csum_partial(skb->data, + skb->len, 0); + } else { + skb->csum = + csum_and_copy_from_user( + from, skb_put(skb, copy), + copy, skb->csum, &err); + } + + /* + * FIXME: the *_user functions should + * return how much data was + * copied before the fault + * occurred and then a partial + * packet with this data should + * be sent. Unfortunately + * csum_and_copy_from_user doesn't + * return this information. + * ATM it might send partly zeroed + * data in this case. + */ + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + from += copy; + copied += copy; + seglen -= copy; + if (PSH_NEEDED) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + + if (--tp->partial_writers > 0) + wake_up_interruptible(sk->sleep); + + continue; + } + } + + /* We also need to worry about the window. If + * window < 1/2 the maximum window we've seen + * from this host, don't use it. This is + * sender side silly window prevention, as + * specified in RFC1122. (Note that this is + * different than earlier versions of SWS + * prevention, e.g. RFC813.). What we + * actually do is use the whole MSS. Since + * the results in the right edge of the packet + * being outside the window, it will be queued + * for later rather than sent. + */ + psh = 0; + copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if(copy > (tp->max_window >> 1)) { + copy = min(copy, mss_now); + psh = 1; + } else { + copy = mss_now; + } + if(copy > seglen) + copy = seglen; + + /* Determine how large of a buffer to allocate. */ + tmp = MAX_HEADER + sk->prot->max_header; + if (copy < min(mss_now, tp->max_window >> 1) && + !(flags & MSG_OOB)) { + tmp += min(mss_now, tp->max_window); + + /* What is happening here is that we want to + * tack on later members of the users iovec + * if possible into a single frame. When we + * leave this loop our caller checks to see if + * we can send queued frames onto the wire. + * See tcp_v[46]_sendmsg() for this. + */ + queue_it = 1; + } else { + tmp += copy; + queue_it = 0; + } + skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); + + /* If we didn't get any memory, we need to sleep. */ + if (skb == NULL) { + sk->socket->flags |= SO_NOSPACE; + if (flags&MSG_DONTWAIT) { + err = -EAGAIN; + goto do_interrupted; + } + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto do_interrupted; + } + tcp_push_pending_frames(sk, tp); + wait_for_tcp_memory(sk); + + /* If SACK's were formed or PMTU events happened, + * we must find out about it. + */ + mss_now = tcp_current_mss(sk); + continue; + } + + seglen -= copy; + + /* Prepare control bits for TCP header creation engine. */ + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | + ((PSH_NEEDED || psh) ? + TCPCB_FLAG_PSH : 0)); + TCP_SKB_CB(skb)->sacked = 0; + if (flags & MSG_OOB) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; + TCP_SKB_CB(skb)->urg_ptr = copy; + } else + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* TCP data bytes are SKB_PUT() on top, later + * TCP+IP+DEV headers are SKB_PUSH()'d beneath. + * Reserve header space and checksum the data. + */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = csum_and_copy_from_user(from, + skb_put(skb, copy), copy, 0, &err); + + if (err) + goto do_fault; + + from += copy; + copied += copy; + + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; + + /* This advances tp->write_seq for us. */ + tcp_send_skb(sk, skb, queue_it); + } + } + sk->err = 0; + err = copied; + goto out; + +do_sock_err: + if(copied) + err = copied; + else + err = sock_error(sk); + goto out; +do_shutdown: + if(copied) + err = copied; + else { + if (!(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + } + goto out; +do_interrupted: + if(copied) + err = copied; + goto out; +do_fault: + kfree_skb(skb); +do_fault2: + err = -EFAULT; +out: + tcp_push_pending_frames(sk, tp); + release_sock(sk); + return err; +} + +#undef PSH_NEEDED + +/* + * Send an ack if one is backlogged at this point. Ought to merge + * this with tcp_send_ack(). + * This is called for delayed acks also. + */ + +void tcp_read_wakeup(struct sock *sk) +{ + /* If we're closed, don't send an ack, or we'll get a RST + * from the closed destination. + */ + if (sk->state != TCP_CLOSE) + tcp_send_ack(sk); +} + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock * sk, int nonblock, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* No URG data to read. */ + if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->err) + return sock_error(sk); + + if (sk->done) + return -ENOTCONN; + + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { + sk->done = 1; + return 0; + } + + lock_sock(sk); + if (tp->urg_data & URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = URG_READ; + + if(msg->msg_name) + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + + if(addr_len) + *addr_len = tp->af_specific->sockaddr_len; + + /* Read urgent data. */ + msg->msg_flags|=MSG_OOB; + release_sock(sk); + + if(len>0) + { + err = memcpy_toiovec(msg->msg_iov, &c, 1); + /* N.B. already set above ... */ + msg->msg_flags|=MSG_OOB; + } + else + msg->msg_flags|=MSG_TRUNC; + + /* N.B. Is this right?? If len == 0 we didn't read any data */ + return err ? -EFAULT : 1; + } + release_sock(sk); + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike <pall@rz.uni-karlsruhe.de> + */ + return -EAGAIN; +} + +/* + * Release a skb if it is no longer needed. This routine + * must be called with interrupts disabled or with the + * socket locked so that the sk_buff queue operation is ok. + */ + +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) +{ + __skb_unlink(skb, &sk->receive_queue); + kfree_skb(skb); +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) +{ + struct sk_buff *skb; + + /* NOTE! The socket must be locked, so that we don't get + * a messed-up receive queue. + */ + while ((skb=skb_peek(&sk->receive_queue)) != NULL) { + if (!skb->used || atomic_read(&skb->users) > 1) + break; + tcp_eat_skb(sk, skb); + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + */ + if(copied > 0) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rcv_window_now = tcp_receive_window(tp); + __u32 new_window = __tcp_select_window(sk); + + /* We won't be raising the window any further than + * the window-clamp allows. Our window selection + * also keeps things a nice multiple of MSS. These + * checks are necessary to prevent spurious ACKs + * which don't advertize a larger window. + */ + if((new_window && (new_window >= rcv_window_now * 2)) && + ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) + tcp_read_wakeup(sk); + } +} + + +/* + * This routine copies from a sock struct into the user buffer. + */ + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags, int *addr_len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct wait_queue wait = { current, NULL }; + int copied = 0; + u32 peek_seq; + volatile u32 *seq; /* So gcc doesn't overoptimise */ + unsigned long used; + int err = 0; + int target = 1; /* Read at least this many bytes */ + + if (sk->err) + return sock_error(sk); + + if (sk->state == TCP_LISTEN) + return -ENOTCONN; + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); + + /* Copying sequence to update. This is volatile to handle + * the multi-reader case neatly (memcpy_to/fromfs might be + * inline and thus not flush cached variables otherwise). + */ + peek_seq = tp->copied_seq; + seq = &tp->copied_seq; + if (flags & MSG_PEEK) + seq = &peek_seq; + + /* Handle the POSIX bogosity MSG_WAITALL. */ + if (flags & MSG_WAITALL) + target=len; + + add_wait_queue(sk->sleep, &wait); + lock_sock(sk); + + /* + * BUG BUG BUG + * This violates 1003.1g compliance. We must wait for + * data to exist even if we read none! + */ + + while (len > 0) { + struct sk_buff * skb; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything. */ + if (copied && tp->urg_data && tp->urg_seq == *seq) + break; + + /* We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesnt impact 1003.1g + * and move it down to the bottom of the loop + */ + if (signal_pending(current)) { + if (copied) + break; + copied = -ERESTARTSYS; + if (nonblock) + copied = -EAGAIN; + break; + } + + /* Next get a buffer. */ + current->state = TASK_INTERRUPTIBLE; + + skb = skb_peek(&sk->receive_queue); + do { + if (!skb) + break; + + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (before(*seq, TCP_SKB_CB(skb)->seq)) { + printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", + *seq, TCP_SKB_CB(skb)->seq); + break; + } + offset = *seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + skb->used = 1; + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->receive_queue); + + if (copied >= target) + break; + + /* + These three lines and clause if (sk->state == TCP_CLOSE) + are unlikely to be correct, if target > 1. + I DO NOT FIX IT, because I have no idea, what + POSIX prescribes to make here. Probably, it really + wants to lose data 8), if not all target is received. + --ANK + */ + if (sk->err && !(flags&MSG_PEEK)) { + copied = sock_error(sk); + break; + } + + if (sk->shutdown & RCV_SHUTDOWN) { + sk->done = 1; + break; + } + + if (sk->state == TCP_CLOSE) { + if (!sk->done) { + sk->done = 1; + break; + } + copied = -ENOTCONN; + break; + } + + if (nonblock) { + copied = -EAGAIN; + break; + } + + cleanup_rbuf(sk, copied); + release_sock(sk); + sk->socket->flags |= SO_WAITDATA; + schedule(); + sk->socket->flags &= ~SO_WAITDATA; + lock_sock(sk); + continue; + + found_ok_skb: + /* Lock the buffer. We can be fairly relaxed as + * an interrupt will never steal a buffer we are + * using unless I've missed something serious in + * tcp_data. + */ + atomic_inc(&skb->users); + + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sk->urginline) { + ++*seq; + offset++; + used--; + } + } else + used = urg_offset; + } + } + + /* Copy it - We _MUST_ update *seq first so that we + * don't ever double read when we have dual readers + */ + *seq += used; + + /* This memcpy_toiovec can sleep. If it sleeps and we + * do a second read it relies on the skb->users to avoid + * a crash when cleanup_rbuf() gets called. + */ + err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); + if (err) { + /* Exception. Bailout! */ + atomic_dec(&skb->users); + copied = -EFAULT; + break; + } + + copied += used; + len -= used; + + /* We now will not sleep again until we are finished + * with skb. Sorry if you are doing the SMP port + * but you'll just have to fix it neatly ;) + */ + atomic_dec(&skb->users); + + if (after(tp->copied_seq,tp->urg_seq)) + tp->urg_data = 0; + if (used + offset < skb->len) + continue; + + /* Process the FIN. We may also need to handle PSH + * here and make it break out of MSG_WAITALL. + */ + if (skb->h.th->fin) + goto found_fin_ok; + if (flags & MSG_PEEK) + continue; + skb->used = 1; + if (atomic_read(&skb->users) == 1) + tcp_eat_skb(sk, skb); + continue; + + found_fin_ok: + ++*seq; + if (flags & MSG_PEEK) + break; + + /* All is done. */ + skb->used = 1; + sk->shutdown |= RCV_SHUTDOWN; + break; + } + + if(copied >= 0 && msg->msg_name) { + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + if(addr_len) + *addr_len = tp->af_specific->sockaddr_len; + } + + remove_wait_queue(sk->sleep, &wait); + current->state = TASK_RUNNING; + + /* Clean up data we have read: This will do ACK frames. */ + cleanup_rbuf(sk, copied); + release_sock(sk); + return copied; +} + +/* + * Check whether to renew the timer. + */ +static inline void tcp_check_fin_timer(struct sock *sk) +{ + if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk, int dead) +{ + int next = (int) new_state[sk->state]; + int ns = (next & TCP_STATE_MASK); + + tcp_set_state(sk, ns); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + */ + if (dead) + tcp_check_fin_timer(sk); + + return (next & TCP_ACTION_FIN); +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set sk->dead. + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->state) & + (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { + lock_sock(sk); + + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk,0)) + tcp_send_fin(sk); + + release_sock(sk); + } +} + + +/* + * Return 1 if we still have things to send in our buffers. + */ + +static inline int closing(struct sock * sk) +{ + return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. Currently it is only called by + * tcp_close, and timeout mirrors the value there. + */ + +static void tcp_close_pending (struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req = tp->syn_wait_queue; + + while(req) { + struct open_request *iter; + + if (req->sk) + tcp_close(req->sk, 0); + + iter = req; + req = req->dl_next; + + (*iter->class->destructor)(iter); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + sk->ack_backlog--; + tcp_openreq_free(iter); + } + + tcp_synq_init(tp); +} + +void tcp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + + /* + * Check whether the socket is locked ... supposedly + * it's impossible to tcp_close() a locked socket. + */ + if (atomic_read(&sk->sock_readers)) + printk("tcp_close: socket already locked!\n"); + + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + */ + lock_sock(sk); + if(sk->state == TCP_LISTEN) { + /* Special case. */ + tcp_set_state(sk, TCP_CLOSE); + tcp_close_pending(sk); + release_sock(sk); + sk->dead = 1; + return; + } + + /* It is questionable, what the role of this is now. + * In any event either it should be removed, or + * increment of SLT_KEEPALIVE be done, this is causing + * big problems. For now I comment it out. -DaveM + */ + /* sk->keepopen = 1; */ + sk->shutdown = SHUTDOWN_MASK; + + if (!sk->dead) + sk->state_change(sk); + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; + data_was_unread += len; + kfree_skb(skb); + } + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if(data_was_unread != 0) { + /* Unread data was tossed, zap the connection. */ + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk); + } else if (tcp_close_state(sk,1)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + tcp_send_fin(sk); + } + + if (timeout) { + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + add_wait_queue(sk->sleep, &wait); + release_sock(sk); + + while (1) { + tsk->state = TASK_INTERRUPTIBLE; + if (!closing(sk)) + break; + timeout = schedule_timeout(timeout); + if (signal_pending(tsk) || !timeout) + break; + } + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + + lock_sock(sk); + } + + /* Now that the socket is dead, if we are in the FIN_WAIT2 state + * we may need to set up a timer. + */ + tcp_check_fin_timer(sk); + + release_sock(sk); + sk->dead = 1; +} + +/* + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. + */ +static struct open_request * wait_for_connect(struct sock * sk, + struct open_request **pprev) +{ + struct wait_queue wait = { current, NULL }; + struct open_request *req; + + add_wait_queue(sk->sleep, &wait); + for (;;) { + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + schedule(); + lock_sock(sk); + req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev); + if (req) + break; + if (signal_pending(current)) + break; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return req; +} + +/* + * This will accept the next outstanding connection. + * + * Be careful about race conditions here - this is subtle. + */ + +struct sock *tcp_accept(struct sock *sk, int flags) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + struct sock *newsk = NULL; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = EINVAL; + if (sk->state != TCP_LISTEN) + goto out; + + /* Find already established connection */ + req = tcp_find_established(tp, &prev); + if (!req) { + /* If this is a non blocking socket don't sleep */ + error = EAGAIN; + if (flags & O_NONBLOCK) + goto out; + + error = ERESTARTSYS; + req = wait_for_connect(sk, &prev); + if (!req) + goto out; + } + + tcp_synq_unlink(tp, req, prev); + newsk = req->sk; + req->class->destructor(req); + tcp_openreq_free(req); + sk->ack_backlog--; + if(sk->keepopen) + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); + + release_sock(sk); + return newsk; + +out: + /* sk should be in LISTEN state, thus accept can use sk->err for + * internal purposes without stomping one anyone's feed. + */ + sk->err = error; + release_sock(sk); + return newsk; +} + +/* + * Socket option code for TCP. + */ + +int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, + int optlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val; + + if (level != SOL_TCP) + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); + + if(optlen<sizeof(int)) + return -EINVAL; + + if (get_user(val, (int *)optval)) + return -EFAULT; + + switch(optname) { + case TCP_MAXSEG: + /* values greater than interface MTU won't take effect. however at + * the point when this call is done we typically don't yet know + * which interface is going to be used + */ + if(val < 1 || val > MAX_WINDOW) + return -EINVAL; + tp->user_mss = val; + return 0; + + case TCP_NODELAY: + /* You cannot try to use this and TCP_CORK in + * tandem, so let the user know. + */ + if (sk->nonagle == 2) + return -EINVAL; + sk->nonagle = (val == 0) ? 0 : 1; + return 0; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * You cannot try to use TCP_NODELAY and this mechanism + * at the same time, so let the user know. + */ + if (sk->nonagle == 1) + return -EINVAL; + if (val != 0) { + sk->nonagle = 2; + } else { + sk->nonagle = 0; + + lock_sock(sk); + tcp_push_pending_frames(sk, tp); + release_sock(sk); + } + return 0; + + default: + return -ENOPROTOOPT; + }; +} + +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, + int *optlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val, len; + + if(level != SOL_TCP) + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min(len, sizeof(int)); + + switch(optname) { + case TCP_MAXSEG: + val = tp->user_mss; + break; + case TCP_NODELAY: + val = (sk->nonagle == 1); + break; + case TCP_CORK: + val = (sk->nonagle == 2); + break; + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if (!sk->keepopen && val) + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); + else if (sk->keepopen && !val) + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); +} + +extern void __skb_cb_too_small_for_tcp(int, int); + +void __init tcp_init(void) +{ + struct sk_buff *skb = NULL; + + if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); +} |