From a03e2da13bfbed30eedd589977595a5f3398a478 Mon Sep 17 00:00:00 2001 From: Zheng Da Date: Tue, 4 May 2010 16:08:14 +0200 Subject: remove all temporary files created by vim. --- pfinet.old/linux-src/net/core/dev.c~ | 2092 ----------------------- pfinet.old/linux-src/net/ipv4/ip_output.c~ | 1000 ----------- pfinet.old/linux-src/net/ipv4/tcp_input.c~ | 2449 --------------------------- pfinet.old/linux-src/net/ipv4/tcp_output.c~ | 1150 ------------- 4 files changed, 6691 deletions(-) delete mode 100644 pfinet.old/linux-src/net/core/dev.c~ delete mode 100644 pfinet.old/linux-src/net/ipv4/ip_output.c~ delete mode 100644 pfinet.old/linux-src/net/ipv4/tcp_input.c~ delete mode 100644 pfinet.old/linux-src/net/ipv4/tcp_output.c~ (limited to 'pfinet.old/linux-src') diff --git a/pfinet.old/linux-src/net/core/dev.c~ b/pfinet.old/linux-src/net/core/dev.c~ deleted file mode 100644 index 7d0658c5..00000000 --- a/pfinet.old/linux-src/net/core/dev.c~ +++ /dev/null @@ -1,2092 +0,0 @@ -/* - * NET3 Protocol independent device support routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro, - * Fred N. van Kempen, - * Mark Evans, - * - * Additional Authors: - * Florian la Roche - * Alan Cox - * David Hinds - * Alexey Kuznetsov - * Adam Sulmicki - * - * Changes: - * Marcelo Tosatti : dont accept mtu 0 or < - * Alan Cox : device private ioctl copies fields back. - * Alan Cox : Transmit queue code does relevant stunts to - * keep the queue safe. - * Alan Cox : Fixed double lock. - * Alan Cox : Fixed promisc NULL pointer trap - * ???????? : Support the full private ioctl range - * Alan Cox : Moved ioctl permission check into drivers - * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI - * Alan Cox : 100 backlog just doesn't cut it when - * you start doing multicast video 8) - * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. - * Alan Cox : Took out transmit every packet pass - * Saved a few bytes in the ioctl handler - * Alan Cox : Network driver sets packet type before calling netif_rx. Saves - * a function call a packet. - * Alan Cox : Hashed net_bh() - * Richard Kooijman: Timestamp fixes. - * Alan Cox : Wrong field in SIOCGIFDSTADDR - * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close changes. - * Rudi Cilibrasi : Pass the right thing to set_mac_address() - * Dave Miller : 32bit quantity for the device lock to make it work out - * on a Sparc. - * Bjorn Ekwall : Added KERNELD hack. - * Alan Cox : Cleaned up the backlog initialise. - * Craig Metz : SIOCGIFCONF fix if space for under - * 1 device. - * Thomas Bogendoerfer : Return ENODEV for dev_open, if there - * is no device open function. - * Andi Kleen : Fix error reporting for SIOCGIFCONF - * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF - * Cyrus Durgin : Cleaned for KMOD - * Adam Sulmicki : Bug Fix : Network Device Unload - * A network device unload needs to purge - * the backlog queue. - * Paul Rusty Russel : SIOCSIFNAME - * Andrea Arcangeli : dev_clear_backlog() needs the - * skb_queue_lock held. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_NET_RADIO -#include -#endif /* CONFIG_NET_RADIO */ -#ifdef CONFIG_PLIP -extern int plip_init(void); -#endif - -NET_PROFILE_DEFINE(dev_queue_xmit) -NET_PROFILE_DEFINE(net_bh) -NET_PROFILE_DEFINE(net_bh_skb) - - -const char *if_port_text[] = { - "unknown", - "BNC", - "10baseT", - "AUI", - "100baseT", - "100baseTX", - "100baseFX" -}; - -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * 0800 IP - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -struct packet_type *ptype_base[16]; /* 16 way hashed list */ -struct packet_type *ptype_all = NULL; /* Taps */ - -/* - * Device list lock. Setting it provides that interface - * will not disappear unexpectedly while kernel sleeps. - */ - -atomic_t dev_lockct = ATOMIC_INIT(0); - -/* - * Our notifier list - */ - -#ifdef _HURD_ -struct notifier_block *netdev_chain=NULL; -#else -static struct notifier_block *netdev_chain=NULL; -#endif - -/* - * Device drivers call our routines to queue packets here. We empty the - * queue in the bottom half handler. - */ - -static struct sk_buff_head backlog; - -#ifdef CONFIG_NET_FASTROUTE -int netdev_fastroute; -int netdev_fastroute_obstacles; -struct net_fastroute_stats dev_fastroute_stat; -#endif - -static void dev_clear_backlog(struct device *dev); - - -/****************************************************************************************** - - Protocol management and registration routines - -*******************************************************************************************/ - -/* - * For efficiency - */ - -int netdev_nit=0; - -/* - * Add a protocol ID to the list. Now that the input handler is - * smarter we can dispense with all the messy stuff that used to be - * here. - * - * BEWARE!!! Protocol handlers, mangling input packets, - * MUST BE last in hash buckets and checking protocol handlers - * MUST start from promiscous ptype_all chain in net_bh. - * It is true now, do not change it. - * Explantion follows: if protocol handler, mangling packet, will - * be the first on list, it is not able to sense, that packet - * is cloned and should be copied-on-write, so that it will - * change it and subsequent readers will get broken packet. - * --ANK (980803) - */ - -void dev_add_pack(struct packet_type *pt) -{ - int hash; -#ifdef CONFIG_NET_FASTROUTE - /* Hack to detect packet socket */ - if (pt->data) { - netdev_fastroute_obstacles++; - dev_clear_fastroute(pt->dev); - } -#endif - if(pt->type==htons(ETH_P_ALL)) - { - netdev_nit++; - pt->next=ptype_all; - ptype_all=pt; - } - else - { - hash=ntohs(pt->type)&15; - pt->next = ptype_base[hash]; - ptype_base[hash] = pt; - } -} - - -/* - * Remove a protocol ID from the list. - */ - -void dev_remove_pack(struct packet_type *pt) -{ - struct packet_type **pt1; - if(pt->type==htons(ETH_P_ALL)) - { - netdev_nit--; - pt1=&ptype_all; - } - else - pt1=&ptype_base[ntohs(pt->type)&15]; - for(; (*pt1)!=NULL; pt1=&((*pt1)->next)) - { - if(pt==(*pt1)) - { - *pt1=pt->next; - synchronize_bh(); -#ifdef CONFIG_NET_FASTROUTE - if (pt->data) - netdev_fastroute_obstacles--; -#endif - return; - } - } - printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); -} - -/***************************************************************************************** - - Device Interface Subroutines - -******************************************************************************************/ - -/* - * Find an interface by name. - */ - -struct device *dev_get(const char *name) -{ - struct device *dev; - - for (dev = dev_base; dev != NULL; dev = dev->next) - { - if (strcmp(dev->name, name) == 0) - return(dev); - } - return NULL; -} - -struct device * dev_get_by_index(int ifindex) -{ - struct device *dev; - - for (dev = dev_base; dev != NULL; dev = dev->next) - { - if (dev->ifindex == ifindex) - return(dev); - } - return NULL; -} - -struct device *dev_getbyhwaddr(unsigned short type, char *ha) -{ - struct device *dev; - - for (dev = dev_base; dev != NULL; dev = dev->next) - { - if (dev->type == type && - memcmp(dev->dev_addr, ha, dev->addr_len) == 0) - return(dev); - } - return(NULL); -} - -/* - * Passed a format string - eg "lt%d" it will try and find a suitable - * id. Not efficient for many devices, not called a lot.. - */ - -int dev_alloc_name(struct device *dev, const char *name) -{ - int i; - /* - * If you need over 100 please also fix the algorithm... - */ - for(i=0;i<100;i++) - { - sprintf(dev->name,name,i); - if(dev_get(dev->name)==NULL) - return i; - } - return -ENFILE; /* Over 100 of the things .. bail out! */ -} - -struct device *dev_alloc(const char *name, int *err) -{ - struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL); - if(dev==NULL) - { - *err=-ENOBUFS; - return NULL; - } - dev->name=(char *)(dev+1); /* Name string space */ - *err=dev_alloc_name(dev,name); - if(*err<0) - { - kfree(dev); - return NULL; - } - return dev; -} - -void netdev_state_change(struct device *dev) -{ - if (dev->flags&IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); -} - - -/* - * Find and possibly load an interface. - */ - -#ifdef CONFIG_KMOD - -void dev_load(const char *name) -{ - if(!dev_get(name) && capable(CAP_SYS_MODULE)) - request_module(name); -} - -#else - -extern inline void dev_load(const char *unused){;} - -#endif - -static int default_rebuild_header(struct sk_buff *skb) -{ - printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); - kfree_skb(skb); - return 1; -} - -/* - * Prepare an interface for use. - */ - -int dev_open(struct device *dev) -{ - int ret = 0; - - /* - * Is it already up? - */ - - if (dev->flags&IFF_UP) - return 0; - - /* - * Call device private open method - */ - - if (dev->open) - ret = dev->open(dev); - - /* - * If it went open OK then: - */ - - if (ret == 0) - { - /* - * nil rebuild_header routine, - * that should be never called and used as just bug trap. - */ - - if (dev->rebuild_header == NULL) - dev->rebuild_header = default_rebuild_header; - - /* - * Set the flags. - */ - dev->flags |= (IFF_UP | IFF_RUNNING); - - /* - * Initialize multicasting status - */ - dev_mc_upload(dev); - - /* - * Wakeup transmit queue engine - */ - dev_activate(dev); - - /* - * ... and announce new interface. - */ - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - - } - return(ret); -} - -#ifdef CONFIG_NET_FASTROUTE - -static __inline__ void dev_do_clear_fastroute(struct device *dev) -{ - if (dev->accept_fastpath) { - int i; - - for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) - dst_release_irqwait(xchg(dev->fastpath+i, NULL)); - } -} - -void dev_clear_fastroute(struct device *dev) -{ - if (dev) { - dev_do_clear_fastroute(dev); - } else { - for (dev = dev_base; dev; dev = dev->next) - dev_do_clear_fastroute(dev); - } -} -#endif - -/* - * Completely shutdown an interface. - */ - -int dev_close(struct device *dev) -{ - if (!(dev->flags&IFF_UP)) - return 0; - - dev_deactivate(dev); - - dev_lock_wait(); - - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - */ - - if (dev->stop) - dev->stop(dev); - - if (dev->start) - printk("dev_close: bug %s still running\n", dev->name); - - /* - * Device is now down. - */ - dev_clear_backlog(dev); - - dev->flags&=~(IFF_UP|IFF_RUNNING); -#ifdef CONFIG_NET_FASTROUTE - dev_clear_fastroute(dev); -#endif - - /* - * Tell people we are going down - */ - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); - - return(0); -} - - -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - -int register_netdevice_notifier(struct notifier_block *nb) -{ - return notifier_chain_register(&netdev_chain, nb); -} - -int unregister_netdevice_notifier(struct notifier_block *nb) -{ - return notifier_chain_unregister(&netdev_chain,nb); -} - -/* - * Support routine. Sends outgoing frames to any network - * taps currently in use. - */ - -void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) -{ - struct packet_type *ptype; - get_fast_time(&skb->stamp); - - for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) - { - /* Never send packets back to the socket - * they originated from - MvS (miquels@drinkel.ow.org) - */ - if ((ptype->dev == dev || !ptype->dev) && - ((struct sock *)ptype->data != skb->sk)) - { - struct sk_buff *skb2; - if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) - break; - - /* Code, following below is wrong. - - The only reason, why it does work is that - ONLY packet sockets receive outgoing - packets. If such a packet will be (occasionally) - received by normal packet handler, which expects - that mac header is pulled... - */ - - /* More sensible variant. skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb2->mac.raw = skb2->data; - - if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) { - if (net_ratelimit()) - printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); - skb2->nh.raw = skb2->data; - if (dev->hard_header) - skb2->nh.raw += dev->hard_header_len; - } - - skb2->h.raw = skb2->nh.raw; - skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); - } - } -} - -/* - * Fast path for loopback frames. - */ - -void dev_loopback_xmit(struct sk_buff *skb) -{ - struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); - if (newskb==NULL) - return; - - newskb->mac.raw = newskb->data; - skb_pull(newskb, newskb->nh.raw - newskb->data); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - if (newskb->dst==NULL) - printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); - netif_rx(newskb); -} - -int dev_queue_xmit(struct sk_buff *skb) -{ - struct device *dev = skb->dev; - struct Qdisc *q; - char *str1 = "pfinet: dev_queue_xmit check point 1.\n"; - char *str2 = "pfinet: dev_queue_xmit check point 2.\n"; - char *str3 = "pfinet: dev_queue_xmit check point 3.\n"; - char *str4 = "pfinet: dev_queue_xmit check point 4.\n"; - int stderr_fd = fileno (stderr); - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr); - -#ifdef CONFIG_NET_PROFILE - start_bh_atomic(); - NET_PROFILE_ENTER(dev_queue_xmit); -#endif - - start_bh_atomic(); - q = dev->qdisc; - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr); - if (q->enqueue) { - q->enqueue(skb, q); - qdisc_wakeup(dev); - end_bh_atomic(); - -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif - - return 0; - } - write (stderr_fd, str3, strlen (str3) + 1); - fflush (stderr); - - /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... - - Really, it is unlikely that bh protection is necessary here: - virtual devices do not generate EOI events. - However, it is possible, that they rely on bh protection - made by us here. - */ - if (dev->flags&IFF_UP) { - write (stderr_fd, str4, strlen (str4) + 1); - fflush (stderr); - if (netdev_nit) - dev_queue_xmit_nit(skb,dev); - write (stderr_fd, str5, strlen (str5) + 1); - fflush (stderr); - if (dev->hard_start_xmit(skb, dev) == 0) { - end_bh_atomic(); - -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif - write (stderr_fd, str6, strlen (str6) + 1); - fflush (stderr); - - return 0; - } - if (net_ratelimit()) - printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); - } - end_bh_atomic(); - write (stderr_fd, str6, strlen (str6) + 1); - fflush (stderr); - - kfree_skb(skb); - -#ifdef CONFIG_NET_PROFILE - NET_PROFILE_LEAVE(dev_queue_xmit); - end_bh_atomic(); -#endif - - return 0; -} - - -/*======================================================================= - Receiver rotutines - =======================================================================*/ - -int netdev_dropping = 0; -int netdev_max_backlog = 300; -atomic_t netdev_rx_dropped; -#ifdef CONFIG_CPU_IS_SLOW -int net_cpu_congestion; -#endif - -#ifdef CONFIG_NET_HW_FLOWCONTROL -int netdev_throttle_events; -static unsigned long netdev_fc_mask = 1; -unsigned long netdev_fc_xoff = 0; - -static struct -{ - void (*stimul)(struct device *); - struct device *dev; -} netdev_fc_slots[32]; - -int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) -{ - int bit = 0; - unsigned long flags; - - save_flags(flags); - cli(); - if (netdev_fc_mask != ~0UL) { - bit = ffz(netdev_fc_mask); - netdev_fc_slots[bit].stimul = stimul; - netdev_fc_slots[bit].dev = dev; - set_bit(bit, &netdev_fc_mask); - clear_bit(bit, &netdev_fc_xoff); - } - restore_flags(flags); - return bit; -} - -void netdev_unregister_fc(int bit) -{ - unsigned long flags; - - save_flags(flags); - cli(); - if (bit > 0) { - netdev_fc_slots[bit].stimul = NULL; - netdev_fc_slots[bit].dev = NULL; - clear_bit(bit, &netdev_fc_mask); - clear_bit(bit, &netdev_fc_xoff); - } - restore_flags(flags); -} - -static void netdev_wakeup(void) -{ - unsigned long xoff; - - cli(); - xoff = netdev_fc_xoff; - netdev_fc_xoff = 0; - netdev_dropping = 0; - netdev_throttle_events++; - while (xoff) { - int i = ffz(~xoff); - xoff &= ~(1<next) - if (curr->dev == dev) - { - __skb_unlink(curr, &backlog); - spin_unlock_irqrestore(&skb_queue_lock, flags); - kfree_skb(curr); - goto repeat; - } - spin_unlock_irqrestore(&skb_queue_lock, flags); -#ifdef CONFIG_NET_HW_FLOWCONTROL - if (netdev_dropping) - netdev_wakeup(); -#else - netdev_dropping = 0; -#endif - } -} - -/* - * Receive a packet from a device driver and queue it for the upper - * (protocol) levels. It always succeeds. - */ - -void netif_rx(struct sk_buff *skb) -{ -#ifndef CONFIG_CPU_IS_SLOW - if(skb->stamp.tv_sec==0) - get_fast_time(&skb->stamp); -#else - skb->stamp = xtime; -#endif - - /* The code is rearranged so that the path is the most - short when CPU is congested, but is still operating. - */ - - if (backlog.qlen <= netdev_max_backlog) { - if (backlog.qlen) { - if (netdev_dropping == 0) { - skb_queue_tail(&backlog,skb); - mark_bh(NET_BH); - return; - } - atomic_inc(&netdev_rx_dropped); - kfree_skb(skb); - return; - } -#ifdef CONFIG_NET_HW_FLOWCONTROL - if (netdev_dropping) - netdev_wakeup(); -#else - netdev_dropping = 0; -#endif - skb_queue_tail(&backlog,skb); - mark_bh(NET_BH); - return; - } - netdev_dropping = 1; - atomic_inc(&netdev_rx_dropped); - kfree_skb(skb); -} - -#ifdef CONFIG_BRIDGE -static inline void handle_bridge(struct sk_buff *skb, unsigned short type) -{ - /* - * The br_stats.flags is checked here to save the expense of a - * function call. - */ - if ((br_stats.flags & BR_UP) && br_call_bridge(skb, type)) - { - /* - * We pass the bridge a complete frame. This means - * recovering the MAC header first. - */ - - int offset; - - skb=skb_clone(skb, GFP_ATOMIC); - if(skb==NULL) - return; - - offset=skb->data-skb->mac.raw; - skb_push(skb,offset); /* Put header back on for bridge */ - - if(br_receive_frame(skb)) - return; - kfree_skb(skb); - } - return; -} -#endif - -/* - * When we are called the queue is ready to grab, the interrupts are - * on and hardware can interrupt and queue to the receive queue as we - * run with no problems. - * This is run as a bottom half after an interrupt handler that does - * mark_bh(NET_BH); - */ - -void net_bh(void) -{ - struct packet_type *ptype; - struct packet_type *pt_prev; - unsigned short type; -#ifndef _HURD_ - unsigned long start_time = jiffies; -#ifdef CONFIG_CPU_IS_SLOW - static unsigned long start_busy = 0; - static unsigned long ave_busy = 0; - - if (start_busy == 0) - start_busy = start_time; - net_cpu_congestion = ave_busy>>8; -#endif -#endif - - NET_PROFILE_ENTER(net_bh); - /* - * Can we send anything now? We want to clear the - * decks for any more sends that get done as we - * process the input. This also minimises the - * latency on a transmit interrupt bh. - */ - - if (qdisc_head.forw != &qdisc_head) - qdisc_run_queues(); - - /* - * Any data left to process. This may occur because a - * mark_bh() is done after we empty the queue including - * that from the device which does a mark_bh() just after - */ - - /* - * While the queue is not empty.. - * - * Note that the queue never shrinks due to - * an interrupt, so we can do this test without - * disabling interrupts. - */ - - while (!skb_queue_empty(&backlog)) - { - struct sk_buff * skb; - -#ifndef _HURD_ - /* Give chance to other bottom halves to run */ - if (jiffies - start_time > 1) - goto net_bh_break; -#endif - - /* - * We have a packet. Therefore the queue has shrunk - */ - skb = skb_dequeue(&backlog); - -#ifndef _HURD_ -#ifdef CONFIG_CPU_IS_SLOW - if (ave_busy > 128*16) { - kfree_skb(skb); - while ((skb = skb_dequeue(&backlog)) != NULL) - kfree_skb(skb); - break; - } -#endif -#endif - - -#if 0 - NET_PROFILE_SKB_PASSED(skb, net_bh_skb); -#endif -#ifdef CONFIG_NET_FASTROUTE - if (skb->pkt_type == PACKET_FASTROUTE) { - dev_queue_xmit(skb); - continue; - } -#endif - - /* - * Bump the pointer to the next structure. - * - * On entry to the protocol layer. skb->data and - * skb->nh.raw point to the MAC and encapsulated data - */ - - /* XXX until we figure out every place to modify.. */ - skb->h.raw = skb->nh.raw = skb->data; - - if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) { - printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol); - kfree_skb(skb); - continue; - } - - /* - * Fetch the packet protocol ID. - */ - - type = skb->protocol; - -#ifdef CONFIG_BRIDGE - /* - * If we are bridging then pass the frame up to the - * bridging code (if this protocol is to be bridged). - * If it is bridged then move on - */ - handle_bridge(skb, type); -#endif - - /* - * We got a packet ID. Now loop over the "known protocols" - * list. There are two lists. The ptype_all list of taps (normally empty) - * and the main protocol list which is hashed perfectly for normal protocols. - */ - - pt_prev = NULL; - for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) - { - if (!ptype->dev || ptype->dev == skb->dev) { - if(pt_prev) - { - struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); - if(skb2) - pt_prev->func(skb2,skb->dev, pt_prev); - } - pt_prev=ptype; - } - } - - for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) - { - if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) - { - /* - * We already have a match queued. Deliver - * to it and then remember the new match - */ - if(pt_prev) - { - struct sk_buff *skb2; - - skb2=skb_clone(skb, GFP_ATOMIC); - - /* - * Kick the protocol handler. This should be fast - * and efficient code. - */ - - if(skb2) - pt_prev->func(skb2, skb->dev, pt_prev); - } - /* Remember the current last to do */ - pt_prev=ptype; - } - } /* End of protocol list loop */ - - /* - * Is there a last item to send to ? - */ - - if(pt_prev) { - pt_prev->func(skb, skb->dev, pt_prev); - } - /* - * Has an unknown packet has been received ? - */ - - else { - kfree_skb(skb); - } - } /* End of queue loop */ - - /* - * We have emptied the queue - */ - - /* - * One last output flush. - */ - - if (qdisc_head.forw != &qdisc_head) - qdisc_run_queues(); - -#ifndef _HURD_ -#ifdef CONFIG_CPU_IS_SLOW - if (1) { - unsigned long start_idle = jiffies; - ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); - start_busy = 0; - } -#endif -#endif -#ifdef CONFIG_NET_HW_FLOWCONTROL - if (netdev_dropping) - netdev_wakeup(); -#else - netdev_dropping = 0; -#endif - NET_PROFILE_LEAVE(net_bh); - return; - -#ifndef _HURD_ -net_bh_break: - mark_bh(NET_BH); - NET_PROFILE_LEAVE(net_bh); - return; -#endif -} - -/* Protocol dependent address dumping routines */ - -static gifconf_func_t * gifconf_list [NPROTO]; - -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) -{ - if (family>=NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} - - -/* - * Map an interface index to its name (SIOCGIFNAME) - */ - -/* - * This call is useful, but I'd remove it too. - * - * The reason is purely aestetical, it is the only call - * from SIOC* family using struct ifreq in reversed manner. - * Besides that, it is pretty silly to put "drawing" facility - * to kernel, it is useful only to print ifindices - * in readable form, is not it? --ANK - * - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ - -static int dev_ifname(struct ifreq *arg) -{ - struct device *dev; - struct ifreq ifr; - int err; - - /* - * Fetch the caller's info block. - */ - - err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); - if (err) - return -EFAULT; - - dev = dev_get_by_index(ifr.ifr_ifindex); - if (!dev) - return -ENODEV; - - strcpy(ifr.ifr_name, dev->name); - - err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); - return (err)?-EFAULT:0; -} - -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. - */ - -#ifdef _HURD_ -int dev_ifconf(char *arg) -#else -static int dev_ifconf(char *arg) -#endif -{ - struct ifconf ifc; - struct device *dev; - char *pos; - int len; - int total; - int i; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; - - /* - * Loop over the interfaces, and write an info block for each. - */ - - total = 0; - for (dev = dev_base; dev != NULL; dev = dev->next) { - for (i=0; iget_stats ? dev->get_stats(dev): NULL); - int size; - - if (stats) - size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, - stats->rx_bytes, - stats->rx_packets, stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors - + stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, - stats->tx_packets, stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + stats->tx_aborted_errors - + stats->tx_window_errors + stats->tx_heartbeat_errors, - stats->tx_compressed); - else - size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); - - return size; -} - -/* - * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface - * to create /proc/net/dev - */ - -int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - int len=0; - off_t begin=0; - off_t pos=0; - int size; - - struct device *dev; - - - size = sprintf(buffer, - "Inter-| Receive | Transmit\n" - " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); - - pos+=size; - len+=size; - - - for (dev = dev_base; dev != NULL; dev = dev->next) - { - size = sprintf_stats(buffer+len, dev); - len+=size; - pos=begin+len; - - if(posoffset+length) - break; - } - - *start=buffer+(offset-begin); /* Start of wanted data */ - len-=(offset-begin); /* Start slop */ - if(len>length) - len=length; /* Ending slop */ - return len; -} - -static int dev_proc_stats(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - int len; - - len = sprintf(buffer, "%08x %08x %08x %08x %08x\n", - atomic_read(&netdev_rx_dropped), -#ifdef CONFIG_NET_HW_FLOWCONTROL - netdev_throttle_events, -#else - 0, -#endif -#ifdef CONFIG_NET_FASTROUTE - dev_fastroute_stat.hits, - dev_fastroute_stat.succeed, - dev_fastroute_stat.deferred -#else - 0, 0, 0 -#endif - ); - - len -= offset; - - if (len > length) - len = length; - if(len < 0) - len = 0; - - *start = buffer + offset; - *eof = 1; - - return len; -} - -#endif /* CONFIG_PROC_FS */ - - -#ifdef CONFIG_NET_RADIO -#ifdef CONFIG_PROC_FS - -/* - * Print one entry of /proc/net/wireless - * This is a clone of /proc/net/dev (just above) - */ -static int sprintf_wireless_stats(char *buffer, struct device *dev) -{ - /* Get stats from the driver */ - struct iw_statistics *stats = (dev->get_wireless_stats ? - dev->get_wireless_stats(dev) : - (struct iw_statistics *) NULL); - int size; - - if(stats != (struct iw_statistics *) NULL) - { - size = sprintf(buffer, - "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d\n", - dev->name, - stats->status, - stats->qual.qual, - stats->qual.updated & 1 ? '.' : ' ', - stats->qual.level, - stats->qual.updated & 2 ? '.' : ' ', - stats->qual.noise, - stats->qual.updated & 4 ? '.' : ' ', - stats->discard.nwid, - stats->discard.code, - stats->discard.misc); - stats->qual.updated = 0; - } - else - size = 0; - - return size; -} - -/* - * Print info for /proc/net/wireless (print all entries) - * This is a clone of /proc/net/dev (just above) - */ -int dev_get_wireless_info(char * buffer, char **start, off_t offset, - int length, int dummy) -{ - int len = 0; - off_t begin = 0; - off_t pos = 0; - int size; - - struct device * dev; - - size = sprintf(buffer, - "Inter-| sta-| Quality | Discarded packets\n" - " face | tus | link level noise | nwid crypt misc\n" - ); - - pos+=size; - len+=size; - - for(dev = dev_base; dev != NULL; dev = dev->next) - { - size = sprintf_wireless_stats(buffer+len, dev); - len+=size; - pos=begin+len; - - if(pos < offset) - { - len=0; - begin=pos; - } - if(pos > offset + length) - break; - } - - *start = buffer + (offset - begin); /* Start of wanted data */ - len -= (offset - begin); /* Start slop */ - if(len > length) - len = length; /* Ending slop */ - - return len; -} -#endif /* CONFIG_PROC_FS */ -#endif /* CONFIG_NET_RADIO */ - -void dev_set_promiscuity(struct device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - - dev->flags |= IFF_PROMISC; - if ((dev->promiscuity += inc) == 0) - dev->flags &= ~IFF_PROMISC; - if (dev->flags^old_flags) { -#ifdef CONFIG_NET_FASTROUTE - if (dev->flags&IFF_PROMISC) { - netdev_fastroute_obstacles++; - dev_clear_fastroute(dev); - } else - netdev_fastroute_obstacles--; -#endif - dev_mc_upload(dev); - printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); - } -} - -void dev_set_allmulti(struct device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - - dev->flags |= IFF_ALLMULTI; - if ((dev->allmulti += inc) == 0) - dev->flags &= ~IFF_ALLMULTI; - if (dev->flags^old_flags) - dev_mc_upload(dev); -} - -int dev_change_flags(struct device *dev, unsigned flags) -{ - int ret; - int old_flags = dev->flags; - - /* - * Set the flags on our device. - */ - - dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| - IFF_SLAVE|IFF_MASTER|IFF_DYNAMIC| - IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | - (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); - - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - - /* - * Have we downed the interface. We handle IFF_UP ourselves - * according to user attempts to set it, rather than blindly - * setting it. - */ - - ret = 0; - if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ - { - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); - - if (ret == 0) - dev_mc_upload(dev); - } - - if (dev->flags&IFF_UP && - ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE))) - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - - if ((flags^dev->gflags)&IFF_PROMISC) { - int inc = (flags&IFF_PROMISC) ? +1 : -1; - dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); - } - - /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. - */ - if ((flags^dev->gflags)&IFF_ALLMULTI) { - int inc = (flags&IFF_ALLMULTI) ? +1 : -1; - dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); - } - - return ret; -} - -#ifdef _HURD_ - -#define dev_ioctl 0 - -#else - -/* - * Perform the SIOCxIFxxx calls. - */ - -static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) -{ - struct device *dev; - int err; - - if ((dev = dev_get(ifr->ifr_name)) == NULL) - return -ENODEV; - - switch(cmd) - { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI)) - |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI)); - return 0; - - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ - return -EOPNOTSUPP; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCSIFMTU: /* Set the MTU of a device */ - if (ifr->ifr_mtu == dev->mtu) - return 0; - - /* - * MTU must be positive. - */ - - if (ifr->ifr_mtu<=0) - return -EINVAL; - - if (dev->change_mtu) - err = dev->change_mtu(dev, ifr->ifr_mtu); - else { - dev->mtu = ifr->ifr_mtu; - err = 0; - } - if (!err && dev->flags&IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); - return err; - - case SIOCGIFHWADDR: - memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); - ifr->ifr_hwaddr.sa_family=dev->type; - return 0; - - case SIOCSIFHWADDR: - if(dev->set_mac_address==NULL) - return -EOPNOTSUPP; - if(ifr->ifr_hwaddr.sa_family!=dev->type) - return -EINVAL; - err=dev->set_mac_address(dev,&ifr->ifr_hwaddr); - if (!err) - notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - return err; - - case SIOCSIFHWBROADCAST: - if(ifr->ifr_hwaddr.sa_family!=dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); - notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - return 0; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start=dev->mem_start; - ifr->ifr_map.mem_end=dev->mem_end; - ifr->ifr_map.base_addr=dev->base_addr; - ifr->ifr_map.irq=dev->irq; - ifr->ifr_map.dma=dev->dma; - ifr->ifr_map.port=dev->if_port; - return 0; - - case SIOCSIFMAP: - if (dev->set_config) - return dev->set_config(dev,&ifr->ifr_map); - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if(dev->set_multicast_list==NULL || - ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) - return -EINVAL; - dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); - return 0; - - case SIOCDELMULTI: - if(dev->set_multicast_list==NULL || - ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) - return -EINVAL; - dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); - return 0; - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - case SIOCSIFTXQLEN: - if(ifr->ifr_qlen<0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - if (dev->flags&IFF_UP) - return -EBUSY; - if (dev_get(ifr->ifr_newname)) - return -EEXIST; - memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); - dev->name[IFNAMSIZ-1] = 0; - notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); - return 0; - - /* - * Unknown or private ioctl - */ - - default: - if(cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) { - if (dev->do_ioctl) - return dev->do_ioctl(dev, ifr, cmd); - return -EOPNOTSUPP; - } - -#ifdef CONFIG_NET_RADIO - if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - if (dev->do_ioctl) - return dev->do_ioctl(dev, ifr, cmd); - return -EOPNOTSUPP; - } -#endif /* CONFIG_NET_RADIO */ - - } - return -EINVAL; -} - - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ - -int dev_ioctl(unsigned int cmd, void *arg) -{ - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_shlock(); - ret = dev_ifconf((char *) arg); - rtnl_shunlock(); - return ret; - } - if (cmd == SIOCGIFNAME) { - return dev_ifname((struct ifreq *)arg); - } - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch(cmd) - { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(ifr.ifr_name); - ret = dev_ifsioc(&ifr, cmd); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but currently - do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. Not applicable in our case */ - case SIOCSIFLINK: - return -EINVAL; - - /* - * Unknown or private ioctl. - */ - - default: - if (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) { - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return ret; - } -#ifdef CONFIG_NET_RADIO - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - dev_load(ifr.ifr_name); - if (IW_IS_SET(cmd)) { - if (!suser()) - return -EPERM; - rtnl_lock(); - } - ret = dev_ifsioc(&ifr, cmd); - if (IW_IS_SET(cmd)) - rtnl_unlock(); - if (!ret && IW_IS_GET(cmd) && - copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return ret; - } -#endif /* CONFIG_NET_RADIO */ - return -EINVAL; - } -} - -#endif - -int dev_new_index(void) -{ - static int ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex=1; - if (dev_get_by_index(ifindex) == NULL) - return ifindex; - } -} - -static int dev_boot_phase = 1; - - -int register_netdevice(struct device *dev) -{ - struct device *d, **dp; - - if (dev_boot_phase) { - /* This is NOT bug, but I am not sure, that all the - devices, initialized before netdev module is started - are sane. - - Now they are chained to device boot list - and probed later. If a module is initialized - before netdev, but assumes that dev->init - is really called by register_netdev(), it will fail. - - So that this message should be printed for a while. - */ - printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); - - /* Check for existence, and append to tail of chain */ - for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { - if (d == dev || strcmp(d->name, dev->name) == 0) - return -EEXIST; - } - dev->next = NULL; - *dp = dev; - return 0; - } - - dev->iflink = -1; - - /* Init, if this function is available */ - if (dev->init && dev->init(dev) != 0) - return -EIO; - - /* Check for existence, and append to tail of chain */ - for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { - if (d == dev || strcmp(d->name, dev->name) == 0) - return -EEXIST; - } - dev->next = NULL; - dev_init_scheduler(dev); - dev->ifindex = dev_new_index(); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - *dp = dev; - - /* Notify protocols, that a new device appeared. */ - notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); - - return 0; -} - -int unregister_netdevice(struct device *dev) -{ - struct device *d, **dp; - - if (dev_boot_phase == 0) { - /* If device is running, close it. - It is very bad idea, really we should - complain loudly here, but random hackery - in linux/drivers/net likes it. - */ - if (dev->flags & IFF_UP) - dev_close(dev); - -#ifdef CONFIG_NET_FASTROUTE - dev_clear_fastroute(dev); -#endif - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); - - /* - * Flush the multicast chain - */ - dev_mc_discard(dev); - - /* To avoid pointers looking to nowhere, - we wait for end of critical section */ - dev_lock_wait(); - } - - /* And unlink it from device chain. */ - for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { - if (d == dev) { - *dp = d->next; - synchronize_bh(); - d->next = NULL; - - if (dev->destructor) - dev->destructor(dev); - return 0; - } - } - return -ENODEV; -} - - -/* - * Initialize the DEV module. At boot time this walks the device list and - * unhooks any devices that fail to initialise (normally hardware not - * present) and leaves us with a valid list of present and active devices. - * - */ -extern int lance_init(void); -extern int bpq_init(void); -extern int scc_init(void); -extern void sdla_setup(void); -extern void sdla_c_setup(void); -extern void dlci_setup(void); -extern int dmascc_init(void); -extern int sm_init(void); - -extern int baycom_ser_fdx_init(void); -extern int baycom_ser_hdx_init(void); -extern int baycom_par_init(void); - -extern int lapbeth_init(void); -extern int comx_init(void); -extern void arcnet_init(void); -extern void ip_auto_config(void); -#ifdef CONFIG_8xx -extern int cpm_enet_init(void); -#endif /* CONFIG_8xx */ - -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_net_dev = { - PROC_NET_DEV, 3, "dev", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - dev_get_info -}; -#endif - -#ifdef CONFIG_NET_RADIO -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_net_wireless = { - PROC_NET_WIRELESS, 8, "wireless", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - dev_get_wireless_info -}; -#endif /* CONFIG_PROC_FS */ -#endif /* CONFIG_NET_RADIO */ - -__initfunc(int net_dev_init(void)) -{ - struct device *dev, **dp; - -#ifdef CONFIG_NET_SCHED - pktsched_init(); -#endif - - /* - * Initialise the packet receive queue. - */ - - skb_queue_head_init(&backlog); - - /* - * The bridge has to be up before the devices - */ - -#ifdef CONFIG_BRIDGE - br_init(); -#endif - - /* - * This is Very Ugly(tm). - * - * Some devices want to be initialized early.. - */ - -#if defined(CONFIG_SCC) - scc_init(); -#endif -#if defined(CONFIG_DMASCC) - dmascc_init(); -#endif -#if defined(CONFIG_BPQETHER) - bpq_init(); -#endif -#if defined(CONFIG_DLCI) - dlci_setup(); -#endif -#if defined(CONFIG_SDLA) - sdla_c_setup(); -#endif -#if defined(CONFIG_BAYCOM_PAR) - baycom_par_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_FDX) - baycom_ser_fdx_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_HDX) - baycom_ser_hdx_init(); -#endif -#if defined(CONFIG_SOUNDMODEM) - sm_init(); -#endif -#if defined(CONFIG_LAPBETHER) - lapbeth_init(); -#endif -#if defined(CONFIG_PLIP) - plip_init(); -#endif -#if defined(CONFIG_ARCNET) - arcnet_init(); -#endif -#if defined(CONFIG_8xx) - cpm_enet_init(); -#endif -#if defined(CONFIG_COMX) - comx_init(); -#endif - /* - * SLHC if present needs attaching so other people see it - * even if not opened. - */ - -#ifdef CONFIG_INET -#if (defined(CONFIG_SLIP) && defined(CONFIG_SLIP_COMPRESSED)) \ - || defined(CONFIG_PPP) \ - || (defined(CONFIG_ISDN) && defined(CONFIG_ISDN_PPP)) - slhc_install(); -#endif -#endif - -#ifdef CONFIG_NET_PROFILE - net_profile_init(); - NET_PROFILE_REGISTER(dev_queue_xmit); - NET_PROFILE_REGISTER(net_bh); -#if 0 - NET_PROFILE_REGISTER(net_bh_skb); -#endif -#endif - /* - * Add the devices. - * If the call to dev->init fails, the dev is removed - * from the chain disconnecting the device until the - * next reboot. - */ - - dp = &dev_base; - while ((dev = *dp) != NULL) - { - dev->iflink = -1; - if (dev->init && dev->init(dev)) - { - /* - * It failed to come up. Unhook it. - */ - *dp = dev->next; - synchronize_bh(); - } - else - { - dp = &dev->next; - dev->ifindex = dev_new_index(); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - dev_init_scheduler(dev); - } - } - -#ifdef CONFIG_PROC_FS - proc_net_register(&proc_net_dev); - { - struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0); - ent->read_proc = dev_proc_stats; - } -#endif - -#ifdef CONFIG_NET_RADIO -#ifdef CONFIG_PROC_FS - proc_net_register(&proc_net_wireless); -#endif /* CONFIG_PROC_FS */ -#endif /* CONFIG_NET_RADIO */ - - init_bh(NET_BH, net_bh); - - dev_boot_phase = 0; - - dev_mcast_init(); - -#ifdef CONFIG_BRIDGE - /* - * Register any statically linked ethernet devices with the bridge - */ - br_spacedevice_register(); -#endif - -#ifdef CONFIG_IP_PNP - ip_auto_config(); -#endif - - return 0; -} diff --git a/pfinet.old/linux-src/net/ipv4/ip_output.c~ b/pfinet.old/linux-src/net/ipv4/ip_output.c~ deleted file mode 100644 index 89272d6b..00000000 --- a/pfinet.old/linux-src/net/ipv4/ip_output.c~ +++ /dev/null @@ -1,1000 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * The Internet Protocol (IP) output module. - * - * Version: $Id: ip_output.c,v 1.67.2.1 1999/09/07 02:25:23 davem Exp $ - * - * Authors: Ross Biro, - * Fred N. van Kempen, - * Donald Becker, - * Alan Cox, - * Richard Underwood - * Stefan Becker, - * Jorge Cwik, - * Arnt Gulbrandsen, - * - * See ip_input.c for original log - * - * Fixes: - * Alan Cox : Missing nonblock feature in ip_build_xmit. - * Mike Kilburn : htons() missing in ip_build_xmit. - * Bradford Johnson: Fix faulty handling of some frames when - * no route is found. - * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit - * (in case if packet not accepted by - * output firewall rules) - * Mike McLagan : Routing by source - * Alexey Kuznetsov: use new route cache - * Andi Kleen: Fix broken PMTU recovery and remove - * some redundant tests. - * Vitaly E. Lavrov : Transparent proxy revived after year coma. - * Andi Kleen : Replace ip_reply with ip_send_reply. - * Andi Kleen : Split fast and slow ip_build_xmit path - * for decreased register pressure on x86 - * and more readibility. - * Marc Boucher : When call_out_firewall returns FW_QUEUE, - * silently drop skb instead of failing with -EPERM. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr = 0; - - -int ip_id_count = 0; - -/* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) -{ - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -} - -/* - * Add an ip header to a skbuff and send it out. - */ -void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - u32 saddr, u32 daddr, struct ip_options *opt) -{ - struct rtable *rt = (struct rtable *)skb->dst; - struct iphdr *iph; - struct device *dev; - - /* Build the IP header. */ - if (opt) - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); - - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->ip_tos; - iph->frag_off = 0; - if (ip_dont_fragment(sk, &rt->u.dst)) - iph->frag_off |= htons(IP_DF); - iph->ttl = sk->ip_ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = sk->protocol; - iph->tot_len = htons(skb->len); - iph->id = htons(ip_id_count++); - skb->nh.iph = iph; - - if (opt && opt->optlen) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, daddr, rt, 0); - } - - dev = rt->u.dst.dev; - -#ifdef CONFIG_FIREWALL - /* Now we have no better mechanism to notify about error. */ - switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) { - case FW_REJECT: - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - /* Fall thru... */ - case FW_BLOCK: - case FW_QUEUE: - kfree_skb(skb); - return; - } -#endif - - ip_send_check(iph); - - /* Send it out. */ - skb->dst->output(skb); - return; -} - -int __ip_finish_output(struct sk_buff *skb) -{ - return ip_finish_output(skb); -} - -int ip_mc_output(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; - struct device *dev = rt->u.dst.dev; - - /* - * If the indicated interface is up and running, send the packet. - */ - - ip_statistics.IpOutRequests++; -#ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags & RTCF_NAT) - ip_do_nat(skb); -#endif - - skb->dev = dev; - skb->protocol = __constant_htons(ETH_P_IP); - - /* - * Multicasts are looped back for other local users - */ - - if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) { -#ifdef CONFIG_IP_MROUTE - /* Small optimization: do not loopback not local frames, - which returned after forwarding; they will be dropped - by ip_mr_input in any case. - Note, that local frames are looped back to be delivered - to local recipients. - - This check is duplicated in ip_mr_input at the moment. - */ - if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) -#endif - dev_loopback_xmit(skb); - - /* Multicasts with ttl 0 must not go beyond the host */ - - if (skb->nh.iph->ttl == 0) { - kfree_skb(skb); - return 0; - } - } - - if (rt->rt_flags&RTCF_BROADCAST) - dev_loopback_xmit(skb); - - return ip_finish_output(skb); -} - -int ip_output(struct sk_buff *skb) -{ - char *str1 = "pfinet ip_output check point 1\n"; - char *str2 = "pfinet ip_output check point 2\n"; - int stderr_fd = fileno (stderr); - int ret; - -#ifdef CONFIG_IP_ROUTE_NAT - struct rtable *rt = (struct rtable*)skb->dst; -#endif - - ip_statistics.IpOutRequests++; - -#ifdef CONFIG_IP_ROUTE_NAT - if (rt->rt_flags&RTCF_NAT) - ip_do_nat(skb); -#endif - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr); - ret = ip_finish_output(skb); - return ret; -} - -/* Queues a packet to be sent, and starts the transmitter if necessary. - * This routine also needs to put in the total length and compute the - * checksum. We use to do this in two stages, ip_build_header() then - * this, but that scheme created a mess when routes disappeared etc. - * So we do it all here, and the TCP send engine has been changed to - * match. (No more unroutable FIN disasters, etc. wheee...) This will - * most likely make other reliable transport layers above IP easier - * to implement under Linux. - */ -void ip_queue_xmit(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - struct ip_options *opt = sk->opt; - struct rtable *rt; - struct device *dev; - struct iphdr *iph; - unsigned int tot_len; - - /* Make sure we can route this packet. */ - rt = (struct rtable *) sk->dst_cache; - if(rt == NULL || rt->u.dst.obsolete) { - u32 daddr; - - sk->dst_cache = NULL; - ip_rt_put(rt); - - /* Use correct destination address if we have options. */ - daddr = sk->daddr; - if(opt && opt->srr) - daddr = opt->faddr; - - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times itself - * out. - */ - if(ip_route_output(&rt, daddr, sk->saddr, - RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, - sk->bound_dev_if)) - goto drop; - sk->dst_cache = &rt->u.dst; - } - if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) - goto no_route; - - /* We have a route, so grab a reference. */ - skb->dst = dst_clone(sk->dst_cache); - - /* OK, we know where to send it, allocate and build IP header. */ - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->ip_tos; - iph->frag_off = 0; - iph->ttl = sk->ip_ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = sk->protocol; - skb->nh.iph = iph; - /* Transport layer set skb->h.foo itself. */ - - if(opt && opt->optlen) { - iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, sk->daddr, rt, 0); - } - - tot_len = skb->len; - iph->tot_len = htons(tot_len); - iph->id = htons(ip_id_count++); - - dev = rt->u.dst.dev; - -#ifdef CONFIG_FIREWALL - /* Now we have no better mechanism to notify about error. */ - switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) { - case FW_REJECT: - start_bh_atomic(); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - end_bh_atomic(); - /* Fall thru... */ - case FW_BLOCK: - case FW_QUEUE: - goto drop; - } -#endif - - /* This can happen when the transport layer has segments queued - * with a cached route, and by the time we get here things are - * re-routed to a device with a different MTU than the original - * device. Sick, but we must cover it. - */ - if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15); - kfree_skb(skb); - if (skb2 == NULL) - return; - if (sk) - skb_set_owner_w(skb2, sk); - skb = skb2; - iph = skb->nh.iph; - } - - /* Do we need to fragment. Again this is inefficient. We - * need to somehow lock the original buffer and use bits of it. - */ - if (tot_len > rt->u.dst.pmtu) - goto fragment; - - if (ip_dont_fragment(sk, &rt->u.dst)) - iph->frag_off |= __constant_htons(IP_DF); - - /* Add an IP checksum. */ - ip_send_check(iph); - - skb->priority = sk->priority; - skb->dst->output(skb); - return; - -fragment: - if (ip_dont_fragment(sk, &rt->u.dst) && - tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) { - /* Reject packet ONLY if TCP might fragment - it itself, if were careful enough. - Test is not precise (f.e. it does not take sacks - into account). Actually, tcp should make it. --ANK (980801) - */ - iph->frag_off |= __constant_htons(IP_DF); - NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n")); - - /* icmp_send is not reenterable, so that bh_atomic... --ANK */ - start_bh_atomic(); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(rt->u.dst.pmtu)); - end_bh_atomic(); - goto drop; - } - ip_fragment(skb, skb->dst->output); - return; - -no_route: - sk->dst_cache = NULL; - ip_rt_put(rt); - ip_statistics.IpOutNoRoutes++; - /* Fall through... */ -drop: - kfree_skb(skb); -} - -/* - * Build and send a packet, with as little as one copy - * - * Doesn't care much about ip options... option length can be - * different for fragment at 0 and other fragments. - * - * Note that the fragment at the highest offset is sent first, - * so the getfrag routine can fill in the TCP/UDP checksum header - * field in the last fragment it sends... actually it also helps - * the reassemblers, they can put most packets in at the head of - * the fragment queue, and they know the total size in advance. This - * last feature will measurably improve the Linux fragment handler one - * day. - * - * The callback has five args, an arbitrary pointer (copy of frag), - * the source IP address (may depend on the routing table), the - * destination address (char *), the offset to copy from, and the - * length to be copied. - */ - -int ip_build_xmit_slow(struct sock *sk, - int getfrag (const void *, - char *, - unsigned int, - unsigned int), - const void *frag, - unsigned length, - struct ipcm_cookie *ipc, - struct rtable *rt, - int flags) -{ - unsigned int fraglen, maxfraglen, fragheaderlen; - int err; - int offset, mf; - int mtu; - unsigned short id; - - int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; - int nfrags=0; - struct ip_options *opt = ipc->opt; - int df = 0; - - mtu = rt->u.dst.pmtu; - if (ip_dont_fragment(sk, &rt->u.dst)) - df = htons(IP_DF); - - length -= sizeof(struct iphdr); - - if (opt) { - fragheaderlen = sizeof(struct iphdr) + opt->optlen; - maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; - } else { - fragheaderlen = sizeof(struct iphdr); - - /* - * Fragheaderlen is the size of 'overhead' on each buffer. Now work - * out the size of the frames to send. - */ - - maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; - } - - if (length + fragheaderlen > 0xFFFF) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); - return -EMSGSIZE; - } - - /* - * Start at the end of the frame by handling the remainder. - */ - - offset = length - (length % (maxfraglen - fragheaderlen)); - - /* - * Amount of memory to allocate for final fragment. - */ - - fraglen = length - offset + fragheaderlen; - - if (length-offset==0) { - fraglen = maxfraglen; - offset -= maxfraglen-fragheaderlen; - } - - - /* - * The last fragment will not have MF (more fragments) set. - */ - - mf = 0; - - /* - * Don't fragment packets for path mtu discovery. - */ - - if (offset > 0 && df) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); - return(-EMSGSIZE); - } - - /* - * Lock the device lists. - */ - - dev_lock_list(); - - /* - * Get an identifier - */ - - id = htons(ip_id_count++); - - /* - * Begin outputting the bytes. - */ - - do { - char *data; - struct sk_buff * skb; - - /* - * Get the memory we require with some space left for alignment. - */ - - skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err); - if (skb == NULL) - goto error; - - /* - * Fill in the control structures - */ - - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, hh_len); - - /* - * Find where to start putting bytes. - */ - - data = skb_put(skb, fraglen); - skb->nh.iph = (struct iphdr *)data; - - /* - * Only write IP header onto non-raw packets - */ - - { - struct iphdr *iph = (struct iphdr *)data; - - iph->version = 4; - iph->ihl = 5; - if (opt) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, - ipc->addr, rt, offset); - } - iph->tos = sk->ip_tos; - iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); - iph->id = id; - iph->frag_off = htons(offset>>3); - iph->frag_off |= mf|df; - if (rt->rt_type == RTN_MULTICAST) - iph->ttl = sk->ip_mc_ttl; - else - iph->ttl = sk->ip_ttl; - iph->protocol = sk->protocol; - iph->check = 0; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - data += iph->ihl*4; - - /* - * Any further fragments will have MF set. - */ - - mf = htons(IP_MF); - } - - /* - * User data callback - */ - - if (getfrag(frag, data, offset, fraglen-fragheaderlen)) { - err = -EFAULT; - kfree_skb(skb); - goto error; - } - - offset -= (maxfraglen-fragheaderlen); - fraglen = maxfraglen; - - nfrags++; - -#ifdef CONFIG_FIREWALL - switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) { - case FW_QUEUE: - kfree_skb(skb); - continue; - case FW_BLOCK: - case FW_REJECT: - kfree_skb(skb); - err = -EPERM; - goto error; - } -#endif - - err = -ENETDOWN; - if (rt->u.dst.output(skb)) - goto error; - } while (offset >= 0); - - if (nfrags>1) - ip_statistics.IpFragCreates += nfrags; - dev_unlock_list(); - return 0; - -error: - ip_statistics.IpOutDiscards++; - if (nfrags>1) - ip_statistics.IpFragCreates += nfrags; - dev_unlock_list(); - return err; -} - - -/* - * Fast path for unfragmented packets. - */ -int ip_build_xmit(struct sock *sk, - int getfrag (const void *, - char *, - unsigned int, - unsigned int), - const void *frag, - unsigned length, - struct ipcm_cookie *ipc, - struct rtable *rt, - int flags) -{ - int err; - struct sk_buff *skb; - int df; - struct iphdr *iph; - - /* - * Try the simple case first. This leaves fragmented frames, and by - * choice RAW frames within 20 bytes of maximum size(rare) to the long path - */ - - if (!sk->ip_hdrincl) { - length += sizeof(struct iphdr); - - /* - * Check for slow path. - */ - if (length > rt->u.dst.pmtu || ipc->opt != NULL) - return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); - } else { - if (length > rt->u.dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu); - return -EMSGSIZE; - } - } - - /* - * Do path mtu discovery if needed. - */ - df = 0; - if (ip_dont_fragment(sk, &rt->u.dst)) - df = htons(IP_DF); - - /* - * Fast path for unfragmented frames without options. - */ - { - int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; - - skb = sock_alloc_send_skb(sk, length+hh_len+15, - 0, flags&MSG_DONTWAIT, &err); - if(skb==NULL) - goto error; - skb_reserve(skb, hh_len); - } - - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); - - dev_lock_list(); - - if(!sk->ip_hdrincl) { - iph->version=4; - iph->ihl=5; - iph->tos=sk->ip_tos; - iph->tot_len = htons(length); - iph->id=htons(ip_id_count++); - iph->frag_off = df; - iph->ttl=sk->ip_mc_ttl; - if (rt->rt_type != RTN_MULTICAST) - iph->ttl=sk->ip_ttl; - iph->protocol=sk->protocol; - iph->saddr=rt->rt_src; - iph->daddr=rt->rt_dst; - iph->check=0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); - } - else - err = getfrag(frag, (void *)iph, 0, length); - - dev_unlock_list(); - - if (err) - goto error_fault; - -#ifdef CONFIG_FIREWALL - switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) { - case FW_QUEUE: - kfree_skb(skb); - return 0; - case FW_BLOCK: - case FW_REJECT: - kfree_skb(skb); - err = -EPERM; - goto error; - } -#endif - - return rt->u.dst.output(skb); - -error_fault: - err = -EFAULT; - kfree_skb(skb); -error: - ip_statistics.IpOutDiscards++; - return err; -} - - - -/* - * This IP datagram is too large to be sent in one piece. Break it up into - * smaller pieces (each of size equal to IP header plus - * a block of the data of the original IP data part) that will yet fit in a - * single device frame, and queue such a frame for sending. - * - * Yes this is inefficient, feel free to submit a quicker one. - */ - -void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) -{ - struct iphdr *iph; - unsigned char *raw; - unsigned char *ptr; - struct device *dev; - struct sk_buff *skb2; - unsigned int mtu, hlen, left, len; - int offset; - int not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; - - dev = rt->u.dst.dev; - - /* - * Point into the IP datagram header. - */ - - raw = skb->nh.raw; - iph = (struct iphdr*)raw; - - /* - * Setup starting values. - */ - - hlen = iph->ihl * 4; - left = ntohs(iph->tot_len) - hlen; /* Space per frame */ - mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ - ptr = raw + hlen; /* Where to start from */ - - /* - * The protocol doesn't seem to say what to do in the case that the - * frame + options doesn't fit the mtu. As it used to fall down dead - * in this case we were fortunate it didn't happen - * - * It is impossible, because mtu>=68. --ANK (980801) - */ - -#ifdef CONFIG_NET_PARANOIA - if (mtu<8) - goto fail; -#endif - - /* - * Fragment the datagram. - */ - - offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; - not_last_frag = iph->frag_off & htons(IP_MF); - - /* - * Keep copying data until we run out. - */ - - while(left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending upto and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* - * Allocate buffer. - */ - - if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); - goto fail; - } - - /* - * Set up data on packet - */ - - skb2->pkt_type = skb->pkt_type; - skb2->priority = skb->priority; - skb_reserve(skb2, (dev->hard_header_len+15)&~15); - skb_put(skb2, len + hlen); - skb2->nh.raw = skb2->data; - skb2->h.raw = skb2->data + hlen; - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - skb2->dst = dst_clone(skb->dst); - - /* - * Copy the packet header into the new buffer. - */ - - memcpy(skb2->nh.raw, raw, hlen); - - /* - * Copy a block of the IP datagram. - */ - memcpy(skb2->h.raw, ptr, len); - left -= len; - - /* - * Fill in the new header fields. - */ - iph = skb2->nh.iph; - iph->frag_off = htons((offset >> 3)); - - /* ANK: dirty, but effective trick. Upgrade options only if - * the segment to be fragmented was THE FIRST (otherwise, - * options are already fixed) and make it ONCE - * on the initial skb, so that all the following fragments - * will inherit fixed options. - */ - if (offset == 0) - ip_options_fragment(skb); - - /* - * Added AC : If we are fragmenting a fragment that's not the - * last fragment then keep MF on each bit - */ - if (left > 0 || not_last_frag) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; - - /* - * Put this fragment into the sending queue. - */ - - ip_statistics.IpFragCreates++; - - iph->tot_len = htons(len + hlen); - - ip_send_check(iph); - - output(skb2); - } - kfree_skb(skb); - ip_statistics.IpFragOKs++; - return; - -fail: - kfree_skb(skb); - ip_statistics.IpFragFails++; -} - -/* - * Fetch data from kernel space and fill in checksum if needed. - */ -static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, - unsigned int fraglen) -{ - struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr; - u16 *pktp = (u16 *)to; - struct iovec *iov; - int len; - int hdrflag = 1; - - iov = &dp->iov[0]; - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - hdrflag = 0; - } - len = iov->iov_len - offset; - if (fraglen > len) { /* overlapping. */ - dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, - dp->csum); - offset = 0; - fraglen -= len; - to += len; - iov++; - } - - dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, - dp->csum); - - if (hdrflag && dp->csumoffset) - *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */ - return 0; -} - -/* - * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. - * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. - */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, - unsigned int len) -{ - struct { - struct ip_options opt; - char data[40]; - } replyopts; - struct ipcm_cookie ipc; - u32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; - - if (ip_options_echo(&replyopts.opt, skb)) - return; - - sk->ip_tos = skb->nh.iph->tos; - sk->priority = skb->priority; - sk->protocol = skb->nh.iph->protocol; - - daddr = ipc.addr = rt->rt_src; - ipc.opt = &replyopts.opt; - - if (ipc.opt->srr) - daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) - return; - - /* And let IP do all the hard work. */ - ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT); - ip_rt_put(rt); -} - -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = -{ - __constant_htons(ETH_P_IP), - NULL, /* All devices */ - ip_rcv, - NULL, - NULL, -}; - - - -#ifdef CONFIG_PROC_FS -#ifdef CONFIG_IP_MULTICAST -static struct proc_dir_entry proc_net_igmp = { - PROC_NET_IGMP, 4, "igmp", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - ip_mc_procinfo -}; -#endif -#endif - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - -__initfunc(void ip_init(void)) -{ - dev_add_pack(&ip_packet_type); - - ip_rt_init(); - -#ifdef CONFIG_PROC_FS -#ifdef CONFIG_IP_MULTICAST - proc_net_register(&proc_net_igmp); -#endif -#endif -} - diff --git a/pfinet.old/linux-src/net/ipv4/tcp_input.c~ b/pfinet.old/linux-src/net/ipv4/tcp_input.c~ deleted file mode 100644 index c5095624..00000000 --- a/pfinet.old/linux-src/net/ipv4/tcp_input.c~ +++ /dev/null @@ -1,2449 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Implementation of the Transmission Control Protocol(TCP). - * - * Version: $Id: tcp_input.c,v 1.164.2.8 1999/09/23 19:21:23 davem Exp $ - * - * Authors: Ross Biro, - * Fred N. van Kempen, - * Mark Evans, - * Corey Minyard - * Florian La Roche, - * Charles Hedrick, - * Linus Torvalds, - * Alan Cox, - * Matthew Dillon, - * Arnt Gulbrandsen, - * Jorge Cwik, - */ - -/* - * Changes: - * Pedro Roque : Fast Retransmit/Recovery. - * Two receive queues. - * Retransmit queue handled by TCP. - * Better retransmit timer handling. - * New congestion avoidance. - * Header prediction. - * Variable renaming. - * - * Eric : Fast Retransmit. - * Randy Scott : MSS option defines. - * Eric Schenk : Fixes to slow start algorithm. - * Eric Schenk : Yet another double ACK bug. - * Eric Schenk : Delayed ACK bug fixes. - * Eric Schenk : Floyd style fast retrans war avoidance. - * David S. Miller : Don't allow zero congestion window. - * Eric Schenk : Fix retransmitter so that it sends - * next packet on ack of previous packet. - * Andi Kleen : Moved open_request checking here - * and process RSTs for open_requests. - * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of - * timestamps. - * Andrey Savochkin: Check sequence numbers correctly when - * removing SACKs due to in sequence incoming - * data segments. - * Andi Kleen: Make sure we never ack data there is not - * enough room for. Also make this condition - * a fatal error if it might still happen. - * Andi Kleen: Add tcp_measure_rcv_mss to make - * connections with MSS -#include -#include -#include -#include - -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif - -extern int sysctl_tcp_fin_timeout; - -/* These are on by default so the code paths get tested. - * For the final 2.2 this may be undone at our discretion. -DaveM - */ -int sysctl_tcp_timestamps = 1; -int sysctl_tcp_window_scaling = 1; -int sysctl_tcp_sack = 1; - -int sysctl_tcp_syncookies = SYNC_INIT; -int sysctl_tcp_stdurg; -int sysctl_tcp_rfc1337; - -static int prune_queue(struct sock *sk); - -/* There is something which you must keep in mind when you analyze the - * behavior of the tp->ato delayed ack timeout interval. When a - * connection starts up, we want to ack as quickly as possible. The - * problem is that "good" TCP's do slow start at the beginning of data - * transmission. The means that until we send the first few ACK's the - * sender will sit on his end and only queue most of his data, because - * he can only send snd_cwnd unacked packets at any given time. For - * each ACK we send, he increments snd_cwnd and transmits more of his - * queue. -DaveM - */ -static void tcp_delack_estimator(struct tcp_opt *tp) -{ - if(tp->ato == 0) { - tp->lrcvtime = tcp_time_stamp; - - /* Help sender leave slow start quickly, - * and also makes sure we do not take this - * branch ever again for this connection. - */ - tp->ato = 1; - tcp_enter_quickack_mode(tp); - } else { - int m = tcp_time_stamp - tp->lrcvtime; - - tp->lrcvtime = tcp_time_stamp; - if(m <= 0) - m = 1; - if(m > tp->rto) - tp->ato = tp->rto; - else { - /* This funny shift makes sure we - * clear the "quick ack mode" bit. - */ - tp->ato = ((tp->ato << 1) >> 2) + m; - } - } -} - -/* - * Remember to send an ACK later. - */ -static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, - struct sk_buff *skb) -{ - tp->delayed_acks++; - - /* Tiny-grams with PSH set artifically deflate our - * ato measurement, but with a lower bound. - */ - if(th->psh && (skb->len < (tp->mss_cache >> 1))) { - /* Preserve the quickack state. */ - if((tp->ato & 0x7fffffff) > HZ/50) - tp->ato = ((tp->ato & 0x80000000) | - (HZ/50)); - } -} - -/* Called to compute a smoothed rtt estimate. The data fed to this - * routine either comes from timestamps, or from segments that were - * known _not_ to have been retransmitted [see Karn/Partridge - * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 - * piece by Van Jacobson. - * NOTE: the next three routines used to be one big routine. - * To save cycles in the RFC 1323 implementation it was better to break - * it up into three procedures. -- erics - */ - -static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) -{ - long m = mrtt; /* RTT */ - - /* The following amusing code comes from Jacobson's - * article in SIGCOMM '88. Note that rtt and mdev - * are scaled versions of rtt and mean deviation. - * This is designed to be as fast as possible - * m stands for "measurement". - * - * On a 1990 paper the rto value is changed to: - * RTO = rtt + 4 * mdev - */ - if(m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - } else { - /* no previous measure. */ - tp->srtt = m<<3; /* take the measured time to be rtt */ - tp->mdev = m<<2; /* make sure rto = 3*rtt */ - } -} - -/* Calculate rto without backoff. This is the second half of Van Jacobson's - * routine referred to above. - */ - -static __inline__ void tcp_set_rto(struct tcp_opt *tp) -{ - tp->rto = (tp->srtt >> 3) + tp->mdev; - tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); -} - - -/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound - * on packet lifetime in the internet. We need the HZ/5 lower - * bound to behave correctly against BSD stacks with a fixed - * delayed ack. - * FIXME: It's not entirely clear this lower bound is the best - * way to avoid the problem. Is it possible to drop the lower - * bound and still avoid trouble with BSD stacks? Perhaps - * some modification to the RTO calculation that takes delayed - * ack bias into account? This needs serious thought. -- erics - */ -static __inline__ void tcp_bound_rto(struct tcp_opt *tp) -{ - if (tp->rto > 120*HZ) - tp->rto = 120*HZ; - if (tp->rto < HZ/5) - tp->rto = HZ/5; -} - -/* WARNING: this must not be called if tp->saw_timestamp was false. */ -extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, - __u32 start_seq, __u32 end_seq) -{ - /* It is start_seq <= last_ack_seq combined - with in window check. If start_seq<=last_ack_seq<=rcv_nxt, - then segment is in window if end_seq>=rcv_nxt. - */ - if (!after(start_seq, tp->last_ack_sent) && - !before(end_seq, tp->rcv_nxt)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Plus: expired timestamps. - * - * Plus: resets failing PAWS. - */ - if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - } -} - -#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) - -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len) -{ - return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - (s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS && - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ - len != (th->doff * 4)); -} - - -static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) -{ - u32 end_window = tp->rcv_wup + tp->rcv_wnd; - - if (tp->rcv_wnd && - after(end_seq, tp->rcv_nxt) && - before(seq, end_window)) - return 1; - if (seq != end_window) - return 0; - return (seq == end_seq); -} - -/* This functions checks to see if the tcp header is actually acceptable. */ -extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) -{ - if (seq == tp->rcv_nxt) - return (tp->rcv_wnd || (end_seq == seq)); - - return __tcp_sequence(tp, seq, end_seq); -} - -/* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) -{ - sk->zapped = 1; - - /* We want the right error as BSD sees it (and indeed as we do). */ - switch (sk->state) { - case TCP_SYN_SENT: - sk->err = ECONNREFUSED; - break; - case TCP_CLOSE_WAIT: - sk->err = EPIPE; - break; - default: - sk->err = ECONNRESET; - }; - tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) - sk->state_change(sk); -} - -/* This tags the retransmission queue when SACKs arrive. */ -static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int i = nsacks; - - while(i--) { - struct sk_buff *skb = skb_peek(&sk->write_queue); - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if(after(TCP_SKB_CB(skb)->seq, end_seq)) - break; - - /* We play conservative, we don't allow SACKS to partially - * tag a sequence space. - */ - fack_count++; - if(!after(start_seq, TCP_SKB_CB(skb)->seq) && - !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { - /* If this was a retransmitted frame, account for it. */ - if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && - tp->retrans_out) - tp->retrans_out--; - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - - /* RULE: All new SACKs will either decrease retrans_out - * or advance fackets_out. - */ - if(fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - } - skb = skb->next; - } - sp++; /* Move on to the next SACK block. */ - } -} - -/* Look for tcp options. Normally only called on SYN and SYNACK packets. - * But, this can also be called on packets in the established flow when - * the fast version below fails. - */ -void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) -{ - unsigned char *ptr; - int length=(th->doff*4)-sizeof(struct tcphdr); - int saw_mss = 0; - - ptr = (unsigned char *)(th + 1); - tp->saw_tstamp = 0; - - while(length>0) { - int opcode=*ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - switch(opcode) { - case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) { - u16 in_mss = ntohs(*(__u16 *)ptr); - if (in_mss == 0) - in_mss = 536; - if (tp->mss_clamp > in_mss) - tp->mss_clamp = in_mss; - saw_mss = 1; - } - break; - case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn) - if (!no_fancy && sysctl_tcp_window_scaling) { - tp->wscale_ok = 1; - tp->snd_wscale = *(__u8 *)ptr; - if(tp->snd_wscale > 14) { - if(net_ratelimit()) - printk("tcp_parse_options: Illegal window " - "scaling value %d >14 received.", - tp->snd_wscale); - tp->snd_wscale = 14; - } - } - break; - case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { - if (sysctl_tcp_timestamps && !no_fancy) { - tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); - } - } - break; - case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn) { - if (sysctl_tcp_sack && !no_fancy) { - tp->sack_ok = 1; - tp->num_sacks = 0; - } - } - break; - - case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && - sysctl_tcp_sack && (sk != NULL) && !th->syn) { - int sack_bytes = opsize - TCPOLEN_SACK_BASE; - - if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { - int num_sacks = sack_bytes >> 3; - struct tcp_sack_block *sackp; - - sackp = (struct tcp_sack_block *)ptr; - tcp_sacktag_write_queue(sk, sackp, num_sacks); - } - } - }; - ptr+=opsize-2; - length-=opsize; - }; - } - if(th->syn && saw_mss == 0) - tp->mss_clamp = 536; -} - -/* Fast parse options. This hopes to only see timestamps. - * If it is wrong it falls back on tcp_parse_options(). - */ -static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) -{ - /* If we didn't send out any options ignore them all. */ - if (tp->tcp_header_len == sizeof(struct tcphdr)) - return 0; - if (th->doff == sizeof(struct tcphdr)>>2) { - tp->saw_tstamp = 0; - return 0; - } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __u32 *ptr = (__u32 *)(th + 1); - if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*++ptr); - tp->rcv_tsecr = ntohl(*++ptr); - return 1; - } - } - tcp_parse_options(sk, th, tp, 0); - return 1; -} - -#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ - -static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) -{ - if (tp->dup_acks > 3) - tp->snd_cwnd = (tp->snd_ssthresh); - - tp->dup_acks = 0; -} - -/* NOTE: This code assumes that tp->dup_acks gets cleared when a - * retransmit timer fires. - */ -static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Note: If not_dup is set this implies we got a - * data carrying packet or a window update. - * This carries no new information about possible - * lost packets, so we have to ignore it for the purposes - * of counting duplicate acks. Ideally this does not imply we - * should stop our fast retransmit phase, more acks may come - * later without data to help us. Unfortunately this would make - * the code below much more complex. For now if I see such - * a packet I clear the fast retransmit phase. - */ - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { - /* This is the standard reno style fast retransmit branch. */ - - /* 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. - */ - if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; - if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd = (tp->snd_ssthresh + 3); - tp->high_seq = tp->snd_nxt; - if(!tp->fackets_out) - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - else - tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } else if (++tp->dup_acks > 3) { - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". We do not use - * duplicate ACKs to artificially inflate the congestion - * window when doing FACK. - */ - if(!tp->fackets_out) { - tp->snd_cwnd++; - } else { - /* Fill any further holes which may have - * appeared. - * - * We may want to change this to run every - * further multiple-of-3 dup ack increments, - * to be more robust against out-of-order - * packet delivery. -DaveM - */ - tcp_fack_retransmit(sk); - } - } - } else if (tp->high_seq != 0) { - /* In this branch we deal with clearing the Floyd style - * block on duplicate fast retransmits, and if requested - * we do Hoe style secondary fast retransmits. - */ - if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { - /* Once we have acked all the packets up to high_seq - * we are done this fast retransmit phase. - * Alternatively data arrived. In this case we - * Have to abort the fast retransmit attempt. - * Note that we do want to accept a window - * update since this is expected with Hoe's algorithm. - */ - clear_fast_retransmit(tp); - - /* After we have cleared up to high_seq we can - * clear the Floyd style block. - */ - if (!before(ack, tp->high_seq)) { - tp->high_seq = 0; - tp->fackets_out = 0; - } - } else if (tp->dup_acks >= 3) { - if (!tp->fackets_out) { - /* Hoe Style. We didn't ack the whole - * window. Take this as a cue that - * another packet was lost and retransmit it. - * Don't muck with the congestion window here. - * Note that we have to be careful not to - * act if this was a window update and it - * didn't ack new data, since this does - * not indicate a packet left the system. - * We can test this by just checking - * if ack changed from snd_una, since - * the only way to get here without advancing - * from snd_una is if this was a window update. - */ - if (ack != tp->snd_una && before(ack, tp->high_seq)) { - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } else { - /* FACK style, fill any remaining holes in - * receiver's queue. - */ - tcp_fack_retransmit(sk); - } - } - } -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) -{ - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } -} - -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, - __u32 *seq, __u32 *seq_rtt) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - __u32 now = tcp_time_stamp; - int acked = 0; - - /* If we are retransmitting, and this ACK clears up to - * the retransmit head, or further, then clear our state. - */ - if (tp->retrans_head != NULL && - !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq)) - tp->retrans_head = NULL; - - while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u8 sacked = scb->sacked; - - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ - if (after(scb->end_seq, ack)) - break; - - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out) - tp->retrans_out--; - if(!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; - if(sacked & TCPCB_SACKED_RETRANS) - acked |= FLAG_RETRANS_DATA_ACKED; - if(tp->fackets_out) - tp->fackets_out--; - } else { - /* This is pure paranoia. */ - tp->retrans_head = NULL; - } - tp->packets_out--; - *seq = scb->seq; - *seq_rtt = now - scb->when; - __skb_unlink(skb, skb->list); - kfree_skb(skb); - } - return acked; -} - -static void tcp_ack_probe(struct sock *sk, __u32 ack) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Our probe was answered. */ - tp->probes_out = 0; - - /* Was it a usable window open? */ - - /* should always be non-null */ - if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { - tp->backoff = 0; - tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - } else { - tcp_reset_xmit_timer(sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); - } -} - -/* Should we open up the congestion window? */ -static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) -{ - /* Data must have been acked. */ - if ((flag & FLAG_DATA_ACKED) == 0) - return 0; - - /* Some of the data acked was retransmitted somehow? */ - if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) { - /* We advance in all cases except during - * non-FACK fast retransmit/recovery. - */ - if (tp->fackets_out != 0 || - tp->retransmits != 0) - return 1; - - /* Non-FACK fast retransmit does it's own - * congestion window management, don't get - * in the way. - */ - return 0; - } - - /* New non-retransmitted data acked, always advance. */ - return 1; -} - -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, - u32 seq, u32 ack, int flag) -{ - __u32 seq_rtt; - - /* RTTM Rule: A TSecr value received in a segment is used to - * update the averaged RTT measurement only if the segment - * acknowledges some new data, i.e., only if it advances the - * left edge of the send window. - * - * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin - */ - if (!(flag & FLAG_DATA_ACKED)) - return; - - seq_rtt = tcp_time_stamp - tp->rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - tp->backoff = 0; - tcp_set_rto(tp); - } else { - /* Still retransmitting, use backoff */ - tcp_set_rto(tp); - tp->rto = tp->rto << tp->backoff; - } - } else { - tcp_set_rto(tp); - } - - tcp_bound_rto(tp); -} - -static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) -{ - struct sk_buff *skb = skb_peek(&sk->write_queue); - - /* Some data was ACK'd, if still retransmitting (due to a - * timeout), resend more of the retransmit queue. The - * congestion window is handled properly by that code. - */ - if (tp->retransmits) { - tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); - if ((__s32)when < 0) - when = 1; - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); - } -} - -/* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, struct tcphdr *th, - u32 ack_seq, u32 ack, int len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int flag = 0; - u32 seq = 0; - u32 seq_rtt = 0; - - if(sk->zapped) - return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - tp->probes_out = 0; - - tp->rcv_tstamp = tcp_time_stamp; - - /* If the ack is newer than sent or older than previous acks - * then we can probably ignore it. - */ - if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) - goto uninteresting_ack; - - /* If there is data set flag 1 */ - if (len != th->doff*4) { - flag |= FLAG_DATA; - tcp_delack_estimator(tp); - } - - /* Update our send window. */ - - /* This is the window update code as per RFC 793 - * snd_wl{1,2} are used to prevent unordered - * segments from shrinking the window - */ - if (before(tp->snd_wl1, ack_seq) || - (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { - u32 nwin = ntohs(th->window) << tp->snd_wscale; - - if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { - flag |= FLAG_WIN_UPDATE; - tp->snd_wnd = nwin; - - tp->snd_wl1 = ack_seq; - tp->snd_wl2 = ack; - - if (nwin > tp->max_window) - tp->max_window = nwin; - } - } - - /* We passed data and got it acked, remove any soft error - * log. Something worked... - */ - sk->err_soft = 0; - - /* If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. - */ - if (tp->pending == TIME_PROBE0) - tcp_ack_probe(sk, ack); - - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - - /* We must do this here, before code below clears out important - * state contained in tp->fackets_out and tp->retransmits. -DaveM - */ - if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); - - /* If we have a timestamp, we always do rtt estimates. */ - if (tp->saw_tstamp) { - tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); - } else { - /* If we were retransmiting don't count rtt estimate. */ - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - } - } else { - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - if (flag & FLAG_DATA_ACKED) { - if(!(flag & FLAG_RETRANS_DATA_ACKED)) { - tp->backoff = 0; - tcp_rtt_estimator(tp, seq_rtt); - tcp_set_rto(tp); - tcp_bound_rto(tp); - } - } - } - } - - if (tp->packets_out) { - if (flag & FLAG_DATA_ACKED) - tcp_ack_packets_out(sk, tp); - } else { - tcp_clear_xmit_timer(sk, TIME_RETRANS); - } - - flag &= (FLAG_DATA | FLAG_WIN_UPDATE); - if ((ack == tp->snd_una && tp->packets_out && flag == 0) || - (tp->high_seq != 0)) { - tcp_fast_retrans(sk, ack, flag); - } else { - /* Clear any aborted fast retransmit starts. */ - tp->dup_acks = 0; - } - /* It is not a brain fart, I thought a bit now. 8) - * - * Forward progress is indicated, if: - * 1. the ack acknowledges new data. - * 2. or the ack is duplicate, but it is caused by new segment - * arrival. This case is filtered by: - * - it contains no data, syn or fin. - * - it does not update window. - * 3. or new SACK. It is difficult to check, so that we ignore it. - * - * Forward progress is also indicated by arrival new data, - * which was caused by window open from our side. This case is more - * difficult and it is made (alas, incorrectly) in tcp_data_queue(). - * --ANK (990513) - */ - if (ack != tp->snd_una || (flag == 0 && !th->fin)) - dst_confirm(sk->dst_cache); - - /* Remember the highest ack received. */ - tp->snd_una = ack; - return 1; - -uninteresting_ack: - SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); - return 0; -} - -/* New-style handling of TIME_WAIT sockets. */ -extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); - -void tcp_timewait_kill(struct tcp_tw_bucket *tw) -{ - struct tcp_bind_bucket *tb = tw->tb; - - /* Disassociate with bind bucket. */ - if(tw->bind_next) - tw->bind_next->bind_pprev = tw->bind_pprev; - *(tw->bind_pprev) = tw->bind_next; - if (tb->owners == NULL) { - if (tb->next) - tb->next->pprev = tb->pprev; - *(tb->pprev) = tb->next; - kmem_cache_free(tcp_bucket_cachep, tb); - } - - /* Unlink from established hashes. */ - if(tw->next) - tw->next->pprev = tw->pprev; - *tw->pprev = tw->next; - - /* We decremented the prot->inuse count when we entered TIME_WAIT - * and the sock from which this came was destroyed. - */ - tw->sklist_next->sklist_prev = tw->sklist_prev; - tw->sklist_prev->sklist_next = tw->sklist_next; - - /* Ok, now free it up. */ - kmem_cache_free(tcp_timewait_cachep, tw); -} - -/* We come here as a special case from the AF specific TCP input processing, - * and the SKB has no owner. Essentially handling this is very simple, - * we just keep silently eating rx'd packets, acking them if necessary, - * until none show up for the entire timeout period. - * - * Return 0, TCP_TW_ACK, TCP_TW_RST - */ -enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - /* RFC 1122: - * "When a connection is [...] on TIME-WAIT state [...] - * [a TCP] MAY accept a new SYN from the remote TCP to - * reopen the connection directly, if it: - * - * (1) assigns its initial sequence number for the new - * connection to be larger than the largest sequence - * number it used on the previous connection incarnation, - * and - * - * (2) returns to TIME-WAIT state if the SYN turns out - * to be an old duplicate". - */ - if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) { - struct sock *sk; - struct tcp_func *af_specific = tw->af_specific; - __u32 isn; - - isn = tw->snd_nxt + 128000; - if(isn == 0) - isn++; - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - sk = af_specific->get_sock(skb, th); - if(sk == NULL || - !ipsec_sk_policy(sk,skb) || - atomic_read(&sk->sock_readers) != 0) - return 0; - skb_set_owner_r(skb, sk); - af_specific = sk->tp_pinfo.af_tcp.af_specific; - if(af_specific->conn_request(sk, skb, isn) < 0) - return TCP_TW_RST; /* Toss a reset back. */ - return 0; /* Discard the frame. */ - } - - /* Check RST or SYN */ - if(th->rst || th->syn) { - /* This is TIME_WAIT assasination, in two flavors. - * Oh well... nobody has a sufficient solution to this - * protocol bug yet. - */ - if(sysctl_tcp_rfc1337 == 0) { - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - } - if(!th->rst) - return TCP_TW_RST; /* toss a reset back */ - return 0; - } else { - /* In this case we must reset the TIMEWAIT timer. */ - if(th->ack) - tcp_tw_reschedule(tw); - } - /* Ack old packets if necessary */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) && - (th->doff * 4) > len) - return TCP_TW_ACK; - return 0; -} - -/* Enter the time wait state. This is always called from BH - * context. Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) -{ - struct sock **head, *sktw; - - /* Step 1: Remove SK from established hash. */ - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - tcp_reg_zap(sk); - - /* Step 2: Put TW into bind hash where SK was. */ - tw->tb = (struct tcp_bind_bucket *)sk->prev; - if((tw->bind_next = sk->bind_next) != NULL) - sk->bind_next->bind_pprev = &tw->bind_next; - tw->bind_pprev = sk->bind_pprev; - *sk->bind_pprev = (struct sock *)tw; - sk->prev = NULL; - - /* Step 3: Same for the protocol sklist. */ - (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw; - (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw; - sk->sklist_next = NULL; - sk->prot->inuse--; - - /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ - head = &tcp_ehash[sk->hashent + (tcp_ehash_size/2)]; - sktw = (struct sock *)tw; - if((sktw->next = *head) != NULL) - (*head)->pprev = &sktw->next; - *head = sktw; - sktw->pprev = head; -} - -void tcp_time_wait(struct sock *sk) -{ - struct tcp_tw_bucket *tw; - - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - if(tw != NULL) { - /* Give us an identity. */ - tw->daddr = sk->daddr; - tw->rcv_saddr = sk->rcv_saddr; - tw->bound_dev_if= sk->bound_dev_if; - tw->num = sk->num; - tw->state = TCP_TIME_WAIT; - tw->sport = sk->sport; - tw->dport = sk->dport; - tw->family = sk->family; - tw->reuse = sk->reuse; - tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; - tw->window = tcp_select_window(sk); - tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if(tw->family == PF_INET6) { - memcpy(&tw->v6_daddr, - &sk->net_pinfo.af_inet6.daddr, - sizeof(struct in6_addr)); - memcpy(&tw->v6_rcv_saddr, - &sk->net_pinfo.af_inet6.rcv_saddr, - sizeof(struct in6_addr)); - } -#endif - /* Linkage updates. */ - tcp_tw_hashdance(sk, tw); - - /* Get the TIME_WAIT timeout firing. */ - tcp_tw_schedule(tw); - - /* CLOSE the SK. */ - if(sk->state == TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab--; - sk->state = TCP_CLOSE; - net_reset_timer(sk, TIME_DONE, - min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); - } else { - /* Sorry, we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - tcp_set_state(sk, TCP_CLOSE); - } - - /* Prevent rcvmsg/sndmsg calls, and wake people up. */ - sk->shutdown = SHUTDOWN_MASK; - if(!sk->dead) - sk->state_change(sk); -} - -/* - * Process the FIN bit. This now behaves as it is supposed to work - * and the FIN takes effect when it is validly part of sequence - * space. Not before when we get holes. - * - * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT - * (and thence onto LAST-ACK and finally, CLOSE, we never enter - * TIME-WAIT) - * - * If we are in FINWAIT-1, a received FIN indicates simultaneous - * close and we go into CLOSING (and later onto TIME-WAIT) - * - * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. - */ - -static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) -{ - sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; - - tcp_send_ack(sk); - - if (!sk->dead) { - sk->state_change(sk); - sock_wake_async(sk->socket, 1); - } - - switch(sk->state) { - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - /* Move to CLOSE_WAIT */ - tcp_set_state(sk, TCP_CLOSE_WAIT); - if (th->rst) - sk->shutdown = SHUTDOWN_MASK; - break; - - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* Received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_LAST_ACK: - /* RFC793: Remain in the LAST-ACK state. */ - break; - - case TCP_FIN_WAIT1: - /* This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - * - * This causes a WRITE timeout, which will either - * move on to TIME_WAIT when we timeout, or resend - * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already - * correct for handling this timeout. - */ - tcp_set_state(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_time_wait(sk); - break; - default: - /* Only TCP_LISTEN and TCP_CLOSE are left, in these - * cases we should never reach this piece of code. - */ - printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); - break; - }; -} - -/* These routines update the SACK block as out-of-order packets arrive or - * in-order packets close up the sequence space. - */ -static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) -{ - int this_sack, num_sacks = tp->num_sacks; - struct tcp_sack_block *swalk = &tp->selective_acks[0]; - - /* If more than one SACK block, see if the recent change to SP eats into - * or hits the sequence space of other SACK blocks, if so coalesce. - */ - if(num_sacks != 1) { - for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { - if(swalk == sp) - continue; - - /* First case, bottom of SP moves into top of the - * sequence space of SWALK. - */ - if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { - sp->start_seq = swalk->start_seq; - goto coalesce; - } - /* Second case, top of SP moves into bottom of the - * sequence space of SWALK. - */ - if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { - sp->end_seq = swalk->end_seq; - goto coalesce; - } - } - } - /* SP is the only SACK, or no coalescing cases found. */ - return; - -coalesce: - /* Zap SWALK, by moving every further SACK up by one slot. - * Decrease num_sacks. - */ - for(; this_sack < num_sacks-1; this_sack++, swalk++) { - struct tcp_sack_block *next = (swalk + 1); - swalk->start_seq = next->start_seq; - swalk->end_seq = next->end_seq; - } - tp->num_sacks--; -} - -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) -{ - __u32 tmp; - - tmp = sack1->start_seq; - sack1->start_seq = sack2->start_seq; - sack2->start_seq = tmp; - - tmp = sack1->end_seq; - sack1->end_seq = sack2->end_seq; - sack2->end_seq = tmp; -} - -static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int cur_sacks = tp->num_sacks; - - if (!cur_sacks) - goto new_sack; - - /* Optimize for the common case, new ofo frames arrive - * "in order". ;-) This also satisfies the requirements - * of RFC2018 about ordering of SACKs. - */ - if(sp->end_seq == TCP_SKB_CB(skb)->seq) { - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tcp_sack_maybe_coalesce(tp, sp); - } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) { - /* Re-ordered arrival, in this case, can be optimized - * as well. - */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_maybe_coalesce(tp, sp); - } else { - struct tcp_sack_block *swap = sp + 1; - int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4); - - /* Oh well, we have to move things around. - * Try to find a SACK we can tack this onto. - */ - - for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { - if((swap->end_seq == TCP_SKB_CB(skb)->seq) || - (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { - if(swap->end_seq == TCP_SKB_CB(skb)->seq) - swap->end_seq = TCP_SKB_CB(skb)->end_seq; - else - swap->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_swap(sp, swap); - tcp_sack_maybe_coalesce(tp, sp); - return; - } - } - - /* Could not find an adjacent existing SACK, build a new one, - * put it at the front, and shift everyone else down. We - * always know there is at least one SACK present already here. - * - * If the sack array is full, forget about the last one. - */ - if (cur_sacks >= max_sacks) { - cur_sacks--; - tp->num_sacks--; - } - while(cur_sacks >= 1) { - struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; - struct tcp_sack_block *prev = (this - 1); - this->start_seq = prev->start_seq; - this->end_seq = prev->end_seq; - cur_sacks--; - } - - new_sack: - /* Build the new head SACK, and we're done. */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tp->num_sacks++; - } -} - -static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) -{ - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int num_sacks = tp->num_sacks; - int this_sack; - - /* This is an in order data segment _or_ an out-of-order SKB being - * moved to the receive queue, so we know this removed SKB will eat - * from the front of a SACK. - */ - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - /* Check if the start of the sack is covered by skb. */ - if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) && - before(sp->start_seq, TCP_SKB_CB(skb)->end_seq)) - break; - } - - /* This should only happen if so many SACKs get built that some get - * pushed out before we get here, or we eat some in sequence packets - * which are before the first SACK block. - */ - if(this_sack >= num_sacks) - return; - - sp->start_seq = TCP_SKB_CB(skb)->end_seq; - if(!before(sp->start_seq, sp->end_seq)) { - /* Zap this SACK, by moving forward any other SACKS. */ - for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { - struct tcp_sack_block *next = (sp + 1); - sp->start_seq = next->start_seq; - sp->end_seq = next->end_seq; - } - tp->num_sacks--; - } -} - -static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) -{ - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int num_sacks = tp->num_sacks; - int this_sack; - - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) - break; - } - if(this_sack >= num_sacks) - return; - sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; -} - -/* This one checks to see if we can put data from the - * out_of_order queue into the receive_queue. - */ -static void tcp_ofo_queue(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - while ((skb = skb_peek(&tp->out_of_order_queue))) { - if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) - break; - - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); - kfree_skb(skb); - continue; - } - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - - if(tp->sack_ok) - tcp_sack_remove_skb(tp, skb); - __skb_unlink(skb, skb->list); - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); - } -} - -static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) -{ - struct sk_buff *skb1; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Queue data for delivery to the user. - * Packets in sequence go to the receive queue. - * Out of sequence packets to the out_of_order_queue. - */ - if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - /* Ok. In sequence. */ - queue_and_out: - dst_confirm(sk->dst_cache); - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) { - tcp_fin(skb, sk, skb->h.th); - } else { - tcp_remember_ack(tp, skb->h.th, skb); - } - /* This may have eaten into a SACK block. */ - if(tp->sack_ok && tp->num_sacks) - tcp_sack_remove_skb(tp, skb); - tcp_ofo_queue(sk); - - /* Turn on fast path. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - (0x10 << 16) | - tp->snd_wnd); - return; - } - - /* An old packet, either a retransmit or some packet got lost. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); - tcp_enter_quickack_mode(tp); - kfree_skb(skb); - return; - } - - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - /* Partial packet, seq < rcv_next < end_seq */ - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - - goto queue_and_out; - } - - /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks++; - tcp_enter_quickack_mode(tp); - - /* Disable header prediction. */ - tp->pred_flags = 0; - - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - - if (skb_peek(&tp->out_of_order_queue) == NULL) { - /* Initial out of order segment, build 1 SACK. */ - if(tp->sack_ok) { - tp->num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; - } - __skb_queue_head(&tp->out_of_order_queue,skb); - } else { - for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { - /* Already there. */ - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) { - if (skb->len >= skb1->len) { - if(tp->sack_ok) - tcp_sack_extend(tp, skb1, skb); - __skb_append(skb1, skb); - __skb_unlink(skb1, skb1->list); - kfree_skb(skb1); - } else { - /* A duplicate, smaller than what is in the - * out-of-order queue right now, toss it. - */ - kfree_skb(skb); - } - break; - } - - if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) { - __skb_append(skb1, skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; - } - - /* See if we've hit the start. If so insert. */ - if (skb1 == skb_peek(&tp->out_of_order_queue)) { - __skb_queue_head(&tp->out_of_order_queue,skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; - } - } - } -} - - -/* - * This routine handles the data. If there is room in the buffer, - * it will be have already been moved into it. If there is no - * room, then we will just have to discard the packet. - */ - -static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) -{ - struct tcphdr *th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - char *str1 = "pfinet: tcp_data check point 1.\n"; - char *str2 = "pfinet: tcp_data check point 2.\n"; - char *str3 = "pfinet: tcp_data check point 3.\n"; - int stderr_fd = fileno (stderr); - - th = skb->h.th; - skb_pull(skb, th->doff*4); - skb_trim(skb, len - (th->doff*4)); - - if (skb->len == 0 && !th->fin) - return(0); - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr); - /* - * If our receive queue has grown past its limits shrink it. - * Make sure to do this before moving snd_nxt, otherwise - * data might be acked for that we don't have enough room. - */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { - if (prune_queue(sk) < 0) { - /* Still not enough room. That can happen when - * skb->true_size differs significantly from skb->len. - */ - return 0; - } - } - - tcp_data_queue(sk, skb); - - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr); - if (before(tp->rcv_nxt, tp->copied_seq)) { - printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); - tp->rcv_nxt = tp->copied_seq; - } - - /* Above, tcp_data_queue() increments delayed_acks appropriately. - * Now tell the user we may have some data. - */ - if (!sk->dead) { - sk->data_ready(sk,0); - } - write (stderr_fd, str3, strlen (str3) + 1); - fflush (stderr); - return(1); -} - -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < tp->snd_cwnd) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } -} - -static __inline__ void tcp_data_snd_check(struct sock *sk) -{ - struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); -} - -/* - * Adapt the MSS value used to make delayed ack decision to the - * real world. - */ -static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len = skb->len, lss; - - if (len > tp->rcv_mss) - tp->rcv_mss = len; - lss = tp->last_seg_size; - tp->last_seg_size = 0; - if (len >= 536) { - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; - } -} - -/* - * Check if sending an ack is needed. - */ -static __inline__ void __tcp_ack_snd_check(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* This also takes care of updating the window. - * This if statement needs to be simplified. - * - * Rules for delaying an ack: - * - delay time <= 0.5 HZ - * - we don't have a window update to send - * - must send at least every 2 full sized packets - * - must send an ACK if we have any out of order data - * - * With an extra heuristic to handle loss of packet - * situations and also helping the sender leave slow - * start in an expediant manner. - */ - - /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || - /* We will update the window "significantly" or... */ - tcp_raise_window(sk) || - /* We entered "quick ACK" mode or... */ - tcp_in_quickack_mode(tp) || - /* We have out of order data */ - (skb_peek(&tp->out_of_order_queue) != NULL)) { - /* Then ack it now */ - tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ - tcp_send_delayed_ack(tp, HZ/2); - } -} - -static __inline__ void tcp_ack_snd_check(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->delayed_acks == 0) { - /* We sent a data segment already. */ - return; - } - __tcp_ack_snd_check(sk); -} - - -/* - * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be - * moved inline now as tcp_urg is only called from one - * place. We handle URGent data wrong. We have to - as - * BSD still doesn't use the correction from RFC961. - * For 1003.1g we should support a new option TCP_STDURG to permit - * either form (or just set the sysctl tcp_stdurg). - */ - -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 ptr = ntohs(th->urg_ptr); - - if (ptr && !sysctl_tcp_stdurg) - ptr--; - ptr += ntohl(th->seq); - - /* Ignore urgent data that we've already seen and read. */ - if (after(tp->copied_seq, ptr)) - return; - - /* Do we already have a newer (or duplicate) urgent pointer? */ - if (tp->urg_data && !after(ptr, tp->urg_seq)) - return; - - /* Tell the world about our new urgent pointer. */ - if (sk->proc != 0) { - if (sk->proc > 0) - kill_proc(sk->proc, SIGURG, 1); - else - kill_pg(-sk->proc, SIGURG, 1); - } - - /* We may be adding urgent data when the last byte read was - * urgent. To do this requires some care. We cannot just ignore - * tp->copied_seq since we would read the last urgent byte again - * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) - */ - if (tp->urg_seq == tp->copied_seq) - tp->copied_seq++; /* Move the copied sequence on correctly */ - tp->urg_data = URG_NOTYET; - tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; -} - -/* This is the 'fast' part of urgent handling. */ -static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) - tcp_check_urg(sk,th); - - /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == URG_NOTYET) { - u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); - - /* Is the urgent pointer pointing into this packet? */ - if (ptr < len) { - tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); - if (!sk->dead) - sk->data_ready(sk,0); - } - } -} - -/* Clean the out_of_order queue if we can, trying to get - * the socket within its memory limits again. - * - * Return less than zero if we should start dropping frames - * until the socket owning process reads some of the data - * to stabilize the situation. - */ -static int prune_queue(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff * skb; - - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - - net_statistics.PruneCalled++; - - /* First, purge the out_of_order queue. */ - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - if(skb != NULL) { - /* Free it all. */ - do { net_statistics.OfoPruned += skb->len; - kfree_skb(skb); - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - } while(skb != NULL); - - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if(tp->sack_ok) - tp->num_sacks = 0; - } - - /* If we are really being abused, tell the caller to silently - * drop receive data on the floor. It will get retransmitted - * and hopefully then we'll have sufficient space. - * - * We used to try to purge the in-order packets too, but that - * turns out to be deadly and fraught with races. Consider: - * - * 1) If we acked the data, we absolutely cannot drop the - * packet. This data would then never be retransmitted. - * 2) It is possible, with a proper sequence of events involving - * delayed acks and backlog queue handling, to have the user - * read the data before it gets acked. The previous code - * here got this wrong, and it lead to data corruption. - * 3) Too much state changes happen when the FIN arrives, so once - * we've seen that we can't remove any in-order data safely. - * - * The net result is that removing in-order receive data is too - * complex for anyones sanity. So we don't do it anymore. But - * if we are really having our buffer space abused we stop accepting - * new receive data. - */ - if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) - return 0; - - /* Massive buffer overcommit. */ - return -1; -} - -/* - * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. - */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued; - u32 flg; - char *str1 = "pfinet tcp_rcv_established check point 1\n"; - char *str2 = "pfinet tcp_rcv_established check point 2\n"; - int stderr_fd = fileno (stderr); - - /* - * Header prediction. - * The code follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */ - - /* - * RFC1323: H1. Apply PAWS check first. - */ - if (tcp_fast_parse_options(sk, th, tp)) { - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - } - } - - flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16); - - /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 else it will be !0 - * (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (len <= th->doff*4) { - /* Bulk data transfer: sender */ - if (len == th->doff*4) { - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - kfree_skb(skb); - tcp_data_snd_check(sk); - return 0; - } else { /* Header too small */ - tcp_statistics.TcpInErrs++; - goto discard; - } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && - atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { - /* Bulk data transfer: receiver */ - __skb_pull(skb,th->doff*4); - - tcp_measure_rcv_mss(sk, skb); - - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - */ - __skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - - /* FIN bit check is not done since if FIN is set in - * this frame, the pred_flags won't match up. -DaveM - */ - sk->data_ready(sk, 0); - tcp_delack_estimator(tp); - - tcp_remember_ack(tp, th, skb); - - __tcp_ack_snd_check(sk); - return 0; - } - } - - /* - * Standard slow path. - */ - - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - /* RFC793, page 37: "In all states except SYN-SENT, all reset - * (RST) segments are validated by checking their SEQ-fields." - * And page 69: "If an incoming segment is not acceptable, - * an acknowledgment should be sent in reply (unless the RST bit - * is set, if so drop the segment and return)". - */ - if (th->rst) - goto discard; - if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", - TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tp->rcv_wup, tp->rcv_wnd); - } - tcp_send_ack(sk); - goto discard; - } - - if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - SOCK_DEBUG(sk, "syn in established state\n"); - tcp_statistics.TcpInErrs++; - tcp_reset(sk); - return 1; - } - - if(th->rst) { - tcp_reset(sk); - goto discard; - } - - if(th->ack) - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); - - /* Process urgent data. */ - tcp_urg(sk, th, len); - - /* step 7: process the segment text */ - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - * - * Dave!!! Phrase above (and all about rcv_mss) has - * nothing to do with reality. rcv_mss must measure TOTAL - * size, including sacks, IP options etc. Hence, measure_rcv_mss - * must occure before pulling etc, otherwise it will flap - * like hell. Even putting it before tcp_data is wrong, - * it should use skb->tail - skb->nh.raw instead. - * --ANK (980805) - * - * BTW I broke it. Now all TCP options are handled equally - * in mss_clamp calculations (i.e. ignored, rfc1122), - * and mss_cache does include all of them (i.e. tstamps) - * except for sacks, to calulate effective mss faster. - * --ANK (980805) - */ - tcp_measure_rcv_mss(sk, skb); - - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr_fd); - /* Be careful, tcp_data() may have put this into TIME_WAIT. */ - if(sk->state != TCP_CLOSE) { - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - } - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr_fd); - - if (!queued) { - discard: - kfree_skb(skb); - } - - return 0; -} - -/* - * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented - * as an open_request. - */ - -struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - struct open_request *req) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 flg; - - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. - */ - - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); - - flg &= __constant_htonl(0x00170000); - /* Only SYN set? */ - if (flg == __constant_htonl(0x00020000)) { - if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { - /* retransmited syn. - */ - req->class->rtx_syn_ack(sk, req); - return NULL; - } else { - return sk; /* Pass new SYN to the listen socket. */ - } - } - - /* We know it's an ACK here */ - if (req->sk) { - /* socket already created but not - * yet accepted()... - */ - sk = req->sk; - } else { - /* In theory the packet could be for a cookie, but - * TIME_WAIT should guard us against this. - * XXX: Nevertheless check for cookies? - * This sequence number check is done again later, - * but we do it here to prevent syn flood attackers - * from creating big SYN_RECV sockets. - */ - if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) || - !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, - req->rcv_isn+1+req->rcv_wnd)) { - req->class->send_reset(skb); - return NULL; - } - - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - if (sk == NULL) - return NULL; - - req->expires = 0UL; - req->sk = sk; - } - skb_orphan(skb); - skb_set_owner_r(skb, sk); - return sk; -} - -/* - * This function implements the receiving procedure of RFC 793 for - * all states except ESTABLISHED and TIME_WAIT. - * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be - * address independent. - */ - -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, unsigned len) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued = 0; - - switch (sk->state) { - case TCP_CLOSE: - /* When state == CLOSED, hash lookup always fails. - * - * But, there is a back door, the backlog queue. - * If we have a sequence of packets in the backlog - * during __release_sock() which have a sequence such - * that: - * packet X causes entry to TCP_CLOSE state - * ... - * packet X + N has FIN bit set - * - * We report a (luckily) harmless error in this case. - * The issue is that backlog queue processing bypasses - * any hash lookups (we know which socket packets are for). - * The correct behavior here is what 2.0.x did, since - * a TCP_CLOSE socket does not exist. Drop the frame - * and send a RST back to the other end. - */ - return 1; - - case TCP_LISTEN: - /* These use the socket TOS.. - * might want to be the received TOS - */ - if(th->ack) - return 1; - - if(th->syn) { - if(tp->af_specific->conn_request(sk, skb, 0) < 0) - return 1; - - /* Now we have several options: In theory there is - * nothing else in the frame. KA9Q has an option to - * send data with the syn, BSD accepts data with the - * syn up to the [to be] advertised window and - * Solaris 2.1 gives you a protocol error. For now - * we just ignore it, that fits the spec precisely - * and avoids incompatibilities. It would be nice in - * future to drop through and process the data. - * - * Now that TTCP is starting to be used we ought to - * queue this data. - * But, this leaves one open to an easy denial of - * service attack, and SYN cookies can't defend - * against this problem. So, we drop the data - * in the interest of security over speed. - */ - goto discard; - } - - goto discard; - break; - - case TCP_SYN_SENT: - /* SYN sent means we have to look for a suitable ack and - * either reset for bad matches or go to connected. - * The SYN_SENT case is unusual and should - * not be in line code. [AC] - */ - if(th->ack) { - /* rfc793: - * "If the state is SYN-SENT then - * first check the ACK bit - * If the ACK bit is set - * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send - * a reset (unless the RST bit is set, if so drop - * the segment and return)" - * - * I cite this place to emphasize one essential - * detail, this check is different of one - * in established state: SND.UNA <= SEG.ACK <= SND.NXT. - * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, - * because we have no previous data sent before SYN. - * --ANK(990513) - * - * We do not send data with SYN, so that RFC-correct - * test reduces to: - */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) - return 1; - - /* Now ACK is acceptable. - * - * "If the RST bit is set - * If the ACK was acceptable then signal the user "error: - * connection reset", drop the segment, enter CLOSED state, - * delete TCB, and return." - */ - - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - /* rfc793: - * "fifth, if neither of the SYN or RST bits is set then - * drop the segment and return." - * - * See note below! - * --ANK(990513) - */ - - if (!th->syn) - goto discard; - - /* rfc793: - * "If the SYN bit is on ... - * are acceptable then ... - * (our SYN has been ACKed), change the connection - * state to ESTABLISHED..." - * - * Do you see? SYN-less ACKs in SYN-SENT state are - * completely ignored. - * - * The bug causing stalled SYN-SENT sockets - * was here: tcp_ack advanced snd_una and canceled - * retransmit timer, so that bare ACK received - * in SYN-SENT state (even with invalid ack==ISS, - * because tcp_ack check is too weak for SYN-SENT) - * causes moving socket to invalid semi-SYN-SENT, - * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * - * --ANK (990514) - */ - - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - - /* Ok.. it's good. Set up sequence numbers and - * move to established. - */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; - tp->fin_seq = TCP_SKB_CB(skb)->seq; - - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_parse_options(sk, th, tp, 0); - - if (tp->wscale_ok == 0) { - tp->snd_wscale = tp->rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp,65535); - } - - if (tp->tstamp_ok) { - tp->tcp_header_len = - sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else - tp->tcp_header_len = sizeof(struct tcphdr); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - /* Can't be earlier, doff would be wrong. */ - tcp_send_ack(sk); - - sk->dport = th->source; - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - sk->state_change(sk); - sock_wake_async(sk->socket, 0); - } - } else { - if(th->syn && !th->rst) { - /* The previous version of the code - * checked for "connecting to self" - * here. that check is done now in - * tcp_connect. - */ - tcp_set_state(sk, TCP_SYN_RECV); - tcp_parse_options(sk, th, tp, 0); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = tcp_time_stamp; - } - - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; - - /* RFC1323: The window in SYN & SYN/ACK segments is - * never scaled. - */ - tp->snd_wnd = htons(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - - tcp_send_synack(sk); - } else - break; - } - - /* tp->tcp_header_len and tp->mss_clamp - probably changed, synchronize mss. - */ - tcp_sync_mss(sk, tp->pmtu_cookie); - tp->rcv_mss = tp->mss_cache; - - if (sk->state == TCP_SYN_RECV) - goto discard; - - goto step6; - } - - /* Parse the tcp_options present on this header. - * By this point we really only expect timestamps. - * Note that this really has to be here and not later for PAWS - * (RFC1323) to work. - */ - if (tcp_fast_parse_options(sk, th, tp)) { - /* NOTE: assumes saw_tstamp is never set if we didn't - * negotiate the option. tcp_fast_parse_options() must - * guarantee this. - */ - if (tp->saw_tstamp) { - if (tcp_paws_discard(tp, th, len)) { - tcp_statistics.TcpInErrs++; - if (!th->rst) { - tcp_send_ack(sk); - goto discard; - } - } - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq); - } - } - - /* The silly FIN test here is necessary to see an advancing ACK in - * retransmitted FIN frames properly. Consider the following sequence: - * - * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ - * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1 - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test) - * - * At this point the connection will deadlock with host1 believing - * that his FIN is never ACK'd, and thus it will retransmit it's FIN - * forever. The following fix is from Taral (taral@taral.net). - */ - - /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) && - !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) { - if (!th->rst) { - tcp_send_ack(sk); - } - goto discard; - } - - /* step 2: check RST bit */ - if(th->rst) { - tcp_reset(sk); - goto discard; - } - - /* step 3: check security and precedence [ignored] */ - - /* step 4: - * - * Check for a SYN, and ensure it matches the SYN we were - * first sent. We have to handle the rather unusual (but valid) - * sequence that KA9Q derived products may generate of - * - * SYN - * SYN|ACK Data - * ACK (lost) - * SYN|ACK Data + More Data - * .. we must ACK not RST... - * - * We keep syn_seq as the sequence space occupied by the - * original syn. - */ - - if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - tcp_reset(sk); - return 1; - } - - /* step 5: check the ACK field */ - if (th->ack) { - int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - - switch(sk->state) { - case TCP_SYN_RECV: - if (acceptable) { - tcp_set_state(sk, TCP_ESTABLISHED); - sk->dport = th->source; - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) - sk->state_change(sk); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; - - } else { - SOCK_DEBUG(sk, "bad ack\n"); - return 1; - } - break; - - case TCP_FIN_WAIT1: - if (tp->snd_una == tp->write_seq) { - sk->shutdown |= SEND_SHUTDOWN; - tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) - sk->state_change(sk); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } - break; - - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk); - goto discard; - } - break; - - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - sk->shutdown = SHUTDOWN_MASK; - tcp_set_state(sk,TCP_CLOSE); - if (!sk->dead) - sk->state_change(sk); - goto discard; - } - break; - } - } else - goto discard; - -step6: - /* step 6: check the URG bit */ - tcp_urg(sk, th, len); - - /* step 7: process the segment text */ - switch (sk->state) { - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) - break; - - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - /* RFC 793 says to queue data in these states, - * RFC 1122 says we MUST send a reset. - * BSD 4.4 also does reset. - */ - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { - if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - tcp_reset(sk); - return 1; - } - } - - case TCP_ESTABLISHED: - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - */ - tcp_measure_rcv_mss(sk, skb); - break; - } - - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - - if (!queued) { -discard: - kfree_skb(skb); - } - return 0; -} diff --git a/pfinet.old/linux-src/net/ipv4/tcp_output.c~ b/pfinet.old/linux-src/net/ipv4/tcp_output.c~ deleted file mode 100644 index df6d48f2..00000000 --- a/pfinet.old/linux-src/net/ipv4/tcp_output.c~ +++ /dev/null @@ -1,1150 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * Implementation of the Transmission Control Protocol(TCP). - * - * Version: $Id: tcp_output.c,v 1.108.2.1 1999/05/14 23:07:36 davem Exp $ - * - * Authors: Ross Biro, - * Fred N. van Kempen, - * Mark Evans, - * Corey Minyard - * Florian La Roche, - * Charles Hedrick, - * Linus Torvalds, - * Alan Cox, - * Matthew Dillon, - * Arnt Gulbrandsen, - * Jorge Cwik, - */ - -/* - * Changes: Pedro Roque : Retransmit queue handled by TCP. - * : Fragmentation on mtu decrease - * : Segment collapse on retransmit - * : AF independence - * - * Linus Torvalds : send_delayed_ack - * David S. Miller : Charge memory using the right skb - * during syn/ack processing. - * David S. Miller : Output engine completely rewritten. - * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. - * - */ - -#include - -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; - -/* People can turn this off for buggy TCP's found in printers etc. */ -int sysctl_tcp_retrans_collapse = 1; - -/* Get rid of any delayed acks, we sent one already.. */ -static __inline__ void clear_delayed_acks(struct sock * sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - tp->delayed_acks = 0; - if(tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - tcp_clear_xmit_timer(sk, TIME_DACK); -} - -static __inline__ void update_send_head(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - tp->send_head = tp->send_head->next; - if (tp->send_head == (struct sk_buff *) &sk->write_queue) - tp->send_head = NULL; -} - -/* This routine actually transmits TCP packets queued in by - * tcp_do_sendmsg(). This is used by both the initial - * transmission and possible later retransmissions. - * All SKB's seen here are completely headerless. It is our - * job to build the TCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - * - * We are working here with either a clone of the original - * SKB, or a fresh unique copy made by the retransmit engine. - */ -void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if(skb != NULL) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - -#define SYSCTL_FLAG_TSTAMPS 0x1 -#define SYSCTL_FLAG_WSCALE 0x2 -#define SYSCTL_FLAG_SACK 0x4 - - sysctl_flags = 0; - if(tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if(tp->sack_ok && tp->num_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - } - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = sk->sport; - th->dest = sk->dport; - th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(tp->rcv_nxt); - th->doff = (tcp_header_size >> 2); - th->res1 = 0; - *(((__u8 *)th) + 13) = tcb->flags; - if(!(tcb->flags & TCPCB_FLAG_SYN)) - th->window = htons(tcp_select_window(sk)); - th->check = 0; - th->urg_ptr = ntohs(tcb->urg_ptr); - if(tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp, - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rcv_wscale, - TCP_SKB_CB(skb)->when, - tp->ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, TCP_SKB_CB(skb)->when); - } - tp->af_specific->send_check(sk, th, skb->len, skb); - - clear_delayed_acks(sk); - tp->last_ack_sent = tp->rcv_nxt; - tcp_statistics.TcpOutSegs++; - tp->af_specific->queue_xmit(skb); - } -#undef SYSCTL_FLAG_TSTAMPS -#undef SYSCTL_FLAG_WSCALE -#undef SYSCTL_FLAG_SACK -} - -/* This is the main buffer sending routine. We queue the buffer - * and decide whether to queue or transmit now. - */ -void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Advance write_seq and place onto the write_queue. */ - tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); - __skb_queue_tail(&sk->write_queue, skb); - - if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - /* Queue it, remembering where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; - if (!force_queue && tp->packets_out == 0 && !tp->pending) { - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } - } -} - -/* Function to create two new TCP segments. Shrinks the given segment - * to the specified size and appends a new segment with the rest of the - * packet to the list. This won't be called frequently, I hope. - * Remember, these are still headerless SKBs at this point. - */ -static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) -{ - struct sk_buff *buff; - int nsize = skb->len - len; - u16 flags; - - /* Get a new skb... force flag on. */ - buff = sock_wmalloc(sk, - (nsize + MAX_HEADER + sk->prot->max_header), - 1, GFP_ATOMIC); - if (buff == NULL) - return -1; /* We'll just try again later. */ - - /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - - /* Correct the sequence numbers. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; - TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; - - /* PSH and FIN should only be set in the second packet. */ - flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); - if(flags & TCPCB_FLAG_URG) { - u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr; - - /* Urgent data is always a pain in the ass. */ - if(old_urg_ptr > len) { - TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG); - TCP_SKB_CB(skb)->urg_ptr = 0; - TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len; - } else { - flags &= ~(TCPCB_FLAG_URG); - } - } - if(!(flags & TCPCB_FLAG_URG)) - TCP_SKB_CB(buff)->urg_ptr = 0; - TCP_SKB_CB(buff)->flags = flags; - TCP_SKB_CB(buff)->sacked = 0; - - /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), - nsize, 0); - - /* This takes care of the FIN sequence number too. */ - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; - skb_trim(skb, len); - - /* Rechecksum original buffer. */ - skb->csum = csum_partial(skb->data, skb->len, 0); - - /* Looks stupid, but our code really uses when of - * skbs, which it never sent before. --ANK - */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; - - /* Link BUFF into the send queue. */ - __skb_append(skb, buff); - - return 0; -} - -/* This function synchronize snd mss to current pmtu/exthdr set. - - tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts - for TCP options, but includes only bare TCP header. - - tp->mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. - It also does not include TCP options. - - tp->pmtu_cookie is last pmtu, seen by this function. - - tp->mss_cache is current effective sending mss, including - all tcp options except for SACKs. It is evaluated, - taking into account current pmtu, but never exceeds - tp->mss_clamp. - - NOTE1. rfc1122 clearly states that advertised MSS - DOES NOT include either tcp or ip options. - - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) - */ - -int tcp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int mss_now; - - /* Calculate base mss without TCP options: - It is MMS_S - sizeof(tcphdr) of rfc1122 - */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); - - /* Clamp it (mss_clamp does not include tcp options) */ - if (mss_now > tp->mss_clamp) - mss_now = tp->mss_clamp; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - - /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; - - /* It we got too small (or even negative) value, - clamp it by 8 from below. Why 8 ? - Well, it could be 1 with the same success, - but if IP accepted segment of length 1, - it would love 8 even more 8) --ANK (980731) - */ - if (mss_now < 8) - mss_now = 8; - - /* And store cached results */ - tp->pmtu_cookie = pmtu; - tp->mss_cache = mss_now; - return mss_now; -} - - -/* This routine writes packets to the network. It advances the - * send_head. This happens as incoming acks open up the remote - * window for us. - */ -void tcp_write_xmit(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int mss_now; - - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk); - - /* If we are zapped, the bytes will have to remain here. - * In time closedown will empty the write queue and all - * will be happy. - */ - if(!sk->zapped) { - struct sk_buff *skb; - int sent_pkts = 0; - - /* Anything on the transmit queue that fits the window can - * be added providing we are: - * - * a) following SWS avoidance [and Nagle algorithm] - * b) not exceeding our congestion window. - * c) not retransmitting [Nagle] - */ - while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) - break; - } - - /* Advance the send_head. This one is going out. */ - update_send_head(sk); - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - sent_pkts = 1; - } - - /* If we sent anything, make sure the retransmit - * timer is active. - */ - if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } -} - -/* This function returns the amount that we can raise the - * usable window based on the following constraints - * - * 1. The window can never be shrunk once it is offered (RFC 793) - * 2. We limit memory per socket - * - * RFC 1122: - * "the suggested [SWS] avoidance algorithm for the receiver is to keep - * RECV.NEXT + RCV.WIN fixed until: - * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" - * - * i.e. don't raise the right edge of the window until you can raise - * it at least MSS bytes. - * - * Unfortunately, the recommended algorithm breaks header prediction, - * since header prediction assumes th->window stays fixed. - * - * Strictly speaking, keeping th->window fixed violates the receiver - * side SWS prevention criteria. The problem is that under this rule - * a stream of single byte packets will cause the right side of the - * window to always advance by a single byte. - * - * Of course, if the sender implements sender side SWS prevention - * then this will not be a problem. - * - * BSD seems to make the following compromise: - * - * If the free space is less than the 1/4 of the maximum - * space available and the free space is less than 1/2 mss, - * then set the window to 0. - * Otherwise, just prevent the window from shrinking - * and from being larger than the largest representable value. - * - * This prevents incremental opening of the window in the regime - * where TCP is limited by the speed of the reader side taking - * data out of the TCP receive queue. It does nothing about - * those cases where the window is constrained on the sender side - * because the pipeline is full. - * - * BSD also seems to "accidentally" limit itself to windows that are a - * multiple of MSS, at least until the free space gets quite small. - * This would appear to be a side effect of the mbuf implementation. - * Combining these two algorithms results in the observed behavior - * of having a fixed window size at almost all times. - * - * Below we obtain similar behavior by forcing the offered window to - * a multiple of the mss when it is feasible to do so. - * - * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. - */ -u32 __tcp_select_window(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - unsigned int mss = tp->mss_cache; - int free_space; - u32 window; - - /* Sometimes free_space can be < 0. */ - free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2; - if (tp->window_clamp) { - if (free_space > ((int) tp->window_clamp)) - free_space = tp->window_clamp; - mss = min(tp->window_clamp, mss); - } else { - printk("tcp_select_window: tp->window_clamp == 0.\n"); - } - - if (mss < 1) { - mss = 1; - printk("tcp_select_window: sk->mss fell to 0.\n"); - } - - if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) { - window = 0; - tp->pred_flags = 0; - } else { - /* Get the largest window that is a nice multiple of mss. - * Window clamp already applied above. - * If our current window offering is within 1 mss of the - * free space we just keep it. This prevents the divide - * and multiply from happening most of the time. - * We also don't do any window rounding when the free space - * is too small. - */ - window = tp->rcv_wnd; - if ((((int) window) <= (free_space - ((int) mss))) || - (((int) window) > free_space)) - window = (((unsigned int) free_space)/mss)*mss; - } - return window; -} - -/* Attempt to collapse two adjacent SKB's during retransmission. */ -static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) -{ - struct sk_buff *next_skb = skb->next; - - /* The first test we must make is that neither of these two - * SKB's are still referenced by someone else. - */ - if(!skb_cloned(skb) && !skb_cloned(next_skb)) { - int skb_size = skb->len, next_skb_size = next_skb->len; - u16 flags = TCP_SKB_CB(skb)->flags; - - /* Punt if the first SKB has URG set. */ - if(flags & TCPCB_FLAG_URG) - return; - - /* Also punt if next skb has been SACK'd. */ - if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) - return; - - /* Punt if not enough space exists in the first SKB for - * the data in the second, or the total combined payload - * would exceed the MSS. - */ - if ((next_skb_size > skb_tailroom(skb)) || - ((skb_size + next_skb_size) > mss_now)) - return; - - /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, next_skb->list); - - if(skb->len % 4) { - /* Must copy and rechecksum all data. */ - memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); - skb->csum = csum_partial(skb->data, skb->len, 0); - } else { - /* Optimize, actually we could also combine next_skb->csum - * to skb->csum using a single add w/carry operation too. - */ - skb->csum = csum_partial_copy(next_skb->data, - skb_put(skb, next_skb_size), - next_skb_size, skb->csum); - } - - /* Update sequence range on original skb. */ - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - if(flags & TCPCB_FLAG_URG) { - u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr; - TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size; - } - TCP_SKB_CB(skb)->flags = flags; - - /* All done, get rid of second SKB and account for it so - * packet counting does not break. - */ - kfree_skb(next_skb); - sk->tp_pinfo.af_tcp.packets_out--; - } -} - -/* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used for path mtu discovery. - * The socket is already locked here. - */ -void tcp_simple_retransmit(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb, *old_next_skb; - unsigned int mss = tcp_current_mss(sk); - - /* Don't muck with the congestion window here. */ - tp->dup_acks = 0; - tp->high_seq = tp->snd_nxt; - tp->retrans_head = NULL; - - /* Input control flow will see that this was retransmitted - * and not use it for RTT calculation in the absence of - * the timestamp option. - */ - for (old_next_skb = skb = skb_peek(&sk->write_queue); - ((skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)); - skb = skb->next) { - int resend_skb = 0; - - /* Our goal is to push out the packets which we - * sent already, but are being chopped up now to - * account for the PMTU information we have. - * - * As we resend the queue, packets are fragmented - * into two pieces, and when we try to send the - * second piece it may be collapsed together with - * a subsequent packet, and so on. -DaveM - */ - if (old_next_skb != skb || skb->len > mss) - resend_skb = 1; - old_next_skb = skb->next; - if (resend_skb != 0) - tcp_retransmit_skb(sk, skb); - } -} - -static __inline__ void update_retrans_head(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - tp->retrans_head = tp->retrans_head->next; - if((tp->retrans_head == tp->send_head) || - (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) { - tp->retrans_head = NULL; - tp->rexmt_done = 1; - } -} - -/* This retransmits one SKB. Policy decisions and retransmit queue - * state updates are done by the caller. Returns non-zero if an - * error occurred which prevented the send. - */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int cur_mss = tcp_current_mss(sk); - - if(skb->len > cur_mss) { - if(tcp_fragment(sk, skb, cur_mss)) - return 1; /* We'll try again later. */ - - /* New SKB created, account for it. */ - tp->packets_out++; - } - - /* Collapse two adjacent packets if worthwhile and we can. */ - if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (cur_mss >> 1)) && - (skb->next != tp->send_head) && - (skb->next != (struct sk_buff *)&sk->write_queue) && - (sysctl_tcp_retrans_collapse != 0)) - tcp_retrans_try_collapse(sk, skb, cur_mss); - - if(tp->af_specific->rebuild_header(sk)) - return 1; /* Routing failure or similar. */ - - /* Some Solaris stacks overoptimize and ignore the FIN on a - * retransmit when old data is attached. So strip it off - * since it is cheap to do so and saves bytes on the network. - */ - if(skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { - TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_trim(skb, 0); - skb->csum = 0; - } - - /* Ok, we're gonna send it out, update state. */ - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; - tp->retrans_out++; - - /* Make a copy, if the first transmission SKB clone we made - * is still in somebody's hands, else make a clone. - */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if(skb_cloned(skb)) - skb = skb_copy(skb, GFP_ATOMIC); - else - skb = skb_clone(skb, GFP_ATOMIC); - - tcp_transmit_skb(sk, skb); - - /* Update global TCP statistics and return success. */ - sk->prot->retransmits++; - tcp_statistics.TcpRetransSegs++; - - return 0; -} - -/* This gets called after a retransmit timeout, and the initially - * retransmitted data is acknowledged. It tries to continue - * resending the rest of the retransmit queue, until either - * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. - */ -void tcp_xmit_retransmit_queue(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - - if (tp->retrans_head == NULL && - tp->rexmt_done == 0) - tp->retrans_head = skb_peek(&sk->write_queue); - if (tp->retrans_head == tp->send_head) - tp->retrans_head = NULL; - - /* Each time, advance the retrans_head if we got - * a packet out or we skipped one because it was - * SACK'd. -DaveM - */ - while ((skb = tp->retrans_head) != NULL) { - /* If it has been ack'd by a SACK block, we don't - * retransmit it. - */ - if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - /* Send it out, punt if error occurred. */ - if(tcp_retransmit_skb(sk, skb)) - break; - - update_retrans_head(sk); - - /* Stop retransmitting if we've hit the congestion - * window limit. - */ - if (tp->retrans_out >= tp->snd_cwnd) - break; - } else { - update_retrans_head(sk); - } - } -} - -/* Using FACK information, retransmit all missing frames at the receiver - * up to the forward most SACK'd packet (tp->fackets_out) if the packet - * has not been retransmitted already. - */ -void tcp_fack_retransmit(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb = skb_peek(&sk->write_queue); - int packet_cnt = 0; - - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS)) - goto next_packet; - - /* Ok, retransmit it. */ - if(tcp_retransmit_skb(sk, skb)) - break; - - if(tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; -next_packet: - packet_cnt++; - if(packet_cnt >= tp->fackets_out) - break; - skb = skb->next; - } -} - -/* Send a fin. The caller locks the socket for us. This cannot be - * allowed to fail queueing a FIN frame under any circumstances. - */ -void tcp_send_fin(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb = skb_peek_tail(&sk->write_queue); - unsigned int mss_now; - - /* Optimization, tack on the FIN if we have a queue of - * unsent frames. But be careful about outgoing SACKS - * and IP options. - */ - mss_now = tcp_current_mss(sk); - - if((tp->send_head != NULL) && (skb->len < mss_now)) { - /* tcp_write_xmit() takes care of the rest. */ - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; - TCP_SKB_CB(skb)->end_seq++; - tp->write_seq++; - - /* Special case to avoid Nagle bogosity. If this - * segment is the last segment, and it was queued - * due to Nagle/SWS-avoidance, send it out now. - */ - if(tp->send_head == skb && - !sk->nonagle && - skb->len < (tp->mss_cache >> 1) && - tp->packets_out && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { - update_send_head(sk); - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } else { - /* Socket is locked, keep trying until memory is available. */ - do { - skb = sock_wmalloc(sk, - (MAX_HEADER + - sk->prot->max_header), - 1, GFP_KERNEL); - } while (skb == NULL); - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; - - /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ - TCP_SKB_CB(skb)->seq = tp->write_seq; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - tcp_send_skb(sk, skb, 0); - } -} - -/* We get here when a process closes a file descriptor (either due to - * an explicit close() or as a byproduct of exit()'ing) and there - * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM - */ -void tcp_send_active_reset(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - - /* NOTE: No TCP options attached and we never retransmit this. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); - if (!skb) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; - - /* Send it off. */ - TCP_SKB_CB(skb)->seq = tp->write_seq; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_transmit_skb(sk, skb); -} - -/* WARNING: This routine must only be called when we have already sent - * a SYN packet that crossed the incoming SYN that caused this routine - * to get called. If this assumption fails then the initial rcv_wnd - * and rcv_wscale values will not be correct. - */ -int tcp_send_synack(struct sock *sk) -{ - struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff* skb; - - skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), - 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOMEM; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; - - /* SYN eats a sequence byte. */ - TCP_SKB_CB(skb)->seq = tp->snd_una; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - __skb_queue_tail(&sk->write_queue, skb); - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - return 0; -} - -/* - * Prepare a SYN-ACK. - */ -struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct open_request *req, int mss) -{ - struct tcphdr *th; - int tcp_header_size; - struct sk_buff *skb; - - skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - /* Reserve space for headers. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); - - skb->dst = dst_clone(dst); - - /* Don't offer more than they did. - * This way we don't have to memorize who said what. - * FIXME: maybe this should be changed for better performance - * with syncookies. - */ - req->mss = min(mss, req->mss); - if (req->mss < 8) { - printk(KERN_DEBUG "initial req->mss below 8\n"); - req->mss = 8; - } - - tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + - (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + - /* SACK_PERM is in the place of NOP NOP of TS */ - ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); - skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); - - memset(th, 0, sizeof(struct tcphdr)); - th->syn = 1; - th->ack = 1; - th->source = sk->sport; - th->dest = req->rmt_port; - TCP_SKB_CB(skb)->seq = req->snt_isn; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(req->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = skb->dst->window; - tcp_select_initial_window(sock_rspace(sk)/2,req->mss, - &req->rcv_wnd, - &req->window_clamp, - req->wscale_ok, - &rcv_wscale); - req->rcv_wscale = rcv_wscale; - } - - /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(req->rcv_wnd); - - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, - req->sack_ok, req->wscale_ok, req->rcv_wscale, - TCP_SKB_CB(skb)->when, - req->ts_recent); - - skb->csum = 0; - th->doff = (tcp_header_size >> 2); - tcp_statistics.TcpOutSegs++; - return skb; -} - -void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu) -{ - struct dst_entry *dst = sk->dst_cache; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - - tp->snd_wnd = 0; - tp->snd_wl1 = 0; - tp->snd_wl2 = tp->write_seq; - tp->snd_una = tp->write_seq; - tp->rcv_nxt = 0; - - sk->err = 0; - - /* We'll fix this up when we get a response from the other end. - * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. - */ - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - - /* If user gave his TCP_MAXSEG, record it to clamp */ - if (tp->user_mss) - tp->mss_clamp = tp->user_mss; - tcp_sync_mss(sk, mtu); - - /* Now unpleasant action: if initial pmtu is too low - set lower clamp. I am not sure that it is good. - To be more exact, I do not think that clamping at value, which - is apparently transient and may improve in future is good idea. - It would be better to wait until peer will returns its MSS - (probably 65535 too) and now advertise something sort of 65535 - or at least first hop device mtu. Is it clear, what I mean? - We should tell peer what maximal mss we expect to RECEIVE, - it has nothing to do with pmtu. - I am afraid someone will be confused by such huge value. - --ANK (980731) - */ - if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp ) - tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr); - - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_SKB_CB(buff)->sacked = 0; - TCP_SKB_CB(buff)->urg_ptr = 0; - buff->csum = 0; - TCP_SKB_CB(buff)->seq = tp->write_seq++; - TCP_SKB_CB(buff)->end_seq = tp->write_seq; - tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; - - tp->window_clamp = dst->window; - tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp, - &tp->rcv_wnd, - &tp->window_clamp, - sysctl_tcp_window_scaling, - &tp->rcv_wscale); - /* Ok, now lock the socket before we make it visible to - * the incoming packet engine. - */ - lock_sock(sk); - - /* Socket identity change complete, no longer - * in TCP_CLOSE, so enter ourselves into the - * hash tables. - */ - tcp_set_state(sk,TCP_SYN_SENT); - sk->prot->hash(sk); - - tp->rto = dst->rtt; - tcp_init_xmit_timers(sk); - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - - /* Send it off. */ - __skb_queue_tail(&sk->write_queue, buff); - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); - tcp_statistics.TcpActiveOpens++; - - /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - /* Now, it is safe to release the socket. */ - release_sock(sk); -} - -/* Send out a delayed ack, the caller does the policy checking - * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() - * for details. - */ -void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) -{ - unsigned long timeout; - - /* Stay within the limit we were given */ - timeout = tp->ato; - if (timeout > max_timeout) - timeout = max_timeout; - timeout += jiffies; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (!tp->delack_timer.prev) { - tp->delack_timer.expires = timeout; - add_timer(&tp->delack_timer); - } else { - if (time_before(timeout, tp->delack_timer.expires)) - mod_timer(&tp->delack_timer, timeout); - } -} - -/* This routine sends an ack and also updates the window. */ -void tcp_send_ack(struct sock *sk) -{ - char *str1 = "pfinet tcp_send_ack check point 1\n"; - char *str2 = "pfinet tcp_send_ack check point 2\n"; - int stderr_fd = fileno (stderr); - /* If we have been reset, we may not send again. */ - if(!sk->zapped) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *buff; - - /* We are not putting this on the write queue, so - * tcp_transmit_skb() will set the ownership to this - * sock. - */ - buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); - if (buff == NULL) { - /* Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. - * - * This is the one possible way that we can delay an - * ACK and have tp->ato indicate that we are in - * quick ack mode, so clear it. - */ - if(tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - tcp_send_delayed_ack(tp, HZ/2); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - buff->csum = 0; - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(buff)->sacked = 0; - TCP_SKB_CB(buff)->urg_ptr = 0; - - /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt; - TCP_SKB_CB(buff)->when = tcp_time_stamp; - write (stderr_fd, str1, strlen (str1) + 1); - fflush (stderr); - tcp_transmit_skb(sk, buff); - write (stderr_fd, str2, strlen (str2) + 1); - fflush (stderr); - } -} - -/* This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. - */ -void tcp_write_wakeup(struct sock *sk) -{ - /* After a valid reset we can send no more. */ - if (!sk->zapped) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - - /* Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] - */ - if ((1 << sk->state) & - ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| - TCPF_LAST_ACK|TCPF_CLOSING)) - return; - - if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && - ((skb = tp->send_head) != NULL)) { - unsigned long win_size; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { - if (tcp_fragment(sk, skb, win_size)) - return; /* Let a retransmit get it. */ - } - update_send_head(sk); - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, - GFP_ATOMIC); - if (skb == NULL) - return; - - /* Reserve space for headers and set control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; - - /* Use a previous sequence. This should cause the other - * end to send an ack. Don't queue or clone SKB, just - * send it. - */ - TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_transmit_skb(sk, skb); - } - } -} - -/* A window probe timeout has occurred. If window is not closed send - * a partial packet else a zero probe. - */ -void tcp_send_probe0(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - tcp_write_wakeup(sk); - tp->pending = TIME_PROBE0; - tp->backoff++; - tp->probes_out++; - tcp_reset_xmit_timer (sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); -} -- cgit v1.2.3