diff options
Diffstat (limited to 'pfinet/linux-src/net')
71 files changed, 48290 insertions, 0 deletions
diff --git a/pfinet/linux-src/net/core/Makefile b/pfinet/linux-src/net/core/Makefile new file mode 100644 index 00000000..5df65cd2 --- /dev/null +++ b/pfinet/linux-src/net/core/Makefile @@ -0,0 +1,41 @@ +# +# Makefile for the Linux networking core. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := core.o + +O_OBJS := sock.o skbuff.o iovec.o datagram.o scm.o + +ifeq ($(CONFIG_SYSCTL),y) +ifeq ($(CONFIG_NET),y) +O_OBJS += sysctl_net_core.o +endif +endif + +ifdef CONFIG_FILTER +O_OBJS += filter.o +endif + +ifdef CONFIG_NET + +O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o + +ifdef CONFIG_FIREWALL +OX_OBJS += firewall.o +endif + +endif + +ifdef CONFIG_NET_PROFILE +OX_OBJS += profile.o +endif + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/pfinet/linux-src/net/core/datagram.c b/pfinet/linux-src/net/core/datagram.c new file mode 100644 index 00000000..9bb68fa4 --- /dev/null +++ b/pfinet/linux-src/net/core/datagram.c @@ -0,0 +1,249 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top + * of these would make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() understood + * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but + * AX.25 now works right, and SPX is feasible. + * Alan Cox : Fixed write poll of non IP protocol crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/poll.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> + + +/* + * Wait for a packet.. + * + * Interrupts off so that no packet arrives before we begin sleeping. + * Otherwise we might miss our wake up + */ + +static inline void wait_for_packet(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + + if (skb_peek(&sk->receive_queue) == NULL) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + +/* + * Is a socket 'connection oriented' ? + */ + +static inline int connection_based(struct sock *sk) +{ + return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM); +} + +/* + * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible + * races. This replaces identical code in packet,raw and udp, as well as the IPX + * AX.25 and Appletalk. It also finally fixes the long standing peek and read + * race for datagram sockets. If you alter this routine remember it must be + * re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err) +{ + int error; + struct sk_buff *skb; + + /* Caller is allowed not to check sk->err before skb_recv_datagram() */ + error = sock_error(sk); + if (error) + goto no_packet; + +restart: + while(skb_queue_empty(&sk->receive_queue)) /* No data */ + { + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto no_packet; + + /* Socket shut down? */ + if (sk->shutdown & RCV_SHUTDOWN) + goto no_packet; + + /* Sequenced packets can come disconnected. If so we report the problem */ + error = -ENOTCONN; + if(connection_based(sk) && sk->state!=TCP_ESTABLISHED) + goto no_packet; + + /* handle signals */ + error = -ERESTARTSYS; + if (signal_pending(current)) + goto no_packet; + + /* User doesn't want to wait */ + error = -EAGAIN; + if (noblock) + goto no_packet; + + wait_for_packet(sk); + } + + /* Again only user level code calls this function, so nothing interrupt level + will suddenly eat the receive_queue */ + if (flags & MSG_PEEK) + { + unsigned long cpu_flags; + + /* It is the only POTENTIAL race condition + in this function. skb may be stolen by + another receiver after peek, but before + incrementing use count, provided kernel + is reentearble (it is not) or this function + is called by interrupts. + + Protect it with global skb spinlock, + though for now even this is overkill. + --ANK (980728) + */ + spin_lock_irqsave(&skb_queue_lock, cpu_flags); + skb = skb_peek(&sk->receive_queue); + if(skb!=NULL) + atomic_inc(&skb->users); + spin_unlock_irqrestore(&skb_queue_lock, cpu_flags); + } else + skb = skb_dequeue(&sk->receive_queue); + + if (!skb) /* Avoid race if someone beats us to the data */ + goto restart; + return skb; + +no_packet: + *err = error; + return NULL; +} + +void skb_free_datagram(struct sock * sk, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* + * Copy a datagram to a linear buffer. + */ + +int skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) +{ + int err = -EFAULT; + + if (!copy_to_user(to, skb->h.raw + offset, size)) + err = 0; + return err; +} + + +/* + * Copy a datagram to an iovec. + * Note: the iovec is modified during the copy. + */ + +int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, + int size) +{ + return memcpy_toiovec(to, skb->h.raw + offset, size); +} + +/* + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ + +unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->err || !skb_queue_empty(&sk->error_queue)) + mask |= POLLERR; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->state==TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + sk->socket->flags |= SO_NOSPACE; + + return mask; +} diff --git a/pfinet/linux-src/net/core/dev.c b/pfinet/linux-src/net/core/dev.c new file mode 100644 index 00000000..cc9584a1 --- /dev/null +++ b/pfinet/linux-src/net/core/dev.c @@ -0,0 +1,2026 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Florian la Roche <rzsfl@rz.uni-sb.de> + * Alan Cox <gw4pts@gw4pts.ampr.org> + * David Hinds <dhinds@allegro.stanford.edu> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Adam Sulmicki <adam@cfar.umd.edu> + * + * Changes: + * Marcelo Tosatti <marcelo@conectiva.com.br> : dont accept mtu 0 or < + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant stunts to + * keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before calling netif_rx. Saves + * a function call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close changes. + * Rudi Cilibrasi : Pass the right thing to set_mac_address() + * Dave Miller : 32bit quantity for the device lock to make it work out + * on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russel : SIOCSIFNAME + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <net/slhc.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <net/br.h> +#include <net/dst.h> +#include <net/pkt_sched.h> +#include <net/profile.h> +#include <linux/init.h> +#include <linux/kmod.h> +#ifdef CONFIG_NET_RADIO +#include <linux/wireless.h> +#endif /* CONFIG_NET_RADIO */ +#ifdef CONFIG_PLIP +extern int plip_init(void); +#endif + +NET_PROFILE_DEFINE(dev_queue_xmit) +NET_PROFILE_DEFINE(net_bh) +NET_PROFILE_DEFINE(net_bh_skb) + + +const char *if_port_text[] = { + "unknown", + "BNC", + "10baseT", + "AUI", + "100baseT", + "100baseTX", + "100baseFX" +}; + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * 0800 IP + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +struct packet_type *ptype_base[16]; /* 16 way hashed list */ +struct packet_type *ptype_all = NULL; /* Taps */ + +/* + * Device list lock. Setting it provides that interface + * will not disappear unexpectedly while kernel sleeps. + */ + +atomic_t dev_lockct = ATOMIC_INIT(0); + +/* + * Our notifier list + */ + +static struct notifier_block *netdev_chain=NULL; + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the bottom half handler. + */ + +static struct sk_buff_head backlog; + +#ifdef CONFIG_NET_FASTROUTE +int netdev_fastroute; +int netdev_fastroute_obstacles; +struct net_fastroute_stats dev_fastroute_stat; +#endif + +static void dev_clear_backlog(struct device *dev); + + +/****************************************************************************************** + + Protocol management and registration routines + +*******************************************************************************************/ + +/* + * For efficiency + */ + +int netdev_nit=0; + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explantion follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +void dev_add_pack(struct packet_type *pt) +{ + int hash; +#ifdef CONFIG_NET_FASTROUTE + /* Hack to detect packet socket */ + if (pt->data) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(pt->dev); + } +#endif + if(pt->type==htons(ETH_P_ALL)) + { + netdev_nit++; + pt->next=ptype_all; + ptype_all=pt; + } + else + { + hash=ntohs(pt->type)&15; + pt->next = ptype_base[hash]; + ptype_base[hash] = pt; + } +} + + +/* + * Remove a protocol ID from the list. + */ + +void dev_remove_pack(struct packet_type *pt) +{ + struct packet_type **pt1; + if(pt->type==htons(ETH_P_ALL)) + { + netdev_nit--; + pt1=&ptype_all; + } + else + pt1=&ptype_base[ntohs(pt->type)&15]; + for(; (*pt1)!=NULL; pt1=&((*pt1)->next)) + { + if(pt==(*pt1)) + { + *pt1=pt->next; + synchronize_bh(); +#ifdef CONFIG_NET_FASTROUTE + if (pt->data) + netdev_fastroute_obstacles--; +#endif + return; + } + } + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +} + +/***************************************************************************************** + + Device Interface Subroutines + +******************************************************************************************/ + +/* + * Find an interface by name. + */ + +struct device *dev_get(const char *name) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (strcmp(dev->name, name) == 0) + return(dev); + } + return NULL; +} + +struct device * dev_get_by_index(int ifindex) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->ifindex == ifindex) + return(dev); + } + return NULL; +} + +struct device *dev_getbyhwaddr(unsigned short type, char *ha) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->type == type && + memcmp(dev->dev_addr, ha, dev->addr_len) == 0) + return(dev); + } + return(NULL); +} + +/* + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. Not efficient for many devices, not called a lot.. + */ + +int dev_alloc_name(struct device *dev, const char *name) +{ + int i; + /* + * If you need over 100 please also fix the algorithm... + */ + for(i=0;i<100;i++) + { + sprintf(dev->name,name,i); + if(dev_get(dev->name)==NULL) + return i; + } + return -ENFILE; /* Over 100 of the things .. bail out! */ +} + +struct device *dev_alloc(const char *name, int *err) +{ + struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL); + if(dev==NULL) + { + *err=-ENOBUFS; + return NULL; + } + dev->name=(char *)(dev+1); /* Name string space */ + *err=dev_alloc_name(dev,name); + if(*err<0) + { + kfree(dev); + return NULL; + } + return dev; +} + +void netdev_state_change(struct device *dev) +{ + if (dev->flags&IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); +} + + +/* + * Find and possibly load an interface. + */ + +#ifdef CONFIG_KMOD + +void dev_load(const char *name) +{ + if(!dev_get(name) && capable(CAP_SYS_MODULE)) + request_module(name); +} + +#else + +extern inline void dev_load(const char *unused){;} + +#endif + +static int default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); + return 1; +} + +/* + * Prepare an interface for use. + */ + +int dev_open(struct device *dev) +{ + int ret = 0; + + /* + * Is it already up? + */ + + if (dev->flags&IFF_UP) + return 0; + + /* + * Call device private open method + */ + + if (dev->open) + ret = dev->open(dev); + + /* + * If it went open OK then: + */ + + if (ret == 0) + { + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; + + /* + * Set the flags. + */ + dev->flags |= (IFF_UP | IFF_RUNNING); + + /* + * Initialize multicasting status + */ + dev_mc_upload(dev); + + /* + * Wakeup transmit queue engine + */ + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + + } + return(ret); +} + +#ifdef CONFIG_NET_FASTROUTE + +static __inline__ void dev_do_clear_fastroute(struct device *dev) +{ + if (dev->accept_fastpath) { + int i; + + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release_irqwait(xchg(dev->fastpath+i, NULL)); + } +} + +void dev_clear_fastroute(struct device *dev) +{ + if (dev) { + dev_do_clear_fastroute(dev); + } else { + for (dev = dev_base; dev; dev = dev->next) + dev_do_clear_fastroute(dev); + } +} +#endif + +/* + * Completely shutdown an interface. + */ + +int dev_close(struct device *dev) +{ + if (!(dev->flags&IFF_UP)) + return 0; + + dev_deactivate(dev); + + dev_lock_wait(); + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + */ + + if (dev->stop) + dev->stop(dev); + + if (dev->start) + printk("dev_close: bug %s still running\n", dev->name); + + /* + * Device is now down. + */ + dev_clear_backlog(dev); + + dev->flags&=~(IFF_UP|IFF_RUNNING); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + + /* + * Tell people we are going down + */ + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + + return(0); +} + + +/* + * Device change register/unregister. These are not inline or static + * as we export them to the world. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&netdev_chain, nb); +} + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&netdev_chain,nb); +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) +{ + struct packet_type *ptype; + get_fast_time(&skb->stamp); + + for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) + { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + ((struct sock *)ptype->data != skb->sk)) + { + struct sk_buff *skb2; + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) + break; + + /* Code, following below is wrong. + + The only reason, why it does work is that + ONLY packet sockets receive outgoing + packets. If such a packet will be (occasionally) + received by normal packet handler, which expects + that mac header is pulled... + */ + + /* More sensible variant. skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; + + if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) { + if (net_ratelimit()) + printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + if (dev->hard_header) + skb2->nh.raw += dev->hard_header_len; + } + + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); + } + } +} + +/* + * Fast path for loopback frames. + */ + +void dev_loopback_xmit(struct sk_buff *skb) +{ + struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); + if (newskb==NULL) + return; + + newskb->mac.raw = newskb->data; + skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + if (newskb->dst==NULL) + printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); + netif_rx(newskb); +} + +int dev_queue_xmit(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + struct Qdisc *q; + +#ifdef CONFIG_NET_PROFILE + start_bh_atomic(); + NET_PROFILE_ENTER(dev_queue_xmit); +#endif + + start_bh_atomic(); + q = dev->qdisc; + if (q->enqueue) { + q->enqueue(skb, q); + qdisc_wakeup(dev); + end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that bh protection is necessary here: + virtual devices do not generate EOI events. + However, it is possible, that they rely on bh protection + made by us here. + */ + if (dev->flags&IFF_UP) { + if (netdev_nit) + dev_queue_xmit_nit(skb,dev); + if (dev->hard_start_xmit(skb, dev) == 0) { + end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; + } + if (net_ratelimit()) + printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + } + end_bh_atomic(); + + kfree_skb(skb); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; +} + + +/*======================================================================= + Receiver rotutines + =======================================================================*/ + +int netdev_dropping = 0; +int netdev_max_backlog = 300; +atomic_t netdev_rx_dropped; +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif + +#ifdef CONFIG_NET_HW_FLOWCONTROL +int netdev_throttle_events; +static unsigned long netdev_fc_mask = 1; +unsigned long netdev_fc_xoff = 0; + +static struct +{ + void (*stimul)(struct device *); + struct device *dev; +} netdev_fc_slots[32]; + +int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) +{ + int bit = 0; + unsigned long flags; + + save_flags(flags); + cli(); + if (netdev_fc_mask != ~0UL) { + bit = ffz(netdev_fc_mask); + netdev_fc_slots[bit].stimul = stimul; + netdev_fc_slots[bit].dev = dev; + set_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + restore_flags(flags); + return bit; +} + +void netdev_unregister_fc(int bit) +{ + unsigned long flags; + + save_flags(flags); + cli(); + if (bit > 0) { + netdev_fc_slots[bit].stimul = NULL; + netdev_fc_slots[bit].dev = NULL; + clear_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + restore_flags(flags); +} + +static void netdev_wakeup(void) +{ + unsigned long xoff; + + cli(); + xoff = netdev_fc_xoff; + netdev_fc_xoff = 0; + netdev_dropping = 0; + netdev_throttle_events++; + while (xoff) { + int i = ffz(~xoff); + xoff &= ~(1<<i); + netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev); + } + sti(); +} +#endif + +static void dev_clear_backlog(struct device *dev) +{ + struct sk_buff *prev, *curr; + + /* + * + * Let now clear backlog queue. -AS + * + * We are competing here both with netif_rx() and net_bh(). + * We don't want either of those to mess with skb ptrs + * while we work on them, thus cli()/sti(). + * + * It looks better to use net_bh trick, at least + * to be sure, that we keep interrupt latency really low. --ANK (980727) + */ + + if (backlog.qlen) { + start_bh_atomic(); + curr = backlog.next; + while ( curr != (struct sk_buff *)(&backlog) ) { + unsigned long flags; + curr=curr->next; + if ( curr->prev->dev == dev ) { + prev = curr->prev; + spin_lock_irqsave(&skb_queue_lock, flags); + __skb_unlink(prev, &backlog); + spin_unlock_irqrestore(&skb_queue_lock, flags); + kfree_skb(prev); + } + } + end_bh_atomic(); +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + } +} + +/* + * Receive a packet from a device driver and queue it for the upper + * (protocol) levels. It always succeeds. + */ + +void netif_rx(struct sk_buff *skb) +{ +#ifndef CONFIG_CPU_IS_SLOW + if(skb->stamp.tv_sec==0) + get_fast_time(&skb->stamp); +#else + skb->stamp = xtime; +#endif + + /* The code is rearranged so that the path is the most + short when CPU is congested, but is still operating. + */ + + if (backlog.qlen <= netdev_max_backlog) { + if (backlog.qlen) { + if (netdev_dropping == 0) { + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); + return; + } +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + netdev_dropping = 1; + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); +} + +#ifdef CONFIG_BRIDGE +static inline void handle_bridge(struct sk_buff *skb, unsigned short type) +{ + if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) + { + /* + * We pass the bridge a complete frame. This means + * recovering the MAC header first. + */ + + int offset; + + skb=skb_clone(skb, GFP_ATOMIC); + if(skb==NULL) + return; + + offset=skb->data-skb->mac.raw; + skb_push(skb,offset); /* Put header back on for bridge */ + + if(br_receive_frame(skb)) + return; + kfree_skb(skb); + } + return; +} +#endif + + +/* + * When we are called the queue is ready to grab, the interrupts are + * on and hardware can interrupt and queue to the receive queue as we + * run with no problems. + * This is run as a bottom half after an interrupt handler that does + * mark_bh(NET_BH); + */ + +void net_bh(void) +{ + struct packet_type *ptype; + struct packet_type *pt_prev; + unsigned short type; + unsigned long start_time = jiffies; +#ifdef CONFIG_CPU_IS_SLOW + static unsigned long start_busy = 0; + static unsigned long ave_busy = 0; + + if (start_busy == 0) + start_busy = start_time; + net_cpu_congestion = ave_busy>>8; +#endif + + NET_PROFILE_ENTER(net_bh); + /* + * Can we send anything now? We want to clear the + * decks for any more sends that get done as we + * process the input. This also minimises the + * latency on a transmit interrupt bh. + */ + + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + + /* + * Any data left to process. This may occur because a + * mark_bh() is done after we empty the queue including + * that from the device which does a mark_bh() just after + */ + + /* + * While the queue is not empty.. + * + * Note that the queue never shrinks due to + * an interrupt, so we can do this test without + * disabling interrupts. + */ + + while (!skb_queue_empty(&backlog)) + { + struct sk_buff * skb; + + /* Give chance to other bottom halves to run */ + if (jiffies - start_time > 1) + goto net_bh_break; + + /* + * We have a packet. Therefore the queue has shrunk + */ + skb = skb_dequeue(&backlog); + +#ifdef CONFIG_CPU_IS_SLOW + if (ave_busy > 128*16) { + kfree_skb(skb); + while ((skb = skb_dequeue(&backlog)) != NULL) + kfree_skb(skb); + break; + } +#endif + + +#if 0 + NET_PROFILE_SKB_PASSED(skb, net_bh_skb); +#endif +#ifdef CONFIG_NET_FASTROUTE + if (skb->pkt_type == PACKET_FASTROUTE) { + dev_queue_xmit(skb); + continue; + } +#endif + + /* + * Bump the pointer to the next structure. + * + * On entry to the protocol layer. skb->data and + * skb->nh.raw point to the MAC and encapsulated data + */ + + /* XXX until we figure out every place to modify.. */ + skb->h.raw = skb->nh.raw = skb->data; + + if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) { + printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol); + kfree_skb(skb); + continue; + } + + /* + * Fetch the packet protocol ID. + */ + + type = skb->protocol; + +#ifdef CONFIG_BRIDGE + /* + * If we are bridging then pass the frame up to the + * bridging code (if this protocol is to be bridged). + * If it is bridged then move on + */ + handle_bridge(skb, type); +#endif + + /* + * We got a packet ID. Now loop over the "known protocols" + * list. There are two lists. The ptype_all list of taps (normally empty) + * and the main protocol list which is hashed perfectly for normal protocols. + */ + + pt_prev = NULL; + for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) + { + if (!ptype->dev || ptype->dev == skb->dev) { + if(pt_prev) + { + struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + if(skb2) + pt_prev->func(skb2,skb->dev, pt_prev); + } + pt_prev=ptype; + } + } + + for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) + { + if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) + { + /* + * We already have a match queued. Deliver + * to it and then remember the new match + */ + if(pt_prev) + { + struct sk_buff *skb2; + + skb2=skb_clone(skb, GFP_ATOMIC); + + /* + * Kick the protocol handler. This should be fast + * and efficient code. + */ + + if(skb2) + pt_prev->func(skb2, skb->dev, pt_prev); + } + /* Remember the current last to do */ + pt_prev=ptype; + } + } /* End of protocol list loop */ + + /* + * Is there a last item to send to ? + */ + + if(pt_prev) + pt_prev->func(skb, skb->dev, pt_prev); + /* + * Has an unknown packet has been received ? + */ + + else { + kfree_skb(skb); + } + } /* End of queue loop */ + + /* + * We have emptied the queue + */ + + /* + * One last output flush. + */ + + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + +#ifdef CONFIG_CPU_IS_SLOW + if (1) { + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; + } +#endif +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + NET_PROFILE_LEAVE(net_bh); + return; + +net_bh_break: + mark_bh(NET_BH); + NET_PROFILE_LEAVE(net_bh); + return; +} + +/* Protocol dependent address dumping routines */ + +static gifconf_func_t * gifconf_list [NPROTO]; + +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ + if (family>=NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * This call is useful, but I'd remove it too. + * + * The reason is purely aestetical, it is the only call + * from SIOC* family using struct ifreq in reversed manner. + * Besides that, it is pretty silly to put "drawing" facility + * to kernel, it is useful only to print ifindices + * in readable form, is not it? --ANK + * + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct ifreq *arg) +{ + struct device *dev; + struct ifreq ifr; + int err; + + /* + * Fetch the caller's info block. + */ + + err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); + if (err) + return -EFAULT; + + dev = dev_get_by_index(ifr.ifr_ifindex); + if (!dev) + return -ENODEV; + + strcpy(ifr.ifr_name, dev->name); + + err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); + return (err)?-EFAULT:0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(char *arg) +{ + struct ifconf ifc; + struct device *dev; + char *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for (dev = dev_base; dev != NULL; dev = dev->next) { + for (i=0; i<NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (pos==NULL) { + done = gifconf_list[i](dev, NULL, 0); + } else { + done = gifconf_list[i](dev, pos+total, len-total); + } + if (done<0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + if (copy_to_user(arg, &ifc, sizeof(struct ifconf))) + return -EFAULT; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return 0; +} + +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ + +#ifdef CONFIG_PROC_FS +static int sprintf_stats(char *buffer, struct device *dev) +{ + struct net_device_stats *stats = (dev->get_stats ? dev->get_stats(dev): NULL); + int size; + + if (stats) + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, + stats->rx_bytes, + stats->rx_packets, stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, + stats->tx_packets, stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + stats->tx_aborted_errors + + stats->tx_window_errors + stats->tx_heartbeat_errors, + stats->tx_compressed); + else + size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); + + return size; +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface + * to create /proc/net/dev + */ + +int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t begin=0; + off_t pos=0; + int size; + + struct device *dev; + + + size = sprintf(buffer, + "Inter-| Receive | Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); + + pos+=size; + len+=size; + + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + size = sprintf_stats(buffer+len, dev); + len+=size; + pos=begin+len; + + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + + *start=buffer+(offset-begin); /* Start of wanted data */ + len-=(offset-begin); /* Start slop */ + if(len>length) + len=length; /* Ending slop */ + return len; +} + +static int dev_proc_stats(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x %08x %08x %08x\n", + atomic_read(&netdev_rx_dropped), +#ifdef CONFIG_NET_HW_FLOWCONTROL + netdev_throttle_events, +#else + 0, +#endif +#ifdef CONFIG_NET_FASTROUTE + dev_fastroute_stat.hits, + dev_fastroute_stat.succeed, + dev_fastroute_stat.deferred +#else + 0, 0, 0 +#endif + ); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} + +#endif /* CONFIG_PROC_FS */ + + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS + +/* + * Print one entry of /proc/net/wireless + * This is a clone of /proc/net/dev (just above) + */ +static int sprintf_wireless_stats(char *buffer, struct device *dev) +{ + /* Get stats from the driver */ + struct iw_statistics *stats = (dev->get_wireless_stats ? + dev->get_wireless_stats(dev) : + (struct iw_statistics *) NULL); + int size; + + if(stats != (struct iw_statistics *) NULL) + size = sprintf(buffer, + "%6s: %02x %3d%c %3d%c %3d%c %5d %5d %5d\n", + dev->name, + stats->status, + stats->qual.qual, + stats->qual.updated & 1 ? '.' : ' ', + stats->qual.level, + stats->qual.updated & 2 ? '.' : ' ', + stats->qual.noise, + stats->qual.updated & 3 ? '.' : ' ', + stats->discard.nwid, + stats->discard.code, + stats->discard.misc); + else + size = 0; + + return size; +} + +/* + * Print info for /proc/net/wireless (print all entries) + * This is a clone of /proc/net/dev (just above) + */ +int dev_get_wireless_info(char * buffer, char **start, off_t offset, + int length, int dummy) +{ + int len = 0; + off_t begin = 0; + off_t pos = 0; + int size; + + struct device * dev; + + size = sprintf(buffer, + "Inter-|sta| Quality | Discarded packets\n" + " face |tus|link level noise| nwid crypt misc\n"); + + pos+=size; + len+=size; + + for(dev = dev_base; dev != NULL; dev = dev->next) + { + size = sprintf_wireless_stats(buffer+len, dev); + len+=size; + pos=begin+len; + + if(pos < offset) + { + len=0; + begin=pos; + } + if(pos > offset + length) + break; + } + + *start = buffer + (offset - begin); /* Start of wanted data */ + len -= (offset - begin); /* Start slop */ + if(len > length) + len = length; /* Ending slop */ + + return len; +} +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + +void dev_set_promiscuity(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags^old_flags) { +#ifdef CONFIG_NET_FASTROUTE + if (dev->flags&IFF_PROMISC) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(dev); + } else + netdev_fastroute_obstacles--; +#endif + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); + } +} + +void dev_set_allmulti(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags^old_flags) + dev_mc_upload(dev); +} + +int dev_change_flags(struct device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| + IFF_SLAVE|IFF_MASTER|IFF_DYNAMIC| + IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ + { + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + + if (ret == 0) + dev_mc_upload(dev); + } + + if (dev->flags&IFF_UP && + ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE))) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + + if ((flags^dev->gflags)&IFF_PROMISC) { + int inc = (flags&IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags^dev->gflags)&IFF_ALLMULTI) { + int inc = (flags&IFF_ALLMULTI) ? +1 : -1; + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + return ret; +} + +/* + * Perform the SIOCxIFxxx calls. + */ + +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + struct device *dev; + int err; + + if ((dev = dev_get(ifr->ifr_name)) == NULL) + return -ENODEV; + + switch(cmd) + { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI)) + |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI)); + return 0; + + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ + return -EOPNOTSUPP; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCSIFMTU: /* Set the MTU of a device */ + if (ifr->ifr_mtu == dev->mtu) + return 0; + + /* + * MTU must be positive. + */ + + if (ifr->ifr_mtu<=0) + return -EINVAL; + + if (dev->change_mtu) + err = dev->change_mtu(dev, ifr->ifr_mtu); + else { + dev->mtu = ifr->ifr_mtu; + err = 0; + } + if (!err && dev->flags&IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); + return err; + + case SIOCGIFHWADDR: + memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); + ifr->ifr_hwaddr.sa_family=dev->type; + return 0; + + case SIOCSIFHWADDR: + if(dev->set_mac_address==NULL) + return -EOPNOTSUPP; + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + err=dev->set_mac_address(dev,&ifr->ifr_hwaddr); + if (!err) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return err; + + case SIOCSIFHWBROADCAST: + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start=dev->mem_start; + ifr->ifr_map.mem_end=dev->mem_end; + ifr->ifr_map.base_addr=dev->base_addr; + ifr->ifr_map.irq=dev->irq; + ifr->ifr_map.dma=dev->dma; + ifr->ifr_map.port=dev->if_port; + return 0; + + case SIOCSIFMAP: + if (dev->set_config) + return dev->set_config(dev,&ifr->ifr_map); + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) + return -EINVAL; + dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); + return 0; + + case SIOCDELMULTI: + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) + return -EINVAL; + dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if(ifr->ifr_qlen<0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + if (dev->flags&IFF_UP) + return -EBUSY; + if (dev_get(ifr->ifr_newname)) + return -EEXIST; + memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); + dev->name[IFNAMSIZ-1] = 0; + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + return 0; + + /* + * Unknown or private ioctl + */ + + default: + if(cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; + } + +#ifdef CONFIG_NET_RADIO + if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; + } +#endif /* CONFIG_NET_RADIO */ + + } + return -EINVAL; +} + + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +int dev_ioctl(unsigned int cmd, void *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + ret = dev_ifconf((char *) arg); + rtnl_shunlock(); + return ret; + } + if (cmd == SIOCGIFNAME) { + return dev_ifname((struct ifreq *)arg); + } + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch(cmd) + { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); + ret = dev_ifsioc(&ifr, cmd); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but currently + do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + + default: + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; + } +#ifdef CONFIG_NET_RADIO + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + dev_load(ifr.ifr_name); + if (IW_IS_SET(cmd)) { + if (!suser()) + return -EPERM; + rtnl_lock(); + } + ret = dev_ifsioc(&ifr, cmd); + if (IW_IS_SET(cmd)) + rtnl_unlock(); + if (!ret && IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; + } +#endif /* CONFIG_NET_RADIO */ + return -EINVAL; + } +} + +int dev_new_index(void) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex=1; + if (dev_get_by_index(ifindex) == NULL) + return ifindex; + } +} + +static int dev_boot_phase = 1; + + +int register_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase) { + /* This is NOT bug, but I am not sure, that all the + devices, initialized before netdev module is started + are sane. + + Now they are chained to device boot list + and probed later. If a module is initialized + before netdev, but assumes that dev->init + is really called by register_netdev(), it will fail. + + So that this message should be printed for a while. + */ + printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + *dp = dev; + return 0; + } + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init && dev->init(dev) != 0) + return -EIO; + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + dev_init_scheduler(dev); + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + *dp = dev; + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + return 0; +} + +int unregister_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase == 0) { + /* If device is running, close it. + It is very bad idea, really we should + complain loudly here, but random hackery + in linux/drivers/net likes it. + */ + if (dev->flags & IFF_UP) + dev_close(dev); + +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + /* To avoid pointers looking to nowhere, + we wait for end of critical section */ + dev_lock_wait(); + } + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev) { + *dp = d->next; + synchronize_bh(); + d->next = NULL; + + if (dev->destructor) + dev->destructor(dev); + return 0; + } + } + return -ENODEV; +} + + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ +extern int lance_init(void); +extern int bpq_init(void); +extern int scc_init(void); +extern void sdla_setup(void); +extern void dlci_setup(void); +extern int dmascc_init(void); +extern int sm_init(void); + +extern int baycom_ser_fdx_init(void); +extern int baycom_ser_hdx_init(void); +extern int baycom_par_init(void); + +extern int lapbeth_init(void); +extern void arcnet_init(void); +extern void ip_auto_config(void); +#ifdef CONFIG_8xx +extern int cpm_enet_init(void); +#endif /* CONFIG_8xx */ + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_dev = { + PROC_NET_DEV, 3, "dev", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + dev_get_info +}; +#endif + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_wireless = { + PROC_NET_WIRELESS, 8, "wireless", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + dev_get_wireless_info +}; +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + +__initfunc(int net_dev_init(void)) +{ + struct device *dev, **dp; + +#ifdef CONFIG_NET_SCHED + pktsched_init(); +#endif + + /* + * Initialise the packet receive queue. + */ + + skb_queue_head_init(&backlog); + + /* + * The bridge has to be up before the devices + */ + +#ifdef CONFIG_BRIDGE + br_init(); +#endif + + /* + * This is Very Ugly(tm). + * + * Some devices want to be initialized early.. + */ + +#if defined(CONFIG_SCC) + scc_init(); +#endif +#if defined(CONFIG_DMASCC) + dmascc_init(); +#endif +#if defined(CONFIG_BPQETHER) + bpq_init(); +#endif +#if defined(CONFIG_DLCI) + dlci_setup(); +#endif +#if defined(CONFIG_SDLA) + sdla_setup(); +#endif +#if defined(CONFIG_BAYCOM_PAR) + baycom_par_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_FDX) + baycom_ser_fdx_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_HDX) + baycom_ser_hdx_init(); +#endif +#if defined(CONFIG_SOUNDMODEM) + sm_init(); +#endif +#if defined(CONFIG_LAPBETHER) + lapbeth_init(); +#endif +#if defined(CONFIG_PLIP) + plip_init(); +#endif +#if defined(CONFIG_ARCNET) + arcnet_init(); +#endif +#if defined(CONFIG_8xx) + cpm_enet_init(); +#endif + /* + * SLHC if present needs attaching so other people see it + * even if not opened. + */ + +#ifdef CONFIG_INET +#if (defined(CONFIG_SLIP) && defined(CONFIG_SLIP_COMPRESSED)) \ + || defined(CONFIG_PPP) \ + || (defined(CONFIG_ISDN) && defined(CONFIG_ISDN_PPP)) + slhc_install(); +#endif +#endif + +#ifdef CONFIG_NET_PROFILE + net_profile_init(); + NET_PROFILE_REGISTER(dev_queue_xmit); + NET_PROFILE_REGISTER(net_bh); +#if 0 + NET_PROFILE_REGISTER(net_bh_skb); +#endif +#endif + /* + * Add the devices. + * If the call to dev->init fails, the dev is removed + * from the chain disconnecting the device until the + * next reboot. + */ + + dp = &dev_base; + while ((dev = *dp) != NULL) + { + dev->iflink = -1; + if (dev->init && dev->init(dev)) + { + /* + * It failed to come up. Unhook it. + */ + *dp = dev->next; + synchronize_bh(); + } + else + { + dp = &dev->next; + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + dev_init_scheduler(dev); + } + } + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_dev); + { + struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0); + ent->read_proc = dev_proc_stats; + } +#endif + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_wireless); +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + + init_bh(NET_BH, net_bh); + + dev_boot_phase = 0; + + dev_mcast_init(); + +#ifdef CONFIG_IP_PNP + ip_auto_config(); +#endif + + return 0; +} diff --git a/pfinet/linux-src/net/core/dev_mcast.c b/pfinet/linux-src/net/core/dev_mcast.c new file mode 100644 index 00000000..bce3f4a4 --- /dev/null +++ b/pfinet/linux-src/net/core/dev_mcast.c @@ -0,0 +1,252 @@ +/* + * Linux NET3: Multicast List maintenance. + * + * Authors: + * Tim Kordas <tjk@nostromo.eeap.cwru.edu> + * Richard Underwood <richard@wuzz.demon.co.uk> + * + * Stir fried together from the IP multicast and CAP patches above + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Update the device on a real delete + * rather than any time but... + * Alan Cox : IFF_ALLMULTI support. + * Alan Cox : New format set_multicast_list() calls. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> + + +/* + * Device multicast list maintenance. + * + * This is used both by IP and by the user level maintenance functions. + * Unlike BSD we maintain a usage count on a given multicast address so + * that a casual user application can add/delete multicasts used by + * protocols without doing damage to the protocols when it deletes the + * entries. It also helps IP as it tracks overlapping maps. + * + * Device mc lists are changed by bh at least if IPv6 is enabled, + * so that it must be bh protected. + */ + +/* + * Update the multicast list into the physical NIC controller. + */ + +void dev_mc_upload(struct device *dev) +{ + /* Don't do anything till we up the interface + [dev_open will call this function so the list will + stay sane] */ + + if(!(dev->flags&IFF_UP)) + return; + + /* + * Devices with no set multicast don't get set + */ + + if(dev->set_multicast_list==NULL) + return; + + start_bh_atomic(); + dev->set_multicast_list(dev); + end_bh_atomic(); +} + +/* + * Delete a device level multicast + */ + +int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, **dmip; + + start_bh_atomic(); + for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) { + /* + * Find the entry we want to delete. The device could + * have variable length entries so check these too. + */ + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && alen==dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + break; + } + if(--dmi->dmi_users) + goto done; + + /* + * Last user. So delete the entry. + */ + *dmip = dmi->next; + dev->mc_count--; + kfree_s(dmi,sizeof(*dmi)); + /* + * We have altered the list, so the card + * loaded filter is now wrong. Fix it + */ + end_bh_atomic(); + dev_mc_upload(dev); + return 0; + } + } + err = -ENOENT; +done: + end_bh_atomic(); + return err; +} + +/* + * Add a device level multicast + */ + +int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, *dmi1; + + dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any()); + + start_bh_atomic(); + for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + goto done; + } + dmi->dmi_users++; + goto done; + } + } + + if ((dmi=dmi1)==NULL) + return -ENOMEM; + memcpy(dmi->dmi_addr, addr, alen); + dmi->dmi_addrlen=alen; + dmi->next=dev->mc_list; + dmi->dmi_users=1; + dmi->dmi_gusers=glbl ? 1 : 0; + dev->mc_list=dmi; + dev->mc_count++; + end_bh_atomic(); + dev_mc_upload(dev); + return 0; + +done: + end_bh_atomic(); + if (dmi1) + kfree(dmi1); + return err; +} + +/* + * Discard multicast list when a device is downed + */ + +void dev_mc_discard(struct device *dev) +{ + start_bh_atomic(); + while (dev->mc_list!=NULL) { + struct dev_mc_list *tmp=dev->mc_list; + dev->mc_list=tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); + kfree_s(tmp,sizeof(*tmp)); + } + dev->mc_count=0; + end_bh_atomic(); +} + +#ifdef CONFIG_PROC_FS +static int dev_mc_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0, begin=0; + struct dev_mc_list *m; + int len=0; + struct device *dev; + + start_bh_atomic(); + + for (dev = dev_base; dev; dev = dev->next) { + for (m = dev->mc_list; m; m = m->next) { + int i; + + len += sprintf(buffer+len,"%-4d %-15s %-5d %-5d ", dev->ifindex, dev->name, + m->dmi_users, m->dmi_gusers); + + for (i=0; i<m->dmi_addrlen; i++) + len += sprintf(buffer+len, "%02x", m->dmi_addr[i]); + + len+=sprintf(buffer+len, "\n"); + + pos=begin+len; + if (pos < offset) { + len=0; + begin=pos; + } + if (pos > offset+length) + goto done; + } + } + *eof = 1; + +done: + end_bh_atomic(); + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if(len<0) + len=0; + return len; +} +#endif + +__initfunc(void dev_mcast_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/dev_mcast", 0, 0); + ent->read_proc = dev_mc_read_proc; +#endif +} + diff --git a/pfinet/linux-src/net/core/dst.c b/pfinet/linux-src/net/core/dst.c new file mode 100644 index 00000000..9007dde6 --- /dev/null +++ b/pfinet/linux-src/net/core/dst.c @@ -0,0 +1,145 @@ +/* + * net/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <asm/segment.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +#include <net/dst.h> + +struct dst_entry * dst_garbage_list; +atomic_t dst_total = ATOMIC_INIT(0); + +static unsigned long dst_gc_timer_expires; +static unsigned long dst_gc_timer_inc = DST_GC_MAX; +static void dst_run_gc(unsigned long); + +static struct timer_list dst_gc_timer = + { NULL, NULL, DST_GC_MIN, 0L, dst_run_gc }; + +#if RT_CACHE_DEBUG >= 2 +atomic_t hh_count; +#endif + +static void dst_run_gc(unsigned long dummy) +{ + int delayed = 0; + struct dst_entry * dst, **dstp; + + del_timer(&dst_gc_timer); + dstp = &dst_garbage_list; + while ((dst = *dstp) != NULL) { + if (atomic_read(&dst->use)) { + dstp = &dst->next; + delayed++; + continue; + } + *dstp = dst->next; + dst_destroy(dst); + } + if (!dst_garbage_list) { + dst_gc_timer_inc = DST_GC_MAX; + return; + } + if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) + dst_gc_timer_expires = DST_GC_MAX; + dst_gc_timer_inc += DST_GC_INC; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; +#if RT_CACHE_DEBUG >= 2 + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); +#endif + add_timer(&dst_gc_timer); +} + +static int dst_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int dst_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +void * dst_alloc(int size, struct dst_ops * ops) +{ + struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } + dst = kmalloc(size, GFP_ATOMIC); + if (!dst) + return NULL; + memset(dst, 0, size); + dst->ops = ops; + atomic_set(&dst->refcnt, 0); + dst->lastuse = jiffies; + dst->input = dst_discard; + dst->output = dst_blackhole; + atomic_inc(&dst_total); + atomic_inc(&ops->entries); + return dst; +} + +void __dst_free(struct dst_entry * dst) +{ + start_bh_atomic(); + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_blackhole; + dst->dev = &loopback_dev; + } + dst->obsolete = 2; + dst->next = dst_garbage_list; + dst_garbage_list = dst; + if (dst_gc_timer_inc > DST_GC_INC) { + del_timer(&dst_gc_timer); + dst_gc_timer_inc = DST_GC_INC; + dst_gc_timer_expires = DST_GC_MIN; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; + add_timer(&dst_gc_timer); + } + end_bh_atomic(); +} + +void dst_destroy(struct dst_entry * dst) +{ + struct neighbour *neigh = dst->neighbour; + struct hh_cache *hh = dst->hh; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + atomic_dec(&dst_total); + kfree(dst); +} diff --git a/pfinet/linux-src/net/core/filter.c b/pfinet/linux-src/net/core/filter.c new file mode 100644 index 00000000..8e1ffb62 --- /dev/null +++ b/pfinet/linux-src/net/core/filter.c @@ -0,0 +1,454 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist <Jay.Schulist@spacs.k12.wi.us> + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + */ + +#include <linux/config.h> +#if defined(CONFIG_FILTER) + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/filter.h> + +/* No hurry in this branch */ + +static u8 *load_pointer(struct sk_buff *skb, int k) +{ + u8 *ptr = NULL; + + if (k>=SKF_NET_OFF) + ptr = skb->nh.raw + k - SKF_NET_OFF; + else if (k>=SKF_LL_OFF) + ptr = skb->mac.raw + k - SKF_LL_OFF; + + if (ptr<skb->head && ptr < skb->tail) + return ptr; + return NULL; +} + +/* + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +{ + unsigned char *data = skb->data; + /* len is UNSIGNED. Byte wide insns relies only on implicit + type casts to prevent reading arbitrary memory locations. + */ + unsigned int len = skb->len; + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + + /* + * Process array of filter instructions. + */ + + for(pc = 0; pc < flen; pc++) + { + fentry = &filter[pc]; + + switch(fentry->code) + { + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + + case BPF_ALU|BPF_ADD|BPF_K: + A += fentry->k; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + + case BPF_ALU|BPF_SUB|BPF_K: + A -= fentry->k; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + + case BPF_ALU|BPF_MUL|BPF_K: + A *= fentry->k; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + if(X == 0) + return (0); + A /= X; + continue; + + case BPF_ALU|BPF_DIV|BPF_K: + if(fentry->k == 0) + return (0); + A /= fentry->k; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + + case BPF_ALU|BPF_AND|BPF_K: + A &= fentry->k; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + + case BPF_ALU|BPF_OR|BPF_K: + A |= fentry->k; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + + case BPF_ALU|BPF_LSH|BPF_K: + A <<= fentry->k; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + + case BPF_ALU|BPF_RSH|BPF_K: + A >>= fentry->k; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; +load_w: + if(k+sizeof(u32) <= len) { + A = ntohl(*(u32*)&data[k]); + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = ntohl(*(u32*)ptr); + continue; + } + } + return 0; + + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; +load_h: + if(k + sizeof(u16) <= len) { + A = ntohs(*(u16*)&data[k]); + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = ntohs(*(u16*)ptr); + continue; + } + } + return 0; + + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; +load_b: + if(k < len) { + A = data[k]; + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = *ptr; + continue; + } + } + + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + goto load_w; + + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + goto load_h; + + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + goto load_b; + + case BPF_LDX|BPF_B|BPF_MSH: + k = fentry->k; + if(k >= len) + return (0); + X = (data[k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + + case BPF_RET|BPF_A: + return ((unsigned int)A); + + case BPF_ST: + mem[fentry->k] = A; + continue; + + case BPF_STX: + mem[fentry->k] = X; + continue; + + default: + /* Invalid instruction counts as RET */ + return (0); + } + + /* Handle ancillary data, which are impossible + (or very difficult) to get parsing packet contents. + */ + switch (k-SKF_AD_OFF) { + case SKF_AD_PROTOCOL: + A = htons(skb->protocol); + continue; + case SKF_AD_PKTTYPE: + A = skb->pkt_type; + continue; + case SKF_AD_IFINDEX: + A = skb->dev->ifindex; + continue; + default: + return 0; + } + } + + return (0); +} + +/* + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! + */ + +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + /* + * Check the filter code now. + */ + for(pc = 0; pc < flen; pc++) + { + /* + * All jumps are forward as they are not signed + */ + + ftest = &filter[pc]; + if(BPF_CLASS(ftest->code) == BPF_JMP) + { + /* + * But they mustn't jump off the end. + */ + if(BPF_OP(ftest->code) == BPF_JA) + { + /* Note, the large ftest->k might cause + loops. Compare this with conditional + jumps below, where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return (-EINVAL); + } + else + { + /* + * For conditionals both must be safe + */ + if(pc + ftest->jt +1 >= flen || pc + ftest->jf +1 >= flen) + return (-EINVAL); + } + } + + /* + * Check that memory operations use valid addresses. + */ + + if (ftest->k >= BPF_MEMWORDS) + { + /* + * But it might not be a memory operation... + */ + switch (ftest->code) { + case BPF_ST: + case BPF_STX: + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + return -EINVAL; + } + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the + * end they _will_ hit this. + */ + + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL; +} + +/* + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. + */ + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + return (-EINVAL); + + fp = (struct sk_filter *)sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if(fp == NULL) + return (-ENOMEM); + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + + if ((err = sk_chk_filter(fp->insns, fp->len))==0) { + struct sk_filter *old_fp = sk->filter; + sk->filter = fp; + synchronize_bh(); + fp = old_fp; + } + + if (fp) + sk_filter_release(sk, fp); + + return (err); +} +#endif /* CONFIG_FILTER */ diff --git a/pfinet/linux-src/net/core/firewall.c b/pfinet/linux-src/net/core/firewall.c new file mode 100644 index 00000000..fc7b1a51 --- /dev/null +++ b/pfinet/linux-src/net/core/firewall.c @@ -0,0 +1,160 @@ +/* + * Generic loadable firewalls. At the moment only IP will actually + * use these, but people can add the others as they are needed. + * + * Authors: Dave Bonn (for IP) + * much hacked by: Alan Cox + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/firewall.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <asm/semaphore.h> + +struct semaphore firewall_sem = MUTEX; +static int firewall_policy[NPROTO]; +static struct firewall_ops *firewall_chain[NPROTO]; + +/* + * Register a firewall + */ + +int register_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **p; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + down(&firewall_sem); + + p=&firewall_chain[pf]; + + while(*p) + { + if(fw->fw_priority > (*p)->fw_priority) + break; + p=&((*p)->next); + } + + /* + * We need to use a memory barrier to make sure that this + * works correctly even in SMP with weakly ordered writes. + * + * This is atomic wrt interrupts (and generally walking the + * chain), but not wrt itself (so you can't call this from + * an interrupt. Not that you'd want to). + */ + + fw->next=*p; + mb(); + *p = fw; + + /* + * And release the sleep lock + */ + + up(&firewall_sem); + return 0; +} + +/* + * Unregister a firewall + */ + +int unregister_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **nl; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + down(&firewall_sem); + + nl=&firewall_chain[pf]; + + while(*nl!=NULL) + { + if(*nl==fw) + { + struct firewall_ops *f=fw->next; + *nl = f; + up(&firewall_sem); + synchronize_bh(); + return 0; + } + nl=&((*nl)->next); + } + up(&firewall_sem); + return -ENOENT; +} + +int call_fw_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_forward(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +/* + * Actual invocation of the chains + */ + +int call_in_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_input(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +int call_out_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_output(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + /* alan, is this right? */ + return firewall_policy[pf]; +} + +EXPORT_SYMBOL(register_firewall); +EXPORT_SYMBOL(unregister_firewall); +EXPORT_SYMBOL(call_in_firewall); +EXPORT_SYMBOL(call_out_firewall); +EXPORT_SYMBOL(call_fw_firewall); + +__initfunc(void fwchain_init(void)) +{ + int i; + for(i=0;i<NPROTO;i++) + firewall_policy[i]=FW_ACCEPT; +} diff --git a/pfinet/linux-src/net/core/iovec.c b/pfinet/linux-src/net/core/iovec.c new file mode 100644 index 00000000..c20f8530 --- /dev/null +++ b/pfinet/linux-src/net/core/iovec.c @@ -0,0 +1,278 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + * Alexey Kuznetsov: 2.1 optimisations + * Andi Kleen : Fix csum*fromiovecend for IPv6. + */ + + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/malloc.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <net/checksum.h> + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing verify_area. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ + int size, err, ct; + + if(m->msg_namelen) + { + if(mode==VERIFY_READ) + { + err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address); + if(err<0) + goto out; + } + + m->msg_name = address; + } else + m->msg_name = NULL; + + err = -EFAULT; + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, m->msg_iov, size)) + goto out; + m->msg_iov=iov; + + for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) { + err += iov[ct].iov_len; + /* Goal is not to verify user data, but to prevent returning + negative value, which is interpreted as errno. + Overflow is still possible, but it is harmless. + */ + if (err < 0) + return -EMSGSIZE; + } +out: + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + int err = -EFAULT; + + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + goto out; + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } + err = 0; +out: + return err; +} + +/* + * In kernel copy to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len, len); + memcpy(iov->iov_base, kdata, copy); + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } +} + + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + int err = -EFAULT; + + while(len>0) + { + if(iov->iov_len) + { + int copy = min(len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; + len-=copy; + kdata+=copy; + iov->iov_base+=copy; + iov->iov_len-=copy; + } + iov++; + } + err = 0; +out: + return err; +} + + +/* + * For use with ip_build_xmit + */ + +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, + int len) +{ + int err = -EFAULT; + + /* Skip over the finished iovecs */ + while(offset >= iov->iov_len) + { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) + { + u8 *base = iov->iov_base + offset; + int copy = min(len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + goto out; + len -= copy; + kdata += copy; + iov++; + } + err = 0; +out: + return err; +} + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ + +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) +{ + int csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) + { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) + { + u8 *base = iov->iov_base + offset; + unsigned int copy = min(len, iov->iov_len - offset); + + offset = 0; + /* There is a remnant from previous iov. */ + if (partial_cnt) + { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) + { + partial_cnt = copy % 4; + if (partial_cnt) + { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} diff --git a/pfinet/linux-src/net/core/neighbour.c b/pfinet/linux-src/net/core/neighbour.c new file mode 100644 index 00000000..6afbfdcc --- /dev/null +++ b/pfinet/linux-src/net/core/neighbour.c @@ -0,0 +1,1394 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/netdevice.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <net/neighbour.h> +#include <net/dst.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> + +/* + NOTE. The most unpleasent question is serialization of + accesses to resolved addresses. The problem is that addresses + are modified by bh, but they are referenced from normal + kernel thread. Before today no locking was made. + My reasoning was that corrupted address token will be copied + to packet with cosmologically small probability + (it is even difficult to estimate such small number) + and it is very silly to waste cycles in fast path to lock them. + + But now I changed my mind, but not because previous statement + is wrong. Actually, neigh->ha MAY BE not opaque byte array, + but reference to some private data. In this case even neglibible + corruption probability becomes bug. + + - hh cache is protected by rwlock. It assumes that + hh cache update procedure is short and fast, and that + read_lock is cheaper than start_bh_atomic(). + - ha tokens, saved in neighbour entries, are protected + by bh_atomic(). + - no protection is made in /proc reading. It is OK, because + /proc is broken by design in any case, and + corrupted output is normal behaviour there. + + --ANK (981025) + */ + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif +static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev); + +static int neigh_glbl_allocs; +static struct neigh_table *neigh_tables; + +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonbale choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (net_random() % base) + (base>>1); +} + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + + if (atomic_read(&tbl->lock)) + return 0; + + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + /* Neighbour record may be discarded if: + - nobody refers to it. + - it is not premanent + - (NEW and probably wrong) + INCOMPLETE entries are kept at least for + n->parms->retrans_time, otherwise we could + flood network with resolution requests. + It is not clear, what is better table overflow + or flooding. + */ + if (atomic_read(&n->refcnt) == 0 && + !(n->nud_state&NUD_PERMANENT) && + (n->nud_state != NUD_INCOMPLETE || + jiffies - n->used > n->parms->retrans_time)) { + *np = n->next; + n->tbl = NULL; + tbl->entries--; + shrunk = 1; + neigh_destroy(n); + continue; + } + np = &n->next; + } + } + + tbl->last_flush = jiffies; + return shrunk; +} + +int neigh_ifdown(struct neigh_table *tbl, struct device *dev) +{ + int i; + + if (atomic_read(&tbl->lock)) { + NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n"); + return -EBUSY; + } + + start_bh_atomic(); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + n->tbl = NULL; + tbl->entries--; + if (atomic_read(&n->refcnt)) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + if (n->nud_state & NUD_IN_TIMER) + del_timer(&n->timer); + n->parms = &tbl->parms; + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state&NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } else + neigh_destroy(n); + } + } + + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + pneigh_ifdown(tbl, dev); + end_bh_atomic(); + return 0; +} + +static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) +{ + struct neighbour *n; + unsigned long now = jiffies; + + if (tbl->entries > tbl->gc_thresh1) { + if (creat < 0) + return NULL; + if (tbl->entries > tbl->gc_thresh3 || + (tbl->entries > tbl->gc_thresh2 && + now - tbl->last_flush > 5*HZ)) { + if (neigh_forced_gc(tbl) == 0 && + tbl->entries > tbl->gc_thresh3) + return NULL; + } + } + + n = kmalloc(tbl->entry_size, GFP_ATOMIC); + if (n == NULL) + return NULL; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = &tbl->parms; + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + tbl->stats.allocs++; + neigh_glbl_allocs++; + return n; +} + + +struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) +{ + struct neighbour *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>3; + hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && + memcmp(n->primary_key, pkey, key_len) == 0) { + atomic_inc(&n->refcnt); + return n; + } + } + if (!creat) + return NULL; + + n = neigh_alloc(tbl, creat); + if (n == NULL) + return NULL; + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + + /* Protocol specific setup. */ + if (tbl->constructor && tbl->constructor(n) < 0) { + neigh_destroy(n); + return NULL; + } + + /* Device specific setup. */ + if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { + neigh_destroy(n); + return NULL; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time<<1); + atomic_set(&n->refcnt, 1); + tbl->entries++; + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->tbl = tbl; + NEIGH_PRINTK2("neigh %p is created.\n", n); + return n; +} + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) +{ + struct pneigh_entry *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && + (n->dev == dev || !n->dev)) + return n; + } + if (!creat) + return NULL; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (n == NULL) + return NULL; + + memcpy(n->key, pkey, key_len); + n->dev = dev; + + if (tbl->pconstructor && tbl->pconstructor(n)) { + kfree(n); + return NULL; + } + + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) { + *np = n->next; + synchronize_bh(); + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + return 0; + } + } + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h=0; h<=PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n=*np) != NULL) { + if (n->dev == dev || dev == NULL) { + *np = n->next; + synchronize_bh(); + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct hh_cache *hh; + + if (neigh->tbl || atomic_read(&neigh->refcnt)) { + NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: " + "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0)); + return; + } + + if (neigh->nud_state&NUD_IN_TIMER) + del_timer(&neigh->timer); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + hh->hh_output = neigh_blackhole; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + } + + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + neigh_glbl_allocs--; + kfree(neigh); +} + +/* Neighbour state is suspicious; + disable fast path. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh); + + neigh->output = neigh->ops->output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; +} + +/* + Transitions NUD_STALE <-> NUD_REACHABLE do not occur + when fast path is built: we have no timers assotiated with + these states, we do not have time to check state when sending. + neigh_periodic_timer check periodically neigh->confirmed + time and moves NUD_REACHABLE -> NUD_STALE. + + If a routine wants to know TRUE entry state, it calls + neigh_sync before checking state. + */ + +static void neigh_sync(struct neighbour *n) +{ + unsigned long now = jiffies; + u8 state = n->nud_state; + + if (state&(NUD_NOARP|NUD_PERMANENT)) + return; + if (state&NUD_REACHABLE) { + if (now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + } else if (state&NUD_VALID) { + if (now - n->confirmed < n->parms->reachable_time) { + if (state&NUD_IN_TIMER) + del_timer(&n->timer); + n->nud_state = NUD_REACHABLE; + neigh_connect(n); + } + } +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table*)arg; + unsigned long now = jiffies; + int i; + + if (atomic_read(&tbl->lock)) { + tbl->gc_timer.expires = now + 1*HZ; + add_timer(&tbl->gc_timer); + return; + } + + /* + * periodicly recompute ReachableTime from random function + */ + + if (now - tbl->last_rand > 300*HZ) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p=&tbl->parms; p; p = p->next) + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + } + + for (i=0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + unsigned state = n->nud_state; + + if (state&(NUD_PERMANENT|NUD_IN_TIMER)) + goto next_elt; + + if ((long)(n->used - n->confirmed) < 0) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 0 && + (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) { + *np = n->next; + n->tbl = NULL; + n->next = NULL; + tbl->entries--; + neigh_destroy(n); + continue; + } + + if (n->nud_state&NUD_REACHABLE && + now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + +next_elt: + np = &n->next; + } + } + + tbl->gc_timer.expires = now + tbl->gc_interval; + add_timer(&tbl->gc_timer); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return p->ucast_probes + p->app_probes + p->mcast_probes; +} + + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now = jiffies; + struct neighbour *neigh = (struct neighbour*)arg; + unsigned state = neigh->nud_state; + + if (!(state&NUD_IN_TIMER)) { + NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); + return; + } + + if ((state&NUD_VALID) && + now - neigh->confirmed < neigh->parms->reachable_time) { + neigh->nud_state = NUD_REACHABLE; + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_connect(neigh); + return; + } + if (state == NUD_DELAY) { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->probes = 0; + } + + if (neigh->probes >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + neigh->tbl->stats.res_failed++; + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->ops->error_report(neigh, skb); + skb_queue_purge(&neigh->arp_queue); + return; + } + + neigh->timer.expires = now + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); + neigh->probes++; +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + start_bh_atomic(); + if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { + if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { + if (neigh->tbl == NULL) { + NEIGH_PRINTK2("neigh %p used after death.\n", neigh); + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + neigh->probes = neigh->parms->ucast_probes; + neigh->nud_state = NUD_INCOMPLETE; + neigh->timer.expires = jiffies + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb); + neigh->probes++; + } else { + neigh->nud_state = NUD_FAILED; + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + } + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.prev; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); + } + __skb_queue_head(&neigh->arp_queue, skb); + } + end_bh_atomic(); + return 1; + } + if (neigh->nud_state == NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + } + end_bh_atomic(); + return 0; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct device*, unsigned char*) = + neigh->dev->header_cache_update; + + if (update) { + for (hh=neigh->hh; hh; hh=hh->hh_next) { + write_lock_irq(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_unlock_irq(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- override==1 allows to override existing lladdr, if it is different. + -- arp==0 means that the change is administrative. + */ + +int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) +{ + u8 old = neigh->nud_state; + struct device *dev = neigh->dev; + + if (arp && (old&(NUD_NOARP|NUD_PERMANENT))) + return -EPERM; + + if (!(new&NUD_VALID)) { + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + if (old&NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + return 0; + } + + /* Compare new lladdr with cached one */ + if (dev->addr_len == 0) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if (old&NUD_VALID) { + if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0) + lladdr = neigh->ha; + else if (!override) + return -EPERM; + } + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + if (!(old&NUD_VALID)) + return -EINVAL; + lladdr = neigh->ha; + } + + neigh_sync(neigh); + old = neigh->nud_state; + if (new&NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + if (old&NUD_VALID) { + if (lladdr == neigh->ha) + if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) + return 0; + } + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + neigh->nud_state = new; + if (lladdr != neigh->ha) { + memcpy(&neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1); +#ifdef CONFIG_ARPD + if (neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + } + if (new == old) + return 0; + if (new&NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old&NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state&NUD_VALID && + (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + } + skb_queue_purge(&neigh->arp_queue); + } + return 0; +} + +struct neighbour * neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct device *dev) +{ + struct neighbour *neigh; + + neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, 1, 1); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol) +{ + struct hh_cache *hh = NULL; + struct device *dev = dst->dev; + + for (hh=n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state&NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resoltution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (neigh_event_send(neigh, skb) == 0) { + int err; + struct device *dev = neigh->dev; + if (dev->hard_header_cache && dst->hh == NULL) { + start_bh_atomic(); + if (dst->hh == NULL) + neigh_hh_init(neigh, dst, dst->ops->protocol); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + } else { + start_bh_atomic(); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + } + if (err >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; + } + return 0; + +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL); + kfree_skb(skb); + return -EINVAL; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + start_bh_atomic(); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + if (err >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff*)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo) + tbl->proxy_redo(back); + else + kfree_skb(back); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) { + tbl->proxy_timer.expires = jiffies + sched_next; + add_timer(&tbl->proxy_timer); + } +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + long sched_next = net_random()%p->proxy_delay; + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = 0; + skb->stamp.tv_usec = now + sched_next; + if (del_timer(&tbl->proxy_timer)) { + long tval = tbl->proxy_timer.expires - now; + if (tval < sched_next) + sched_next = tval; + } + tbl->proxy_timer.expires = now + sched_next; + dst_release(skb->dst); + skb->dst = NULL; + __skb_queue_tail(&tbl->proxy_queue, skb); + add_timer(&tbl->proxy_timer); +} + + +struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl) +{ + struct neigh_parms *p; + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup) { + if (dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + } + p->next = tbl->parms.next; + tbl->parms.next = p; + } + return p; +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (parms == NULL || parms == &tbl->parms) + return; + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + synchronize_bh(); +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(parms); +#endif + kfree(parms); + return; + } + } + NEIGH_PRINTK1("neigh_release_parms: not found\n"); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + + tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); + + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time*20; + tbl->next = neigh_tables; + neigh_tables = tbl; +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + start_bh_atomic(); + del_timer(&tbl->gc_timer); + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + end_bh_atomic(); + if (tbl->entries) + printk(KERN_CRIT "neighbour leakage\n"); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + synchronize_bh(); + break; + } + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&tbl->parms); +#endif + return 0; +} + +#ifdef CONFIG_RTNETLINK + + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + + if (ndm->ndm_flags&NTF_PROXY) + return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + + if (dev == NULL) + return -EINVAL; + + start_bh_atomic(); + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, 1, 0); + neigh_release(n); + } + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + if (ndm->ndm_flags&NTF_PROXY) { + if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1)) + return 0; + return -ENOBUFS; + } + if (dev == NULL) + return -EINVAL; + if (nda[NDA_LLADDR-1] != NULL && + nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) + return -EINVAL; + start_bh_atomic(); + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + if (n) { + if (nlh->nlmsg_flags&NLM_F_EXCL) + err = -EEXIST; + } else if (!(nlh->nlmsg_flags&NLM_F_CREATE)) + err = -ENOENT; + else { + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1); + if (n == NULL) + err = -ENOBUFS; + } + if (err == 0) { + err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL, + ndm->ndm_state, + nlh->nlmsg_flags&NLM_F_REPLACE, 0); + } + if (n) + neigh_release(n); + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + u32 pid, u32 seq, int event) +{ + unsigned long now = jiffies; + struct ndmsg *ndm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm)); + ndm = NLMSG_DATA(nlh); + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_state = n->nud_state; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + if (n->nud_state&NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt); + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neighbour *n; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[1]; + s_idx = idx = cb->args[2]; + for (h=0; h <= NEIGH_HASHMASK; h++) { + if (h < s_h) continue; + if (h > s_h) + s_idx = 0; + start_bh_atomic(); + for (n = tbl->hash_buckets[h], idx = 0; n; + n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) { + end_bh_atomic(); + cb->args[1] = h; + cb->args[2] = idx; + return -1; + } + } + end_bh_atomic(); + } + + cb->args[1] = h; + cb->args[2] = idx; + return skb->len; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct neigh_table *tbl; + int family = ((struct rtgenmsg*)NLMSG_DATA(cb->nlh))->rtgen_family; + + s_t = cb->args[0]; + + for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) { + if (t < s_t) continue; + if (family && tbl->family != family) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +static void neigh_app_notify(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + + + +#endif + + +#endif + +#ifdef CONFIG_SYSCTL + +struct neigh_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[17]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + NULL, + {{NET_NEIGH_MCAST_SOLICIT, "mcast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_UCAST_SOLICIT, "ucast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_APP_SOLICIT, "app_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_RETRANS_TIME, "retrans_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_REACHABLE_TIME, "base_reachable_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_STALE_TIME, "gc_stale_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_UNRES_QLEN, "unres_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_QLEN, "proxy_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_ANYCAST_DELAY, "anycast_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_DELAY, "proxy_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_LOCKTIME, "locktime", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_INTERVAL, "gc_interval", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_THRESH1, "gc_thresh1", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH2, "gc_thresh2", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH3, "gc_thresh3", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{NET_PROTO_CONF_DEFAULT, "default", NULL, 0, 0555, NULL},{0}}, + {{0, "neigh", NULL, 0, 0555, NULL},{0}}, + {{0, NULL, NULL, 0, 0555, NULL},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} +}; + +int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name) +{ + struct neigh_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[0].data = &p->mcast_probes; + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->base_reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + if (dev) { + t->neigh_dev[0].procname = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex; + memset(&t->neigh_vars[12], 0, sizeof(ctl_table)); + } else { + t->neigh_vars[12].data = (int*)(p+1); + t->neigh_vars[13].data = (int*)(p+1) + 1; + t->neigh_vars[14].data = (int*)(p+1) + 2; + t->neigh_vars[15].data = (int*)(p+1) + 3; + } + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (t->sysctl_header == NULL) { + kfree(t); + return -ENOBUFS; + } + p->sysctl_table = t; + return 0; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ diff --git a/pfinet/linux-src/net/core/profile.c b/pfinet/linux-src/net/core/profile.c new file mode 100644 index 00000000..fc7464b7 --- /dev/null +++ b/pfinet/linux-src/net/core/profile.c @@ -0,0 +1,305 @@ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <net/checksum.h> + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <net/profile.h> + +#ifdef CONFIG_NET_PROFILE + +atomic_t net_profile_active; +struct timeval net_profile_adjust; + +NET_PROFILE_DEFINE(total); + +struct net_profile_slot *net_profile_chain = &net_prof_total; + +#ifdef __alpha__ +__u32 alpha_lo; +long alpha_hi; + +static void alpha_tick(unsigned long); + +static struct timer_list alpha_timer = + { NULL, NULL, 0, 0L, alpha_tick }; + +void alpha_tick(unsigned long dummy) +{ + struct timeval dummy_stamp; + net_profile_stamp(&dummy_stamp); + alpha_timer.expires = jiffies + 4*HZ; + add_timer(&alpha_timer); +} + +#endif + +void net_profile_irq_adjust(struct timeval *entered, struct timeval* leaved) +{ + struct net_profile_slot *s; + + net_profile_sub(entered, leaved); + for (s = net_profile_chain; s; s = s->next) { + if (s->active) + net_profile_add(leaved, &s->irq); + } +} + + +#ifdef CONFIG_PROC_FS +static int profile_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + struct net_profile_slot *s; + + len+= sprintf(buffer, "Slot Hits Hi Lo OnIrqHi OnIrqLo Ufl\n"); + + if (offset == 0) { + cli(); + net_prof_total.active = 1; + atomic_inc(&net_profile_active); + NET_PROFILE_LEAVE(total); + sti(); + } + for (s = net_profile_chain; s; s = s->next) { + struct net_profile_slot tmp; + + cli(); + tmp = *s; + + /* Wrong, but pretty close to truth */ + + s->accumulator.tv_sec = 0; + s->accumulator.tv_usec = 0; + s->irq.tv_sec = 0; + s->irq.tv_usec = 0; + s->hits = 0; + s->underflow = 0; + /* Repair active count, it is possible, only if code has a bug */ + if (s->active) { + s->active = 0; + atomic_dec(&net_profile_active); + } + sti(); + + net_profile_sub(&tmp.irq, &tmp.accumulator); + + len += sprintf(buffer+len,"%-15s %-10d %-10ld %-10lu %-10lu %-10lu %d/%d", + tmp.id, + tmp.hits, + tmp.accumulator.tv_sec, + tmp.accumulator.tv_usec, + tmp.irq.tv_sec, + tmp.irq.tv_usec, + tmp.underflow, tmp.active); + + buffer[len++]='\n'; + + pos=begin+len; + if(pos<offset) { + len=0; + begin=pos; + } + if(pos>offset+length) + goto done; + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } + if (offset == 0) { + cli(); + net_prof_total.active = 0; + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + sti(); + } + return len; +} +#endif + +struct iphdr whitehole_iph; +int whitehole_count; + +static int whitehole_xmit(struct sk_buff *skb, struct device *dev) +{ + struct net_device_stats *stats; + dev_kfree_skb(skb); + stats = (struct net_device_stats *)dev->priv; + stats->tx_packets++; + stats->tx_bytes+=skb->len; + + return 0; +} + +static void whitehole_inject(unsigned long); +int whitehole_init(struct device *dev); + +static struct timer_list whitehole_timer = + { NULL, NULL, 0, 0L, whitehole_inject }; + +static struct device whitehole_dev = { + "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, }; + +static int whitehole_open(struct device *dev) +{ + whitehole_count = 100000; + whitehole_timer.expires = jiffies + 5*HZ; + add_timer(&whitehole_timer); + return 0; +} + +static int whitehole_close(struct device *dev) +{ + del_timer(&whitehole_timer); + return 0; +} + +static void whitehole_inject(unsigned long dummy) +{ + struct net_device_stats *stats = (struct net_device_stats *)whitehole_dev.priv; + extern int netdev_dropping; + + do { + struct iphdr *iph; + struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + if (!skb) + break; + skb_reserve(skb, 32); + iph = (struct iphdr*)skb_put(skb, sizeof(*iph)); + skb->mac.raw = ((u8*)iph) - 14; + memcpy(iph, &whitehole_iph, sizeof(*iph)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = &whitehole_dev; + skb->pkt_type = PACKET_HOST; + stats->rx_packets++; + stats->rx_bytes += skb->len; + netif_rx(skb); + whitehole_count--; + } while (netdev_dropping == 0 && whitehole_count>0); + if (whitehole_count > 0) { + whitehole_timer.expires = jiffies + 1; + add_timer(&whitehole_timer); + } +} + +static struct net_device_stats *whitehole_get_stats(struct device *dev) +{ + struct net_device_stats *stats = (struct net_device_stats *) dev->priv; + return stats; +} + +__initfunc(int whitehole_init(struct device *dev)) +{ + dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); + if (dev->priv == NULL) + return -ENOBUFS; + memset(dev->priv, 0, sizeof(struct net_device_stats)); + dev->get_stats = whitehole_get_stats; + dev->hard_start_xmit = whitehole_xmit; + dev->open = whitehole_open; + dev->stop = whitehole_close; + ether_setup(dev); + dev->tx_queue_len = 0; + dev->flags |= IFF_NOARP; + dev->flags &= ~(IFF_BROADCAST|IFF_MULTICAST); + dev->iflink = 0; + whitehole_iph.ihl = 5; + whitehole_iph.version = 4; + whitehole_iph.ttl = 2; + whitehole_iph.saddr = in_aton("193.233.7.21"); + whitehole_iph.daddr = in_aton("193.233.7.10"); + whitehole_iph.tot_len = htons(20); + whitehole_iph.check = ip_compute_csum((void *)&whitehole_iph, 20); + return 0; +} + +int net_profile_register(struct net_profile_slot *slot) +{ + cli(); + slot->next = net_profile_chain; + net_profile_chain = slot; + sti(); + return 0; +} + +int net_profile_unregister(struct net_profile_slot *slot) +{ + struct net_profile_slot **sp, *s; + + for (sp = &net_profile_chain; (s = *sp) != NULL; sp = &s->next) { + if (s == slot) { + cli(); + *sp = s->next; + sti(); + return 0; + } + } + return -ESRCH; +} + + +__initfunc(int net_profile_init(void)) +{ + int i; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/profile", 0, 0); + ent->read_proc = profile_read_proc; +#endif + + register_netdevice(&whitehole_dev); + + printk("Evaluating net profiler cost ..."); +#if CPU == 586 || CPU == 686 + if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC)) { + printk(KERN_ERR "Sorry, your CPU does not support TSC. Net profiler disabled.\n"); + return -1; + } +#endif + start_bh_atomic(); +#ifdef __alpha__ + alpha_tick(0); +#endif + for (i=0; i<1024; i++) { + NET_PROFILE_ENTER(total); + NET_PROFILE_LEAVE(total); + } + if (net_prof_total.accumulator.tv_sec) { + printk(" too high!\n"); + } else { + net_profile_adjust.tv_usec = net_prof_total.accumulator.tv_usec>>10; + printk("%ld units\n", net_profile_adjust.tv_usec); + } + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + end_bh_atomic(); + return 0; +} + +#endif diff --git a/pfinet/linux-src/net/core/rtnetlink.c b/pfinet/linux-src/net/core/rtnetlink.c new file mode 100644 index 00000000..7f89e54a --- /dev/null +++ b/pfinet/linux-src/net/core/rtnetlink.c @@ -0,0 +1,512 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Alexey Zhuravlev ifi_change does something useful + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/string.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +atomic_t rtnl_rlockct; +struct wait_queue *rtnl_wait; + + +void rtnl_lock() +{ + rtnl_shlock(); + rtnl_exlock(); +} + +void rtnl_unlock() +{ + rtnl_exunlock(); + rtnl_shunlock(); +} + +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + +#ifdef CONFIG_RTNETLINK +struct sock *rtnl; + +unsigned long rtnl_wlockct; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +#define _S 1 /* superuser privileges required */ +#define _X 2 /* exclusive access to tables required */ +#define _G 4 /* GET request */ + +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +{ + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)) +}; + +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +{ + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); +} + +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, + int type, u32 pid, u32 seq, u32 change) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = change; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + if (1) { + unsigned mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + if (dev->qdisc_sleeping) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct device *dev; + + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) + break; + } + cb->args[0] = idx; + + return skb->len; +} + +int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idx<NPROTO; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0) + continue; + if (skb_tailroom(skb) < 256) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct device *dev) +{ + struct sk_buff *skb; + int size = NLMSG_GOODSIZE; + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, ~0U) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ + if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC) + rtnl_shunlock(); + return 0; +} + +/* Process one rtnetlink message. */ + +extern __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + struct rtattr *rta[RTATTR_MAX]; + + int exclusive = 0; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + /* Only requests are handled by kernel now */ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < RTM_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > RTM_MAX) + goto err_inval; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family > NPROTO) { + *errp = -EAFNOSUPPORT; + return -1; + } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[PF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && !cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + *errp = -EPERM; + return -1; + } + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + u32 rlen; + + if (link->dumpit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + + if (link->dumpit == NULL) + goto err_inval; + + /* Super-user locks all the tables to get atomic snapshot */ + if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) + && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_inc(&rtnl_rlockct); + if ((*errp = netlink_dump_start(rtnl, skb, nlh, + link->dumpit, + rtnetlink_done)) != 0) { + if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_dec(&rtnl_rlockct); + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + if (kind != 2) { + if (rtnl_exlock_nowait()) { + *errp = 0; + return -1; + } + exclusive = 1; + } + + memset(&rta, 0, sizeof(rta)); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + if (link->doit == NULL) + goto err_inval; + err = link->doit(skb, nlh, (void *)&rta); + + if (exclusive) + rtnl_exunlock(); + *errp = err; + return err; + +err_inval: + if (exclusive) + rtnl_exunlock(); + *errp = -EINVAL; + return -1; +} + +/* + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (rtnetlink_rcv_msg(skb, nlh, &err)) { + /* Not error, but we must interrupt processing here: + * Note, that in this case we do not pull message + * from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags&NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +/* + * rtnetlink input queue processing routine: + * - try to acquire shared lock. If it is failed, defer processing. + * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, + * that will occur, when a dump started and/or acquisition of + * exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->receive_queue, skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + rtnl_shunlock(); +} + +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { neigh_add, NULL, }, + { neigh_delete, NULL, }, + { NULL, neigh_dump_info, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +}; + + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev); + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block rtnetlink_dev_notifier = { + rtnetlink_event, + NULL, + 0 +}; + + +__initfunc(void rtnetlink_init(void)) +{ +#ifdef RTNL_DEBUG + printk("Initializing RT netlink socket\n"); +#endif + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + if (rtnl == NULL) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[PF_PACKET] = link_rtnetlink_table; +} + + + +#endif diff --git a/pfinet/linux-src/net/core/scm.c b/pfinet/linux-src/net/core/scm.c new file mode 100644 index 00000000..cdb5f3d0 --- /dev/null +++ b/pfinet/linux-src/net/core/scm.c @@ -0,0 +1,280 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/inet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/rarp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/scm.h> + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + if ((creds->pid == current->pid || capable(CAP_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || + creds->gid == current->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > SCM_MAX_FD) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } +} + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (cmsg->cmsg_len < sizeof(struct cmsghdr) || + (unsigned long)(((char*)cmsg - (char*)msg->msg_control) + + cmsg->cmsg_len) > msg->msg_controllen) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + + err = -EINVAL; + if (msg->msg_flags & MSG_CTLFLAGS) + goto error; + + return 0; + +error: + scm_destroy(p); + return err; +} + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + + int fdmax = (msg->msg_controllen - sizeof(struct cmsghdr))/sizeof(int); + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int *cmfptr; + int err = 0, i; + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(int*)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) + { + int new_fd; + err = get_unused_fd(); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + fp[i]->f_count++; + current->files->fd[new_fd] = fp[i]; + } + + if (i > 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + if (!err) + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + memcpy(new_fpl, fpl, sizeof(*fpl)); + + for (i=fpl->count-1; i>=0; i--) + fpl->fp[i]->f_count++; + } + return new_fpl; +} diff --git a/pfinet/linux-src/net/core/skbuff.c b/pfinet/linux-src/net/core/skbuff.c new file mode 100644 index 00000000..b7636437 --- /dev/null +++ b/pfinet/linux-src/net/core/skbuff.c @@ -0,0 +1,385 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> + * Florian La Roche <rzsfl@rz.uni-sb.de> + * + * Version: $Id: skbuff.c,v 1.55 1999/02/23 08:12:27 davem Exp $ + * + * Fixes: + * Alan Cox : Fixed the worst of the load balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/malloc.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/dst.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/sock.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +/* + * Skb list spinlock + */ +spinlock_t skb_queue_lock = SPIN_LOCK_UNLOCKED; + +/* + * Resource tracking variables + */ + +static atomic_t net_skbcount = ATOMIC_INIT(0); +static atomic_t net_allocs = ATOMIC_INIT(0); +static atomic_t net_fails = ATOMIC_INIT(0); + +extern atomic_t ip_frag_mem; + +static kmem_cache_t *skbuff_head_cache; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + panic("skput:over: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); +} + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + panic("skput:under: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); +} + +void show_net_buffers(void) +{ + printk("Networking buffers in use : %u\n", + atomic_read(&net_skbcount)); + printk("Total network buffer allocations : %u\n", + atomic_read(&net_allocs)); + printk("Total failed network buffer allocs : %u\n", + atomic_read(&net_fails)); +#ifdef CONFIG_INET + printk("IP fragment buffer size : %u\n", + atomic_read(&ip_frag_mem)); +#endif +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { + static int count = 0; + if (++count < 5) { + printk(KERN_ERR "alloc_skb called nonatomically " + "from interrupt %p\n", __builtin_return_address(0)); + } + gfp_mask &= ~__GFP_WAIT; + } + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (skb == NULL) + goto nohead; + + /* Get the DATA. Size must match skb_add_mtu(). */ + size = ((size + 15) & ~15); + data = kmalloc(size + sizeof(atomic_t), gfp_mask); + if (data == NULL) + goto nodata; + + /* Note that this counter is useless now - you can just look in the + * skbuff_head entry in /proc/slabinfo. We keep it only for emergency + * cases. + */ + atomic_inc(&net_allocs); + + skb->truesize = size; + + atomic_inc(&net_skbcount); + + /* Load the data pointers. */ + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + /* Set up other state */ + skb->len = 0; + skb->is_clone = 0; + skb->cloned = 0; + + atomic_set(&skb->users, 1); + atomic_set(skb_datarefp(skb), 1); + return skb; + +nodata: + kmem_cache_free(skbuff_head_cache, skb); +nohead: + atomic_inc(&net_fails); + return NULL; +} + + +/* + * Slab constructor for a skb head. + */ +static inline void skb_headerinit(void *p, kmem_cache_t *cache, + unsigned long flags) +{ + struct sk_buff *skb = p; + + skb->destructor = NULL; + skb->pkt_type = PACKET_HOST; /* Default type */ + skb->pkt_bridged = 0; /* Not bridged */ + skb->prev = skb->next = NULL; + skb->list = NULL; + skb->sk = NULL; + skb->stamp.tv_sec=0; /* No idea about time */ + skb->ip_summed = 0; + skb->security = 0; /* By default packets are insecure */ + skb->dst = NULL; +#ifdef CONFIG_IP_FIREWALL + skb->fwmark = 0; +#endif + memset(skb->cb, 0, sizeof(skb->cb)); + skb->priority = 0; +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ + if (!skb->cloned || atomic_dec_and_test(skb_datarefp(skb))) + kfree(skb->head); + + kmem_cache_free(skbuff_head_cache, skb); + atomic_dec(&net_skbcount); +} + +/* + * Free an sk_buff. Release anything attached to the buffer. Clean the state. + */ + +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) + printk(KERN_WARNING "Warning: kfree_skb passed an skb still " + "on a list (from %p).\n", __builtin_return_address(0)); + + dst_release(skb->dst); + if(skb->destructor) + skb->destructor(skb); + skb_headerinit(skb, NULL, 0); /* clean state */ + kfree_skbmem(skb); +} + +/* + * Duplicate an sk_buff. The new one is not owned by a socket. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n; + + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + memcpy(n, skb, sizeof(*n)); + atomic_inc(skb_datarefp(skb)); + skb->cloned = 1; + + atomic_inc(&net_allocs); + atomic_inc(&net_skbcount); + dst_clone(n->dst); + n->cloned = 1; + n->next = n->prev = NULL; + n->list = NULL; + n->sk = NULL; + n->is_clone = 1; + atomic_set(&n->users, 1); + n->destructor = NULL; + return n; +} + +/* + * This is slower, and copies the whole data area + */ + +struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n; + unsigned long offset; + + /* + * Allocate the copy buffer + */ + + n=alloc_skb(skb->end - skb->head, gfp_mask); + if(n==NULL) + return NULL; + + /* + * Shift between the two data areas in bytes + */ + + offset=n->head-skb->head; + + /* Set the data pointer */ + skb_reserve(n,skb->data-skb->head); + /* Set the tail pointer and length */ + skb_put(n,skb->len); + /* Copy the bytes */ + memcpy(n->head,skb->head,skb->end-skb->head); + n->csum = skb->csum; + n->list=NULL; + n->sk=NULL; + n->dev=skb->dev; + n->priority=skb->priority; + n->protocol=skb->protocol; + n->dst=dst_clone(skb->dst); + n->h.raw=skb->h.raw+offset; + n->nh.raw=skb->nh.raw+offset; + n->mac.raw=skb->mac.raw+offset; + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + n->used=skb->used; + n->is_clone=0; + atomic_set(&n->users, 1); + n->pkt_type=skb->pkt_type; + n->stamp=skb->stamp; + n->destructor = NULL; + n->security=skb->security; +#ifdef CONFIG_IP_FIREWALL + n->fwmark = skb->fwmark; +#endif + return n; +} + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) +{ + struct sk_buff *n; + unsigned long offset; + int headroom = skb_headroom(skb); + + /* + * Allocate the copy buffer + */ + + n=alloc_skb(skb->truesize+newheadroom-headroom, GFP_ATOMIC); + if(n==NULL) + return NULL; + + skb_reserve(n,newheadroom); + + /* + * Shift between the two data areas in bytes + */ + + offset=n->data-skb->data; + + /* Set the tail pointer and length */ + skb_put(n,skb->len); + /* Copy the bytes */ + memcpy(n->data,skb->data,skb->len); + n->list=NULL; + n->sk=NULL; + n->priority=skb->priority; + n->protocol=skb->protocol; + n->dev=skb->dev; + n->dst=dst_clone(skb->dst); + n->h.raw=skb->h.raw+offset; + n->nh.raw=skb->nh.raw+offset; + n->mac.raw=skb->mac.raw+offset; + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + n->used=skb->used; + n->is_clone=0; + atomic_set(&n->users, 1); + n->pkt_type=skb->pkt_type; + n->stamp=skb->stamp; + n->destructor = NULL; + n->security=skb->security; +#ifdef CONFIG_IP_FIREWALL + n->fwmark = skb->fwmark; +#endif + + return n; +} + +#if 0 +/* + * Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ + /* Must match allocation in alloc_skb */ + mtu = ((mtu + 15) & ~15) + sizeof(atomic_t); + + kmem_add_cache_size(mtu); +} +#endif + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + skb_headerinit, NULL); + if (!skbuff_head_cache) + panic("cannot create skbuff cache"); +} diff --git a/pfinet/linux-src/net/core/sock.c b/pfinet/linux-src/net/core/sock.c new file mode 100644 index 00000000..e0eb41a0 --- /dev/null +++ b/pfinet/linux-src/net/core/sock.c @@ -0,0 +1,1051 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Version: $Id: sock.c,v 1.80 1999/05/08 03:04:34 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/poll.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/rarp.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <net/icmp.h> +#include <linux/ipsec.h> + +#ifdef CONFIG_FILTER +#include <linux/filter.h> +#endif + +#define min(a,b) ((a)<(b)?(a):(b)) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max = SK_WMEM_MAX; +__u32 sysctl_rmem_max = SK_RMEM_MAX; +__u32 sysctl_wmem_default = SK_WMEM_MAX; +__u32 sysctl_rmem_default = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancilliary data plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + struct sock *sk=sock->sk; +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + int val; + int valbool; + int err; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + +#ifdef SO_DONTLINGER /* Compatibility item... */ + switch(optname) + { + case SO_DONTLINGER: + sk->linger=0; + return 0; + } +#endif + + if(optlen<sizeof(int)) + return(-EINVAL); + + err = get_user(val, (int *)optval); + if (err) + return err; + + valbool = val?1:0; + + switch(optname) + { + case SO_DEBUG: + if(val && !capable(CAP_NET_ADMIN)) + { + ret = -EACCES; + } + else + sk->debug=valbool; + break; + case SO_REUSEADDR: + sk->reuse = valbool; + break; + case SO_TYPE: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sk->localroute=valbool; + break; + case SO_BROADCAST: + sk->broadcast=valbool; + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; + + sk->sndbuf = max(val*2,2048); + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->write_space(sk); + break; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; + + /* FIXME: is this lower bound the right one? */ + sk->rcvbuf = max(val*2,256); + break; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->protocol == IPPROTO_TCP) + { + tcp_set_keepalive(sk, valbool); + } +#endif + sk->keepopen = valbool; + break; + + case SO_OOBINLINE: + sk->urginline = valbool; + break; + + case SO_NO_CHECK: + sk->no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->priority = val; + else + return(-EPERM); + break; + + case SO_LINGER: + if(optlen<sizeof(ling)) + return -EINVAL; /* 1003.1g */ + err = copy_from_user(&ling,optval,sizeof(ling)); + if (err) + { + ret = -EFAULT; + break; + } + if(ling.l_onoff==0) + sk->linger=0; + else + { + sk->lingertime=ling.l_linger; + sk->linger=1; + } + break; + + case SO_BSDCOMPAT: + sk->bsdism = valbool; + break; + + case SO_PASSCRED: + sock->passcred = valbool; + break; + + +#ifdef CONFIG_NETDEVICES + case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) + return -EPERM; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + + if (!valbool) { + sk->bound_dev_if = 0; + } else { + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) + return -EFAULT; + + /* Remove any cached route for this socket. */ + lock_sock(sk); + dst_release(xchg(&sk->dst_cache, NULL)); + release_sock(sk); + + if (devname[0] == '\0') { + sk->bound_dev_if = 0; + } else { + struct device *dev = dev_get(devname); + if (!dev) + return -EINVAL; + sk->bound_dev_if = dev->ifindex; + } + return 0; + } + } +#endif + + +#ifdef CONFIG_FILTER + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + filter = sk->filter; + if(filter) { + sk->filter = NULL; + synchronize_bh(); + sk_filter_release(sk, filter); + return 0; + } + return -ENOENT; +#endif + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + default: + return(-ENOPROTOOPT); + } + return ret; +} + + +int sock_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + struct sock *sk = sock->sk; + + union + { + int val; + struct linger ling; + struct timeval tm; + } v; + + int lv=sizeof(int),len; + + if(get_user(len,optlen)) + return -EFAULT; + + switch(optname) + { + case SO_DEBUG: + v.val = sk->debug; + break; + + case SO_DONTROUTE: + v.val = sk->localroute; + break; + + case SO_BROADCAST: + v.val= sk->broadcast; + break; + + case SO_SNDBUF: + v.val=sk->sndbuf; + break; + + case SO_RCVBUF: + v.val =sk->rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->reuse; + break; + + case SO_KEEPALIVE: + v.val = sk->keepopen; + break; + + case SO_TYPE: + v.val = sk->type; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if(v.val==0) + v.val=xchg(&sk->err_soft,0); + break; + + case SO_OOBINLINE: + v.val = sk->urginline; + break; + + case SO_NO_CHECK: + v.val = sk->no_check; + break; + + case SO_PRIORITY: + v.val = sk->priority; + break; + + case SO_LINGER: + lv=sizeof(v.ling); + v.ling.l_onoff=sk->linger; + v.ling.l_linger=sk->lingertime; + break; + + case SO_BSDCOMPAT: + v.val = sk->bsdism; + break; + + case SO_RCVTIMEO: + case SO_SNDTIMEO: + lv=sizeof(struct timeval); + v.tm.tv_sec=0; + v.tm.tv_usec=0; + break; + + case SO_RCVLOWAT: + case SO_SNDLOWAT: + v.val=1; + break; + + case SO_PASSCRED: + v.val = sock->passcred; + break; + + case SO_PEERCRED: + lv=sizeof(sk->peercred); + len=min(len, lv); + if(copy_to_user((void*)optval, &sk->peercred, len)) + return -EFAULT; + goto lenout; + + default: + return(-ENOPROTOOPT); + } + len=min(len,lv); + if(copy_to_user(optval,&v,len)) + return -EFAULT; +lenout: + if(put_user(len, optlen)) + return -EFAULT; + return 0; +} + +static kmem_cache_t *sk_cachep; + +/* + * All socket objects are allocated here. This is for future + * usage. + */ + +struct sock *sk_alloc(int family, int priority, int zero_it) +{ + struct sock *sk = kmem_cache_alloc(sk_cachep, priority); + + if(sk) { + if (zero_it) + memset(sk, 0, sizeof(struct sock)); + sk->family = family; + } + + return sk; +} + +void sk_free(struct sock *sk) +{ +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + if (sk->destruct) + sk->destruct(sk); + +#ifdef CONFIG_FILTER + filter = sk->filter; + if (filter) { + sk_filter_release(sk, filter); + sk->filter = NULL; + } +#endif + + if (atomic_read(&sk->omem_alloc)) + printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc)); + + kmem_cache_free(sk_cachep, sk); +} + +void __init sk_init(void) +{ + sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0, + SLAB_HWCACHE_ALIGN, 0, 0); + +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + /* In case it might be waiting for more memory. */ + atomic_sub(skb->truesize, &sk->wmem_alloc); + sk->write_space(sk); +} + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->rmem_alloc); +} + + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) { + struct sk_buff * skb = alloc_skb(size, priority); + if (skb) { + atomic_add(skb->truesize, &sk->wmem_alloc); + skb->destructor = sock_wfree; + skb->sk = sk; + return skb; + } + } + return NULL; +} + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + atomic_add(skb->truesize, &sk->rmem_alloc); + skb->destructor = sock_rfree; + skb->sk = sk; + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->omem_alloc); + } + return NULL; +} + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree_s(mem, size); + atomic_sub(size, &sk->omem_alloc); +} + +/* FIXME: this is insane. We are trying suppose to be controlling how + * how much space we have for data bytes, not packet headers. + * This really points out that we need a better system for doing the + * receive buffer. -- erics + * WARNING: This is currently ONLY used in tcp. If you need it else where + * this will probably not be what you want. Possibly these two routines + * should move over to the ipv4 directory. + */ +unsigned long sock_rspace(struct sock *sk) +{ + int amt = 0; + + if (sk != NULL) { + /* This used to have some bizarre complications that + * to attempt to reserve some amount of space. This doesn't + * make sense, since the number returned here does not + * actually reflect allocated space, but rather the amount + * of space we committed to. We gamble that we won't + * run out of memory, and returning a smaller number does + * not change the gamble. If we lose the gamble tcp still + * works, it may just slow down for retransmissions. + */ + amt = sk->rcvbuf - atomic_read(&sk->rmem_alloc); + if (amt < 0) + amt = 0; + } + return amt; +} + + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static void sock_wait_for_wmem(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + + +/* + * Generic send/receive buffer handlers + */ + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + unsigned long fallback, int noblock, int *errcode) +{ + int err; + struct sk_buff *skb; + + while (1) { + unsigned long try_size = size; + + err = sock_error(sk); + if (err != 0) + goto failure; + + /* + * We should send SIGPIPE in these cases according to + * 1003.1g draft 6.4. If we (the user) did a shutdown() + * call however we should not. + * + * Note: This routine isnt just used for datagrams and + * anyway some datagram protocols have a notion of + * close down. + */ + + err = -EPIPE; + if (sk->shutdown&SEND_SHUTDOWN) + goto failure; + + if (fallback) { + /* The buffer get won't block, or use the atomic queue. + * It does produce annoying no free page messages still. + */ + skb = sock_wmalloc(sk, size, 0, GFP_BUFFER); + if (skb) + break; + try_size = fallback; + } + skb = sock_wmalloc(sk, try_size, 0, sk->allocation); + if (skb) + break; + + /* + * This means we have too many buffers for this socket already. + */ + + sk->socket->flags |= SO_NOSPACE; + err = -EAGAIN; + if (noblock) + goto failure; + err = -ERESTARTSYS; + if (signal_pending(current)) + goto failure; + sock_wait_for_wmem(sk); + } + + return skb; + +failure: + *errcode = err; + return NULL; +} + + +void __release_sock(struct sock *sk) +{ +#ifdef CONFIG_INET + if (!sk->prot || !sk->backlog_rcv) + return; + + /* See if we have any packets built up. */ + start_bh_atomic(); + while (!skb_queue_empty(&sk->back_log)) { + struct sk_buff * skb = sk->back_log.next; + __skb_unlink(skb, &sk->back_log); + sk->backlog_rcv(sk, skb); + } + end_bh_atomic(); +#endif +} + + +/* + * Generic socket manager library. Most simpler socket families + * use this to manage their socket lists. At some point we should + * hash these. By making this generic we get the lot hashed for free. + */ + +void sklist_remove_socket(struct sock **list, struct sock *sk) +{ + struct sock *s; + + start_bh_atomic(); + + s= *list; + if(s==sk) + { + *list = s->next; + end_bh_atomic(); + return; + } + while(s && s->next) + { + if(s->next==sk) + { + s->next=sk->next; + break; + } + s=s->next; + } + end_bh_atomic(); +} + +void sklist_insert_socket(struct sock **list, struct sock *sk) +{ + start_bh_atomic(); + sk->next= *list; + *list=sk; + end_bh_atomic(); +} + +/* + * This is only called from user mode. Thus it protects itself against + * interrupt users but doesn't worry about being called during work. + * Once it is removed from the queue no interrupt or bottom half will + * touch it and we are (fairly 8-) ) safe. + */ + +void sklist_destroy_socket(struct sock **list, struct sock *sk); + +/* + * Handler for deferred kills. + */ + +static void sklist_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + sklist_destroy_socket(NULL,sk); +} + +/* + * Destroy a socket. We pass NULL for a list if we know the + * socket is not on a list. + */ + +void sklist_destroy_socket(struct sock **list,struct sock *sk) +{ + struct sk_buff *skb; + if(list) + sklist_remove_socket(list, sk); + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + { + kfree_skb(skb); + } + + if(atomic_read(&sk->wmem_alloc) == 0 && + atomic_read(&sk->rmem_alloc) == 0 && + sk->dead) + { + sk_free(sk); + } + else + { + /* + * Someone is using our buffers still.. defer + */ + init_timer(&sk->timer); + sk->timer.expires=jiffies+SOCK_DESTROY_TIME; + sk->timer.function=sklist_destroy_timer; + sk->timer.data = (unsigned long)sk; + add_timer(&sk->timer); + } +} + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_dup(struct socket *newsock, struct socket *oldsock) +{ + struct sock *sk = oldsock->sk; + + return net_families[sk->family]->create(newsock, sk->protocol); +} + +int sock_no_release(struct socket *sock, struct socket *peersock) +{ + return 0; +} + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} + +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +{ + return 0; +} + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + return -EOPNOTSUPP; +} + +/* + * Note: if you add something that sleeps here then change sock_fcntl() + * to do proper fd locking. + */ +int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch(cmd) + { + case F_SETOWN: + /* + * This is a little restrictive, but it's the only + * way to make sure that you can't send a sigurg to + * another process. + */ + if (current->pgrp != -arg && + current->pid != arg && + !capable(CAP_KILL)) return(-EPERM); + sk->proc = arg; + return(0); + case F_GETOWN: + return(sk->proc); + default: + return(-EINVAL); + } +} + +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags, + struct scm_cookie *scm) +{ + return -EOPNOTSUPP; +} + +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags, + struct scm_cookie *scm) +{ + return -EOPNOTSUPP; +} + + + +/* + * Default Socket Callbacks + */ + +void sock_def_wakeup(struct sock *sk) +{ + if(!sk->dead) + wake_up_interruptible(sk->sleep); +} + +void sock_def_error_report(struct sock *sk) +{ + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,0); + } +} + +void sock_def_readable(struct sock *sk, int len) +{ + if(!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1); + } +} + +void sock_def_write_space(struct sock *sk) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if(!sk->dead && + ((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf)) { + wake_up_interruptible(sk->sleep); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sock_wake_async(sk->socket, 2); + } +} + +void sock_def_destruct(struct sock *sk) +{ + if (sk->protinfo.destruct_hook) + kfree(sk->protinfo.destruct_hook); +} + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->back_log); + skb_queue_head_init(&sk->error_queue); + + init_timer(&sk->timer); + + sk->allocation = GFP_KERNEL; + sk->rcvbuf = sysctl_rmem_default; + sk->sndbuf = sysctl_wmem_default; + sk->state = TCP_CLOSE; + sk->zapped = 1; + sk->socket = sock; + + if(sock) + { + sk->type = sock->type; + sk->sleep = &sock->wait; + sock->sk = sk; + } + + sk->state_change = sock_def_wakeup; + sk->data_ready = sock_def_readable; + sk->write_space = sock_def_write_space; + sk->error_report = sock_def_error_report; + sk->destruct = sock_def_destruct; + + sk->peercred.pid = 0; + sk->peercred.uid = -1; + sk->peercred.gid = -1; + +} diff --git a/pfinet/linux-src/net/core/sysctl_net_core.c b/pfinet/linux-src/net/core/sysctl_net_core.c new file mode 100644 index 00000000..446ca145 --- /dev/null +++ b/pfinet/linux-src/net/core/sysctl_net_core.c @@ -0,0 +1,61 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/config.h> + +#ifdef CONFIG_SYSCTL + +extern int netdev_max_backlog; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + +extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; + +ctl_table core_table[] = { +#ifdef CONFIG_NET + {NET_CORE_WMEM_MAX, "wmem_max", + &sysctl_wmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_RMEM_MAX, "rmem_max", + &sysctl_rmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_WMEM_DEFAULT, "wmem_default", + &sysctl_wmem_default, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_RMEM_DEFAULT, "rmem_default", + &sysctl_rmem_default, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", + &netdev_max_backlog, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_NET_FASTROUTE + {NET_CORE_FASTROUTE, "netdev_fastroute", + &netdev_fastroute, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_CORE_MSG_COST, "message_cost", + &net_msg_cost, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_MSG_BURST, "message_burst", + &net_msg_burst, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_OPTMEM_MAX, "optmem_max", + &sysctl_optmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif /* CONFIG_NET */ + { 0 } +}; +#endif diff --git a/pfinet/linux-src/net/core/utils.c b/pfinet/linux-src/net/core/utils.c new file mode 100644 index 00000000..415926b8 --- /dev/null +++ b/pfinet/linux-src/net/core/utils.c @@ -0,0 +1,66 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> + +static unsigned long net_rand_seed = 152L; + +unsigned long net_random(void) +{ + net_rand_seed=net_rand_seed*69069L+1; + return net_rand_seed^jiffies; +} + +void net_srandom(unsigned long entropy) +{ + net_rand_seed ^= entropy; + net_random(); +} + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10*5*HZ; + +/* + * This enforces a rate limit: not more than one kernel message + * every 5secs to make a denial-of-service attack impossible. + * + * All warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + static unsigned long toks = 10*5*HZ; + static unsigned long last_msg; + static int missed; + unsigned long now = jiffies; + + toks += now - xchg(&last_msg, now); + if (toks > net_msg_burst) + toks = net_msg_burst; + if (toks >= net_msg_cost) { + toks -= net_msg_cost; + if (missed) + printk(KERN_WARNING "NET: %d messages suppressed.\n", missed); + missed = 0; + return 1; + } + missed++; + return 0; +} diff --git a/pfinet/linux-src/net/ethernet/Makefile b/pfinet/linux-src/net/ethernet/Makefile new file mode 100644 index 00000000..193d6af8 --- /dev/null +++ b/pfinet/linux-src/net/ethernet/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the Linux Ethernet layer. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := ethernet.o + +OBJS := eth.o + +ifeq ($(CONFIG_SYSCTL),y) +OBJS += sysctl_net_ether.o +endif + +ifdef CONFIG_IPX +OBJ2 := pe2.o +endif + +ifdef CONFIG_ATALK +OBJ2 := pe2.o +endif + +ifdef CONFIG_NET +O_OBJS := $(OBJS) $(OBJ2) +endif + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/pfinet/linux-src/net/ethernet/eth.c b/pfinet/linux-src/net/ethernet/eth.c new file mode 100644 index 00000000..bce35d48 --- /dev/null +++ b/pfinet/linux-src/net/ethernet/eth.c @@ -0,0 +1,298 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Ethernet-type device handling. + * + * Version: @(#)eth.c 1.0.7 05/25/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Florian La Roche, <rzsfl@rz.uni-sb.de> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * + * Fixes: + * Mr Linux : Arp problems + * Alan Cox : Generic queue tidyup (very tiny here) + * Alan Cox : eth_header ntohs should be htons + * Alan Cox : eth_rebuild_header missing an htons and + * minor other things. + * Tegge : Arp bug fixes. + * Florian : Removed many unnecessary functions, code cleanup + * and changes for new arp and skbuff. + * Alan Cox : Redid header building to reflect new format. + * Alan Cox : ARP only when compiled with CONFIG_INET + * Greg Page : 802.2 and SNAP stuff. + * Alan Cox : MAC layer pointers/new format. + * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding. + * Alan Cox : Protect against forwarding explosions with + * older network drivers and IFF_ALLMULTI. + * Christer Weinigel : Better rebuild header message. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/config.h> +#include <linux/init.h> +#include <net/dst.h> +#include <net/arp.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/checksum.h> + + +__initfunc(void eth_setup(char *str, int *ints)) +{ + struct device *d = dev_base; + + if (!str || !*str) + return; + while (d) + { + if (!strcmp(str,d->name)) + { + if (ints[0] > 0) + d->irq=ints[1]; + if (ints[0] > 1) + d->base_addr=ints[2]; + if (ints[0] > 2) + d->mem_start=ints[3]; + if (ints[0] > 3) + d->mem_end=ints[4]; + break; + } + d=d->next; + } +} + + +/* + * Create the Ethernet MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +int eth_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ethhdr *eth = (struct ethhdr *)skb_push(skb,ETH_HLEN); + + /* + * Set the protocol type. For a packet of type ETH_P_802_3 we put the length + * in here instead. It is up to the 802.2 layer to carry protocol information. + */ + + if(type!=ETH_P_802_3) + eth->h_proto = htons(type); + else + eth->h_proto = htons(len); + + /* + * Set the source hardware address. + */ + + if(saddr) + memcpy(eth->h_source,saddr,dev->addr_len); + else + memcpy(eth->h_source,dev->dev_addr,dev->addr_len); + + /* + * Anyway, the loopback-device should never use this function... + */ + + if (dev->flags & (IFF_LOOPBACK|IFF_NOARP)) + { + memset(eth->h_dest, 0, dev->addr_len); + return(dev->hard_header_len); + } + + if(daddr) + { + memcpy(eth->h_dest,daddr,dev->addr_len); + return dev->hard_header_len; + } + + return -dev->hard_header_len; +} + + +/* + * Rebuild the Ethernet MAC header. This is called after an ARP + * (or in future other address resolution) has completed on this + * sk_buff. We now let ARP fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. + */ + +int eth_rebuild_header(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + struct device *dev = skb->dev; + + switch (eth->h_proto) + { +#ifdef CONFIG_INET + case __constant_htons(ETH_P_IP): + return arp_find(eth->h_dest, skb); +#endif + default: + printk(KERN_DEBUG + "%s: unable to resolve type %X addresses.\n", + dev->name, (int)eth->h_proto); + + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + break; + } + + return 0; +} + + +/* + * Determine the packet's protocol ID. The rule here is that we + * assume 802.3 if the type field is short enough to be a length. + * This is normal practice and works for any 'now in use' protocol. + */ + +unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) +{ + struct ethhdr *eth; + unsigned char *rawp; + + skb->mac.raw=skb->data; + skb_pull(skb,dev->hard_header_len); + eth= skb->mac.ethernet; + + if(*eth->h_dest&1) + { + if(memcmp(eth->h_dest,dev->broadcast, ETH_ALEN)==0) + skb->pkt_type=PACKET_BROADCAST; + else + skb->pkt_type=PACKET_MULTICAST; + } + + /* + * This ALLMULTI check should be redundant by 1.4 + * so don't forget to remove it. + * + * Seems, you forgot to remove it. All silly devices + * seems to set IFF_PROMISC. + */ + + else if(dev->flags&(IFF_PROMISC/*|IFF_ALLMULTI*/)) + { + if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN)) + skb->pkt_type=PACKET_OTHERHOST; + } + + if (ntohs(eth->h_proto) >= 1536) + return eth->h_proto; + + rawp = skb->data; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (*(unsigned short *)rawp == 0xFFFF) + return htons(ETH_P_802_3); + + /* + * Real 802.2 LLC + */ + return htons(ETH_P_802_2); +} + +int eth_header_parse(struct sk_buff *skb, unsigned char *haddr) +{ + struct ethhdr *eth = skb->mac.ethernet; + memcpy(haddr, eth->h_source, ETH_ALEN); + return ETH_ALEN; +} + +int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh) +{ + unsigned short type = hh->hh_type; + struct ethhdr *eth = (struct ethhdr*)(((u8*)hh->hh_data) + 2); + struct device *dev = neigh->dev; + + if (type == __constant_htons(ETH_P_802_3)) + return -1; + + eth->h_proto = type; + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, neigh->ha, dev->addr_len); + return 0; +} + +/* + * Called by Address Resolution module to notify changes in address. + */ + +void eth_header_cache_update(struct hh_cache *hh, struct device *dev, unsigned char * haddr) +{ + memcpy(((u8*)hh->hh_data) + 2, haddr, dev->addr_len); +} + +#ifndef CONFIG_IP_ROUTER + +/* + * Copy from an ethernet device memory space to an sk_buff while checksumming if IP + */ + +void eth_copy_and_sum(struct sk_buff *dest, unsigned char *src, int length, int base) +{ + struct ethhdr *eth; + struct iphdr *iph; + int ip_length; + + eth=(struct ethhdr *)src; + if(eth->h_proto!=htons(ETH_P_IP)) + { + memcpy(dest->data,src,length); + return; + } + /* + * We have to watch for padded packets. The csum doesn't include the + * padding, and there is no point in copying the padding anyway. + * We have to use the smaller of length and ip_length because it + * can happen that ip_length > length. + */ + memcpy(dest->data,src,sizeof(struct iphdr)+ETH_HLEN); /* ethernet is always >= 34 */ + length -= sizeof(struct iphdr) + ETH_HLEN; + iph=(struct iphdr*)(src+ETH_HLEN); + ip_length = ntohs(iph->tot_len) - sizeof(struct iphdr); + + /* Also watch out for bogons - min IP size is 8 (rfc-1042) */ + if ((ip_length <= length) && (ip_length > 7)) + length=ip_length; + + dest->csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base); + dest->ip_summed=1; +} + +#endif /* !(CONFIG_IP_ROUTER) */ diff --git a/pfinet/linux-src/net/ethernet/pe2.c b/pfinet/linux-src/net/ethernet/pe2.c new file mode 100644 index 00000000..4915f070 --- /dev/null +++ b/pfinet/linux-src/net/ethernet/pe2.c @@ -0,0 +1,38 @@ +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/datalink.h> +#include <linux/mm.h> +#include <linux/in.h> + +static void +pEII_datalink_header(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct device *dev = skb->dev; + + skb->protocol = htons (ETH_P_IPX); + if(dev->hard_header) + dev->hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len); +} + +struct datalink_proto * +make_EII_client(void) +{ + struct datalink_proto *proto; + + proto = (struct datalink_proto *) kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto != NULL) { + proto->type_len = 0; + proto->header_length = 0; + proto->datalink_header = pEII_datalink_header; + proto->string_name = "EtherII"; + } + + return proto; +} + +void destroy_EII_client(struct datalink_proto *dl) +{ + if (dl) + kfree_s(dl, sizeof(struct datalink_proto)); +} diff --git a/pfinet/linux-src/net/ethernet/sysctl_net_ether.c b/pfinet/linux-src/net/ethernet/sysctl_net_ether.c new file mode 100644 index 00000000..b81a6d53 --- /dev/null +++ b/pfinet/linux-src/net/ethernet/sysctl_net_ether.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_ether.c: sysctl interface to net Ethernet subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ether directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table ether_table[] = { + {0} +}; diff --git a/pfinet/linux-src/net/ipv4/Config.in b/pfinet/linux-src/net/ipv4/Config.in new file mode 100644 index 00000000..29786da5 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/Config.in @@ -0,0 +1,88 @@ +# +# IP configuration +# +bool 'IP: multicasting' CONFIG_IP_MULTICAST +bool 'IP: advanced router' CONFIG_IP_ADVANCED_ROUTER +if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then + define_bool CONFIG_RTNETLINK y + define_bool CONFIG_NETLINK y + bool 'IP: policy routing' CONFIG_IP_MULTIPLE_TABLES + bool 'IP: equal cost multipath' CONFIG_IP_ROUTE_MULTIPATH + bool 'IP: use TOS value as routing key' CONFIG_IP_ROUTE_TOS + bool 'IP: verbose route monitoring' CONFIG_IP_ROUTE_VERBOSE + bool 'IP: large routing tables' CONFIG_IP_ROUTE_LARGE_TABLES + if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'IP: fast network address translation' CONFIG_IP_ROUTE_NAT + fi +fi +bool 'IP: kernel level autoconfiguration' CONFIG_IP_PNP +if [ "$CONFIG_IP_PNP" = "y" ]; then + bool ' BOOTP support' CONFIG_IP_PNP_BOOTP + bool ' RARP support' CONFIG_IP_PNP_RARP +# not yet ready.. +# bool ' ARP support' CONFIG_IP_PNP_ARP +fi +if [ "$CONFIG_FIREWALL" = "y" ]; then + bool 'IP: firewalling' CONFIG_IP_FIREWALL + if [ "$CONFIG_IP_FIREWALL" = "y" ]; then + if [ "$CONFIG_NETLINK" = "y" ]; then + bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK + if [ "$CONFIG_IP_FIREWALL_NETLINK" = "y" ]; then + define_bool CONFIG_NETLINK_DEV y + fi + fi + bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG + if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK + fi + fi +fi +if [ "$CONFIG_IP_FIREWALL" = "y" ]; then + if [ "$CONFIG_IP_ALWAYS_DEFRAG" != "n" ]; then + bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY + bool 'IP: masquerading' CONFIG_IP_MASQUERADE + if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' + bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP + comment 'Protocol-specific masquerading support will be built as modules.' + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'IP: masquerading special modules support' CONFIG_IP_MASQUERADE_MOD + if [ "$CONFIG_IP_MASQUERADE_MOD" = "y" ]; then + tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW + tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW + tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW + fi + fi + fi + fi +fi +bool 'IP: optimize as router not host' CONFIG_IP_ROUTER +tristate 'IP: tunneling' CONFIG_NET_IPIP +tristate 'IP: GRE tunnels over IP' CONFIG_NET_IPGRE +if [ "$CONFIG_IP_MULTICAST" = "y" ]; then + if [ "$CONFIG_NET_IPGRE" != "n" ]; then + bool 'IP: broadcast GRE over IP' CONFIG_NET_IPGRE_BROADCAST + fi + bool 'IP: multicast routing' CONFIG_IP_MROUTE + if [ "$CONFIG_IP_MROUTE" = "y" ]; then + bool 'IP: PIM-SM version 1 support' CONFIG_IP_PIMSM_V1 + bool 'IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2 + fi +fi +bool 'IP: aliasing support' CONFIG_IP_ALIAS +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_RTNETLINK" = "y" ]; then + bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD + fi +fi +bool 'IP: TCP syncookie support (not enabled per default)' CONFIG_SYN_COOKIES +comment '(it is safe to leave these untouched)' +#bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP +tristate 'IP: Reverse ARP' CONFIG_INET_RARP +#bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY +#bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF +bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE +#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +#bool 'IP: support experimental checksum copy to user for UDP' CONFIG_UDP_DELAY_CSUM +#fi + diff --git a/pfinet/linux-src/net/ipv4/Makefile b/pfinet/linux-src/net/ipv4/Makefile new file mode 100644 index 00000000..8ab280de --- /dev/null +++ b/pfinet/linux-src/net/ipv4/Makefile @@ -0,0 +1,116 @@ +# +# Makefile for the Linux TCP/IP (INET) layer. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := ipv4.o +IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\ + raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o +IPV4X_OBJS := + +MOD_LIST_NAME := IPV4_MODULES +M_OBJS := + +ifeq ($(CONFIG_IP_FIREWALL),y) +IPV4_OBJS += ip_fw.o +endif + +ifeq ($(CONFIG_IP_MULTIPLE_TABLES),y) +IPV4_OBJS += fib_rules.o +endif + +ifeq ($(CONFIG_IP_ROUTE_NAT),y) +IPV4_OBJS += ip_nat_dumb.o +endif + +ifeq ($(CONFIG_IP_MROUTE),y) +IPV4_OBJS += ipmr.o +endif + +ifeq ($(CONFIG_INET_RARP),y) +IPV4_OBJS += rarp.o +else + ifeq ($(CONFIG_INET_RARP),m) + M_OBJS += rarp.o + endif +endif + +ifeq ($(CONFIG_NET_IPIP),y) +IPV4X_OBJS += ipip.o +else + ifeq ($(CONFIG_NET_IPIP),m) + MX_OBJS += ipip.o + endif +endif + +ifeq ($(CONFIG_NET_IPGRE),y) +IPV4X_OBJS += ip_gre.o +else + ifeq ($(CONFIG_NET_IPGRE),m) + MX_OBJS += ip_gre.o + endif +endif + +ifeq ($(CONFIG_IP_MASQUERADE),y) +IPV4X_OBJS += ip_masq.o ip_masq_app.o + +ifeq ($(CONFIG_IP_MASQUERADE_MOD),y) + IPV4X_OBJS += ip_masq_mod.o + + ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),y) + IPV4_OBJS += ip_masq_autofw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),m) + M_OBJS += ip_masq_autofw.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),y) + IPV4_OBJS += ip_masq_portfw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),m) + M_OBJS += ip_masq_portfw.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_MFW),y) + IPV4_OBJS += ip_masq_mfw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_MFW),m) + M_OBJS += ip_masq_mfw.o + endif + endif + +endif + +M_OBJS += ip_masq_user.o +M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o +M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o +endif + +ifeq ($(CONFIG_SYN_COOKIES),y) +IPV4_OBJS += syncookies.o +# module not supported, because it would be too messy. +endif + +ifeq ($(CONFIG_IP_PNP),y) +IPV4_OBJS += ipconfig.o +endif + +ifdef CONFIG_INET +O_OBJS := $(IPV4_OBJS) +OX_OBJS := $(IPV4X_OBJS) +endif + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/pfinet/linux-src/net/ipv4/af_inet.c b/pfinet/linux-src/net/ipv4/af_inet.c new file mode 100644 index 00000000..e37eb6bd --- /dev/null +++ b/pfinet/linux-src/net/ipv4/af_inet.c @@ -0,0 +1,1161 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET protocol family socket handler. + * + * Version: $Id: af_inet.c,v 1.87.2.5 1999/08/08 08:43:10 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Changes (see also sock.c) + * + * A.N.Kuznetsov : Socket death error in accept(). + * John Richardson : Fix non blocking error in connect() + * so sockets that fail to connect + * don't return -EINPROGRESS. + * Alan Cox : Asynchronous I/O support + * Alan Cox : Keep correct socket pointer on sock structures + * when accept() ed + * Alan Cox : Semantics of SO_LINGER aren't state moved + * to close when you look carefully. With + * this fixed and the accept bug fixed + * some RPC stuff seems happier. + * Niibe Yutaka : 4.4BSD style write async I/O + * Alan Cox, + * Tony Gale : Fixed reuse semantics. + * Alan Cox : bind() shouldn't abort existing but dead + * sockets. Stops FTP netin:.. I hope. + * Alan Cox : bind() works correctly for RAW sockets. Note + * that FreeBSD at least was broken in this respect + * so be careful with compatibility tests... + * Alan Cox : routing cache support + * Alan Cox : memzero the socket structure for compactness. + * Matt Day : nonblock connect error handler + * Alan Cox : Allow large numbers of pending sockets + * (eg for big web sites), but only if + * specifically application requested. + * Alan Cox : New buffering throughout IP. Used dumbly. + * Alan Cox : New buffering now used smartly. + * Alan Cox : BSD rather than common sense interpretation of + * listen. + * Germano Caronni : Assorted small races. + * Alan Cox : sendmsg/recvmsg basic support. + * Alan Cox : Only sendmsg/recvmsg now supported. + * Alan Cox : Locked down bind (see security list). + * Alan Cox : Loosened bind a little. + * Mike McLagan : ADD/DEL DLCI Ioctls + * Willy Konynenberg : Transparent proxying support. + * David S. Miller : New socket lookup architecture. + * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. + * Andi Kleen : Fix inet_stream_connect TCP race. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/poll.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/rarp.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <net/icmp.h> +#include <net/ipip.h> +#include <net/inet_common.h> +#include <linux/ip_fw.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#ifdef CONFIG_BRIDGE +#include <net/br.h> +#endif +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif +#ifdef CONFIG_NET_RADIO +#include <linux/wireless.h> +#endif /* CONFIG_NET_RADIO */ + +#define min(a,b) ((a)<(b)?(a):(b)) + +struct linux_mib net_statistics; + +extern int raw_get_info(char *, char **, off_t, int, int); +extern int snmp_get_info(char *, char **, off_t, int, int); +extern int netstat_get_info(char *, char **, off_t, int, int); +extern int afinet_get_info(char *, char **, off_t, int, int); +extern int tcp_get_info(char *, char **, off_t, int, int); +extern int udp_get_info(char *, char **, off_t, int, int); +extern void ip_mc_drop_socket(struct sock *sk); + +#ifdef CONFIG_DLCI +extern int dlci_ioctl(unsigned int, void*); +#endif + +#ifdef CONFIG_DLCI_MODULE +int (*dlci_ioctl_hook)(unsigned int, void *) = NULL; +#endif + +int (*rarp_ioctl_hook)(unsigned int,void*) = NULL; + +/* + * Destroy an AF_INET socket + */ + +static __inline__ void kill_sk_queues(struct sock *sk) +{ + struct sk_buff *skb; + + /* First the read buffer. */ + while((skb = skb_dequeue(&sk->receive_queue)) != NULL) + kfree_skb(skb); + + /* Next, the error queue. */ + while((skb = skb_dequeue(&sk->error_queue)) != NULL) + kfree_skb(skb); + + /* Now the backlog. */ + while((skb=skb_dequeue(&sk->back_log)) != NULL) + kfree_skb(skb); +} + +static __inline__ void kill_sk_now(struct sock *sk) +{ + /* No longer exists. */ + del_from_prot_sklist(sk); + + /* Remove from protocol hash chains. */ + sk->prot->unhash(sk); + + if(sk->opt) + kfree(sk->opt); + dst_release(sk->dst_cache); + sk_free(sk); +} + +static __inline__ void kill_sk_later(struct sock *sk) +{ + /* this should never happen. */ + /* actually it can if an ack has just been sent. */ + /* + * It's more normal than that... + * It can happen because a skb is still in the device queues + * [PR] + */ + + NETDEBUG(printk(KERN_DEBUG "Socket destroy delayed (r=%d w=%d)\n", + atomic_read(&sk->rmem_alloc), + atomic_read(&sk->wmem_alloc))); + + sk->ack_backlog = 0; + release_sock(sk); + net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME); +} + +void destroy_sock(struct sock *sk) +{ + lock_sock(sk); /* just to be safe. */ + + /* Now we can no longer get new packets or once the + * timers are killed, send them. + */ + net_delete_timer(sk); + + if (sk->prot->destroy && !sk->destroy) + sk->prot->destroy(sk); + + sk->destroy = 1; + + kill_sk_queues(sk); + + /* Now if everything is gone we can free the socket + * structure, otherwise we need to keep it around until + * everything is gone. + */ + if (atomic_read(&sk->rmem_alloc) == 0 && atomic_read(&sk->wmem_alloc) == 0) + kill_sk_now(sk); + else + kill_sk_later(sk); +} + +/* + * The routines beyond this point handle the behaviour of an AF_INET + * socket object. Mostly it punts to the subprotocols of IP to do + * the work. + */ + + +/* + * Set socket options on an inet socket. + */ + +int inet_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + struct sock *sk=sock->sk; + if (sk->prot->setsockopt==NULL) + return(-EOPNOTSUPP); + return sk->prot->setsockopt(sk,level,optname,optval,optlen); +} + +/* + * Get a socket option on an AF_INET socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ + +int inet_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + struct sock *sk=sock->sk; + if (sk->prot->getsockopt==NULL) + return(-EOPNOTSUPP); + return sk->prot->getsockopt(sk,level,optname,optval,optlen); +} + +/* + * Automatically bind an unbound socket. + */ + +static int inet_autobind(struct sock *sk) +{ + /* We may need to bind the socket. */ + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) + return(-EAGAIN); + sk->sport = htons(sk->num); + sk->prot->hash(sk); + add_to_prot_sklist(sk); + } + return 0; +} + +/* + * Move a socket into listening state. + */ + +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + return(-EINVAL); + + if ((unsigned) backlog == 0) /* BSDism */ + backlog = 1; + if ((unsigned) backlog > SOMAXCONN) + backlog = SOMAXCONN; + sk->max_ack_backlog = backlog; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + old_state = sk->state; + if (old_state != TCP_LISTEN) { + sk->state = TCP_LISTEN; + sk->ack_backlog = 0; + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) { + sk->state = old_state; + return -EAGAIN; + } + sk->sport = htons(sk->num); + add_to_prot_sklist(sk); + } else { + if (sk->prev) + ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0; + } + + dst_release(xchg(&sk->dst_cache, NULL)); + sk->prot->hash(sk); + sk->socket->flags |= SO_ACCEPTCON; + } + return 0; +} + +/* + * Create an inet socket. + * + * FIXME: Gcc would generate much better code if we set the parameters + * up in in-memory structure order. Gcc68K even more so + */ + +static int inet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct proto *prot; + + /* Compatibility */ + if (sock->type == SOCK_PACKET) { + static int warned; + if (net_families[PF_PACKET]==NULL) + { +#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE) + char module_name[30]; + sprintf(module_name,"net-pf-%d", PF_PACKET); + request_module(module_name); + if (net_families[PF_PACKET] == NULL) +#endif + return -ESOCKTNOSUPPORT; + } + if (!warned++) + printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); + return net_families[PF_PACKET]->create(sock, protocol); + } + + sock->state = SS_UNCONNECTED; + sk = sk_alloc(PF_INET, GFP_KERNEL, 1); + if (sk == NULL) + goto do_oom; + + switch (sock->type) { + case SOCK_STREAM: + if (protocol && protocol != IPPROTO_TCP) + goto free_and_noproto; + protocol = IPPROTO_TCP; + if (ipv4_config.no_pmtu_disc) + sk->ip_pmtudisc = IP_PMTUDISC_DONT; + else + sk->ip_pmtudisc = IP_PMTUDISC_WANT; + prot = &tcp_prot; + sock->ops = &inet_stream_ops; + break; + case SOCK_SEQPACKET: + goto free_and_badtype; + case SOCK_DGRAM: + if (protocol && protocol != IPPROTO_UDP) + goto free_and_noproto; + protocol = IPPROTO_UDP; + sk->no_check = UDP_NO_CHECK; + sk->ip_pmtudisc = IP_PMTUDISC_DONT; + prot=&udp_prot; + sock->ops = &inet_dgram_ops; + break; + case SOCK_RAW: + if (!capable(CAP_NET_RAW)) + goto free_and_badperm; + if (!protocol) + goto free_and_noproto; + prot = &raw_prot; + sk->reuse = 1; + sk->ip_pmtudisc = IP_PMTUDISC_DONT; + sk->num = protocol; + sock->ops = &inet_dgram_ops; + if (protocol == IPPROTO_RAW) + sk->ip_hdrincl = 1; + break; + default: + goto free_and_badtype; + } + + sock_init_data(sock,sk); + + sk->destruct = NULL; + + sk->zapped=0; +#ifdef CONFIG_TCP_NAGLE_OFF + sk->nonagle = 1; +#endif + sk->family = PF_INET; + sk->protocol = protocol; + + sk->prot = prot; + sk->backlog_rcv = prot->backlog_rcv; + + sk->timer.data = (unsigned long)sk; + sk->timer.function = &net_timer; + + sk->ip_ttl=ip_statistics.IpDefaultTTL; + + sk->ip_mc_loop=1; + sk->ip_mc_ttl=1; + sk->ip_mc_index=0; + sk->ip_mc_list=NULL; + + if (sk->num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically + * shares. + */ + sk->sport = htons(sk->num); + + /* Add to protocol hash chains. */ + sk->prot->hash(sk); + add_to_prot_sklist(sk); + } + + if (sk->prot->init) { + int err = sk->prot->init(sk); + if (err != 0) { + destroy_sock(sk); + return(err); + } + } + return(0); + +free_and_badtype: + sk_free(sk); + return -ESOCKTNOSUPPORT; + +free_and_badperm: + sk_free(sk); + return -EPERM; + +free_and_noproto: + sk_free(sk); + return -EPROTONOSUPPORT; + +do_oom: + return -ENOBUFS; +} + + +/* + * The peer socket should always be NULL (or else). When we call this + * function we are destroying the object and from then on nobody + * should refer to it. + */ + +int inet_release(struct socket *sock, struct socket *peersock) +{ + struct sock *sk = sock->sk; + + if (sk) { + long timeout; + + /* Begin closedown and wake up sleepers. */ + if (sock->state != SS_UNCONNECTED) + sock->state = SS_DISCONNECTING; + sk->state_change(sk); + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); + + /* If linger is set, we don't return until the close + * is complete. Otherwise we return immediately. The + * actually closing is done the same either way. + * + * If the close is due to the process exiting, we never + * linger.. + */ + timeout = 0; + if (sk->linger && !(current->flags & PF_EXITING)) { + timeout = HZ * sk->lingertime; + if (!timeout) + timeout = MAX_SCHEDULE_TIMEOUT; + } + sock->sk = NULL; + sk->socket = NULL; + sk->prot->close(sk, timeout); + } + return(0); +} + +static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr=(struct sockaddr_in *)uaddr; + struct sock *sk=sock->sk; + unsigned short snum; + int chk_addr_ret; + + /* If the socket has its own bind function then use it. (RAW) */ + if(sk->prot->bind) + return sk->prot->bind(sk, uaddr, addr_len); + + /* Check these errors (active socket, bad address length, double bind). */ + if ((sk->state != TCP_CLOSE) || + (addr_len < sizeof(struct sockaddr_in)) || + (sk->num != 0)) + return -EINVAL; + + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* Superuser may bind to any address to allow transparent proxying. */ + if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN)) +#endif + return -EADDRNOTAVAIL; /* Source address MUST be ours! */ + } + + /* We keep a pair of addresses. rcv_saddr is the one + * used by hash lookups, and saddr is used for transmit. + * + * In the BSD API these are the same except where it + * would be illegal to use them (multicast/broadcast) in + * which case the sending device address is used. + */ + sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + sk->saddr = 0; /* Use device */ + + snum = ntohs(addr->sin_port); +#ifdef CONFIG_IP_MASQUERADE + /* The kernel masquerader needs some ports. */ + if((snum >= PORT_MASQ_BEGIN) && (snum <= PORT_MASQ_END)) + return -EADDRINUSE; +#endif + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return(-EACCES); + + /* Make sure we are allowed to bind here. */ + if (sk->prot->get_port(sk, snum) != 0) + return -EADDRINUSE; + + sk->sport = htons(sk->num); + sk->daddr = 0; + sk->dport = 0; + sk->prot->hash(sk); + add_to_prot_sklist(sk); + dst_release(sk->dst_cache); + sk->dst_cache=NULL; + return(0); +} + +int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk=sock->sk; + int err; + + if (inet_autobind(sk) != 0) + return(-EAGAIN); + if (sk->prot->connect == NULL) + return(-EOPNOTSUPP); + err = sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len); + if (err < 0) + return(err); + return(0); +} + +static void inet_wait_for_connect(struct sock *sk) +{ + struct wait_queue wait = { current, NULL }; + + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { + if (signal_pending(current)) + break; + if (sk->err) + break; + schedule(); + current->state = TASK_INTERRUPTIBLE; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + +/* + * Connect to a remote host. There is regrettably still a little + * TCP 'magic' in here. + */ + +int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk=sock->sk; + int err; + + if(sock->state != SS_UNCONNECTED && sock->state != SS_CONNECTING) { + if(sock->state == SS_CONNECTED) + return -EISCONN; + return -EINVAL; + } + + if(sock->state == SS_CONNECTING) { + /* Note: tcp_connected contains SYN_RECV, which may cause + bogus results here. -AK */ + if(tcp_connected(sk->state)) { + sock->state = SS_CONNECTED; + return 0; + } + if (sk->zapped || sk->err) + goto sock_error; + if (flags & O_NONBLOCK) + return -EALREADY; + } else { + if (sk->prot->connect == NULL) + return(-EOPNOTSUPP); + + /* We may need to bind the socket. */ + if (inet_autobind(sk) != 0) + return(-EAGAIN); + + err = sk->prot->connect(sk, uaddr, addr_len); + /* Note: there is a theoretical race here when an wake up + occurred before inet_wait_for_connect is entered. In 2.3 + the wait queue setup should be moved before the low level + connect call. -AK*/ + if (err < 0) + return(err); + sock->state = SS_CONNECTING; + } + + if (sk->state > TCP_FIN_WAIT2 && sock->state == SS_CONNECTING) + goto sock_error; + + if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + return (-EINPROGRESS); + + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { + inet_wait_for_connect(sk); + if (signal_pending(current)) + return -ERESTARTSYS; + } + + sock->state = SS_CONNECTED; + if ((sk->state != TCP_ESTABLISHED) && sk->err) + goto sock_error; + return(0); + +sock_error: + /* This is ugly but needed to fix a race in the ICMP error handler */ + if (sk->zapped && sk->state != TCP_CLOSE) { + lock_sock(sk); + tcp_set_state(sk, TCP_CLOSE); + release_sock(sk); + sk->zapped = 0; + } + sock->state = SS_UNCONNECTED; + return sock_error(sk); +} + +/* + * Accept a pending connection. The TCP layer now gives BSD semantics. + */ + +int inet_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk1 = sock->sk, *sk2; + struct sock *newsk = newsock->sk; + int err = -EINVAL; + + if (sock->state != SS_UNCONNECTED || !(sock->flags & SO_ACCEPTCON)) + goto do_err; + + err = -EOPNOTSUPP; + if (sk1->prot->accept == NULL) + goto do_err; + + if((sk2 = sk1->prot->accept(sk1,flags)) == NULL) + goto do_sk1_err; + + /* + * We've been passed an extra socket. + * We need to free it up because the tcp module creates + * its own when it accepts one. + */ + sk2->sleep = newsk->sleep; + + newsock->sk = sk2; + sk2->socket = newsock; + newsk->socket = NULL; + + if (flags & O_NONBLOCK) + goto do_half_success; + + if(sk2->state == TCP_ESTABLISHED) + goto do_full_success; + if(sk2->err > 0) + goto do_connect_err; + err = -ECONNABORTED; + if (sk2->state == TCP_CLOSE) + goto do_bad_connection; +do_full_success: + destroy_sock(newsk); + newsock->state = SS_CONNECTED; + return 0; + +do_half_success: + destroy_sock(newsk); + return(0); + +do_connect_err: + err = sock_error(sk2); +do_bad_connection: + sk2->sleep = NULL; + sk2->socket = NULL; + destroy_sock(sk2); + newsock->sk = newsk; + newsk->socket = newsock; + return err; + +do_sk1_err: + err = sock_error(sk1); +do_err: + return err; +} + + +/* + * This does both peername and sockname. + */ + +static int inet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + sin->sin_family = AF_INET; + if (peer) { + if (!tcp_connected(sk->state)) + return(-ENOTCONN); + sin->sin_port = sk->dport; + sin->sin_addr.s_addr = sk->daddr; + } else { + __u32 addr = sk->rcv_saddr; + if (!addr) + addr = sk->saddr; + sin->sin_port = sk->sport; + sin->sin_addr.s_addr = addr; + } + *uaddr_len = sizeof(*sin); + return(0); +} + + + +int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + if (sock->flags & SO_ACCEPTCON) + return(-EINVAL); + if (sk->prot->recvmsg == NULL) + return(-EOPNOTSUPP); + /* We may need to bind the socket. */ + if (inet_autobind(sk) != 0) + return(-EAGAIN); + err = sk->prot->recvmsg(sk, msg, size, flags&MSG_DONTWAIT, + flags&~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + + +int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + + if (sk->shutdown & SEND_SHUTDOWN) { + if (!(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 1); + return(-EPIPE); + } + if (sk->prot->sendmsg == NULL) + return(-EOPNOTSUPP); + if(sk->err) + return sock_error(sk); + + /* We may need to bind the socket. */ + if (inet_autobind(sk) != 0) + return -EAGAIN; + + return sk->prot->sendmsg(sk, msg, size); +} + + +int inet_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + + /* This should really check to make sure + * the socket is a TCP socket. (WHY AC...) + */ + how++; /* maps 0->1 has the advantage of making bit 1 rcvs and + 1->2 bit 2 snds. + 2->3 */ + if ((how & ~SHUTDOWN_MASK) || how==0) /* MAXINT->0 */ + return(-EINVAL); + if (!sk) + return(-ENOTCONN); + if (sock->state == SS_CONNECTING && sk->state == TCP_ESTABLISHED) + sock->state = SS_CONNECTED; + if (!tcp_connected(sk->state)) + return(-ENOTCONN); + sk->shutdown |= how; + if (sk->prot->shutdown) + sk->prot->shutdown(sk, how); + /* Wake up anyone sleeping in poll. */ + sk->state_change(sk); + return(0); +} + + +unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + + if (sk->prot->poll == NULL) + return(0); + return sk->prot->poll(file, sock, wait); +} + +/* + * ioctl() calls you can issue on an INET socket. Most of these are + * device configuration and stuff and very rarely used. Some ioctls + * pass on to the socket itself. + * + * NOTE: I like the idea of a module for the config stuff. ie ifconfig + * loads the devconfigure module does its configuring and unloads it. + * There's a good 20K of config code hanging around the kernel. + */ + +static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + int pid; + + switch(cmd) + { + case FIOSETOWN: + case SIOCSPGRP: + err = get_user(pid, (int *) arg); + if (err) + return err; + if (current->pid != pid && current->pgrp != -pid && + !capable(CAP_NET_ADMIN)) + return -EPERM; + sk->proc = pid; + return(0); + case FIOGETOWN: + case SIOCGPGRP: + return put_user(sk->proc, (int *)arg); + case SIOCGSTAMP: + if(sk->stamp.tv_sec==0) + return -ENOENT; + err = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); + if (err) + err = -EFAULT; + return err; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + return(ip_rt_ioctl(cmd,(void *) arg)); + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + return(arp_ioctl(cmd,(void *) arg)); + case SIOCDRARP: + case SIOCGRARP: + case SIOCSRARP: +#ifdef CONFIG_KMOD + if (rarp_ioctl_hook == NULL) + request_module("rarp"); +#endif + if (rarp_ioctl_hook != NULL) + return(rarp_ioctl_hook(cmd,(void *) arg)); + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + return(devinet_ioctl(cmd,(void *) arg)); + case SIOCGIFBR: + case SIOCSIFBR: +#ifdef CONFIG_BRIDGE + return(br_ioctl(cmd,(void *) arg)); +#else + return -ENOPKG; +#endif + + case SIOCADDDLCI: + case SIOCDELDLCI: +#ifdef CONFIG_DLCI + return(dlci_ioctl(cmd, (void *) arg)); +#endif + +#ifdef CONFIG_DLCI_MODULE + +#ifdef CONFIG_KMOD + if (dlci_ioctl_hook == NULL) + request_module("dlci"); +#endif + + if (dlci_ioctl_hook) + return((*dlci_ioctl_hook)(cmd, (void *) arg)); +#endif + return -ENOPKG; + + default: + if ((cmd >= SIOCDEVPRIVATE) && + (cmd <= (SIOCDEVPRIVATE + 15))) + return(dev_ioctl(cmd,(void *) arg)); + +#ifdef CONFIG_NET_RADIO + if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) + return(dev_ioctl(cmd,(void *) arg)); +#endif + + if (sk->prot->ioctl==NULL || (err=sk->prot->ioctl(sk, cmd, arg))==-ENOIOCTLCMD) + return(dev_ioctl(cmd,(void *) arg)); + return err; + } + /*NOTREACHED*/ + return(0); +} + +struct proto_ops inet_stream_ops = { + PF_INET, + + sock_no_dup, + inet_release, + inet_bind, + inet_stream_connect, + sock_no_socketpair, + inet_accept, + inet_getname, + inet_poll, + inet_ioctl, + inet_listen, + inet_shutdown, + inet_setsockopt, + inet_getsockopt, + sock_no_fcntl, + inet_sendmsg, + inet_recvmsg +}; + +struct proto_ops inet_dgram_ops = { + PF_INET, + + sock_no_dup, + inet_release, + inet_bind, + inet_dgram_connect, + sock_no_socketpair, + sock_no_accept, + inet_getname, + datagram_poll, + inet_ioctl, + sock_no_listen, + inet_shutdown, + inet_setsockopt, + inet_getsockopt, + sock_no_fcntl, + inet_sendmsg, + inet_recvmsg +}; + +struct net_proto_family inet_family_ops = { + PF_INET, + inet_create +}; + + +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_INET_RARP +static struct proc_dir_entry proc_net_rarp = { + PROC_NET_RARP, 4, "rarp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rarp_get_info +}; +#endif /* RARP */ +static struct proc_dir_entry proc_net_raw = { + PROC_NET_RAW, 3, "raw", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + raw_get_info +}; +static struct proc_dir_entry proc_net_netstat = { + PROC_NET_NETSTAT, 7, "netstat", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + netstat_get_info +}; +static struct proc_dir_entry proc_net_snmp = { + PROC_NET_SNMP, 4, "snmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + snmp_get_info +}; +static struct proc_dir_entry proc_net_sockstat = { + PROC_NET_SOCKSTAT, 8, "sockstat", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + afinet_get_info +}; +static struct proc_dir_entry proc_net_tcp = { + PROC_NET_TCP, 3, "tcp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + tcp_get_info +}; +static struct proc_dir_entry proc_net_udp = { + PROC_NET_UDP, 3, "udp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + udp_get_info +}; +#endif /* CONFIG_PROC_FS */ + +extern void tcp_init(void); +extern void tcp_v4_init(struct net_proto_family *); + + +/* + * Called by socket.c on kernel startup. + */ + +__initfunc(void inet_proto_init(struct net_proto *pro)) +{ + struct sk_buff *dummy_skb; + struct inet_protocol *p; + + printk(KERN_INFO "NET4: Linux TCP/IP 1.0 for NET4.0\n"); + + if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) + { + printk(KERN_CRIT "inet_proto_init: panic\n"); + return; + } + + /* + * Tell SOCKET that we are alive... + */ + + (void) sock_register(&inet_family_ops); + + /* + * Add all the protocols. + */ + + printk(KERN_INFO "IP Protocols: "); + for(p = inet_protocol_base; p != NULL;) + { + struct inet_protocol *tmp = (struct inet_protocol *) p->next; + inet_add_protocol(p); + printk("%s%s",p->name,tmp?", ":"\n"); + p = tmp; + } + + /* + * Set the ARP module up + */ + + arp_init(); + + /* + * Set the IP module up + */ + + ip_init(); + + tcp_v4_init(&inet_family_ops); + + /* Setup TCP slab cache for open requests. */ + tcp_init(); + + + /* + * Set the ICMP layer up + */ + + icmp_init(&inet_family_ops); + + /* I wish inet_add_protocol had no constructor hook... + I had to move IPIP from net/ipv4/protocol.c :-( --ANK + */ +#ifdef CONFIG_NET_IPIP + ipip_init(); +#endif +#ifdef CONFIG_NET_IPGRE + ipgre_init(); +#endif + + /* + * Set the firewalling up + */ +#if defined(CONFIG_IP_FIREWALL) + ip_fw_init(); +#endif + +#ifdef CONFIG_IP_MASQUERADE + ip_masq_init(); +#endif + + /* + * Initialise the multicast router + */ +#if defined(CONFIG_IP_MROUTE) + ip_mr_init(); +#endif + +#ifdef CONFIG_INET_RARP + rarp_ioctl_hook = rarp_ioctl; +#endif + /* + * Create all the /proc entries. + */ + +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_INET_RARP + proc_net_register(&proc_net_rarp); +#endif /* RARP */ + proc_net_register(&proc_net_raw); + proc_net_register(&proc_net_snmp); + proc_net_register(&proc_net_netstat); + proc_net_register(&proc_net_sockstat); + proc_net_register(&proc_net_tcp); + proc_net_register(&proc_net_udp); +#endif /* CONFIG_PROC_FS */ +} diff --git a/pfinet/linux-src/net/ipv4/arp.c b/pfinet/linux-src/net/ipv4/arp.c new file mode 100644 index 00000000..27d2f802 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/arp.c @@ -0,0 +1,1154 @@ +/* linux/net/inet/arp.c + * + * Version: $Id: arp.c,v 1.77.2.1 1999/06/28 10:39:23 davem Exp $ + * + * Copyright (C) 1994 by Florian La Roche + * + * This module implements the Address Resolution Protocol ARP (RFC 826), + * which is used to convert IP addresses (or in the future maybe other + * high-level addresses) into a low-level hardware address (like an Ethernet + * address). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox : Removed the Ethernet assumptions in + * Florian's code + * Alan Cox : Fixed some small errors in the ARP + * logic + * Alan Cox : Allow >4K in /proc + * Alan Cox : Make ARP add its own protocol entry + * Ross Martin : Rewrote arp_rcv() and arp_get_info() + * Stephen Henson : Add AX25 support to arp_get_info() + * Alan Cox : Drop data when a device is downed. + * Alan Cox : Use init_timer(). + * Alan Cox : Double lock fixes. + * Martin Seine : Move the arphdr structure + * to if_arp.h for compatibility. + * with BSD based programs. + * Andrew Tridgell : Added ARP netmask code and + * re-arranged proxy handling. + * Alan Cox : Changed to use notifiers. + * Niibe Yutaka : Reply for this device or proxies only. + * Alan Cox : Don't proxy across hardware types! + * Jonathan Naylor : Added support for NET/ROM. + * Mike Shaver : RFC1122 checks. + * Jonathan Naylor : Only lookup the hardware address for + * the correct hardware type. + * Germano Caronni : Assorted subtle races. + * Craig Schlenter : Don't modify permanent entry + * during arp_rcv. + * Russ Nelson : Tidied up a few bits. + * Alexey Kuznetsov: Major changes to caching and behaviour, + * eg intelligent arp probing and + * generation + * of host down events. + * Alan Cox : Missing unlock in device events. + * Eckes : ARP ioctl control errors. + * Alexey Kuznetsov: Arp free fix. + * Manuel Rodriguez: Gratuitous ARP. + * Jonathan Layes : Added arpd support through kerneld + * message queue (960314) + * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Mike McLagan : Routing by source + * Stuart Cheshire : Metricom and grat arp fixes + * *** FOR 2.1 clean this up *** + * Lawrence V. Stefani: (08/12/96) Added FDDI support. + * Alan Cox : Took the AP1000 nasty FDDI hack and + * folded into the mainstream FDDI code. + * Ack spit, Linus how did you allow that + * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. + * Alexey Kuznetsov: new arp state machine; + * now it is in net/core/neighbour.c. + */ + +/* RFC1122 Status: + 2.3.2.1 (ARP Cache Validation): + MUST provide mechanism to flush stale cache entries (OK) + SHOULD be able to configure cache timeout (OK) + MUST throttle ARP retransmits (OK) + 2.3.2.2 (ARP Packet Queue): + SHOULD save at least one packet from each "conversation" with an + unresolved IP address. (OK) + 950727 -- MS +*/ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/config.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/mm.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/fddidevice.h> +#include <linux/if_arp.h> +#include <linux/trdevice.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <net/ip.h> +#include <net/icmp.h> +#include <net/route.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/arp.h> +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#include <net/ax25.h> +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#include <net/netrom.h> +#endif +#endif + +#include <asm/system.h> +#include <asm/uaccess.h> + +/* + * Interface to generic neighbour cache. + */ +static int arp_constructor(struct neighbour *neigh); +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); +static void parp_redo(struct sk_buff *skb); + +static struct neigh_ops arp_generic_ops = +{ + AF_INET, + NULL, + arp_solicit, + arp_error_report, + neigh_resolve_output, + neigh_connected_output, + dev_queue_xmit, + dev_queue_xmit +}; + +static struct neigh_ops arp_hh_ops = +{ + AF_INET, + NULL, + arp_solicit, + arp_error_report, + neigh_resolve_output, + neigh_resolve_output, + dev_queue_xmit, + dev_queue_xmit +}; + +static struct neigh_ops arp_direct_ops = +{ + AF_INET, + NULL, + NULL, + NULL, + dev_queue_xmit, + dev_queue_xmit, + dev_queue_xmit, + dev_queue_xmit +}; + +struct neigh_ops arp_broken_ops = +{ + AF_INET, + NULL, + arp_solicit, + arp_error_report, + neigh_compat_output, + neigh_compat_output, + dev_queue_xmit, + dev_queue_xmit, +}; + +struct neigh_table arp_tbl = +{ + NULL, + AF_INET, + sizeof(struct neighbour) + 4, + 4, + arp_constructor, + NULL, + NULL, + parp_redo, + { NULL, NULL, &arp_tbl, 0, NULL, NULL, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ }, + 30*HZ, 128, 512, 1024, +}; + +int arp_mc_map(u32 addr, u8 *haddr, struct device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: + case ARPHRD_FDDI: + ip_eth_mc_map(addr, haddr); + return 0; + default: + if (dir) { + memcpy(haddr, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + + + +static int arp_constructor(struct neighbour *neigh) +{ + u32 addr = *(u32*)neigh->primary_key; + struct device *dev = neigh->dev; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return -EINVAL; + + neigh->type = inet_addr_type(addr); + if (in_dev->arp_parms) + neigh->parms = in_dev->arp_parms; + + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &arp_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + /* Good devices (checked by reading texts, but only Ethernet is + tested) + + ARPHRD_ETHER: (ethernet, apfddi) + ARPHRD_FDDI: (fddi) + ARPHRD_IEEE802: (tr) + ARPHRD_METRICOM: (strip) + ARPHRD_ARCNET: + etc. etc. etc. + + ARPHRD_IPDDP will also work, if author repairs it. + I did not it, because this driver does not work even + in old paradigm. + */ + +#if 1 + /* So... these "amateur" devices are hopeless. + The only thing, that I can say now: + It is very sad that we need to keep ugly obsolete + code to make them happy. + + They should be moved to more reasonable state, now + they use rebuild_header INSTEAD OF hard_start_xmit!!! + Besides that, they are sort of out of date + (a lot of redundant clones/copies, useless in 2.1), + I wonder why people believe that they work. + */ + switch (dev->type) { + default: + break; + case ARPHRD_ROSE: +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: +#endif + neigh->ops = &arp_broken_ops; + neigh->output = neigh->ops->output; + return 0; +#endif + } +#endif + if (neigh->type == RTN_MULTICAST) { + neigh->nud_state = NUD_NOARP; + arp_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->hard_header_cache) + neigh->ops = &arp_hh_ops; + else + neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + + return 0; +} + +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + dst_link_failure(skb); + kfree_skb(skb); +} + +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + u32 saddr; + u8 *dst_ha = NULL; + struct device *dev = neigh->dev; + u32 target = *(u32*)neigh->primary_key; + int probes = neigh->probes; + + if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) + saddr = skb->nh.iph->saddr; + else + saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state&NUD_VALID)) + printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + dst_ha = neigh->ha; + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + return; + } + + arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_ha, dev->dev_addr, NULL); +} + +/* OBSOLETE FUNCTIONS */ + +/* + * Find an arp mapping in the cache. If not found, post a request. + * + * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, + * even if it exists. It is supposed that skb->dev was mangled + * by a virtual device (eql, shaper). Nobody but broken devices + * is allowed to use this function, it is scheduled to be removed. --ANK + */ + +static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev) +{ + switch (addr_hint) { + case RTN_LOCAL: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; + case RTN_MULTICAST: + arp_mc_map(paddr, haddr, dev, 1); + return 1; + case RTN_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; + } + return 0; +} + + +int arp_find(unsigned char *haddr, struct sk_buff *skb) +{ + struct device *dev = skb->dev; + u32 paddr; + struct neighbour *n; + + if (!skb->dst) { + printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + kfree_skb(skb); + return 1; + } + + paddr = ((struct rtable*)skb->dst)->rt_gateway; + + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + return 0; + + start_bh_atomic(); + n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); + + if (n) { + n->used = jiffies; + if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { + memcpy(haddr, n->ha, dev->addr_len); + neigh_release(n); + end_bh_atomic(); + return 0; + } + neigh_release(n); + } else + kfree_skb(skb); + end_bh_atomic(); + return 1; +} + +/* END OF OBSOLETE FUNCTIONS */ + +/* + * Note: requires bh_atomic locking. + */ +int arp_bind_neighbour(struct dst_entry *dst) +{ + struct device *dev = dst->dev; + + if (dev == NULL) + return 0; + if (dst->neighbour == NULL) { + u32 nexthop = ((struct rtable*)dst)->rt_gateway; + if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + nexthop = 0; + dst->neighbour = __neigh_lookup(&arp_tbl, &nexthop, dev, 1); + } + return (dst->neighbour != NULL); +} + +/* + * Interface to link layer: send routine and receive handler. + */ + +/* + * Create and send an arp packet. If (dest_hw == NULL), we create a broadcast + * message. + */ + +void arp_send(int type, int ptype, u32 dest_ip, + struct device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + struct arphdr *arp; + unsigned char *arp_ptr; + + /* + * No arp on this interface. + */ + + if (dev->flags&IFF_NOARP) + return; + + /* + * Allocate a buffer + */ + + skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) + + dev->hard_header_len + 15, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->nh.raw = skb->data; + arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); + skb->dev = dev; + skb->protocol = __constant_htons (ETH_P_ARP); + if (src_hw == NULL) + src_hw = dev->dev_addr; + if (dest_hw == NULL) + dest_hw = dev->broadcast; + + /* + * Fill the device header for the ARP frame + */ + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len); + + /* + * Fill out the arp protocol part. + * + * The arp hardware type should match the device type, except for FDDI, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + /* + * Exceptions everywhere. AX.25 uses the AX.25 PID value not the + * DIX code for the protocol. Make these device structure fields. + */ + switch (dev->type) { + default: + arp->ar_hrd = htons(dev->type); + arp->ar_pro = __constant_htons(ETH_P_IP); + break; + +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + arp->ar_hrd = __constant_htons(ARPHRD_AX25); + arp->ar_pro = __constant_htons(AX25_P_IP); + break; + +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + arp->ar_hrd = __constant_htons(ARPHRD_NETROM); + arp->ar_pro = __constant_htons(AX25_P_IP); + break; +#endif +#endif + +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + arp->ar_hrd = __constant_htons(ARPHRD_ETHER); + arp->ar_pro = __constant_htons(ETH_P_IP); + break; +#endif + } + + arp->ar_hln = dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp+1); + + memcpy(arp_ptr, src_hw, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &src_ip,4); + arp_ptr+=4; + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &dest_ip, 4); + skb->dev = dev; + + dev_queue_xmit(skb); +} + +static void parp_redo(struct sk_buff *skb) +{ + arp_rcv(skb, skb->dev, NULL); +} + +/* + * Receive an arp request by the device layer. + */ + +int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct arphdr *arp = skb->nh.arph; + unsigned char *arp_ptr= (unsigned char *)(arp+1); + struct rtable *rt; + unsigned char *sha, *tha; + u32 sip, tip; + u16 dev_type = dev->type; + int addr_type; + struct in_device *in_dev = dev->ip_ptr; + struct neighbour *n; + +/* + * The hardware length of the packet should match the hardware length + * of the device. Similarly, the hardware types should match. The + * device should be ARP-able. Also, if pln is not 4, then the lookup + * is not from an IP number. We can't currently handle this, so toss + * it. + */ + if (in_dev == NULL || + arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto out; + + switch (dev_type) { + default: + if (arp->ar_pro != __constant_htons(ETH_P_IP)) + goto out; + if (htons(dev_type) != arp->ar_hrd) + goto out; + break; +#ifdef CONFIG_NET_ETHERNET + case ARPHRD_ETHER: + /* + * ETHERNET devices will accept ARP hardware types of either + * 1 (Ethernet) or 6 (IEEE 802.2). + */ + if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) && + arp->ar_hrd != __constant_htons(ARPHRD_IEEE802)) + goto out; + if (arp->ar_pro != __constant_htons(ETH_P_IP)) + goto out; + break; +#endif +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + /* + * According to RFC 1390, FDDI devices should accept ARP hardware types + * of 1 (Ethernet). However, to be more robust, we'll accept hardware + * types of either 1 (Ethernet) or 6 (IEEE 802.2). + */ + if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) && + arp->ar_hrd != __constant_htons(ARPHRD_IEEE802)) + goto out; + if (arp->ar_pro != __constant_htons(ETH_P_IP)) + goto out; + break; +#endif +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + if (arp->ar_pro != __constant_htons(AX25_P_IP)) + goto out; + if (arp->ar_hrd != __constant_htons(ARPHRD_AX25)) + goto out; + break; +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + if (arp->ar_pro != __constant_htons(AX25_P_IP)) + goto out; + if (arp->ar_hrd != __constant_htons(ARPHRD_NETROM)) + goto out; + break; +#endif +#endif + } + + /* Undertsand only these message types */ + + if (arp->ar_op != __constant_htons(ARPOP_REPLY) && + arp->ar_op != __constant_htons(ARPOP_REQUEST)) + goto out; + +/* + * Extract fields + */ + sha=arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + tha=arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, 4); +/* + * Check for bad requests for 127.x.x.x and requests for multicast + * addresses. If this is one such, delete it. + */ + if (LOOPBACK(tip) || MULTICAST(tip)) + goto out; + +/* + * Process entry. The idea here is we want to send a reply if it is a + * request for us or if it is a request for someone else that we hold + * a proxy for. We want to add an entry to our cache if it is a reply + * to us or if it is a request for our address. + * (The assumption for this last is that if someone is requesting our + * address, they are probably intending to talk to us, so it saves time + * if we cache their address. Their address is also probably not in + * our cache, since ours is not in their cache.) + * + * Putting this another way, we only care about replies if they are to + * us, in which case we add them to the cache. For requests, we care + * about those for us and those for our proxies. We reply to both, + * and in the case of requests for us we add the requester to the arp + * cache. + */ + + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ + if (sip == 0) { + if (arp->ar_op == __constant_htons(ARPOP_REQUEST) && + inet_addr_type(tip) == RTN_LOCAL) + arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr); + goto out; + } + + if (arp->ar_op == __constant_htons(ARPOP_REQUEST) && + ip_route_input(skb, tip, sip, 0, dev) == 0) { + + rt = (struct rtable*)skb->dst; + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + neigh_release(n); + } + goto out; + } else if (IN_DEV_FORWARD(in_dev)) { + if ((rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + neigh_release(n); + + if (skb->stamp.tv_sec == 0 || + skb->pkt_type == PACKET_HOST || + in_dev->arp_parms->proxy_delay == 0) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + return 0; + } + goto out; + } + } + } + + /* Update our ARP tables */ + + n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + +#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + arp->ar_op == __constant_htons(ARPOP_REPLY) && + inet_addr_type(sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, -1); +#endif + + if (n) { + int state = NUD_REACHABLE; + int override = 0; + + /* If several different ARP replies follows back-to-back, + use the FIRST one. It is possible, if several proxy + agents are active. Taking the first reply prevents + arp trashing and chooses the fastest router. + */ + if (jiffies - n->updated >= n->parms->locktime) + override = 1; + + /* Broadcast replies and request packets + do not assert neighbour reachability. + */ + if (arp->ar_op != __constant_htons(ARPOP_REPLY) || + skb->pkt_type != PACKET_HOST) + state = NUD_STALE; + neigh_update(n, sha, state, override, 1); + neigh_release(n); + } + +out: + kfree_skb(skb); + return 0; +} + + + +/* + * User level interface (ioctl, /proc) + */ + +/* + * Set (create) an ARP cache entry. + */ + +int arp_req_set(struct arpreq *r, struct device * dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err; + + if (r->arp_flags&ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask && mask != 0xFFFFFFFF) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + if (dev == NULL) { + ipv4_devconf.proxy_arp = 1; + return 0; + } + if (dev->ip_ptr) { + ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 1; + return 0; + } + return -ENXIO; + } + + if (r->arp_flags & ATF_PERM) + r->arp_flags |= ATF_COM; + if (dev == NULL) { + struct rtable * rt; + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + if (r->arp_ha.sa_family != dev->type) + return -EINVAL; + + err = -ENOBUFS; + start_bh_atomic(); + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1); + if (neigh) { + unsigned state = NUD_STALE; + if (r->arp_flags & ATF_PERM) + state = NUD_PERMANENT; + err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + r->arp_ha.sa_data : NULL, state, 1, 0); + neigh_release(neigh); + } + end_bh_atomic(); + return err; +} + +static unsigned arp_state_to_flags(struct neighbour *neigh) +{ + unsigned flags = 0; + if (neigh->nud_state&NUD_PERMANENT) + flags = ATF_PERM|ATF_COM; + else if (neigh->nud_state&NUD_VALID) + flags = ATF_COM; + return flags; +} + +/* + * Get an ARP cache entry. + */ + +static int arp_req_get(struct arpreq *r, struct device *dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err = -ENXIO; + + start_bh_atomic(); + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + if (neigh) { + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_ha.sa_family = dev->type; + strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + r->arp_flags = arp_state_to_flags(neigh); + neigh_release(neigh); + err = 0; + } + end_bh_atomic(); + return err; +} + +int arp_req_delete(struct arpreq *r, struct device * dev) +{ + int err; + u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + + if (r->arp_flags & ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask == 0xFFFFFFFF) + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { + ipv4_devconf.proxy_arp = 0; + return 0; + } + if (dev->ip_ptr) { + ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 0; + return 0; + } + return -ENXIO; + } + return -EINVAL; + } + + if (dev == NULL) { + struct rtable * rt; + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + err = -ENXIO; + start_bh_atomic(); + neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + if (neigh) { + if (neigh->nud_state&~NUD_NOARP) + err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); + neigh_release(neigh); + } + end_bh_atomic(); + return err; +} + +/* + * Handle an ARP layer I/O control request. + */ + +int arp_ioctl(unsigned int cmd, void *arg) +{ + int err; + struct arpreq r; + struct device * dev = NULL; + + switch(cmd) { + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; + } + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + + if (!(r.arp_flags & ATF_PUBL) && + (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + return -EINVAL; + if (!(r.arp_flags & ATF_NETMASK)) + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr=__constant_htonl(0xFFFFFFFFUL); + + rtnl_lock(); + if (r.arp_dev[0]) { + err = -ENODEV; + if ((dev = dev_get(r.arp_dev)) == NULL) + goto out; + + /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ + if (!r.arp_ha.sa_family) + r.arp_ha.sa_family = dev->type; + err = -EINVAL; + if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) + goto out; + } else if (cmd == SIOCGARP) { + err = -ENODEV; + goto out; + } + + switch(cmd) { + case SIOCDARP: + err = arp_req_delete(&r, dev); + break; + case SIOCSARP: + err = arp_req_set(&r, dev); + break; + case SIOCGARP: + err = arp_req_get(&r, dev); + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; + break; + } +out: + rtnl_unlock(); + return err; +} + +/* + * Write the contents of the ARP cache to a PROCfs file. + */ +#ifdef CONFIG_PROC_FS + +#define HBUFFERLEN 30 + +int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t pos=0; + int size; + char hbuffer[HBUFFERLEN]; + int i,j,k; + const char hexbuf[] = "0123456789ABCDEF"; + + size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n"); + + pos+=size; + len+=size; + + neigh_table_lock(&arp_tbl); + + for(i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n; + for (n=arp_tbl.hash_buckets[i]; n; n=n->next) { + struct device *dev = n->dev; + int hatype = dev->type; + + /* Do not confuse users "arp -a" with magic entries */ + if (!(n->nud_state&~NUD_NOARP)) + continue; + + /* I'd get great pleasure deleting + this ugly code. Let's output it in hexadecimal format. + "arp" utility will eventually repaired --ANK + */ +#if 1 /* UGLY CODE */ +/* + * Convert hardware address to XX:XX:XX:XX ... form. + */ +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) + strcpy(hbuffer,ax2asc((ax25_address *)n->ha)); + else { +#endif + for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) { + hbuffer[k++]=hexbuf[(n->ha[j]>>4)&15 ]; + hbuffer[k++]=hexbuf[n->ha[j]&15 ]; + hbuffer[k++]=':'; + } + hbuffer[--k]=0; + +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + } +#endif +#else + if ((neigh->nud_state&NUD_VALID) && dev->addr_len) { + int j; + for (j=0; j < dev->addr_len; j++) + sprintf(hbuffer+2*j, "%02x", neigh->ha[j]); + } else + sprintf(hbuffer, "0"); +#endif + + size = sprintf(buffer+len, + "%-17s0x%-10x0x%-10x%s", + in_ntoa(*(u32*)n->primary_key), + hatype, + arp_state_to_flags(n), + hbuffer); + size += sprintf(buffer+len+size, + " %-17s %s\n", + "*", dev->name); + + len += size; + pos += size; + + if (pos <= offset) + len=0; + if (pos >= offset+length) + goto done; + } + } + + for (i=0; i<=PNEIGH_HASHMASK; i++) { + struct pneigh_entry *n; + for (n=arp_tbl.phash_buckets[i]; n; n=n->next) { + struct device *dev = n->dev; + int hatype = dev ? dev->type : 0; + + size = sprintf(buffer+len, + "%-17s0x%-10x0x%-10x%s", + in_ntoa(*(u32*)n->key), + hatype, + ATF_PUBL|ATF_PERM, + "00:00:00:00:00:00"); + size += sprintf(buffer+len+size, + " %-17s %s\n", + "*", dev ? dev->name : "*"); + + len += size; + pos += size; + + if (pos <= offset) + len=0; + if (pos >= offset+length) + goto done; + } + } + +done: + neigh_table_unlock(&arp_tbl); + + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; /* Start slop */ + if (len>length) + len = length; /* Ending slop */ + if (len<0) + len = 0; + return len; +} +#endif + +/* Note, that it is not on notifier chain. + It is necessary, that this routine was called after route cache will be + flushed. + */ +void arp_ifdown(struct device *dev) +{ + neigh_ifdown(&arp_tbl, dev); +} + + +/* + * Called once on startup. + */ + +static struct packet_type arp_packet_type = +{ + __constant_htons(ETH_P_ARP), + NULL, /* All devices */ + arp_rcv, + NULL, + NULL +}; + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_arp = { + PROC_NET_ARP, 3, "arp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + arp_get_info +}; +#endif + +__initfunc(void arp_init (void)) +{ + neigh_table_init(&arp_tbl); + + dev_add_pack(&arp_packet_type); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_arp); +#endif +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4"); +#endif +} + + +#ifdef CONFIG_AX25_MODULE + +/* + * ax25 -> ASCII conversion + */ +char *ax2asc(ax25_address *a) +{ + static char buf[11]; + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + +} + +#endif diff --git a/pfinet/linux-src/net/ipv4/devinet.c b/pfinet/linux-src/net/ipv4/devinet.c new file mode 100644 index 00000000..a50ee3bd --- /dev/null +++ b/pfinet/linux-src/net/ipv4/devinet.c @@ -0,0 +1,1034 @@ +/* + * NET3 IP device support routines. + * + * Version: $Id: devinet.c,v 1.28.2.2 1999/08/07 10:56:18 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the IP parts of dev.c 1.0.19 + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. + * Cyrus Durgin: updated for kmod + */ + +#include <linux/config.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +#include <net/ip.h> +#include <net/route.h> +#include <net/ip_fib.h> + +struct ipv4_devconf ipv4_devconf = { 1, 1, 1, 1, 0, }; +static struct ipv4_devconf ipv4_devconf_dflt = { 1, 1, 1, 1, 1, }; + +#ifdef CONFIG_RTNETLINK +static void rtmsg_ifa(int event, struct in_ifaddr *); +#else +#define rtmsg_ifa(a,b) do { } while(0) +#endif + +static struct notifier_block *inetaddr_chain; +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); +#ifdef CONFIG_SYSCTL +static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p); +static void devinet_sysctl_unregister(struct ipv4_devconf *p); +#endif + +int inet_ifa_count; +int inet_dev_count; + +static struct in_ifaddr * inet_alloc_ifa(void) +{ + struct in_ifaddr *ifa; + + ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + inet_ifa_count++; + } + + return ifa; +} + +static __inline__ void inet_free_ifa(struct in_ifaddr *ifa) +{ + kfree_s(ifa, sizeof(*ifa)); + inet_ifa_count--; +} + +struct in_device *inetdev_init(struct device *dev) +{ + struct in_device *in_dev; + + if (dev->mtu < 68) + return NULL; + + in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + return NULL; + inet_dev_count++; + memset(in_dev, 0, sizeof(*in_dev)); + memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + in_dev->cnf.sysctl = NULL; + in_dev->dev = dev; + if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) { + kfree(in_dev); + return NULL; + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4"); +#endif + dev->ip_ptr = in_dev; +#ifdef CONFIG_SYSCTL + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + if (dev->flags&IFF_UP) + ip_mc_up(in_dev); + return in_dev; +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); +#endif + in_dev->dev->ip_ptr = NULL; + synchronize_bh(); + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + kfree(in_dev); +} + +struct in_ifaddr * inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) +{ + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) + return ifa; + } + } endfor_ifa(in_dev); + return NULL; +} + +static void +inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) +{ + struct in_ifaddr *ifa1 = *ifap; + + /* 1. Deleting primary ifaddr forces deletion all secondaries */ + + if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) { + struct in_ifaddr *ifa; + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa=*ifap1) != NULL) { + if (!(ifa->ifa_flags&IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap1 = &ifa->ifa_next; + continue; + } + *ifap1 = ifa->ifa_next; + synchronize_bh(); + + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } + } + + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + synchronize_bh(); + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + inet_free_ifa(ifa1); + if (in_dev->ifa_list == NULL) + inetdev_destroy(in_dev); + } +} + +static int +inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) +{ + struct in_ifaddr *ifa1, **ifap, **last_primary; + + if (ifa->ifa_local == 0) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap=&in_dev->ifa_list; (ifa1=*ifap)!=NULL; ifap=&ifa1->ifa_next) { + if (!(ifa1->ifa_flags&IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) { + net_srandom(ifa->ifa_local); + ifap = last_primary; + } + + ifa->ifa_next = *ifap; + wmb(); + *ifap = ifa; + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int +inet_set_ifa(struct device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) { + in_dev = inetdev_init(dev); + if (in_dev == NULL) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + } + ifa->ifa_dev = in_dev; + if (LOOPBACK(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(in_dev, ifa); +} + +struct in_device *inetdev_by_index(int ifindex) +{ + struct device *dev; + dev = dev_get_by_index(ifindex); + if (dev) + return dev->ip_ptr; + return NULL; +} + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask) +{ + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +#ifdef CONFIG_RTNETLINK + +/* rtm_{add|del} functions are not reenterable, so that + this structure can be made static + */ + +int +inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa, **ifap; + + if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL) + return -EADDRNOTAVAIL; + + for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) { + if ((rta[IFA_LOCAL-1] && memcmp(RTA_DATA(rta[IFA_LOCAL-1]), &ifa->ifa_local, 4)) || + (rta[IFA_LABEL-1] && strcmp(RTA_DATA(rta[IFA_LABEL-1]), ifa->ifa_label)) || + (rta[IFA_ADDRESS-1] && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS-1]), ifa)))) + continue; + inet_del_ifa(in_dev, ifap, 1); + return 0; + } + + return -EADDRNOTAVAIL; +} + +int +inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct device *dev; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa; + + if (ifm->ifa_prefixlen > 32 || rta[IFA_LOCAL-1] == NULL) + return -EINVAL; + + if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL) + return -ENODEV; + + if ((in_dev = dev->ip_ptr) == NULL) { + in_dev = inetdev_init(dev); + if (!in_dev) + return -ENOBUFS; + } + + if ((ifa = inet_alloc_ifa()) == NULL) + return -ENOBUFS; + + if (rta[IFA_ADDRESS-1] == NULL) + rta[IFA_ADDRESS-1] = rta[IFA_LOCAL-1]; + memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL-1]), 4); + memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS-1]), 4); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + if (rta[IFA_BROADCAST-1]) + memcpy(&ifa->ifa_broadcast, RTA_DATA(rta[IFA_BROADCAST-1]), 4); + if (rta[IFA_ANYCAST-1]) + memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST-1]), 4); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = in_dev; + if (rta[IFA_LABEL-1]) + memcpy(ifa->ifa_label, RTA_DATA(rta[IFA_LABEL-1]), IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + return inet_insert_ifa(in_dev, ifa); +} + +#endif + +/* + * Determine a default network mask, based on the IP address. + */ + +static __inline__ int inet_abc_len(u32 addr) +{ + if (ZERONET(addr)) + return 0; + + addr = ntohl(addr); + if (IN_CLASSA(addr)) + return 8; + if (IN_CLASSB(addr)) + return 16; + if (IN_CLASSC(addr)) + return 24; + + /* + * Something else, probably a multicast. + */ + + return -1; +} + + +int devinet_ioctl(unsigned int cmd, void *arg) +{ + struct ifreq ifr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + struct device *dev; +#ifdef CONFIG_IP_ALIAS + char *colon; +#endif + int exclusive = 0; + int ret = 0; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_IP_ALIAS + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; +#endif + +#ifdef CONFIG_KMOD + dev_load(ifr.ifr_name); +#endif + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + /* Note that this ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + rtnl_lock(); + exclusive = 1; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + if (sin->sin_family != AF_INET) + return -EINVAL; + rtnl_lock(); + exclusive = 1; + break; + default: + return -EINVAL; + } + + + if ((dev = dev_get(ifr.ifr_name)) == NULL) { + ret = -ENODEV; + goto done; + } + +#ifdef CONFIG_IP_ALIAS + if (colon) + *colon = ':'; +#endif + + if ((in_dev=dev->ip_ptr) != NULL) { + for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next) + if (strcmp(ifr.ifr_name, ifa->ifa_label) == 0) + break; + } + + if (ifa == NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) { + ret = -EADDRNOTAVAIL; + goto done; + } + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; + + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; + + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; + + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; + + case SIOCSIFFLAGS: +#ifdef CONFIG_IP_ALIAS + if (colon) { + if (ifa == NULL) { + ret = -EADDRNOTAVAIL; + break; + } + if (!(ifr.ifr_flags&IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } +#endif + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFADDR: /* Set interface address (and family) */ + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + + if (!ifa) { + if ((ifa = inet_alloc_ifa()) == NULL) { + ret = -ENOBUFS; + break; + } +#ifdef CONFIG_IP_ALIAS + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else +#endif + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_anycast = 0; + } + + ifa->ifa_address = + ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags&IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask; + } else { + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); + } + ret = inet_set_ifa(dev, ifa); + break; + + case SIOCSIFBRDADDR: /* Set the broadcast address */ + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } + break; + + case SIOCSIFDSTADDR: /* Set the destination address */ + if (ifa->ifa_address != sin->sin_addr.s_addr) { + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } + break; + + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + + /* + * The mask we set must be legal. + */ + if (bad_mask(sin->sin_addr.s_addr, 0)) { + ret = -EINVAL; + break; + } + + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + inet_set_ifa(dev, ifa); + } + break; + } +done: + if (exclusive) + rtnl_unlock(); + return ret; + +rarok: + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +static int +inet_gifconf(struct device *dev, char *buf, int len) +{ + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + struct ifreq ifr; + int done=0; + + if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL) + return 0; + + for ( ; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < (int) sizeof(ifr)) + return done; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) + return -EFAULT; + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } + return done; +} + +u32 inet_select_addr(struct device *dev, u32 dst, int scope) +{ + u32 addr = 0; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return 0; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + if (!dst || inet_ifa_match(dst, ifa)) + return ifa->ifa_local; + if (!addr) + addr = ifa->ifa_local; + } endfor_ifa(in_dev); + + if (addr || scope >= RT_SCOPE_LINK) + return addr; + + /* Not loopback addresses on loopback should be preferred + in this case. It is importnat that lo is the first interface + in dev_base list. + */ + for (dev=dev_base; dev; dev=dev->next) { + if ((in_dev=dev->ip_ptr) == NULL) + continue; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope <= scope) + return ifa->ifa_local; + } endfor_ifa(in_dev); + } + + return 0; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inetaddr_chain, nb); +} + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inetaddr_chain,nb); +} + +static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + if (in_dev) + printk(KERN_DEBUG "inetdev_event: bug\n"); + dev->ip_ptr = NULL; + break; + case NETDEV_UP: + if (dev == &loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + inet_insert_ifa(in_dev, ifa); + } + } + ip_mc_up(in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_CHANGEMTU: + if (dev->mtu >= 68) + break; + /* MTU falled under minimal IP mtu. Disable IP. */ + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + case NETDEV_CHANGENAME: + if (in_dev->ifa_list) { + struct in_ifaddr *ifa; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + /* Do not notify about label change, this event is + not interesting to applications using netlink. + */ + } + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block ip_netdev_notifier={ + inetdev_event, + NULL, + 0 +}; + +#ifdef CONFIG_RTNETLINK + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_address) + RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); + if (ifa->ifa_broadcast) + RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast); + if (ifa->ifa_anycast) + RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + struct device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + if ((in_dev = dev->ip_ptr) == NULL) + continue; + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) + goto done; + } + } +done: + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr * ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + return; + } + if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); +} + + +static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + + { inet_rtm_newaddr, NULL, }, + { inet_rtm_deladdr, NULL, }, + { NULL, inet_dump_ifaddr, }, + { NULL, NULL, }, + + { inet_rtm_newroute, NULL, }, + { inet_rtm_delroute, NULL, }, + { inet_rtm_getroute, inet_dump_fib, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + +#ifdef CONFIG_IP_MULTIPLE_TABLES + { inet_rtm_newrule, NULL, }, + { inet_rtm_delrule, NULL, }, + { NULL, inet_dump_rules, }, + { NULL, NULL, }, +#else + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +#endif +}; + +#endif /* CONFIG_RTNETLINK */ + + +#ifdef CONFIG_SYSCTL + +void inet_forward_change() +{ + struct device *dev; + int on = ipv4_devconf.forwarding; + + ipv4_devconf.accept_redirects = !on; + ipv4_devconf_dflt.forwarding = on; + + for (dev = dev_base; dev; dev = dev->next) { + struct in_device *in_dev = dev->ip_ptr; + if (in_dev) + in_dev->cnf.forwarding = on; + } + + rt_cache_flush(0); + + ip_statistics.IpForwarding = on ? 1 : 2; +} + +static +int devinet_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write && *valp != val) { + if (valp == &ipv4_devconf.forwarding) + inet_forward_change(); + else if (valp != &ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + + return ret; +} + +static struct devinet_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table devinet_vars[12]; + ctl_table devinet_dev[2]; + ctl_table devinet_conf_dir[2]; + ctl_table devinet_proto_dir[2]; + ctl_table devinet_root_dir[2]; +} devinet_sysctl = { + NULL, + {{NET_IPV4_CONF_FORWARDING, "forwarding", + &ipv4_devconf.forwarding, sizeof(int), 0644, NULL, + &devinet_sysctl_forward}, + {NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding", + &ipv4_devconf.mc_forwarding, sizeof(int), 0444, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects", + &ipv4_devconf.accept_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects", + &ipv4_devconf.secure_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SHARED_MEDIA, "shared_media", + &ipv4_devconf.shared_media, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_RP_FILTER, "rp_filter", + &ipv4_devconf.rp_filter, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects", + &ipv4_devconf.send_redirects, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route", + &ipv4_devconf.accept_source_route, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_PROXY_ARP, "proxy_arp", + &ipv4_devconf.proxy_arp, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay", + &ipv4_devconf.bootp_relay, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_CONF_LOG_MARTIANS, "log_martians", + &ipv4_devconf.log_martians, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, devinet_sysctl.devinet_vars},{0}}, + {{NET_IPV4_CONF, "conf", NULL, 0, 0555, devinet_sysctl.devinet_dev},{0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, devinet_sysctl.devinet_conf_dir},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, devinet_sysctl.devinet_proto_dir},{0}} +}; + +static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p) +{ + int i; + struct device *dev = in_dev ? in_dev->dev : NULL; + struct devinet_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i=0; i<sizeof(t->devinet_vars)/sizeof(t->devinet_vars[0])-1; i++) { + t->devinet_vars[i].data += (char*)p - (char*)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + if (dev) { + t->devinet_dev[0].procname = dev->name; + t->devinet_dev[0].ctl_name = dev->ifindex; + } else { + t->devinet_dev[0].procname = "default"; + t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; + t->devinet_conf_dir[0].child = t->devinet_dev; + t->devinet_conf_dir[0].de = NULL; + t->devinet_proto_dir[0].child = t->devinet_conf_dir; + t->devinet_proto_dir[0].de = NULL; + t->devinet_root_dir[0].child = t->devinet_proto_dir; + t->devinet_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (t->sysctl_header == NULL) + kfree(t); + else + p->sysctl = t; +} + +static void devinet_sysctl_unregister(struct ipv4_devconf *p) +{ + if (p->sysctl) { + struct devinet_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} +#endif + +__initfunc(void devinet_init(void)) +{ + register_gifconf(PF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); +#ifdef CONFIG_RTNETLINK + rtnetlink_links[PF_INET] = inet_rtnetlink_table; +#endif +#ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); + devinet_sysctl_register(NULL, &ipv4_devconf_dflt); +#endif +} diff --git a/pfinet/linux-src/net/ipv4/fib_frontend.c b/pfinet/linux-src/net/ipv4/fib_frontend.c new file mode 100644 index 00000000..a1747048 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/fib_frontend.c @@ -0,0 +1,628 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: FIB frontend. + * + * Version: $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/arp.h> +#include <net/ip_fib.h> + +#define FFprint(a...) printk(KERN_DEBUG a) + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +#define RT_TABLE_MIN RT_TABLE_MAIN + +struct fib_table *local_table; +struct fib_table *main_table; + +#else + +#define RT_TABLE_MIN 1 + +struct fib_table *fib_tables[RT_TABLE_MAX+1]; + +struct fib_table *__fib_new_table(int id) +{ + struct fib_table *tb; + + tb = fib_hash_init(id); + if (!tb) + return NULL; + fib_tables[id] = tb; + return tb; +} + + +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + +void fib_flush(void) +{ + int flushed = 0; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_table *tb; + int id; + + for (id = RT_TABLE_MAX; id>0; id--) { + if ((tb = fib_get_table(id))==NULL) + continue; + flushed += tb->tb_flush(tb); + } +#else /* CONFIG_IP_MULTIPLE_TABLES */ + flushed += main_table->tb_flush(main_table); + flushed += local_table->tb_flush(local_table); +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + if (flushed) + rt_cache_flush(-1); +} + + +#ifdef CONFIG_PROC_FS + +/* + * Called from the PROCfs module. This outputs /proc/net/route. + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ + +static int +fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int first = offset/128; + char *ptr = buffer; + int count = (length+127)/128; + int len; + + *start = buffer + offset%128; + + if (--first < 0) { + sprintf(buffer, "%-127s\n", "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT"); + --count; + ptr += 128; + first = 0; + } + + /* rtnl_shlock(); -- it is pointless at the moment --ANK */ + if (main_table && count > 0) { + int n = main_table->tb_get_info(main_table, ptr, first, count); + count -= n; + ptr += n*128; + } + /* rtnl_shunlock(); */ + len = ptr - *start; + if (len >= length) + return length; + if (len >= 0) + return len; + return 0; +} + +#endif /* CONFIG_PROC_FS */ + +/* + * Find the first device with a given source address. + */ + +struct device * ip_dev_find(u32 addr) +{ + struct rt_key key; + struct fib_result res; + + memset(&key, 0, sizeof(key)); + key.dst = addr; + + if (!local_table || local_table->tb_lookup(local_table, &key, &res) + || res.type != RTN_LOCAL) + return NULL; + + return FIB_RES_DEV(res); +} + +unsigned inet_addr_type(u32 addr) +{ + struct rt_key key; + struct fib_result res; + + if (ZERONET(addr) || BADCLASS(addr)) + return RTN_BROADCAST; + if (MULTICAST(addr)) + return RTN_MULTICAST; + + memset(&key, 0, sizeof(key)); + key.dst = addr; + + if (local_table) { + if (local_table->tb_lookup(local_table, &key, &res) == 0) + return res.type; + return RTN_UNICAST; + } + return RTN_BROADCAST; +} + +/* Given (packet source, input interface) and optional (dst, oif, tos): + - (main) check, that source is valid i.e. not broadcast or our local + address. + - figure out what "logical" interface this packet arrived + and calculate "specific destination" address. + - check, that packet arrived from expected physical interface. + */ + +int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct device *dev, u32 *spec_dst, u32 *itag) +{ + struct in_device *in_dev = dev->ip_ptr; + struct rt_key key; + struct fib_result res; + + key.dst = src; + key.src = dst; + key.tos = tos; + key.oif = 0; + key.iif = oif; + key.scope = RT_SCOPE_UNIVERSE; + + if (in_dev == NULL) + return -EINVAL; + if (fib_lookup(&key, &res)) + goto last_resort; + if (res.type != RTN_UNICAST) + return -EINVAL; + *spec_dst = FIB_RES_PREFSRC(res); + if (itag) + fib_combine_itag(itag, &res); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) +#else + if (FIB_RES_DEV(res) == dev) +#endif + return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + + if (in_dev->ifa_list == NULL) + goto last_resort; + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; + key.oif = dev->ifindex; + if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) { + *spec_dst = FIB_RES_PREFSRC(res); + return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + } + return 0; + +last_resort: + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; + *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + *itag = 0; + return 0; +} + +#ifndef CONFIG_IP_NOSIOCRT + +/* + * Handle IP routing ioctl calls. These are used to manipulate the routing tables + */ + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + int err; + struct kern_rta rta; + struct rtentry r; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; + rtnl_lock(); + err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r); + if (err == 0) { + if (cmd == SIOCDELRT) { + struct fib_table *tb = fib_get_table(req.rtm.rtm_table); + err = -ESRCH; + if (tb) + err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); + } else { + struct fib_table *tb = fib_new_table(req.rtm.rtm_table); + err = -ENOBUFS; + if (tb) + err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + } + if (rta.rta_mx) + kfree(rta.rta_mx); + } + rtnl_unlock(); + return err; + } + return -EINVAL; +} + +#else + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + return -EINVAL; +} + +#endif + +#ifdef CONFIG_RTNETLINK + +static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for (i=1; i<=RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4) + return -EINVAL; + if (i != RTA_MULTIPATH && i != RTA_METRICS) + rta[i-1] = (struct rtattr*)RTA_DATA(attr); + } + } + return 0; +} + +int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_get_table(r->rtm_table); + if (tb) + return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ESRCH; +} + +int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_new_table(r->rtm_table); + if (tb) + return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ENOBUFS; +} + +int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct fib_table *tb; + + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return ip_rt_dump(skb, cb); + + s_t = cb->args[0]; + if (s_t == 0) + s_t = cb->args[0] = RT_TABLE_MIN; + + for (t=s_t; t<=RT_TABLE_MAX; t++) { + if (t < s_t) continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if ((tb = fib_get_table(t))==NULL) + continue; + if (tb->tb_dump(tb, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#endif + +/* Prepare and feed intra-kernel routing request. + Really, it should be netlink message, but :-( netlink + can be not configured, so that we feed it directly + to fib engine. It is legal, because all events occur + only when netlink is already locked. + */ + +static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa) +{ + struct fib_table * tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = fib_new_table(RT_TABLE_MAIN); + else + tb = fib_new_table(RT_TABLE_LOCAL); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->tb_id; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void fib_add_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct device *dev = in_dev->dev; + struct in_ifaddr *prim = ifa; + u32 mask = ifa->ifa_mask; + u32 addr = ifa->ifa_local; + u32 prefix = ifa->ifa_address&mask; + + if (ifa->ifa_flags&IFA_F_SECONDARY) { + prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + + if (!(dev->flags&IFF_UP)) + return; + + /* Add broadcast address, if it is explicitly assigned. */ + if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + + /* Add network specific broadcasts, when it takes a sense */ + if (ifa->ifa_prefixlen < 31) { + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + } + } +} + +static void fib_del_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct device *dev = in_dev->dev; + struct in_ifaddr *ifa1; + struct in_ifaddr *prim = ifa; + u32 brd = ifa->ifa_address|~ifa->ifa_mask; + u32 any = ifa->ifa_address&ifa->ifa_mask; +#define LOCAL_OK 1 +#define BRD_OK 2 +#define BRD0_OK 4 +#define BRD1_OK 8 + unsigned ok = 0; + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + else { + prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + } + + /* Deletion is more complicated than add. + We should take care of not to delete too much :-) + + Scan address list to be sure that addresses are really gone. + */ + + for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa->ifa_local == ifa1->ifa_local) + ok |= LOCAL_OK; + if (ifa->ifa_broadcast == ifa1->ifa_broadcast) + ok |= BRD_OK; + if (brd == ifa1->ifa_broadcast) + ok |= BRD1_OK; + if (any == ifa1->ifa_broadcast) + ok |= BRD0_OK; + } + + if (!(ok&BRD_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + if (!(ok&BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok&BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (!(ok&LOCAL_OK)) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + + /* Check, that this local address finally disappeared. */ + if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + /* And the last, but not the least thing. + We must flush stray FIB entries. + + First of all, we scan fib_info list searching + for stray nexthop entries, then ignite fib_flush. + */ + if (fib_sync_down(ifa->ifa_local, NULL, 0)) + fib_flush(); + } + } +#undef LOCAL_OK +#undef BRD_OK +#undef BRD0_OK +#undef BRD1_OK +} + +static void fib_disable_ip(struct device *dev, int force) +{ + if (fib_sync_down(0, dev, force)) + fib_flush(); + rt_cache_flush(0); + arp_ifdown(dev); +} + +static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + /* Last address was deleted from this interface. + Disable IP. + */ + fib_disable_ip(ifa->ifa_dev->dev, 1); + } else { + fib_del_ifaddr(ifa); + rt_cache_flush(-1); + } + break; + } + return NOTIFY_DONE; +} + +static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct in_device *in_dev = dev->ip_ptr; + + if (!in_dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + fib_disable_ip(dev, 0); + break; + case NETDEV_UNREGISTER: + fib_disable_ip(dev, 1); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + rt_cache_flush(0); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block fib_inetaddr_notifier = { + fib_inetaddr_event, + NULL, + 0 +}; + +struct notifier_block fib_netdev_notifier = { + fib_netdev_event, + NULL, + 0 +}; + +__initfunc(void ip_fib_init(void)) +{ +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_ROUTE, 5, "route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + fib_get_procinfo + }); +#endif /* CONFIG_PROC_FS */ + +#ifndef CONFIG_IP_MULTIPLE_TABLES + local_table = fib_hash_init(RT_TABLE_LOCAL); + main_table = fib_hash_init(RT_TABLE_MAIN); +#else + fib_rules_init(); +#endif + + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); +} + diff --git a/pfinet/linux-src/net/ipv4/fib_hash.c b/pfinet/linux-src/net/ipv4/fib_hash.c new file mode 100644 index 00000000..d9e029ce --- /dev/null +++ b/pfinet/linux-src/net/ipv4/fib_hash.c @@ -0,0 +1,885 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * Version: $Id: fib_hash.c,v 1.8 1999/03/25 10:04:17 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> + +#define FTprint(a...) +/* + printk(KERN_DEBUG a) + */ + +/* + These bizarre types are just to force strict type checking. + When I reversed order of bytes and changed to natural mask lengths, + I forgot to make fixes in several places. Now I am lazy to return + it back. + */ + +typedef struct { + u32 datum; +} fn_key_t; + +typedef struct { + u32 datum; +} fn_hash_idx_t; + +struct fib_node +{ + struct fib_node *fn_next; + struct fib_info *fn_info; +#define FIB_INFO(f) ((f)->fn_info) + fn_key_t fn_key; + u8 fn_tos; + u8 fn_type; + u8 fn_scope; + u8 fn_state; +}; + +#define FN_S_ZOMBIE 1 +#define FN_S_ACCESSED 2 + +static int fib_hash_zombies; + +struct fn_zone +{ + struct fn_zone *fz_next; /* Next not empty zone */ + struct fib_node **fz_hash; /* Hash table pointer */ + int fz_nent; /* Number of entries */ + + int fz_divisor; /* Hash divisor */ + u32 fz_hashmask; /* (1<<fz_divisor) - 1 */ +#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) + + int fz_order; /* Zone order */ + u32 fz_mask; +#define FZ_MASK(fz) ((fz)->fz_mask) +}; + +/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask + can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +struct fn_hash +{ + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + +static __inline__ fn_hash_idx_t fn_hash(fn_key_t key, struct fn_zone *fz) +{ + u32 h = ntohl(key.datum)>>(32 - fz->fz_order); + h ^= (h>>20); + h ^= (h>>10); + h ^= (h>>5); + h &= FZ_HASHMASK(fz); + return *(fn_hash_idx_t*)&h; +} + +#define fz_key_0(key) ((key).datum = 0) +#define fz_prefix(key,fz) ((key).datum) + +static __inline__ fn_key_t fz_key(u32 dst, struct fn_zone *fz) +{ + fn_key_t k; + k.datum = dst & FZ_MASK(fz); + return k; +} + +static __inline__ struct fib_node ** fz_chain_p(fn_key_t key, struct fn_zone *fz) +{ + return &fz->fz_hash[fn_hash(key, fz).datum]; +} + +static __inline__ struct fib_node * fz_chain(fn_key_t key, struct fn_zone *fz) +{ + return fz->fz_hash[fn_hash(key, fz).datum]; +} + +extern __inline__ int fn_key_eq(fn_key_t a, fn_key_t b) +{ + return a.datum == b.datum; +} + +extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b) +{ + return a.datum <= b.datum; +} + +#define FZ_MAX_DIVISOR 1024 + +#ifdef CONFIG_IP_ROUTE_LARGE_TABLES + +static __inline__ void fn_rebuild_zone(struct fn_zone *fz, + struct fib_node **old_ht, + int old_divisor) +{ + int i; + struct fib_node *f, **fp, *next; + + for (i=0; i<old_divisor; i++) { + for (f=old_ht[i]; f; f=next) { + next = f->fn_next; + for (fp = fz_chain_p(f->fn_key, fz); + *fp && fn_key_leq((*fp)->fn_key, f->fn_key); + fp = &(*fp)->fn_next) + /* NONE */; + f->fn_next = *fp; + *fp = f; + } + } +} + +static void fn_rehash_zone(struct fn_zone *fz) +{ + struct fib_node **ht, **old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = fz->fz_divisor; + + switch (old_divisor) { + case 16: + new_divisor = 256; + new_hashmask = 0xFF; + break; + case 256: + new_divisor = 1024; + new_hashmask = 0x3FF; + break; + default: + printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); + return; + } +#if RT_CACHE_DEBUG >= 2 + printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); +#endif + + ht = kmalloc(new_divisor*sizeof(struct fib_node*), GFP_KERNEL); + + if (ht) { + memset(ht, 0, new_divisor*sizeof(struct fib_node*)); + start_bh_atomic(); + old_ht = fz->fz_hash; + fz->fz_hash = ht; + fz->fz_hashmask = new_hashmask; + fz->fz_divisor = new_divisor; + fn_rebuild_zone(fz, old_ht, old_divisor); + end_bh_atomic(); + kfree(old_ht); + } +} +#endif /* CONFIG_IP_ROUTE_LARGE_TABLES */ + +static void fn_free_node(struct fib_node * f) +{ + fib_release_info(FIB_INFO(f)); + kfree_s(f, sizeof(struct fib_node)); +} + + +static struct fn_zone * +fn_new_zone(struct fn_hash *table, int z) +{ + int i; + struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL); + if (!fz) + return NULL; + + memset(fz, 0, sizeof(struct fn_zone)); + if (z) { + fz->fz_divisor = 16; + fz->fz_hashmask = 0xF; + } else { + fz->fz_divisor = 1; + fz->fz_hashmask = 0; + } + fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL); + if (!fz->fz_hash) { + kfree(fz); + return NULL; + } + memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*)); + fz->fz_order = z; + fz->fz_mask = inet_make_mask(z); + + /* Find the first not empty zone with more specific mask */ + for (i=z+1; i<=32; i++) + if (table->fn_zones[i]) + break; + if (i>32) { + /* No more specific masks, we are the first. */ + fz->fz_next = table->fn_zone_list; + table->fn_zone_list = fz; + } else { + fz->fz_next = table->fn_zones[i]->fz_next; + table->fn_zones[i]->fz_next = fz; + } + table->fn_zones[z] = fz; + return fz; +} + +static int +fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) +{ + int err; + struct fn_zone *fz; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + + for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + struct fib_node *f; + fn_key_t k = fz_key(key->dst, fz); + + for (f = fz_chain(k, fz); f; f = f->fn_next) { + if (!fn_key_eq(k, f->fn_key)) { + if (fn_key_leq(k, f->fn_key)) + break; + else + continue; + } +#ifdef CONFIG_IP_ROUTE_TOS + if (f->fn_tos && f->fn_tos != key->tos) + continue; +#endif + f->fn_state |= FN_S_ACCESSED; + + if (f->fn_state&FN_S_ZOMBIE) + continue; + if (f->fn_scope < key->scope) + continue; + + err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res); + if (err == 0) { + res->type = f->fn_type; + res->scope = f->fn_scope; + res->prefixlen = fz->fz_order; + res->prefix = &fz_prefix(f->fn_key, fz); + return 0; + } + if (err < 0) + return err; + } + } + return 1; +} + +static int fn_hash_last_dflt=-1; + +static int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx) +{ + struct neighbour *n; + int state = NUD_NONE; + + n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state==NUD_REACHABLE) + return 0; + if ((state&NUD_VALID) && order != fn_hash_last_dflt) + return 0; + if ((state&NUD_VALID) || + (*last_idx<0 && order > fn_hash_last_dflt)) { + *last_resort = fi; + *last_idx = order; + } + return 1; +} + +static void +fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) +{ + int order, last_idx; + struct fib_node *f; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz = t->fn_zones[0]; + + if (fz == NULL) + return; + + last_idx = -1; + last_resort = NULL; + order = -1; + + for (f = fz->fz_hash[0]; f; f = f->fn_next) { + struct fib_info *next_fi = FIB_INFO(f); + + if ((f->fn_state&FN_S_ZOMBIE) || + f->fn_scope != res->scope || + f->fn_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + f->fn_state |= FN_S_ACCESSED; + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + res->fi = fi; + fn_hash_last_dflt = order; + return; + } + fi = next_fi; + order++; + } + + if (order<=0 || fi==NULL) { + fn_hash_last_dflt = -1; + return; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + res->fi = fi; + fn_hash_last_dflt = order; + return; + } + + if (last_idx >= 0) + res->fi = last_resort; + fn_hash_last_dflt = last_idx; +} + +#define FIB_SCAN(f, fp) \ +for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) + +#define FIB_SCAN_KEY(f, fp, key) \ +for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) + +#ifndef CONFIG_IP_ROUTE_TOS +#define FIB_SCAN_TOS(f, fp, key, tos) FIB_SCAN_KEY(f, fp, key) +#else +#define FIB_SCAN_TOS(f, fp, key, tos) \ +for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)) && \ + (f)->fn_tos == (tos) ; (fp) = &(f)->fn_next) +#endif + + +#ifdef CONFIG_RTNETLINK +static void rtmsg_fib(int, struct fib_node*, int, int, + struct nlmsghdr *n, + struct netlink_skb_parms *); +#else +#define rtmsg_fib(a, b, c, d, e, f) +#endif + + +static int +fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node *new_f, *f, **fp, **del_fp; + struct fn_zone *fz; + struct fib_info *fi; + + int z = r->rtm_dst_len; + int type = r->rtm_type; +#ifdef CONFIG_IP_ROUTE_TOS + u8 tos = r->rtm_tos; +#endif + fn_key_t key; + int err; + +FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ? +*(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1, +rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0); + if (z > 32) + return -EINVAL; + fz = table->fn_zones[z]; + if (!fz && !(fz = fn_new_zone(table, z))) + return -ENOBUFS; + + fz_key_0(key); + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + if ((fi = fib_create_info(r, rta, n, &err)) == NULL) + return err; + +#ifdef CONFIG_IP_ROUTE_LARGE_TABLES + if (fz->fz_nent > (fz->fz_divisor<<2) && + fz->fz_divisor < FZ_MAX_DIVISOR && + (z==32 || (1<<z) > fz->fz_divisor)) + fn_rehash_zone(fz); +#endif + + fp = fz_chain_p(key, fz); + + /* + * Scan list to find the first route with the same destination + */ + FIB_SCAN(f, fp) { + if (fn_key_leq(key,f->fn_key)) + break; + } + +#ifdef CONFIG_IP_ROUTE_TOS + /* + * Find route with the same destination and tos. + */ + FIB_SCAN_KEY(f, fp, key) { + if (f->fn_tos <= tos) + break; + } +#endif + + del_fp = NULL; + + if (f && (f->fn_state&FN_S_ZOMBIE) && +#ifdef CONFIG_IP_ROUTE_TOS + f->fn_tos == tos && +#endif + fn_key_eq(f->fn_key, key)) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto create; + } + + FIB_SCAN_TOS(f, fp, key, tos) { + if (fi->fib_priority <= FIB_INFO(f)->fib_priority) + break; + } + + /* Now f==*fp points to the first node with the same + keys [prefix,tos,priority], if such key already + exists or to the node, before which we will insert new one. + */ + + if (f && +#ifdef CONFIG_IP_ROUTE_TOS + f->fn_tos == tos && +#endif + fn_key_eq(f->fn_key, key) && + fi->fib_priority == FIB_INFO(f)->fib_priority) { + struct fib_node **ins_fp; + + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags&NLM_F_REPLACE) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto replace; + } + + ins_fp = fp; + err = -EEXIST; + + FIB_SCAN_TOS(f, fp, key, tos) { + if (fi->fib_priority != FIB_INFO(f)->fib_priority) + break; + if (f->fn_type == type && f->fn_scope == r->rtm_scope + && FIB_INFO(f) == fi) + goto out; + } + + if (!(n->nlmsg_flags&NLM_F_APPEND)) { + fp = ins_fp; + f = *fp; + } + } + +create: + err = -ENOENT; + if (!(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + +replace: + err = -ENOBUFS; + new_f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); + if (new_f == NULL) + goto out; + + memset(new_f, 0, sizeof(struct fib_node)); + + new_f->fn_key = key; +#ifdef CONFIG_IP_ROUTE_TOS + new_f->fn_tos = tos; +#endif + new_f->fn_type = type; + new_f->fn_scope = r->rtm_scope; + FIB_INFO(new_f) = fi; + + /* + * Insert new entry to the list. + */ + + new_f->fn_next = f; + *fp = new_f; + fz->fz_nent++; + + if (del_fp) { + f = *del_fp; + /* Unlink replaced node */ + *del_fp = f->fn_next; + synchronize_bh(); + + if (!(f->fn_state&FN_S_ZOMBIE)) + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + if (f->fn_state&FN_S_ACCESSED) + rt_cache_flush(-1); + fn_free_node(f); + fz->fz_nent--; + } else { + rt_cache_flush(-1); + } + rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req); + return 0; + +out: + fib_release_info(fi); + return err; +} + + +static int +fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node **fp, **del_fp, *f; + int z = r->rtm_dst_len; + struct fn_zone *fz; + fn_key_t key; + int matched; +#ifdef CONFIG_IP_ROUTE_TOS + u8 tos = r->rtm_tos; +#endif + +FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? + *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1); + if (z > 32) + return -EINVAL; + if ((fz = table->fn_zones[z]) == NULL) + return -ESRCH; + + fz_key_0(key); + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + fp = fz_chain_p(key, fz); + + FIB_SCAN(f, fp) { + if (fn_key_eq(f->fn_key, key)) + break; + if (fn_key_leq(key, f->fn_key)) + return -ESRCH; + } +#ifdef CONFIG_IP_ROUTE_TOS + FIB_SCAN_KEY(f, fp, key) { + if (f->fn_tos == tos) + break; + } +#endif + + matched = 0; + del_fp = NULL; + FIB_SCAN_TOS(f, fp, key, tos) { + struct fib_info * fi = FIB_INFO(f); + + if (f->fn_state&FN_S_ZOMBIE) + return -ESRCH; + + matched++; + + if (del_fp == NULL && + (!r->rtm_type || f->fn_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && + (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) && + fib_nh_match(r, n, rta, fi) == 0) + del_fp = fp; + } + + if (del_fp) { + f = *del_fp; + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + + if (matched != 1) { + *del_fp = f->fn_next; + synchronize_bh(); + + if (f->fn_state&FN_S_ACCESSED) + rt_cache_flush(-1); + fn_free_node(f); + fz->fz_nent--; + } else { + f->fn_state |= FN_S_ZOMBIE; + if (f->fn_state&FN_S_ACCESSED) { + f->fn_state &= ~FN_S_ACCESSED; + rt_cache_flush(-1); + } + if (++fib_hash_zombies > 128) + fib_flush(); + } + + return 0; + } + return -ESRCH; +} + +extern __inline__ int +fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table) +{ + int found = 0; + struct fib_node *f; + + while ((f = *fp) != NULL) { + struct fib_info *fi = FIB_INFO(f); + + if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + *fp = f->fn_next; + synchronize_bh(); + + fn_free_node(f); + found++; + continue; + } + fp = &f->fn_next; + } + return found; +} + +static int fn_hash_flush(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz; + int found = 0; + + fib_hash_zombies = 0; + for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + int tmp = 0; + for (i=fz->fz_divisor-1; i>=0; i--) + tmp += fn_flush_list(&fz->fz_hash[i], fz->fz_order, table); + fz->fz_nent -= tmp; + found += tmp; + } + return found; +} + + +#ifdef CONFIG_PROC_FS + +static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int count) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz; + int pos = 0; + int n = 0; + + for (fz=table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + struct fib_node *f; + int maxslot = fz->fz_divisor; + struct fib_node **fp = fz->fz_hash; + + if (fz->fz_nent == 0) + continue; + + if (pos + fz->fz_nent <= first) { + pos += fz->fz_nent; + continue; + } + + for (i=0; i < maxslot; i++, fp++) { + for (f = *fp; f; f = f->fn_next) { + if (++pos <= first) + continue; + fib_node_get_info(f->fn_type, + f->fn_state&FN_S_ZOMBIE, + FIB_INFO(f), + fz_prefix(f->fn_key, fz), + FZ_MASK(fz), buffer); + buffer += 128; + if (++n >= count) + return n; + } + } + } + return n; +} +#endif + + +#ifdef CONFIG_RTNETLINK + +extern __inline__ int +fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz, + struct fib_node *f) +{ + int i, s_i; + + s_i = cb->args[3]; + for (i=0; f; i++, f=f->fn_next) { + if (i < s_i) continue; + if (f->fn_state&FN_S_ZOMBIE) continue; + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, (f->fn_state&FN_S_ZOMBIE) ? 0 : f->fn_type, f->fn_scope, + &f->fn_key, fz->fz_order, f->fn_tos, + f->fn_info) < 0) { + cb->args[3] = i; + return -1; + } + } + cb->args[3] = i; + return skb->len; +} + +extern __inline__ int +fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz) +{ + int h, s_h; + + s_h = cb->args[2]; + for (h=0; h < fz->fz_divisor; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); + if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL) + continue; + if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) { + cb->args[2] = h; + return -1; + } + } + cb->args[2] = h; + return skb->len; +} + +static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +{ + int m, s_m; + struct fn_zone *fz; + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + + s_m = cb->args[1]; + for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { + if (m < s_m) continue; + if (m > s_m) + memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0])); + if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { + cb->args[1] = m; + return -1; + } + } + cb->args[1] = m; + return skb->len; +} + +static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + u32 pid = req ? req->pid : 0; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, + f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos, + FIB_INFO(f)) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + if (n->nlmsg_flags&NLM_F_ECHO) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + if (n->nlmsg_flags&NLM_F_ECHO) + netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +} + +#endif /* CONFIG_RTNETLINK */ + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_table * fib_hash_init(int id) +#else +__initfunc(struct fib_table * fib_hash_init(int id)) +#endif +{ + struct fib_table *tb; + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); + if (tb == NULL) + return NULL; + tb->tb_id = id; + tb->tb_lookup = fn_hash_lookup; + tb->tb_insert = fn_hash_insert; + tb->tb_delete = fn_hash_delete; + tb->tb_flush = fn_hash_flush; + tb->tb_select_default = fn_hash_select_default; +#ifdef CONFIG_RTNETLINK + tb->tb_dump = fn_hash_dump; +#endif +#ifdef CONFIG_PROC_FS + tb->tb_get_info = fn_hash_get_info; +#endif + memset(tb->tb_data, 0, sizeof(struct fn_hash)); + return tb; +} diff --git a/pfinet/linux-src/net/ipv4/fib_rules.c b/pfinet/linux-src/net/ipv4/fib_rules.c new file mode 100644 index 00000000..868c44c3 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/fib_rules.c @@ -0,0 +1,419 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: policy rules. + * + * Version: $Id: fib_rules.c,v 1.9 1999/03/25 10:04:23 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : local_rule cannot be deleted + * Marc Boucher : routing by fwmark + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> + +#define FRprintk(a...) + +struct fib_rule +{ + struct fib_rule *r_next; + u32 r_preference; + unsigned char r_table; + unsigned char r_action; + unsigned char r_dst_len; + unsigned char r_src_len; + u32 r_src; + u32 r_srcmask; + u32 r_dst; + u32 r_dstmask; + u32 r_srcmap; + u8 r_flags; + u8 r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + u32 r_fwmark; +#endif + int r_ifindex; +#ifdef CONFIG_NET_CLS_ROUTE + __u32 r_tclassid; +#endif + char r_ifname[IFNAMSIZ]; +}; + +static struct fib_rule default_rule = { NULL, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST, }; +static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST, }; +static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, }; + +static struct fib_rule *fib_rules = &local_rule; + +int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, **rp; + + for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { + if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && + rtm->rtm_src_len == r->r_src_len && + rtm->rtm_dst_len == r->r_dst_len && + (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && + rtm->rtm_tos == r->r_tos && +#ifdef CONFIG_IP_ROUTE_FWMARK + (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) && +#endif + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && + (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && + (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) && + (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + if (r == &local_rule) + return -EPERM; + + *rp = r->r_next; + synchronize_bh(); + + if (r != &default_rule && r != &main_rule) + kfree(r); + return 0; + } + } + return -ESRCH; +} + +/* Allocate new unique table id */ + +static struct fib_table *fib_empty_table(void) +{ + int id; + + for (id = 1; id <= RT_TABLE_MAX; id++) + if (fib_tables[id] == NULL) + return __fib_new_table(id); + return NULL; +} + + +int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, *new_r, **rp; + unsigned char table_id; + + if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 || + (rtm->rtm_tos & ~IPTOS_TOS_MASK)) + return -EINVAL; + + if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ) + return -EINVAL; + + table_id = rtm->rtm_table; + if (table_id == RT_TABLE_UNSPEC) { + struct fib_table *table; + if (rtm->rtm_type == RTN_UNICAST || rtm->rtm_type == RTN_NAT) { + if ((table = fib_empty_table()) == NULL) + return -ENOBUFS; + table_id = table->tb_id; + } + } + + new_r = kmalloc(sizeof(*new_r), GFP_KERNEL); + if (!new_r) + return -ENOMEM; + memset(new_r, 0, sizeof(*new_r)); + if (rta[RTA_SRC-1]) + memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_GATEWAY-1]) + memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4); + new_r->r_src_len = rtm->rtm_src_len; + new_r->r_dst_len = rtm->rtm_dst_len; + new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len); + new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len); + new_r->r_tos = rtm->rtm_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (rta[RTA_PROTOINFO-1]) + memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4); +#endif + new_r->r_action = rtm->rtm_type; + new_r->r_flags = rtm->rtm_flags; + if (rta[RTA_PRIORITY-1]) + memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + new_r->r_table = table_id; + if (rta[RTA_IIF-1]) { + struct device *dev; + memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ); + new_r->r_ifname[IFNAMSIZ-1] = 0; + new_r->r_ifindex = -1; + dev = dev_get(new_r->r_ifname); + if (dev) + new_r->r_ifindex = dev->ifindex; + } +#ifdef CONFIG_NET_CLS_ROUTE + if (rta[RTA_FLOW-1]) + memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); +#endif + + rp = &fib_rules; + if (!new_r->r_preference) { + r = fib_rules; + if (r && (r = r->r_next) != NULL) { + rp = &fib_rules->r_next; + if (r->r_preference) + new_r->r_preference = r->r_preference - 1; + } + } + + while ( (r = *rp) != NULL ) { + if (r->r_preference > new_r->r_preference) + break; + rp = &r->r_next; + } + + new_r->r_next = r; + *rp = new_r; + return 0; +} + +u32 fib_rules_map_destination(u32 daddr, struct fib_result *res) +{ + u32 mask = inet_make_mask(res->prefixlen); + return (daddr&~mask)|res->fi->fib_nh->nh_gw; +} + +u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags) +{ + struct fib_rule *r = res->r; + + if (r->r_action == RTN_NAT) { + int addrtype = inet_addr_type(r->r_srcmap); + + if (addrtype == RTN_NAT) { + /* Packet is from translated source; remember it */ + saddr = (saddr&~r->r_srcmask)|r->r_srcmap; + *flags |= RTCF_SNAT; + } else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) { + /* Packet is from masqueraded source; remember it */ + saddr = r->r_srcmap; + *flags |= RTCF_MASQ; + } + } + return saddr; +} + +#ifdef CONFIG_NET_CLS_ROUTE +u32 fib_rules_tclass(struct fib_result *res) +{ + if (res->r) + return res->r->r_tclassid; + return 0; +} +#endif + + +static void fib_rules_detach(struct device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == dev->ifindex) + r->r_ifindex = -1; + } +} + +static void fib_rules_attach(struct device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) + r->r_ifindex = dev->ifindex; + } +} + +int fib_lookup(const struct rt_key *key, struct fib_result *res) +{ + int err; + struct fib_rule *r, *policy; + struct fib_table *tb; + + u32 daddr = key->dst; + u32 saddr = key->src; + +FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); + for (r = fib_rules; r; r=r->r_next) { + if (((saddr^r->r_src) & r->r_srcmask) || + ((daddr^r->r_dst) & r->r_dstmask) || +#ifdef CONFIG_IP_ROUTE_TOS + (r->r_tos && r->r_tos != key->tos) || +#endif +#ifdef CONFIG_IP_ROUTE_FWMARK + (r->r_fwmark && r->r_fwmark != key->fwmark) || +#endif + (r->r_ifindex && r->r_ifindex != key->iif)) + continue; + +FRprintk("tb %d r %d ", r->r_table, r->r_action); + switch (r->r_action) { + case RTN_UNICAST: + case RTN_NAT: + policy = r; + break; + case RTN_UNREACHABLE: + return -ENETUNREACH; + default: + case RTN_BLACKHOLE: + return -EINVAL; + case RTN_PROHIBIT: + return -EACCES; + } + + if ((tb = fib_get_table(r->r_table)) == NULL) + continue; + err = tb->tb_lookup(tb, key, res); + if (err == 0) { +FRprintk("ok\n"); + res->r = policy; + return 0; + } + if (err < 0 && err != -EAGAIN) + return err; + } +FRprintk("FAILURE\n"); + return -ENETUNREACH; +} + +void fib_select_default(const struct rt_key *key, struct fib_result *res) +{ + if (res->r && res->r->r_action == RTN_UNICAST && + FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { + struct fib_table *tb; + if ((tb = fib_get_table(res->r->r_table)) != NULL) + tb->tb_select_default(tb, key, res); + } +} + +static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + + if (event == NETDEV_UNREGISTER) + fib_rules_detach(dev); + else if (event == NETDEV_REGISTER) + fib_rules_attach(dev); + return NOTIFY_DONE; +} + + +struct notifier_block fib_rules_notifier = { + fib_rules_event, + NULL, + 0 +}; + +#ifdef CONFIG_RTNETLINK + +extern __inline__ int inet_fill_rule(struct sk_buff *skb, + struct fib_rule *r, + struct netlink_callback *cb) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = r->r_dst_len; + rtm->rtm_src_len = r->r_src_len; + rtm->rtm_tos = r->r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (r->r_fwmark) + RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark); +#endif + rtm->rtm_table = r->r_table; + rtm->rtm_protocol = 0; + rtm->rtm_scope = 0; + rtm->rtm_type = r->r_action; + rtm->rtm_flags = r->r_flags; + + if (r->r_dst_len) + RTA_PUT(skb, RTA_DST, 4, &r->r_dst); + if (r->r_src_len) + RTA_PUT(skb, RTA_SRC, 4, &r->r_src); + if (r->r_ifname[0]) + RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname); + if (r->r_preference) + RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); + if (r->r_srcmap) + RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); +#ifdef CONFIG_NET_CLS_ROUTE + if (r->r_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid); +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_put(skb, b - skb->tail); + return -1; +} + +int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct fib_rule *r; + + for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { + if (idx < s_idx) + continue; + if (inet_fill_rule(skb, r, cb) < 0) + break; + } + cb->args[0] = idx; + + return skb->len; +} + +#endif /* CONFIG_RTNETLINK */ + +__initfunc(void fib_rules_init(void)) +{ + register_netdevice_notifier(&fib_rules_notifier); +} diff --git a/pfinet/linux-src/net/ipv4/fib_semantics.c b/pfinet/linux-src/net/ipv4/fib_semantics.c new file mode 100644 index 00000000..b78f7eba --- /dev/null +++ b/pfinet/linux-src/net/ipv4/fib_semantics.c @@ -0,0 +1,991 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: semantics. + * + * Version: $Id: fib_semantics.c,v 1.13 1999/03/21 05:22:34 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/init.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/ip_fib.h> + +#define FSprintk(a...) + +static struct fib_info *fib_info_list; + +#define for_fib_info() { struct fib_info *fi; \ + for (fi = fib_info_list; fi; fi = fi->fib_next) + +#define endfor_fib_info() } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ +for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ +for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +/* Hope, that gcc will optimize it to get rid of dummy loop */ + +#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ +for (nhsel=0; nhsel < 1; nhsel++) + +#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ +for (nhsel=0; nhsel < 1; nhsel++) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define endfor_nexthops(fi) } + + +static struct +{ + int error; + u8 scope; +} fib_props[RTA_MAX+1] = { + { 0, RT_SCOPE_NOWHERE}, /* RTN_UNSPEC */ + { 0, RT_SCOPE_UNIVERSE}, /* RTN_UNICAST */ + { 0, RT_SCOPE_HOST}, /* RTN_LOCAL */ + { 0, RT_SCOPE_LINK}, /* RTN_BROADCAST */ + { 0, RT_SCOPE_LINK}, /* RTN_ANYCAST */ + { 0, RT_SCOPE_UNIVERSE}, /* RTN_MULTICAST */ + { -EINVAL, RT_SCOPE_UNIVERSE}, /* RTN_BLACKHOLE */ + { -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */ + { -EACCES, RT_SCOPE_UNIVERSE}, /* RTN_PROHIBIT */ + { -EAGAIN, RT_SCOPE_UNIVERSE}, /* RTN_THROW */ +#ifdef CONFIG_IP_ROUTE_NAT + { 0, RT_SCOPE_HOST}, /* RTN_NAT */ +#else + { -EINVAL, RT_SCOPE_NOWHERE}, /* RTN_NAT */ +#endif + { -EINVAL, RT_SCOPE_NOWHERE} /* RTN_XRESOLVE */ +}; + +/* Release a nexthop info record */ + +void fib_release_info(struct fib_info *fi) +{ + if (fi && !--fi->fib_refcnt) { + if (fi->fib_next) + fi->fib_next->fib_prev = fi->fib_prev; + if (fi->fib_prev) + fi->fib_prev->fib_next = fi->fib_next; + if (fi == fib_info_list) + fib_info_list = fi->fib_next; + kfree(fi); + } +} + +extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +{ + const struct fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight != onh->nh_weight || +#endif +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || +#endif + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi) +{ + for_fib_info() { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + nfi->fib_mtu == fi->fib_mtu && + nfi->fib_rtt == fi->fib_rtt && + nfi->fib_window == fi->fib_window && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } endfor_fib_info(); + return NULL; +} + +/* Check, that the gateway is already configured. + Used only by redirect accept routine. + */ + +int ip_fib_check_default(u32 gw, struct device *dev) +{ + for_fib_info() { + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for_nexthops(fi) { + if (nh->nh_dev == dev && nh->nh_gw == gw && + !(nh->nh_flags&RTNH_F_DEAD)) + return 0; + } endfor_nexthops(fi); + } endfor_fib_info(); + return -1; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) +{ + while (RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(u32*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +static int +fib_count_nexthops(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while (nhlen >= (int)sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + }; + return nhs; +} + +static int +fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + if (attrlen) { + nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + return 0; +} + +#endif + +int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, + struct fib_info *fi) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + struct rtnexthop *nhp; + int nhlen; +#endif + + if (rta->rta_priority && + *rta->rta_priority != fi->fib_priority) + return 1; + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0)) + return 0; + return 1; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp == NULL) + return 0; + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + u32 gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + if (gw && gw != nh->nh_gw) + return 1; +#ifdef CONFIG_NET_CLS_ROUTE + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); + if (gw && gw != nh->nh_tclassid) + return 1; +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); +#endif + return 0; +} + + +/* + Picture + ------- + + Semantics of nexthop is very messy by historical reasons. + We have to take into account, that: + a) gateway can be actually local interface address, + so that gatewayed route is direct. + b) gateway must be on-link address, possibly + described not by an ifaddr, but also by a direct route. + c) If both gateway and interface are specified, they should not + contradict. + d) If we use tunnel routes, gateway could be not on-link. + + Attempt to reconcile all of these (alas, self-contradictory) conditions + results in pretty ugly and hairy code with obscure logic. + + I choosed to generalized it instead, so that the size + of code does not increase practically, but it becomes + much more general. + Every prefix is assigned a "scope" value: "host" is local address, + "link" is direct route, + [ ... "site" ... "interior" ... ] + and "universe" is true gateway route with global meaning. + + Every prefix refers to a set of "nexthop"s (gw, oif), + where gw must have narrower scope. This recursion stops + when gw has LOCAL scope or if "nexthop" is declared ONLINK, + which means that gw is forced to be on link. + + Code is still hairy, but now it is apparently logically + consistent and very flexible. F.e. as by-product it allows + to co-exists in peace independent exterior and interior + routing processes. + + Normally it looks as following. + + {universe prefix} -> (gw, oif) [scope link] + | + |-> {link prefix} -> (gw, oif) [scope local] + | + |-> {local prefix} (terminal node) + */ + +static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct rt_key key; + struct fib_result res; + +#ifdef CONFIG_IP_ROUTE_PERVASIVE + if (nh->nh_flags&RTNH_F_PERVASIVE) + return 0; +#endif + if (nh->nh_flags&RTNH_F_ONLINK) { + struct device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = dev_get_by_index(nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + memset(&key, 0, sizeof(key)); + key.dst = nh->nh_gw; + key.oif = nh->nh_oif; + key.scope = r->rtm_scope + 1; + + /* It is not necessary, but requires a bit of thinking */ + if (key.scope < RT_SCOPE_LINK) + key.scope = RT_SCOPE_LINK; + + if ((err = fib_lookup(&key, &res)) != 0) + return err; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = FIB_RES_DEV(res); + } else { + struct in_device *in_dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + in_dev = inetdev_by_index(nh->nh_oif); + if (in_dev == NULL) + return -ENODEV; + if (!(in_dev->dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = in_dev->dev; + nh->nh_scope = RT_SCOPE_HOST; + } + return 0; +} + +struct fib_info * +fib_create_info(const struct rtmsg *r, struct kern_rta *rta, + const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct fib_info *fi = NULL; + struct fib_info *ofi; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nhs = 1; +#else + const int nhs = 1; +#endif + + /* Fast check to catch the most weird cases */ + if (fib_props[r->rtm_type].scope > r->rtm_scope) + goto err_inval; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp) { + nhs = fib_count_nexthops(rta->rta_mp); + if (nhs == 0) + goto err_inval; + } +#endif + + fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + err = -ENOBUFS; + if (fi == NULL) + goto failure; + memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh)); + + fi->fib_protocol = r->rtm_protocol; + fi->fib_nhs = nhs; + fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > FIB_MAX_METRICS) + goto err_inval; + fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); + + if (rta->rta_mp) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) + goto err_inval; +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4)) + goto err_inval; +#endif +#else + goto err_inval; +#endif + } else { + struct fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 4); +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow) + memcpy(&nh->nh_tclassid, rta->rta_flow, 4); +#endif + nh->nh_flags = r->rtm_flags; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = 1; +#endif + } + +#ifdef CONFIG_IP_ROUTE_NAT + if (r->rtm_type == RTN_NAT) { + if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) + goto err_inval; + memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 4); + goto link_it; + } +#endif + + if (fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct fib_nh *nh = fi->fib_nh; + + /* Local address is added. */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = fib_check_nh(r, fi, nh)) != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) + if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + +link_it: + if ((ofi = fib_find_info(fi)) != NULL) { + kfree(fi); + ofi->fib_refcnt++; + return ofi; + } + + fi->fib_refcnt++; + fi->fib_next = fib_info_list; + fi->fib_prev = NULL; + if (fib_info_list) + fib_info_list->fib_prev = fi; + fib_info_list = fi; + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) + kfree(fi); + return NULL; +} + +int +fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res) +{ + int err = fib_props[type].error; + + if (err == 0) { + if (fi->fib_flags&RTNH_F_DEAD) + return 1; + + res->fi = fi; + + switch (type) { +#ifdef CONFIG_IP_ROUTE_NAT + case RTN_NAT: + FIB_RES_RESET(*res); + return 0; +#endif + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + for_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + continue; + if (!key->oif || key->oif == nh->nh_oif) + break; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (nhsel < fi->fib_nhs) { + res->nh_sel = nhsel; + return 0; + } +#else + if (nhsel < 1) + return 0; +#endif + endfor_nexthops(fi); + return 1; + default: + printk(KERN_DEBUG "impossible 102\n"); + return -EINVAL; + } + } + return err; +} + +/* Find appropriate source address to this destination */ + +u32 __fib_res_prefsrc(struct fib_result *res) +{ + return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); +} + +#ifdef CONFIG_RTNETLINK + +int +fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, + struct fib_info *fi) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = tos; + rtm->rtm_table = tb_id; + rtm->rtm_type = type; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 4, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); +#ifdef CONFIG_NET_CLS_ROUTE + if (fi->fib_nh[0].nh_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); +#endif + if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) { + int i; + struct rtattr *mx = (struct rtattr *)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + for (i=0; i<FIB_MAX_METRICS; i++) { + if (fi->fib_metrics[i]) + RTA_PUT(skb, i+1, sizeof(unsigned), fi->fib_metrics + i); + } + mx->rta_len = skb->tail - (u8*)mx; + } + if (fi->fib_prefsrc) + RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0)); + + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight-1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); + nhp->rtnh_len = skb->tail - (unsigned char*)nhp; + } endfor_nexthops(fi); + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#endif /* CONFIG_RTNETLINK */ + +#ifndef CONFIG_IP_NOSIOCRT + +int +fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r) +{ + int plen; + u32 *ptr; + + memset(rtm, 0, sizeof(*rtm)); + memset(rta, 0, sizeof(*rta)); + + if (r->rt_dst.sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* Check mask for validity: + a) it must be contiguous. + b) destination must have all host bits clear. + c) if application forgot to set correct family (AF_INET), + reject request unless it is absolutely clear i.e. + both family and mask are zero. + */ + plen = 32; + ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr; + if (!(r->rt_flags&RTF_HOST)) { + u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr; + if (r->rt_genmask.sa_family != AF_INET) { + if (mask || r->rt_genmask.sa_family) + return -EAFNOSUPPORT; + } + if (bad_mask(mask, *ptr)) + return -EINVAL; + plen = inet_mask_len(mask); + } + + nl->nlmsg_flags = NLM_F_REQUEST; + nl->nlmsg_pid = 0; + nl->nlmsg_seq = 0; + nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); + if (cmd == SIOCDELRT) { + nl->nlmsg_type = RTM_DELROUTE; + nl->nlmsg_flags = 0; + } else { + nl->nlmsg_type = RTM_NEWROUTE; + nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; + rtm->rtm_protocol = RTPROT_BOOT; + } + + rtm->rtm_dst_len = plen; + rta->rta_dst = ptr; + + if (r->rt_metric) { + *(u32*)&r->rt_pad3 = r->rt_metric - 1; + rta->rta_priority = (u32*)&r->rt_pad3; + } + if (r->rt_flags&RTF_REJECT) { + rtm->rtm_scope = RT_SCOPE_HOST; + rtm->rtm_type = RTN_UNREACHABLE; + return 0; + } + rtm->rtm_scope = RT_SCOPE_NOWHERE; + rtm->rtm_type = RTN_UNICAST; + + if (r->rt_dev) { +#ifdef CONFIG_IP_ALIAS + char *colon; +#endif + struct device *dev; + char devname[IFNAMSIZ]; + + if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + devname[IFNAMSIZ-1] = 0; +#ifdef CONFIG_IP_ALIAS + colon = strchr(devname, ':'); + if (colon) + *colon = 0; +#endif + dev = dev_get(devname); + if (!dev) + return -ENODEV; + rta->rta_oif = &dev->ifindex; +#ifdef CONFIG_IP_ALIAS + if (colon) { + struct in_ifaddr *ifa; + struct in_device *in_dev = dev->ip_ptr; + if (!in_dev) + return -ENODEV; + *colon = ':'; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + if (strcmp(ifa->ifa_label, devname) == 0) + break; + if (ifa == NULL) + return -ENODEV; + rta->rta_prefsrc = &ifa->ifa_local; + } +#endif + } + + ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; + if (r->rt_gateway.sa_family == AF_INET && *ptr) { + rta->rta_gw = ptr; + if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST) + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + } + + if (cmd == SIOCDELRT) + return 0; + + if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) + return -EINVAL; + + if (rtm->rtm_scope == RT_SCOPE_NOWHERE) + rtm->rtm_scope = RT_SCOPE_LINK; + + if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) { + struct rtattr *rec; + struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL); + if (mx == NULL) + return -ENOMEM; + rta->rta_mx = mx; + mx->rta_type = RTA_METRICS; + mx->rta_len = RTA_LENGTH(0); + if (r->rt_flags&RTF_MTU) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_MTU; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_mtu; + } + if (r->rt_flags&RTF_WINDOW) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_WINDOW; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_window; + } + if (r->rt_flags&RTF_IRTT) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_RTT; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_irtt; + } + } + return 0; +} + +#endif + +/* + Update FIB if: + - local address disappeared -> we must delete all the entries + referring to it. + - device went down -> we must shutdown all nexthops going via it. + */ + +int fib_sync_down(u32 local, struct device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; + + for_fib_info() { + if (local && fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } else if (dev && fi->fib_nhs) { + int dead = 0; + + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != scope) { + nh->nh_flags |= RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; +#endif + dead++; + } + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } endfor_fib_info(); + return ret; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* + Dead device goes up. We wake up dead nexthops. + It takes sense only on multipath routes. + */ + +int fib_sync_up(struct device *dev) +{ + int ret = 0; + + if (!(dev->flags&IFF_UP)) + return 0; + + for_fib_info() { + int alive = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || dev->ip_ptr == NULL) + continue; + alive++; + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + } endfor_nexthops(fi) + + if (alive == fi->fib_nhs) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } endfor_fib_info(); + return ret; +} + +/* + The algorithm is suboptimal, but it provides really + fair weighted route distribution. + */ + +void fib_select_multipath(const struct rt_key *key, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + int w; + + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; +#if 1 + if (power <= 0) { + printk(KERN_CRIT "impossible 777\n"); + return; + } +#endif + } + + + /* w should be random number [0..fi->fib_power-1], + it is pretty bad approximation. + */ + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + return; + } + } + } endfor_nexthops(fi); + +#if 1 + printk(KERN_CRIT "impossible 888\n"); +#endif + return; +} +#endif + + +#ifdef CONFIG_PROC_FS + +static unsigned fib_flag_trans(int type, int dead, u32 mask, struct fib_info *fi) +{ + static unsigned type2flags[RTN_MAX+1] = { + 0, 0, 0, 0, 0, 0, 0, RTF_REJECT, RTF_REJECT, 0, 0, 0 + }; + unsigned flags = type2flags[type]; + + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == 0xFFFFFFFF) + flags |= RTF_HOST; + if (!dead) + flags |= RTF_UP; + return flags; +} + +void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 mask, char *buffer) +{ + int len; + unsigned flags = fib_flag_trans(type, dead, mask, fi); + + if (fi) { + len = sprintf(buffer, "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", prefix, + fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, + mask, fi->fib_mtu, fi->fib_window, fi->fib_rtt); + } else { + len = sprintf(buffer, "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + prefix, 0, + flags, 0, 0, 0, + mask, 0, 0, 0); + } + memset(buffer+len, ' ', 127-len); + buffer[127] = '\n'; +} + +#endif diff --git a/pfinet/linux-src/net/ipv4/icmp.c b/pfinet/linux-src/net/ipv4/icmp.c new file mode 100644 index 00000000..34b48a93 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/icmp.c @@ -0,0 +1,1155 @@ +/* + * NET3: Implementation of the ICMP protocol layer. + * + * Alan Cox, <alan@redhat.com> + * + * Version: $Id: icmp.c,v 1.52.2.2 1999/06/20 21:27:39 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Some of the function names and the icmp unreach table for this + * module were derived from [icmp.c 1.0.11 06/02/93] by + * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. + * Other than that this module is a complete rewrite. + * + * Fixes: + * Mike Shaver : RFC1122 checks. + * Alan Cox : Multicast ping reply as self. + * Alan Cox : Fix atomicity lockup in ip_build_xmit + * call. + * Alan Cox : Added 216,128 byte paths to the MTU + * code. + * Martin Mares : RFC1812 checks. + * Martin Mares : Can be configured to follow redirects + * if acting as a router _without_ a + * routing protocol (RFC 1812). + * Martin Mares : Echo requests may be configured to + * be ignored (RFC 1812). + * Martin Mares : Limitation of ICMP error message + * transmit rate (RFC 1812). + * Martin Mares : TOS and Precedence set correctly + * (RFC 1812). + * Martin Mares : Now copying as much data from the + * original packet as we can without + * exceeding 576 bytes (RFC 1812). + * Willy Konynenberg : Transparent proxying support. + * Keith Owens : RFC1191 correction for 4.2BSD based + * path MTU bug. + * Thomas Quinot : ICMP Dest Unreach codes up to 15 are + * valid (RFC 1812). + * Andi Kleen : Check all packet lengths properly + * and moved all kfree_skb() up to + * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a token + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. + * Yu Tianli : Fixed two ugly bugs in icmp_send + * - IP option length was accounted wrongly + * - ICMP header length was not accounted at all. + * Tristan Greaves : Added sysctl option to ignore bogus broadcast + * responses from broken routers. + * + * To Fix: + * + * - Should use skb_pull() instead of all the manual checking. + * This would also greatly simply some upper layer error handlers. --AK + * + * RFC1122 (Host Requirements -- Comm. Layer) Status: + * (boy, are there a lot of rules for ICMP) + * 3.2.2 (Generic ICMP stuff) + * MUST discard messages of unknown type. (OK) + * MUST copy at least the first 8 bytes from the offending packet + * when sending ICMP errors. (OBSOLETE -- see RFC1812) + * MUST pass received ICMP errors up to protocol level. (OK) + * SHOULD send ICMP errors with TOS == 0. (OBSOLETE -- see RFC1812) + * MUST NOT send ICMP errors in reply to: + * ICMP errors (OK) + * Broadcast/multicast datagrams (OK) + * MAC broadcasts (OK) + * Non-initial fragments (OK) + * Datagram with a source address that isn't a single host. (OK) + * 3.2.2.1 (Destination Unreachable) + * All the rules govern the IP layer, and are dealt with in ip.c, not here. + * 3.2.2.2 (Redirect) + * Host SHOULD NOT send ICMP_REDIRECTs. (OK) + * MUST update routing table in response to host or network redirects. + * (host OK, network OBSOLETE) + * SHOULD drop redirects if they're not from directly connected gateway + * (OK -- we drop it if it's not from our old gateway, which is close + * enough) + * 3.2.2.3 (Source Quench) + * MUST pass incoming SOURCE_QUENCHs to transport layer (OK) + * Other requirements are dealt with at the transport layer. + * 3.2.2.4 (Time Exceeded) + * MUST pass TIME_EXCEEDED to transport layer (OK) + * Other requirements dealt with at IP (generating TIME_EXCEEDED). + * 3.2.2.5 (Parameter Problem) + * SHOULD generate these (OK) + * MUST pass received PARAMPROBLEM to transport layer (NOT YET) + * [Solaris 2.X seems to assert EPROTO when this occurs] -- AC + * 3.2.2.6 (Echo Request/Reply) + * MUST reply to ECHO_REQUEST, and give app to do ECHO stuff (OK, OK) + * MAY discard broadcast ECHO_REQUESTs. (Configurable with a sysctl.) + * MUST reply using same source address as the request was sent to. + * We're OK for unicast ECHOs, and it doesn't say anything about + * how to handle broadcast ones, since it's optional. + * MUST copy data from REQUEST to REPLY (OK) + * unless it would require illegal fragmentation (OK) + * MUST pass REPLYs to transport/user layer (OK) + * MUST use any provided source route (reversed) for REPLY. (NOT YET) + * 3.2.2.7 (Information Request/Reply) + * MUST NOT implement this. (I guess that means silently discard...?) (OK) + * 3.2.2.8 (Timestamp Request/Reply) + * MAY implement (OK) + * SHOULD be in-kernel for "minimum variability" (OK) + * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency) + * MUST reply using same source address as the request was sent to. (OK) + * MUST reverse source route, as per ECHO (NOT YET) + * MUST pass REPLYs to transport/user layer (requires RAW, just like + * ECHO) (OK) + * MUST update clock for timestamp at least 15 times/sec (OK) + * MUST be "correct within a few minutes" (OK) + * 3.2.2.9 (Address Mask Request/Reply) + * MAY implement (OK) + * MUST send a broadcast REQUEST if using this system to set netmask + * (OK... we don't use it) + * MUST discard received REPLYs if not using this system (OK) + * MUST NOT send replies unless specifically made agent for this sort + * of thing. (OK) + * + * + * RFC 1812 (IPv4 Router Requirements) Status (even longer): + * 4.3.2.1 (Unknown Message Types) + * MUST pass messages of unknown type to ICMP user iface or silently discard + * them (OK) + * 4.3.2.2 (ICMP Message TTL) + * MUST initialize TTL when originating an ICMP message (OK) + * 4.3.2.3 (Original Message Header) + * SHOULD copy as much data from the offending packet as possible without + * the length of the ICMP datagram exceeding 576 bytes (OK) + * MUST leave original IP header of the offending packet, but we're not + * required to undo modifications made (OK) + * 4.3.2.4 (Original Message Source Address) + * MUST use one of addresses for the interface the orig. packet arrived as + * source address (OK) + * 4.3.2.5 (TOS and Precedence) + * SHOULD leave TOS set to the same value unless the packet would be + * discarded for that reason (OK) + * MUST use TOS=0 if not possible to leave original value (OK) + * MUST leave IP Precedence for Source Quench messages (OK -- not sent + * at all) + * SHOULD use IP Precedence = 6 (Internetwork Control) or 7 (Network Control) + * for all other error messages (OK, we use 6) + * MAY allow configuration of IP Precedence (OK -- not done) + * MUST leave IP Precedence and TOS for reply messages (OK) + * 4.3.2.6 (Source Route) + * SHOULD use reverse source route UNLESS sending Parameter Problem on source + * routing and UNLESS the packet would be immediately discarded (NOT YET) + * 4.3.2.7 (When Not to Send ICMP Errors) + * MUST NOT send ICMP errors in reply to: + * ICMP errors (OK) + * Packets failing IP header validation tests unless otherwise noted (OK) + * Broadcast/multicast datagrams (OK) + * MAC broadcasts (OK) + * Non-initial fragments (OK) + * Datagram with a source address that isn't a single host. (OK) + * 4.3.2.8 (Rate Limiting) + * SHOULD be able to limit error message rate (OK) + * SHOULD allow setting of rate limits (OK, in the source) + * 4.3.3.1 (Destination Unreachable) + * All the rules govern the IP layer, and are dealt with in ip.c, not here. + * 4.3.3.2 (Redirect) + * MAY ignore ICMP Redirects if running a routing protocol or if forwarding + * is enabled on the interface (OK -- ignores) + * 4.3.3.3 (Source Quench) + * SHOULD NOT originate SQ messages (OK) + * MUST be able to limit SQ rate if originates them (OK as we don't + * send them) + * MAY ignore SQ messages it receives (OK -- we don't) + * 4.3.3.4 (Time Exceeded) + * Requirements dealt with at IP (generating TIME_EXCEEDED). + * 4.3.3.5 (Parameter Problem) + * MUST generate these for all errors not covered by other messages (OK) + * MUST include original value of the value pointed by (OK) + * 4.3.3.6 (Echo Request) + * MUST implement echo server function (OK) + * MUST process at ER of at least max(576, MTU) (OK) + * MAY reject broadcast/multicast ER's (We don't, but that's OK) + * SHOULD have a config option for silently ignoring ER's (OK) + * MUST have a default value for the above switch = NO (OK) + * MUST have application layer interface for Echo Request/Reply (OK) + * MUST reply using same source address as the request was sent to. + * We're OK for unicast ECHOs, and it doesn't say anything about + * how to handle broadcast ones, since it's optional. + * MUST copy data from Request to Reply (OK) + * SHOULD update Record Route / Timestamp options (??) + * MUST use reversed Source Route for Reply if possible (NOT YET) + * 4.3.3.7 (Information Request/Reply) + * SHOULD NOT originate or respond to these (OK) + * 4.3.3.8 (Timestamp / Timestamp Reply) + * MAY implement (OK) + * MUST reply to every Timestamp message received (OK) + * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency) + * MUST reply using same source address as the request was sent to. (OK) + * MUST use reversed Source Route if possible (NOT YET) + * SHOULD update Record Route / Timestamp options (??) + * MUST pass REPLYs to transport/user layer (requires RAW, just like + * ECHO) (OK) + * MUST update clock for timestamp at least 16 times/sec (OK) + * MUST be "correct within a few minutes" (OK) + * 4.3.3.9 (Address Mask Request/Reply) + * MUST have support for receiving AMRq and responding with AMRe (OK, + * but only as a compile-time option) + * SHOULD have option for each interface for AMRe's, MUST default to + * NO (NOT YET) + * MUST NOT reply to AMRq before knows the correct AM (OK) + * MUST NOT respond to AMRq with source address 0.0.0.0 on physical + * interfaces having multiple logical i-faces with different masks + * (NOT YET) + * SHOULD examine all AMRe's it receives and check them (NOT YET) + * SHOULD log invalid AMRe's (AM+sender) (NOT YET) + * MUST NOT use contents of AMRe to determine correct AM (OK) + * MAY broadcast AMRe's after having configured address masks (OK -- doesn't) + * MUST NOT do broadcast AMRe's if not set by extra option (OK, no option) + * MUST use the { <NetPrefix>, -1 } form of broadcast addresses (OK) + * 4.3.3.10 (Router Advertisement and Solicitations) + * MUST support router part of Router Discovery Protocol on all networks we + * support broadcast or multicast addressing. (OK -- done by gated) + * MUST have all config parameters with the respective defaults (OK) + * 5.2.7.1 (Destination Unreachable) + * MUST generate DU's (OK) + * SHOULD choose a best-match response code (OK) + * SHOULD NOT generate Host Isolated codes (OK) + * SHOULD use Communication Administratively Prohibited when administratively + * filtering packets (NOT YET -- bug-to-bug compatibility) + * MAY include config option for not generating the above and silently + * discard the packets instead (OK) + * MAY include config option for not generating Precedence Violation and + * Precedence Cutoff messages (OK as we don't generate them at all) + * MUST use Host Unreachable or Dest. Host Unknown codes whenever other hosts + * on the same network might be reachable (OK -- no net unreach's at all) + * MUST use new form of Fragmentation Needed and DF Set messages (OK) + * 5.2.7.2 (Redirect) + * MUST NOT generate network redirects (OK) + * MUST be able to generate host redirects (OK) + * SHOULD be able to generate Host+TOS redirects (NO as we don't use TOS) + * MUST have an option to use Host redirects instead of Host+TOS ones (OK as + * no Host+TOS Redirects are used) + * MUST NOT generate redirects unless forwarding to the same i-face and the + * dest. address is on the same subnet as the src. address and no source + * routing is in use. (OK) + * MUST NOT follow redirects when using a routing protocol (OK) + * MAY use redirects if not using a routing protocol (OK, compile-time option) + * MUST comply to Host Requirements when not acting as a router (OK) + * 5.2.7.3 (Time Exceeded) + * MUST generate Time Exceeded Code 0 when discarding packet due to TTL=0 (OK) + * MAY have a per-interface option to disable origination of TE messages, but + * it MUST default to "originate" (OK -- we don't support it) + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <net/snmp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <net/checksum.h> + +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif + +#define min(a,b) ((a)<(b)?(a):(b)) + +/* + * Statistics + */ + +struct icmp_mib icmp_statistics; + +/* An array of errno for error messages from dest unreach. */ +/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOS_UNREACH and SR_FAIELD MUST be considered 'transient errs'. */ + +struct icmp_err icmp_err_convert[] = { + { ENETUNREACH, 0 }, /* ICMP_NET_UNREACH */ + { EHOSTUNREACH, 0 }, /* ICMP_HOST_UNREACH */ + { ENOPROTOOPT, 1 }, /* ICMP_PROT_UNREACH */ + { ECONNREFUSED, 1 }, /* ICMP_PORT_UNREACH */ + { EMSGSIZE, 0 }, /* ICMP_FRAG_NEEDED */ + { EOPNOTSUPP, 0 }, /* ICMP_SR_FAILED */ + { ENETUNREACH, 1 }, /* ICMP_NET_UNKNOWN */ + { EHOSTDOWN, 1 }, /* ICMP_HOST_UNKNOWN */ + { ENONET, 1 }, /* ICMP_HOST_ISOLATED */ + { ENETUNREACH, 1 }, /* ICMP_NET_ANO */ + { EHOSTUNREACH, 1 }, /* ICMP_HOST_ANO */ + { ENETUNREACH, 0 }, /* ICMP_NET_UNR_TOS */ + { EHOSTUNREACH, 0 }, /* ICMP_HOST_UNR_TOS */ + { EHOSTUNREACH, 1 }, /* ICMP_PKT_FILTERED */ + { EHOSTUNREACH, 1 }, /* ICMP_PREC_VIOLATION */ + { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */ +}; + +/* Control parameters for ECHO relies. */ +int sysctl_icmp_echo_ignore_all = 0; +int sysctl_icmp_echo_ignore_broadcasts = 0; + +/* Control parameter - ignore bogus broadcast responses? */ +int sysctl_icmp_ignore_bogus_error_responses =0; + +/* + * ICMP control array. This specifies what to do with each ICMP. + */ + +struct icmp_control +{ + unsigned long *output; /* Address to increment on output */ + unsigned long *input; /* Address to increment on input */ + void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len); + short error; /* This ICMP is classed as an error message */ + int *timeout; /* Rate limit */ +}; + +static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; + +/* + * Build xmit assembly blocks + */ + +struct icmp_bxm +{ + void *data_ptr; + int data_len; + struct icmphdr icmph; + unsigned long csum; + struct ip_options replyopts; + unsigned char optbuf[40]; +}; + +/* + * The ICMP socket. This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + */ + +struct inode icmp_inode; +struct socket *icmp_socket=&icmp_inode.u.socket_i; + +/* + * Send an ICMP frame. + */ + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the destination cache now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same dst_entry fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared. + * + * Note that the same dst_entry fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +int xrlim_allow(struct dst_entry *dst, int timeout) +{ + unsigned long now; + + now = jiffies; + dst->rate_tokens += now - dst->rate_last; + dst->rate_last = now; + if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout) + dst->rate_tokens = XRLIM_BURST_FACTOR*timeout; + if (dst->rate_tokens >= timeout) { + dst->rate_tokens -= timeout; + return 1; + } + return 0; +} + +static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +{ + struct dst_entry *dst = &rt->u.dst; + + if (type > NR_ICMP_TYPES || !icmp_pointers[type].timeout) + return 1; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return 1; + + /* Redirect has its own rate limit mechanism */ + if (type == ICMP_REDIRECT) + return 1; + + /* No rate limit on loopback */ + if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + return 1; + + return xrlim_allow(dst, *(icmp_pointers[type].timeout)); +} + +/* + * Maintain the counters used in the SNMP statistics for outgoing ICMP + */ + +static void icmp_out_count(int type) +{ + if (type>NR_ICMP_TYPES) + return; + (*icmp_pointers[type].output)++; + icmp_statistics.IcmpOutMsgs++; +} + +/* + * Checksum each fragment, and on the first include the headers and final checksum. + */ + +static int icmp_glue_bits(const void *p, char *to, unsigned int offset, unsigned int fraglen) +{ + struct icmp_bxm *icmp_param = (struct icmp_bxm *)p; + struct icmphdr *icmph; + unsigned long csum; + + if (offset) { + icmp_param->csum=csum_partial_copy(icmp_param->data_ptr+offset-sizeof(struct icmphdr), + to, fraglen,icmp_param->csum); + return 0; + } + + /* + * First fragment includes header. Note that we've done + * the other fragments first, so that we get the checksum + * for the whole packet here. + */ + csum = csum_partial_copy((void *)&icmp_param->icmph, + to, sizeof(struct icmphdr), + icmp_param->csum); + csum = csum_partial_copy(icmp_param->data_ptr, + to+sizeof(struct icmphdr), + fraglen-sizeof(struct icmphdr), csum); + icmph=(struct icmphdr *)to; + icmph->checksum = csum_fold(csum); + return 0; +} + +/* + * Driving logic for building and sending ICMP messages. + */ + +static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) +{ + struct sock *sk=icmp_socket->sk; + struct ipcm_cookie ipc; + struct rtable *rt = (struct rtable*)skb->dst; + u32 daddr; + + if (ip_options_echo(&icmp_param->replyopts, skb)) + return; + + icmp_param->icmph.checksum=0; + icmp_param->csum=0; + icmp_out_count(icmp_param->icmph.type); + + sk->ip_tos = skb->nh.iph->tos; + daddr = ipc.addr = rt->rt_src; + ipc.opt = &icmp_param->replyopts; + if (ipc.opt->srr) + daddr = icmp_param->replyopts.faddr; + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) + return; + ip_build_xmit(sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+sizeof(struct icmphdr), + &ipc, rt, MSG_DONTWAIT); + ip_rt_put(rt); +} + + +/* + * Send an ICMP message in response to a situation + * + * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. MAY send more (we do). + * MUST NOT change this header information. + * MUST NOT reply to a multicast/broadcast IP address. + * MUST NOT reply to a multicast/broadcast MAC address. + * MUST reply to only the first fragment. + */ + +void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) +{ + struct iphdr *iph; + struct icmphdr *icmph; + int room; + struct icmp_bxm icmp_param; + struct rtable *rt = (struct rtable*)skb_in->dst; + struct ipcm_cookie ipc; + u32 saddr; + u8 tos; + + /* + * Find the original header + */ + + iph = skb_in->nh.iph; + + /* + * No replies to physical multicast/broadcast + */ + + if (skb_in->pkt_type!=PACKET_HOST) + return; + + /* + * Now check at the protocol level + */ + if (!rt) { +#ifndef CONFIG_IP_ALWAYS_DEFRAG + if (net_ratelimit()) + printk(KERN_DEBUG "icmp_send: destinationless packet\n"); +#endif + return; + } + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) + return; + + + /* + * Only reply to fragment 0. We byte re-order the constant + * mask for efficiency. + */ + + if (iph->frag_off&htons(IP_OFFSET)) + return; + + /* + * If we send an ICMP error to an ICMP error a mess would result.. + */ + + if (icmp_pointers[type].error) { + /* + * We are an error, check if we are replying to an ICMP error + */ + + if (iph->protocol==IPPROTO_ICMP) { + icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + /* + * Assume any unknown ICMP type is an error. This isn't + * specified by the RFC, but think about it.. + */ + if (icmph->type>NR_ICMP_TYPES || icmp_pointers[icmph->type].error) + return; + } + } + + + /* + * Construct source address and options. + */ + +#ifdef CONFIG_IP_ROUTE_NAT + /* + * Restore original addresses if packet has been translated. + */ + if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) { + iph->daddr = rt->key.dst; + iph->saddr = rt->key.src; + } +#endif +#ifdef CONFIG_IP_MASQUERADE + if (type==ICMP_DEST_UNREACH && IPCB(skb_in)->flags&IPSKB_MASQUERADED) { + ip_fw_unmasq_icmp(skb_in); + } +#endif + + saddr = iph->daddr; + if (!(rt->rt_flags & RTCF_LOCAL)) + saddr = 0; + + tos = icmp_pointers[type].error ? + ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : + iph->tos; + + /* XXX: use a more aggressive expire for routes created by + * this call (not longer than the rate limit timeout). + * It could be also worthwhile to not put them into ipv4 + * fast routing cache at first. Otherwise an attacker can + * grow the routing table. + */ + if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) + return; + + if (ip_options_echo(&icmp_param.replyopts, skb_in)) + goto ende; + + + /* + * Prepare data for ICMP header. + */ + + icmp_param.icmph.type=type; + icmp_param.icmph.code=code; + icmp_param.icmph.un.gateway = info; + icmp_param.icmph.checksum=0; + icmp_param.csum=0; + icmp_param.data_ptr=iph; + icmp_out_count(icmp_param.icmph.type); + icmp_socket->sk->ip_tos = tos; + ipc.addr = iph->saddr; + ipc.opt = &icmp_param.replyopts; + if (icmp_param.replyopts.srr) { + ip_rt_put(rt); + if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0)) + return; + } + + if (!icmpv4_xrlim_allow(rt, type, code)) + goto ende; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + + room = rt->u.dst.pmtu; + if (room > 576) + room = 576; + room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct icmphdr); + + icmp_param.data_len=(iph->ihl<<2)+skb_in->len; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + + ip_build_xmit(icmp_socket->sk, icmp_glue_bits, &icmp_param, + icmp_param.data_len+sizeof(struct icmphdr), + &ipc, rt, MSG_DONTWAIT); + +ende: + ip_rt_put(rt); +} + + +/* + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + */ + +static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ + struct iphdr *iph; + int hash; + struct inet_protocol *ipprot; + unsigned char *dp; + struct sock *raw_sk; + + /* + * Incomplete header ? + * Only checks for the IP header, there should be an + * additional check for longer headers in upper levels. + */ + + if(len<sizeof(struct iphdr)) { + icmp_statistics.IcmpInErrors++; + return; + } + + iph = (struct iphdr *) (icmph + 1); + dp = (unsigned char*)iph; + + if(icmph->type==ICMP_DEST_UNREACH) { + switch(icmph->code & 15) { + case ICMP_NET_UNREACH: + break; + case ICMP_HOST_UNREACH: + break; + case ICMP_PROT_UNREACH: + break; + case ICMP_PORT_UNREACH: + break; + case ICMP_FRAG_NEEDED: + if (ipv4_config.no_pmtu_disc) { + if (net_ratelimit()) + printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n", + NIPQUAD(iph->daddr)); + } else { + unsigned short new_mtu; + new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); + if (!new_mtu) + return; + icmph->un.frag.mtu = htons(new_mtu); + } + break; + case ICMP_SR_FAILED: + if (net_ratelimit()) + printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr)); + break; + default: + break; + } + if (icmph->code>NR_ICMP_UNREACH) + return; + } + + /* + * Throw it at our lower layers + * + * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed header. + * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the transport layer. + * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to transport layer. + */ + + /* + * Check the other end isnt violating RFC 1122. Some routers send + * bogus responses to broadcast frames. If you see this message + * first check your netmask matches at both ends, if it does then + * get the other vendor to fix their kit. + */ + + if (!sysctl_icmp_ignore_bogus_error_responses) + { + + if (inet_addr_type(iph->daddr) == RTN_BROADCAST) + { + if (net_ratelimit()) + printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n", + NIPQUAD(skb->nh.iph->saddr)); + return; + } + } + + /* + * Deliver ICMP message to raw sockets. Pretty useless feature? + */ + + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + hash = iph->protocol & (MAX_INET_PROTOS - 1); + if ((raw_sk = raw_v4_htable[hash]) != NULL) + { + while ((raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, + iph->daddr, skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb); + raw_sk = raw_sk->next; + } + } + + /* + * This can't change while we are doing it. + */ + + ipprot = (struct inet_protocol *) inet_protos[hash]; + while(ipprot != NULL) { + struct inet_protocol *nextip; + + nextip = (struct inet_protocol *) ipprot->next; + + /* + * Pass it off to everyone who wants it. + */ + + /* RFC1122: OK. Passes appropriate ICMP errors to the */ + /* appropriate protocol layer (MUST), as per 3.2.2. */ + + if (iph->protocol == ipprot->protocol && ipprot->err_handler) + ipprot->err_handler(skb, dp, len); + + ipprot = nextip; + } +} + + +/* + * Handle ICMP_REDIRECT. + */ + +static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ + struct iphdr *iph; + unsigned long ip; + + if (len < sizeof(struct iphdr)) { + icmp_statistics.IcmpInErrors++; + return; + } + + /* + * Get the copied header of the packet that caused the redirect + */ + + iph = (struct iphdr *) (icmph + 1); + ip = iph->daddr; + + switch(icmph->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + /* + * As per RFC recommendations now handle it as + * a host redirect. + */ + + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + ip_rt_redirect(skb->nh.iph->saddr, ip, icmph->un.gateway, iph->saddr, iph->tos, skb->dev); + break; + default: + break; + } +} + +/* + * Handle ICMP_ECHO ("ping") requests. + * + * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo requests. + * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be included in the reply. + * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring echo requests, MUST have default=NOT. + * See also WRT handling of options once they are done and working. + */ + +static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ + if (!sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_ECHOREPLY; + icmp_param.data_ptr=(icmph+1); + icmp_param.data_len=len; + icmp_reply(&icmp_param, skb); + } +} + +/* + * Handle ICMP Timestamp requests. + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ + +static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ + struct timeval tv; + __u32 times[3]; /* So the new timestamp works on ALPHA's.. */ + struct icmp_bxm icmp_param; + + /* + * Too short. + */ + + if(len<12) { + icmp_statistics.IcmpInErrors++; + return; + } + + /* + * Fill in the current time as ms since midnight UT: + */ + + do_gettimeofday(&tv); + times[1] = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + times[2] = times[1]; + memcpy((void *)×[0], icmph+1, 4); /* Incoming stamp */ + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_TIMESTAMPREPLY; + icmp_param.icmph.code=0; + icmp_param.data_ptr=× + icmp_param.data_len=12; + icmp_reply(&icmp_param, skb); +} + + +/* + * Handle ICMP_ADDRESS_MASK requests. (RFC950) + * + * RFC1122 (3.2.2.9). A host MUST only send replies to + * ADDRESS_MASK requests if it's been configured as an address mask + * agent. Receiving a request doesn't constitute implicit permission to + * act as one. Of course, implementing this correctly requires (SHOULD) + * a way to turn the functionality on and off. Another one for sysctl(), + * I guess. -- MS + * + * RFC1812 (4.3.3.9). A router MUST implement it. + * A router SHOULD have switch turning it on/off. + * This switch MUST be ON by default. + * + * Gratuitous replies, zero-source replies are not implemented, + * that complies with RFC. DO NOT implement them!!! All the idea + * of broadcast addrmask replies as specified in RFC950 is broken. + * The problem is that it is not uncommon to have several prefixes + * on one physical interface. Moreover, addrmask agent can even be + * not aware of existing another prefixes. + * If source is zero, addrmask agent cannot choose correct prefix. + * Gratuitous mask announcements suffer from the same problem. + * RFC1812 explains it, but still allows to use ADDRMASK, + * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK + * + * Furthermore you can do it with a usermode address agent program + * anyway... + */ + +static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); +#endif +} + +/* + * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain + * loudly if an inconsistency is found. + */ + +static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct device *dev = skb->dev; + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + u32 mask; + + if (!in_dev || !in_dev->ifa_list || + !IN_DEV_LOG_MARTIANS(in_dev) || + !IN_DEV_FORWARD(in_dev) || + len < 4 || + !(rt->rt_flags&RTCF_DIRECTSRC)) + return; + + mask = *(u32*)&icmph[1]; + for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) + return; + } + if (net_ratelimit()) + printk(KERN_INFO "Wrong address mask %08lX from %08lX/%s\n", + ntohl(mask), ntohl(rt->rt_src), dev->name); +} + +static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len) +{ +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check incoming icmp packets not addressed locally, to check whether + * they relate to a (proxying) socket on our system. + * Needed for transparent proxying. + * + * This code is presently ugly and needs cleanup. + * Probably should add a chkaddr entry to ipprot to call a chk routine + * in udp.c or tcp.c... + */ + +/* This should work with the new hashes now. -DaveM */ +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); + +int icmp_chkaddr(struct sk_buff *skb) +{ + struct icmphdr *icmph=(struct icmphdr *)(skb->nh.raw + skb->nh.iph->ihl*4); + struct iphdr *iph = (struct iphdr *) (icmph + 1); + void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len) = icmp_pointers[icmph->type].handler; + + if (handler == icmp_unreach || handler == icmp_redirect) { + struct sock *sk; + + switch (iph->protocol) { + case IPPROTO_TCP: + { + struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); + + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); + if (!sk || (sk->state == TCP_LISTEN)) + return 0; + /* + * This packet came from us. + */ + return 1; + } + case IPPROTO_UDP: + { + struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (!sk) return 0; + if (sk->saddr != iph->saddr && inet_addr_type(iph->saddr) != RTN_LOCAL) + return 0; + /* + * This packet may have come from us. + * Assume it did. + */ + return 1; + } + } + } + return 0; +} + +#endif + +/* + * Deal with incoming ICMP packets. + */ + +int icmp_rcv(struct sk_buff *skb, unsigned short len) +{ + struct icmphdr *icmph = skb->h.icmph; + struct rtable *rt = (struct rtable*)skb->dst; + + icmp_statistics.IcmpInMsgs++; + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently discarded. + */ + if(len < sizeof(struct icmphdr) || + ip_compute_csum((unsigned char *) icmph, len) || + icmph->type > NR_ICMP_TYPES) + goto error; + + /* + * Parse the ICMP message + */ + + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + /* + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + if (icmph->type == ICMP_ECHO && + sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } + if (icmph->type != ICMP_ECHO && + icmph->type != ICMP_TIMESTAMP && + icmph->type != ICMP_ADDRESS && + icmph->type != ICMP_ADDRESSREPLY) { + goto error; + } + } + + len -= sizeof(struct icmphdr); + (*icmp_pointers[icmph->type].input)++; + (icmp_pointers[icmph->type].handler)(icmph, skb, len); + +drop: + kfree_skb(skb); + return 0; +error: + icmp_statistics.IcmpInErrors++; + goto drop; +} + +/* + * A spare long used to speed up statistics updating + */ + +static unsigned long dummy; + +/* + * Configurable rate limits. + * Someone should check if these default values are correct. + * Note that these values interact with the routing cache GC timeout. + * If you chose them too high they won't take effect, because the + * dst_entry gets expired too early. The same should happen when + * the cache grows too big. + */ +int sysctl_icmp_destunreach_time = 1*HZ; +int sysctl_icmp_timeexceed_time = 1*HZ; +int sysctl_icmp_paramprob_time = 1*HZ; +int sysctl_icmp_echoreply_time = 0; /* don't limit it per default. */ + +/* + * This table is the definition of how we handle ICMP. + */ + +static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = { +/* ECHO REPLY (0) */ + { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, &sysctl_icmp_echoreply_time}, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, +/* DEST UNREACH (3) */ + { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &sysctl_icmp_destunreach_time }, +/* SOURCE QUENCH (4) */ + { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, }, +/* REDIRECT (5) */ + { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, +/* ECHO (8) */ + { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, +/* TIME EXCEEDED (11) */ + { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &sysctl_icmp_timeexceed_time }, +/* PARAMETER PROBLEM (12) */ + { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_unreach, 1, &sysctl_icmp_paramprob_time }, +/* TIMESTAMP (13) */ + { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, }, +/* TIMESTAMP REPLY (14) */ + { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, }, +/* INFO (15) */ + { &dummy, &dummy, icmp_discard, 0, }, +/* INFO REPLY (16) */ + { &dummy, &dummy, icmp_discard, 0, }, +/* ADDR MASK (17) */ + { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, }, +/* ADDR MASK REPLY (18) */ + { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, } +}; + +__initfunc(void icmp_init(struct net_proto_family *ops)) +{ + int err; + + icmp_inode.i_mode = S_IFSOCK; + icmp_inode.i_sock = 1; + icmp_inode.i_uid = 0; + icmp_inode.i_gid = 0; + + icmp_socket->inode = &icmp_inode; + icmp_socket->state = SS_UNCONNECTED; + icmp_socket->type=SOCK_RAW; + + if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0) + panic("Failed to create the ICMP control socket.\n"); + icmp_socket->sk->allocation=GFP_ATOMIC; + icmp_socket->sk->num = 256; /* Don't receive any data */ + icmp_socket->sk->ip_ttl = MAXTTL; +} diff --git a/pfinet/linux-src/net/ipv4/igmp.c b/pfinet/linux-src/net/ipv4/igmp.c new file mode 100644 index 00000000..934e8601 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/igmp.c @@ -0,0 +1,698 @@ +/* + * Linux NET3: Internet Group Management Protocol [IGMP] + * + * This code implements the IGMP protocol as defined in RFC1112. There has + * been a further revision of this protocol since which is now supported. + * + * If you have trouble with this module be careful what gcc you have used, + * the older version didn't come out right using gcc 2.5.8, the newer one + * seems to fall out with gcc 2.6.2. + * + * Version: $Id: igmp.c,v 1.30.2.1 1999/07/23 15:29:22 davem Exp $ + * + * Authors: + * Alan Cox <Alan.Cox@linux.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + * Alan Cox : Added lots of __inline__ to optimise + * the memory usage of all the tiny little + * functions. + * Alan Cox : Dumped the header building experiment. + * Alan Cox : Minor tweaks ready for multicast routing + * and extended IGMP protocol. + * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 + * writes utterly bogus code otherwise (sigh) + * fixed IGMP loopback to behave in the manner + * desired by mrouted, fixed the fact it has been + * broken since 1.3.6 and cleaned up a few minor + * points. + * + * Chih-Jen Chang : Tried to revise IGMP to Version 2 + * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu + * The enhancements are mainly based on Steve Deering's + * ipmulti-3.5 source code. + * Chih-Jen Chang : Added the igmp_get_mrouter_info and + * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of + * the mrouted version on that device. + * Chih-Jen Chang : Added the max_resp_time parameter to + * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter + * to identify the multicast router version + * and do what the IGMP version 2 specified. + * Chih-Jen Chang : Added a timer to revert to IGMP V2 router + * Tsu-Sheng Tsao if the specified time expired. + * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. + * Alan Cox : Use GFP_ATOMIC in the right places. + * Christian Daudt : igmp timer wasn't set for local group + * memberships but was being deleted, + * which caused a "del_timer() called + * from %p with timer not initialized\n" + * message (960131). + * Christian Daudt : removed del_timer from + * igmp_timer_expire function (960205). + * Christian Daudt : igmp_heard_report now only calls + * igmp_timer_expire if tm->running is + * true (960216). + * Malcolm Beattie : ttl comparison wrong in igmp_rcv made + * igmp_heard_query never trigger. Expiry + * miscalculation fixed in igmp_heard_query + * and random() made to return unsigned to + * prevent negative expiry times. + * Alexey Kuznetsov: Wrong group leaving behaviour, backport + * fix from pending 2.1.x patches. + * Alan Cox: Forget to enable FDDI support earlier. + * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. + */ + + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/sock.h> +#include <net/checksum.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif + +#define IP_MAX_MEMBERSHIPS 20 + +#ifdef CONFIG_IP_MULTICAST + +/* Parameter names and values are taken from igmp-v2-06 draft */ + +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 + + +#define IGMP_Initial_Report_Delay (1*HZ) + +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. + */ + +#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && (long)(jiffies - (in_dev)->mr_v1_seen) < 0) + +/* + * Timer management + */ + +static __inline__ void igmp_stop_timer(struct ip_mc_list *im) +{ + if (im->tm_running) { + del_timer(&im->timer); + im->tm_running=0; + } +} + +static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay) +{ + int tv; + if (im->tm_running) + return; + tv=net_random() % max_delay; + im->timer.expires=jiffies+tv+2; + im->tm_running=1; + add_timer(&im->timer); +} + +/* + * Send an IGMP report. + */ + +#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) + +static int igmp_send_report(struct device *dev, u32 group, int type) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct igmphdr *ih; + struct rtable *rt; + u32 dst; + + /* According to IGMPv2 specs, LEAVE messages are + * sent to all-routers group. + */ + dst = group; + if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + + if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) + return -1; + if (rt->rt_src == 0) { + ip_rt_put(rt); + return -1; + } + + skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC); + if (skb == NULL) { + ip_rt_put(rt); + return -1; + } + + skb->dst = &rt->u.dst; + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + + iph->version = 4; + iph->ihl = (sizeof(struct iphdr)+4)>>2; + iph->tos = 0; + iph->frag_off = 0; + iph->ttl = 1; + iph->daddr = dst; + iph->saddr = rt->rt_src; + iph->protocol = IPPROTO_IGMP; + iph->tot_len = htons(IGMP_SIZE); + iph->id = htons(ip_id_count++); + ((u8*)&iph[1])[0] = IPOPT_RA; + ((u8*)&iph[1])[1] = 4; + ((u8*)&iph[1])[2] = 0; + ((u8*)&iph[1])[3] = 0; + ip_send_check(iph); + + ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + ih->type=type; + ih->code=0; + ih->csum=0; + ih->group=group; + ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); + + return skb->dst->output(skb); +} + + +static void igmp_timer_expire(unsigned long data) +{ + struct ip_mc_list *im=(struct ip_mc_list *)data; + struct in_device *in_dev = im->interface; + int err; + + im->tm_running=0; + + if (IGMP_V1_SEEN(in_dev)) + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); + else + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); + + /* Failed. Retry later. */ + if (err) { + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + return; + } + + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; +} + +static void igmp_heard_report(struct in_device *in_dev, u32 group) +{ + struct ip_mc_list *im; + + /* Timers are only set for non-local groups */ + + if (group == IGMP_ALL_HOSTS) + return; + + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (im->multiaddr == group) { + igmp_stop_timer(im); + im->reporter = 0; + im->unsolicit_count = 0; + return; + } + } +} + +static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time, + u32 group) +{ + struct ip_mc_list *im; + int max_delay; + + max_delay = max_resp_time*(HZ/IGMP_TIMER_SCALE); + + if (max_resp_time == 0) { + /* Alas, old v1 router presents here. */ + + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + IGMP_V1_Router_Present_Timeout; + group = 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (group && group != im->multiaddr) + continue; + if (im->multiaddr == IGMP_ALL_HOSTS) + continue; + im->unsolicit_count = 0; + if (im->tm_running && (long)(im->timer.expires-jiffies) > max_delay) + igmp_stop_timer(im); + igmp_start_timer(im, max_delay); + } +} + +int igmp_rcv(struct sk_buff *skb, unsigned short len) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih = skb->h.igmph; + struct in_device *in_dev = skb->dev->ip_ptr; + + if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len) + || in_dev==NULL) { + kfree_skb(skb); + return 0; + } + + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, ih->code, ih->group); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMP_HOST_NEW_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (((struct rtable*)skb->dst)->key.iif == 0) + break; + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + return pim_rcv_v1(skb, len); +#endif + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + } + kfree_skb(skb); + return 0; +} + +#endif + + +/* + * Add a filter to a device + */ + +static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct device *dev = in_dev->dev; + + /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. + We will get multicast token leakage, when IFF_MULTICAST + is changed. This check should be done in dev->set_multicast_list + routine. Something sort of: + if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } + --ANK + */ + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_add(dev,buf,dev->addr_len,0); +} + +/* + * Remove a filter from a device + */ + +static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct device *dev = in_dev->dev; + + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_delete(dev,buf,dev->addr_len,0); +} + +static void igmp_group_dropped(struct ip_mc_list *im) +{ + if (im->loaded) { + im->loaded = 0; + ip_mc_filter_del(im->interface, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + start_bh_atomic(); + igmp_stop_timer(im); + end_bh_atomic(); + + if (im->reporter && !IGMP_V1_SEEN(im->interface)) + igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); +#endif +} + +static void igmp_group_added(struct ip_mc_list *im) +{ + if (im->loaded == 0) { + im->loaded = 1; + ip_mc_filter_add(im->interface, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + start_bh_atomic(); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + end_bh_atomic(); +#endif +} + + +/* + * Multicast list managers + */ + + +/* + * A socket has joined a multicast group on device dev. + */ + +void ip_mc_inc_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *i, *im; + + im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + + for (i=in_dev->mc_list; i; i=i->next) { + if (i->multiaddr == addr) { + i->users++; + if (im) + kfree(im); + return; + } + } + if (!im) + return; + im->users=1; + im->interface=in_dev; + im->multiaddr=addr; +#ifdef CONFIG_IP_MULTICAST + im->tm_running=0; + init_timer(&im->timer); + im->timer.data=(unsigned long)im; + im->timer.function=&igmp_timer_expire; + im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->reporter = 0; + im->loaded = 0; +#endif + im->next=in_dev->mc_list; + in_dev->mc_list=im; + igmp_group_added(im); + if (in_dev->dev->flags & IFF_UP) + ip_rt_multicast_event(in_dev); + return; +} + +/* + * A socket has left a multicast group on device dev + */ + +int ip_mc_dec_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *i, **ip; + + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { + if (i->multiaddr==addr) { + if (--i->users == 0) { + *ip = i->next; + synchronize_bh(); + + igmp_group_dropped(i); + if (in_dev->dev->flags & IFF_UP) + ip_rt_multicast_event(in_dev); + kfree_s(i, sizeof(*i)); + } + return 0; + } + } + return -ESRCH; +} + +/* Device going down */ + +void ip_mc_down(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_dropped(i); + + ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_added(i); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ip_mc_destroy_dev(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + while ((i = in_dev->mc_list) != NULL) { + in_dev->mc_list = i->next; + igmp_group_dropped(i); + kfree_s(i, sizeof(*i)); + } +} + +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +{ + struct rtable *rt; + struct device *dev = NULL; + + if (imr->imr_address.s_addr) { + dev = ip_dev_find(imr->imr_address.s_addr); + if (!dev) + return NULL; + } + + if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) { + dev = rt->u.dst.dev; + ip_rt_put(rt); + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + return dev->ip_ptr; + } + return NULL; +} + +/* + * Join a socket to a group + */ +int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS; + +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) +{ + int err; + u32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml, *i; + struct in_device *in_dev; + int count = 0; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + if (!imr->imr_ifindex) + in_dev = ip_mc_find_dev(imr); + else + in_dev = inetdev_by_index(imr->imr_ifindex); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; + } + + iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); + + err = -EADDRINUSE; + for (i=sk->ip_mc_list; i; i=i->next) { + if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { + /* New style additions are reference counted */ + if (imr->imr_address.s_addr == 0) { + i->count++; + err = 0; + } + goto done; + } + count++; + } + err = -ENOBUFS; + if (iml == NULL || count >= sysctl_igmp_max_memberships) + goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next = sk->ip_mc_list; + iml->count = 1; + sk->ip_mc_list = iml; + ip_mc_inc_group(in_dev, addr); + iml = NULL; + err = 0; +done: + rtnl_shunlock(); + if (iml) + sock_kfree_s(sk, iml, sizeof(*iml)); + return err; +} + +/* + * Ask a socket to leave a group. + */ + +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +{ + struct ip_mc_socklist *iml, **imlp; + + for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) { + if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && + iml->multi.imr_address.s_addr==imr->imr_address.s_addr && + (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { + struct in_device *in_dev; + if (--iml->count) + return 0; + + *imlp = iml->next; + synchronize_bh(); + + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + if (in_dev) + ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); + sock_kfree_s(sk, iml, sizeof(*iml)); + return 0; + } + } + return -EADDRNOTAVAIL; +} + +/* + * A socket is closing. + */ + +void ip_mc_drop_socket(struct sock *sk) +{ + struct ip_mc_socklist *iml; + + while ((iml=sk->ip_mc_list) != NULL) { + struct in_device *in_dev; + sk->ip_mc_list = iml->next; + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + sock_kfree_s(sk, iml, sizeof(*iml)); + } +} + + +#ifdef CONFIG_IP_MULTICAST + +int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + struct ip_mc_list *im; + int len=0; + struct device *dev; + + len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); + + for(dev = dev_base; dev; dev = dev->next) + { + struct in_device *in_dev = dev->ip_ptr; + char *querier = "NONE"; + + if (in_dev == NULL) + continue; + + querier = IGMP_V1_SEEN(in_dev) ? "V1" : "V2"; + + len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n", + dev->ifindex, dev->name, dev->mc_count, querier); + + for (im = in_dev->mc_list; im; im = im->next) { + len+=sprintf(buffer+len, + "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->timer.expires-jiffies, im->reporter); + + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + goto done; + } + } +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if(len<0) + len=0; + return len; +} +#endif + diff --git a/pfinet/linux-src/net/ipv4/ip_forward.c b/pfinet/linux-src/net/ipv4/ip_forward.c new file mode 100644 index 00000000..08ebbc2f --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_forward.c @@ -0,0 +1,297 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP forwarding functionality. + * + * Version: $Id: ip_forward.c,v 1.43 1999/03/21 05:22:37 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip_input.c for + * history. + * Dave Gregorich : NULL ip_rt_put fix for multicast + * routing. + * Jos Vos : Add call_out_firewall before sending, + * use output device for accounting. + * Jos Vos : Call forward firewall after routing + * (always use output device). + * Mike McLagan : Routing by source + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#include <net/checksum.h> +#include <linux/route.h> +#include <net/route.h> + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check the packet against our socket administration to see + * if it is related to a connection on our system. + * Needed for transparent proxying. + */ + +int ip_chksock(struct sk_buff *skb) +{ + switch (skb->nh.iph->protocol) { + case IPPROTO_ICMP: + return icmp_chkaddr(skb); + case IPPROTO_TCP: + return tcp_chkaddr(skb); + case IPPROTO_UDP: + return udp_chkaddr(skb); + default: + return 0; + } +} +#endif + + +int ip_forward(struct sk_buff *skb) +{ + struct device *dev2; /* Output device */ + struct iphdr *iph; /* Our header */ + struct rtable *rt; /* Route we use */ + struct ip_options * opt = &(IPCB(skb)->opt); + unsigned short mtu; +#if defined(CONFIG_FIREWALL) || defined(CONFIG_IP_MASQUERADE) + int fw_res = 0; +#endif + + if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + return 0; + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + /* + * According to the RFC, we must first decrease the TTL field. If + * that reaches zero, we must reply an ICMP control message telling + * that the packet's lifetime expired. + */ + + iph = skb->nh.iph; + rt = (struct rtable*)skb->dst; + +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 1 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + if (((xtime.tv_usec&0xF)<<net_cpu_congestion) > 0x1C) + goto drop; + } +#endif + + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (ip_chksock(skb)) + goto local_pkt; +#endif + + if (iph->ttl <= 1) + goto too_many_hops; + + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; + + /* + * Having picked a route we can now send the frame out + * after asking the firewall permission to do so. + */ + + skb->priority = rt_tos2priority(iph->tos); + dev2 = rt->u.dst.dev; + mtu = rt->u.dst.pmtu; + +#ifdef CONFIG_NET_SECURITY + call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL); +#endif + + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. + */ + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + ip_rt_send_redirect(skb); + + /* We are about to mangle packet. Copy it! */ + if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL) + return -1; + iph = skb->nh.iph; + opt = &(IPCB(skb)->opt); + + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + + /* + * We now may allocate a new buffer, and copy the datagram into it. + * If the indicated interface is up and running, kick it. + */ + + if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) + goto frag_needed; + +#ifdef CONFIG_IP_ROUTE_NAT + if (rt->rt_flags & RTCF_NAT) { + if (ip_do_nat(skb)) { + kfree_skb(skb); + return -1; + } + } +#endif + +#ifdef CONFIG_IP_MASQUERADE + if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) { + /* + * Check that any ICMP packets are not for a + * masqueraded connection. If so rewrite them + * and skip the firewall checks + */ + if (iph->protocol == IPPROTO_ICMP) { + __u32 maddr; +#ifdef CONFIG_IP_MASQUERADE_ICMP + struct icmphdr *icmph = (struct icmphdr *)((char*)iph + (iph->ihl << 2)); + if ((icmph->type==ICMP_DEST_UNREACH)|| + (icmph->type==ICMP_SOURCE_QUENCH)|| + (icmph->type==ICMP_TIME_EXCEEDED)) + { +#endif + maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); + fw_res = ip_fw_masq_icmp(&skb, maddr); + if (fw_res < 0) { + kfree_skb(skb); + return -1; + } + + if (fw_res) + /* ICMP matched - skip firewall */ + goto skip_call_fw_firewall; +#ifdef CONFIG_IP_MASQUERADE_ICMP + } +#endif + } + if (rt->rt_flags&RTCF_MASQ) + goto skip_call_fw_firewall; +#endif /* CONFIG_IP_MASQUERADE */ + +#ifdef CONFIG_FIREWALL + fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL, &skb); + switch (fw_res) { + case FW_ACCEPT: + case FW_MASQUERADE: + break; + case FW_REJECT: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + /* fall thru */ + default: + kfree_skb(skb); + return -1; + } +#endif + +#ifdef CONFIG_IP_MASQUERADE + } + +skip_call_fw_firewall: + /* + * If this fragment needs masquerading, make it so... + * (Don't masquerade de-masqueraded fragments) + */ + if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) && + (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) { + u32 maddr; + +#ifdef CONFIG_IP_ROUTE_NAT + maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0; + + if (maddr == 0) +#endif + maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); + + if (ip_fw_masquerade(&skb, maddr) < 0) { + kfree_skb(skb); + return -1; + } else { + /* + * Masquerader may have changed skb + */ + iph = skb->nh.iph; + opt = &(IPCB(skb)->opt); + } + } +#endif + + +#ifdef CONFIG_FIREWALL + if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) { + /* FW_ACCEPT and FW_MASQUERADE are treated equal: + masquerading is only supported via forward rules */ + if (fw_res == FW_REJECT) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + kfree_skb(skb); + return -1; + } +#endif + + ip_statistics.IpForwDatagrams++; + + if (opt->optlen == 0) { +#ifdef CONFIG_NET_FASTROUTE + if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) { + unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK; + /* Time to switch to functional programming :-) */ + dst_release_irqwait(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst))); + } +#endif + ip_send(skb); + return 0; + } + + ip_forward_options(skb); + ip_send(skb); + return 0; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +local_pkt: + return ip_local_deliver(skb); +#endif + +frag_needed: + ip_statistics.IpFragFails++; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + goto drop; + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb); + return -1; +} diff --git a/pfinet/linux-src/net/ipv4/ip_fragment.c b/pfinet/linux-src/net/ipv4/ip_fragment.c new file mode 100644 index 00000000..f066e607 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_fragment.c @@ -0,0 +1,593 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $ + * + * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + * David S. Miller : Begin massive cleanup... + * Andi Kleen : Add sysctls. + * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. + * Bill Hawes : Frag accounting and evictor fixes. + * John McDonald : 0 length frag bug. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/inet.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> + +/* Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to measurably + * harm machine performance. + */ +int sysctl_ipfrag_high_thresh = 256*1024; +int sysctl_ipfrag_low_thresh = 192*1024; + +int sysctl_ipfrag_time = IP_FRAG_TIME; + +/* Describe an IP fragment. */ +struct ipfrag { + int offset; /* offset of fragment in IP datagram */ + int end; /* last byte of data in datagram */ + int len; /* length of this fragment */ + struct sk_buff *skb; /* complete received fragment */ + unsigned char *ptr; /* pointer into real fragment data */ + struct ipfrag *next; /* linked list pointers */ + struct ipfrag *prev; +}; + +/* Describe an entry in the "incomplete datagrams" queue. */ +struct ipq { + struct iphdr *iph; /* pointer to IP header */ + struct ipq *next; /* linked list pointers */ + struct ipfrag *fragments; /* linked list of received fragments */ + int len; /* total length of original datagram */ + short ihlen; /* length of the IP header */ + struct timer_list timer; /* when will this queue expire? */ + struct ipq **pprev; + struct device *dev; /* Device - for icmp replies */ +}; + +#define IPQ_HASHSZ 64 + +struct ipq *ipq_hash[IPQ_HASHSZ]; + +#define ipqhashfn(id, saddr, daddr, prot) \ + ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1)) + +atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ + +/* Memory Tracking Functions. */ +extern __inline__ void frag_kfree_skb(struct sk_buff *skb) +{ + atomic_sub(skb->truesize, &ip_frag_mem); + kfree_skb(skb); +} + +extern __inline__ void frag_kfree_s(void *ptr, int len) +{ + atomic_sub(len, &ip_frag_mem); + kfree(ptr); +} + +extern __inline__ void *frag_kmalloc(int size, int pri) +{ + void *vp = kmalloc(size, pri); + + if(!vp) + return NULL; + atomic_add(size, &ip_frag_mem); + return vp; +} + +/* Create a new fragment entry. */ +static struct ipfrag *ip_frag_create(int offset, int end, + struct sk_buff *skb, unsigned char *ptr) +{ + struct ipfrag *fp; + + fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC); + if (fp == NULL) + goto out_nomem; + + /* Fill in the structure. */ + fp->offset = offset; + fp->end = end; + fp->len = end - offset; + fp->skb = skb; + fp->ptr = ptr; + fp->next = fp->prev = NULL; + + /* Charge for the SKB as well. */ + atomic_add(skb->truesize, &ip_frag_mem); + + return(fp); + +out_nomem: + NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n")); + return(NULL); +} + +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and return the queue entry address if found. + */ +static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) +{ + __u16 id = iph->id; + __u32 saddr = iph->saddr; + __u32 daddr = iph->daddr; + __u8 protocol = iph->protocol; + unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + struct ipq *qp; + + /* Always, we are in a BH context, so no locking. -DaveM */ + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->iph->id == id && + qp->iph->saddr == saddr && + qp->iph->daddr == daddr && + qp->iph->protocol == protocol) { + del_timer(&qp->timer); + break; + } + } + return qp; +} + +/* Remove an entry from the "incomplete datagrams" queue, either + * because we completed, reassembled and processed it, or because + * it timed out. + * + * This is called _only_ from BH contexts, on packet reception + * processing and from frag queue expiration timers. -DaveM + */ +static void ip_free(struct ipq *qp) +{ + struct ipfrag *fp; + + /* Stop the timer for this entry. */ + del_timer(&qp->timer); + + /* Remove this entry from the "incomplete datagrams" queue. */ + if(qp->next) + qp->next->pprev = qp->pprev; + *qp->pprev = qp->next; + + /* Release all fragment data. */ + fp = qp->fragments; + while (fp) { + struct ipfrag *xp = fp->next; + + frag_kfree_skb(fp->skb); + frag_kfree_s(fp, sizeof(struct ipfrag)); + fp = xp; + } + + /* Release the IP header. */ + frag_kfree_s(qp->iph, 64 + 8); + + /* Finally, release the queue descriptor itself. */ + frag_kfree_s(qp, sizeof(struct ipq)); +} + +/* + * Oops, a fragment queue timed out. Kill it and send an ICMP reply. + */ +static void ip_expire(unsigned long arg) +{ + struct ipq *qp = (struct ipq *) arg; + + if(!qp->fragments) + { +#ifdef IP_EXPIRE_DEBUG + printk("warning: possible ip-expire attack\n"); +#endif + goto out; + } + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + ip_statistics.IpReasmTimeout++; + ip_statistics.IpReasmFails++; + icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + +out: + /* Nuke the fragment queue. */ + ip_free(qp); +} + +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the low threshold. + */ +static void ip_evictor(void) +{ + int i, progress; + +restart: + progress = 0; + /* FIXME: Make LRU queue of frag heads. -DaveM */ + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *qp; + if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) + return; + /* We are in a BH context, so these queue + * accesses are safe. -DaveM + */ + qp = ipq_hash[i]; + if (qp) { + /* find the oldest queue for this hash bucket */ + while (qp->next) + qp = qp->next; + ip_free(qp); + progress = 1; + } + } + if (progress) + goto restart; + panic("ip_evictor: memcount"); +} + +/* Add an entry to the 'ipq' queue for a newly received IP datagram. + * We will (hopefully :-) receive all other fragments of this datagram + * in time, so we just create a queue for this datagram, in which we + * will insert the received fragments at their respective positions. + */ +static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph) +{ + struct ipq *qp; + unsigned int hash; + int ihlen; + + qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC); + if (qp == NULL) + goto out_nomem; + + /* Allocate memory for the IP header (plus 8 octets for ICMP). */ + ihlen = iph->ihl * 4; + + qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC); + if (qp->iph == NULL) + goto out_free; + + memcpy(qp->iph, iph, ihlen + 8); + qp->len = 0; + qp->ihlen = ihlen; + qp->fragments = NULL; + qp->dev = skb->dev; + + /* Initialize a timer for this entry. */ + init_timer(&qp->timer); + qp->timer.expires = 0; /* (to be set later) */ + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + + /* Add this entry to the queue. */ + hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); + + /* We are in a BH context, no locking necessary. -DaveM */ + if((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + + return qp; + +out_free: + frag_kfree_s(qp, sizeof(struct ipq)); +out_nomem: + NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n")); + return(NULL); +} + +/* See if a fragment queue is complete. */ +static int ip_done(struct ipq *qp) +{ + struct ipfrag *fp; + int offset; + + /* Only possible if we received the final fragment. */ + if (qp->len == 0) + return 0; + + /* Check all fragment offsets to see if they connect. */ + fp = qp->fragments; + offset = 0; + while (fp) { + if (fp->offset > offset) + return(0); /* fragment(s) missing */ + offset = fp->end; + fp = fp->next; + } + + /* All fragments are present. */ + return 1; +} + +/* Build a new IP datagram from all its fragments. + * + * FIXME: We copy here because we lack an effective way of handling lists + * of bits on input. Until the new skb data handling is in I'm not going + * to touch this with a bargepole. + */ +static struct sk_buff *ip_glue(struct ipq *qp) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct ipfrag *fp; + unsigned char *ptr; + int count, len; + + /* Allocate a new buffer for the datagram. */ + len = qp->ihlen + qp->len; + + if(len > 65535) + goto out_oversize; + + skb = dev_alloc_skb(len); + if (!skb) + goto out_nomem; + + /* Fill in the basic details. */ + skb->mac.raw = ptr = skb->data; + skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len); + + /* Copy the original IP headers into the new buffer. */ + memcpy(ptr, qp->iph, qp->ihlen); + ptr += qp->ihlen; + + /* Copy the data portions of all fragments into the new buffer. */ + fp = qp->fragments; + count = qp->ihlen; + while(fp) { + if ((fp->len <= 0) || ((count + fp->len) > skb->len)) + goto out_invalid; + memcpy((ptr + fp->offset), fp->ptr, fp->len); + if (count == qp->ihlen) { + skb->dst = dst_clone(fp->skb->dst); + skb->dev = fp->skb->dev; + } + count += fp->len; + fp = fp->next; + } + + skb->pkt_type = qp->fragments->skb->pkt_type; + skb->protocol = qp->fragments->skb->protocol; + /* + * Clearly bogus, because security markings of the individual + * fragments should have been checked for consistency before + * gluing, and intermediate coalescing of fragments may have + * taken place in ip_defrag() before ip_glue() ever got called. + * If we're not going to do the consistency checking, we might + * as well take the value associated with the first fragment. + * --rct + */ + skb->security = qp->fragments->skb->security; + + /* Done with all fragments. Fixup the new IP header. */ + iph = skb->nh.iph; + iph->frag_off = 0; + iph->tot_len = htons(count); + ip_statistics.IpReasmOKs++; + return skb; + +out_invalid: + NETDEBUG(printk(KERN_ERR + "Invalid fragment list: Fragment over size.\n")); + kfree_skb(skb); + goto out_fail; +out_nomem: + NETDEBUG(printk(KERN_ERR + "IP: queue_glue: no memory for gluing queue %p\n", + qp)); + goto out_fail; +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO + "Oversized IP packet from %d.%d.%d.%d.\n", + NIPQUAD(qp->iph->saddr)); +out_fail: + ip_statistics.IpReasmFails++; + return NULL; +} + +/* Process an incoming IP datagram fragment. */ +struct sk_buff *ip_defrag(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct ipfrag *prev, *next, *tmp, *tfp; + struct ipq *qp; + unsigned char *ptr; + int flags, offset; + int i, ihl, end; + + ip_statistics.IpReasmReqds++; + + /* Start by cleaning up the memory. */ + if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) + ip_evictor(); + + /* + * Look for the entry for this IP datagram in the + * "incomplete datagrams" queue. If found, the + * timer is removed. + */ + qp = ip_find(iph, skb->dst); + + /* Is this a non-fragmented datagram? */ + offset = ntohs(iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = iph->ihl * 4; + + /* + * Check whether to create a fresh queue entry. If the + * queue already exists, its timer will be restarted as + * long as we continue to receive fragments. + */ + if (qp) { + /* ANK. If the first fragment is received, + * we should remember the correct IP header (with options) + */ + if (offset == 0) { + /* Fragmented frame replaced by unfragmented copy? */ + if ((flags & IP_MF) == 0) + goto out_freequeue; + qp->ihlen = ihl; + memcpy(qp->iph, iph, (ihl + 8)); + } + } else { + /* Fragmented frame replaced by unfragmented copy? */ + if ((offset == 0) && ((flags & IP_MF) == 0)) + goto out_skb; + + /* If we failed to create it, then discard the frame. */ + qp = ip_create(skb, iph); + if (!qp) + goto out_freeskb; + } + + /* Attempt to construct an oversize packet. */ + if((ntohs(iph->tot_len) + ((int) offset)) > 65535) + goto out_oversize; + + /* Determine the position of this fragment. */ + end = offset + ntohs(iph->tot_len) - ihl; + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) + qp->len = end; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) { + if (next->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* Point into the IP datagram 'data' part. */ + ptr = skb->data + ihl; + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if ((prev != NULL) && (offset < prev->end)) { + i = prev->end - offset; + offset += i; /* ptr into datagram */ + ptr += i; /* ptr into fragment data */ + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + for (tmp = next; tmp != NULL; tmp = tfp) { + tfp = tmp->next; + if (tmp->offset >= end) + break; /* no overlaps at all */ + + i = end - next->offset; /* overlap is 'i' bytes */ + tmp->len -= i; /* so reduce size of */ + tmp->offset += i; /* next fragment */ + tmp->ptr += i; + + /* If we get a frag size of <= 0, remove it and the packet + * that it goes with. + */ + if (tmp->len <= 0) { + if (tmp->prev != NULL) + tmp->prev->next = tmp->next; + else + qp->fragments = tmp->next; + + if (tmp->next != NULL) + tmp->next->prev = tmp->prev; + + /* We have killed the original next frame. */ + next = tfp; + + frag_kfree_skb(tmp->skb); + frag_kfree_s(tmp, sizeof(struct ipfrag)); + } + } + + /* + * Create a fragment to hold this skb. + * No memory to save the fragment? throw the lot ... + */ + tfp = ip_frag_create(offset, end, skb, ptr); + if (!tfp) + goto out_freeskb; + + /* Insert this fragment in the chain of fragments. */ + tfp->prev = prev; + tfp->next = next; + if (prev != NULL) + prev->next = tfp; + else + qp->fragments = tfp; + + if (next != NULL) + next->prev = tfp; + + /* OK, so we inserted this new fragment into the chain. + * Check if we now have a full IP datagram which we can + * bump up to the IP layer... + */ + if (ip_done(qp)) { + /* Glue together the fragments. */ + skb = ip_glue(qp); + /* Free the queue entry. */ +out_freequeue: + ip_free(qp); +out_skb: + return skb; + } + + /* + * The queue is still active ... reset its timer. + */ +out_timer: + mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */ +out: + return NULL; + + /* + * Error exits ... we need to reset the timer if there's a queue. + */ +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", + NIPQUAD(iph->saddr)); + /* the skb isn't in a fragment, so fall through to free it */ +out_freeskb: + kfree_skb(skb); + ip_statistics.IpReasmFails++; + if (qp) + goto out_timer; + goto out; +} diff --git a/pfinet/linux-src/net/ipv4/ip_fw.c b/pfinet/linux-src/net/ipv4/ip_fw.c new file mode 100644 index 00000000..99a91d53 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_fw.c @@ -0,0 +1,1759 @@ +/* + * This code is heavily based on the code on the old ip_fw.c code; see below for + * copyrights and attributions of the old code. This code is basically GPL. + * + * 15-Aug-1997: Major changes to allow graphs for firewall rules. + * Paul Russell <Paul.Russell@rustcorp.com.au> and + * Michael Neuling <Michael.Neuling@rustcorp.com.au> + * 24-Aug-1997: Generalised protocol handling (not just TCP/UDP/ICMP). + * Added explicit RETURN from chains. + * Removed TOS mangling (done in ipchains 1.0.1). + * Fixed read & reset bug by reworking proc handling. + * Paul Russell <Paul.Russell@rustcorp.com.au> + * 28-Sep-1997: Added packet marking for net sched code. + * Removed fw_via comparisons: all done on device name now, + * similar to changes in ip_fw.c in DaveM's CVS970924 tree. + * Paul Russell <Paul.Russell@rustcorp.com.au> + * 2-Nov-1997: Moved types across to __u16, etc. + * Added inverse flags. + * Fixed fragment bug (in args to port_match). + * Changed mark to only one flag (MARKABS). + * 21-Nov-1997: Added ability to test ICMP code. + * 19-Jan-1998: Added wildcard interfaces. + * 6-Feb-1998: Merged 2.0 and 2.1 versions. + * Initialised ip_masq for 2.0.x version. + * Added explicit NETLINK option for 2.1.x version. + * Added packet and byte counters for policy matches. + * 26-Feb-1998: Fixed race conditions, added SMP support. + * 18-Mar-1998: Fix SMP, fix race condition fix. + * 1-May-1998: Remove caching of device pointer. + * 12-May-1998: Allow tiny fragment case for TCP/UDP. + * 15-May-1998: Treat short packets as fragments, don't just block. + * 3-Jan-1999: Fixed serious procfs security hole -- users should never + * be allowed to view the chains! + * Marc Santoro <ultima@snicker.emoti.com> + * 29-Jan-1999: Locally generated bogus IPs dealt with, rather than crash + * during dump_packet. --RR. + * 19-May-1999: Star Wars: The Phantom Menace opened. Rule num + * printed in log (modified from Michael Hasenstein's patch). + * Added SYN in log message. --RR + * 23-Jul-1999: Fixed small fragment security exposure opened on 15-May-1998. + * John McDonald <jm@dataprotect.com> + * Thomas Lopatic <tl@dataprotect.com> + */ + +/* + * + * The origina Linux port was done Alan Cox, with changes/fixes from + * Pauline Middlelink, Jos Vos, Thomas Quinot, Wouter Gadeyne, Juan + * Jose Ciarlante, Bernd Eckenfels, Keith Owens and others. + * + * Copyright from the original FreeBSD version follows: + * + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. */ + + +#include <linux/config.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmp.h> +#include <linux/udp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <linux/netlink.h> +#include <linux/init.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> + +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif + +#include <net/checksum.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +/* Understanding locking in this code: (thanks to Alan Cox for using + * little words to explain this to me). -- PR + * + * In UP, there can be two packets traversing the chains: + * 1) A packet from the current userspace context + * 2) A packet off the bh handlers (timer or net). + * + * For SMP (kernel v2.1+), multiply this by # CPUs. + * + * [Note that this in not correct for 2.2 - because the socket code always + * uses lock_kernel() to serialize, and bottom halves (timers and net_bhs) + * only run on one CPU at a time. This will probably change for 2.3. + * It is still good to use spinlocks because that avoids the global cli() + * for updating the tables, which is rather costly in SMP kernels -AK] + * + * This means counters and backchains can get corrupted if no precautions + * are taken. + * + * To actually alter a chain on UP, we need only do a cli(), as this will + * stop a bh handler firing, as we are in the current userspace context + * (coming from a setsockopt()). + * + * On SMP, we need a write_lock_irqsave(), which is a simple cli() in + * UP. + * + * For backchains and counters, we use an array, indexed by + * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of + * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So, + * confident of uniqueness, we modify counters even though we only + * have a read lock (to read the counters, you need a write lock, + * though). */ + +/* Why I didn't use straight locking... -- PR + * + * The backchains can be separated out of the ip_chains structure, and + * allocated as needed inside ip_fw_check(). + * + * The counters, however, can't. Trying to lock these means blocking + * interrupts every time we want to access them. This would suck HARD + * performance-wise. Not locking them leads to possible corruption, + * made worse on 32-bit machines (counters are 64-bit). */ + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ +/*#define DEBUG_IP_FIREWALL_LOCKING*/ + +#ifdef CONFIG_IP_FIREWALL_NETLINK +static struct sock *ipfwsk; +#endif + +#ifdef __SMP__ +#define SLOT_NUMBER() (cpu_number_map[smp_processor_id()]*2 + !in_interrupt()) +#else +#define SLOT_NUMBER() (!in_interrupt()) +#endif +#define NUM_SLOTS (smp_num_cpus*2) + +#define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \ + + NUM_SLOTS*sizeof(struct ip_reent)) +#define SIZEOF_STRUCT_IP_FW_KERNEL (sizeof(struct ip_fwkernel) \ + + NUM_SLOTS*sizeof(struct ip_counters)) + +#ifdef DEBUG_IP_FIREWALL_LOCKING +static unsigned int fwc_rlocks, fwc_wlocks; +#define FWC_DEBUG_LOCK(d) \ +do { \ + FWC_DONT_HAVE_LOCK(d); \ + d |= (1 << SLOT_NUMBER()); \ +} while (0) + +#define FWC_DEBUG_UNLOCK(d) \ +do { \ + FWC_HAVE_LOCK(d); \ + d &= ~(1 << SLOT_NUMBER()); \ +} while (0) + +#define FWC_DONT_HAVE_LOCK(d) \ +do { \ + if ((d) & (1 << SLOT_NUMBER())) \ + printk("%s:%i: Got lock on %i already!\n", \ + __FILE__, __LINE__, SLOT_NUMBER()); \ +} while(0) + +#define FWC_HAVE_LOCK(d) \ +do { \ + if (!((d) & (1 << SLOT_NUMBER()))) \ + printk("%s:%i:No lock on %i!\n", \ + __FILE__, __LINE__, SLOT_NUMBER()); \ +} while (0) + +#else +#define FWC_DEBUG_LOCK(d) do { } while(0) +#define FWC_DEBUG_UNLOCK(d) do { } while(0) +#define FWC_DONT_HAVE_LOCK(d) do { } while(0) +#define FWC_HAVE_LOCK(d) do { } while(0) +#endif /*DEBUG_IP_FIRWALL_LOCKING*/ + +#define FWC_READ_LOCK(l) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock(l); } while (0) +#define FWC_WRITE_LOCK(l) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock(l); } while (0) +#define FWC_READ_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock_irqsave(l,f); } while (0) +#define FWC_WRITE_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock_irqsave(l,f); } while (0) +#define FWC_READ_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock(l); } while (0) +#define FWC_WRITE_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock(l); } while (0) +#define FWC_READ_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock_irqrestore(l,f); } while (0) +#define FWC_WRITE_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock_irqrestore(l,f); } while (0) + +struct ip_chain; + +struct ip_counters +{ + __u64 pcnt, bcnt; /* Packet and byte counters */ +}; + +struct ip_fwkernel +{ + struct ip_fw ipfw; + struct ip_fwkernel *next; /* where to go next if current + * rule doesn't match */ + struct ip_chain *branch; /* which branch to jump to if + * current rule matches */ + int simplebranch; /* Use this if branch == NULL */ + struct ip_counters counters[0]; /* Actually several of these */ +}; + +struct ip_reent +{ + struct ip_chain *prevchain; /* Pointer to referencing chain */ + struct ip_fwkernel *prevrule; /* Pointer to referencing rule */ + struct ip_counters counters; +}; + +struct ip_chain +{ + ip_chainlabel label; /* Defines the label for each block */ + struct ip_chain *next; /* Pointer to next block */ + struct ip_fwkernel *chain; /* Pointer to first rule in block */ + __u32 refcount; /* Number of refernces to block */ + int policy; /* Default rule for chain. Only * + * used in built in chains */ + struct ip_reent reent[0]; /* Actually several of these */ +}; + +/* + * Implement IP packet firewall + */ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Lock around ip_fw_chains linked list structure */ +rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED; + +/* Head of linked list of fw rules */ +static struct ip_chain *ip_fw_chains; + +#define IP_FW_INPUT_CHAIN ip_fw_chains +#define IP_FW_FORWARD_CHAIN (ip_fw_chains->next) +#define IP_FW_OUTPUT_CHAIN (ip_fw_chains->next->next) + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +extern inline int port_match(__u16 min, __u16 max, __u16 port, + int frag, int invert) +{ + if (frag) /* Fragments fail ANY port test. */ + return (min == 0 && max == 0xFFFF); + else return (port >= min && port <= max) ^ invert; +} + +/* Returns whether matches rule or not. */ +static int ip_rule_match(struct ip_fwkernel *f, + const char *ifname, + struct iphdr *ip, + char tcpsyn, + __u16 src_port, __u16 dst_port, + char isfrag) +{ +#define FWINV(bool,invflg) ((bool) ^ !!(f->ipfw.fw_invflg & invflg)) + /* + * This is a bit simpler as we don't have to walk + * an interface chain as you do in BSD - same logic + * however. + */ + + if (FWINV((ip->saddr&f->ipfw.fw_smsk.s_addr) != f->ipfw.fw_src.s_addr, + IP_FW_INV_SRCIP) + || FWINV((ip->daddr&f->ipfw.fw_dmsk.s_addr)!=f->ipfw.fw_dst.s_addr, + IP_FW_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + f->ipfw.fw_smsk.s_addr, f->ipfw.fw_src.s_addr, + f->ipfw.fw_invflg & IP_FW_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + f->ipfw.fw_dmsk.s_addr, f->ipfw.fw_dst.s_addr, + f->ipfw.fw_invflg & IP_FW_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* + * Look for a VIA device match + */ + if (f->ipfw.fw_flg & IP_FW_F_WILDIF) { + if (FWINV(strncmp(ifname, f->ipfw.fw_vianame, + strlen(f->ipfw.fw_vianame)) != 0, + IP_FW_INV_VIA)) { + dprintf("Wildcard interface mismatch.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_VIA ? " (INV)" : ""); + return 0; /* Mismatch */ + } + } + else if (FWINV(strcmp(ifname, f->ipfw.fw_vianame) != 0, + IP_FW_INV_VIA)) { + dprintf("Interface name does not match.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_VIA + ? " (INV)" : ""); + return 0; /* Mismatch */ + } + + /* + * Ok the chain addresses match. + */ + + /* If we have a fragment rule but the packet is not a fragment + * the we return zero */ + if (FWINV((f->ipfw.fw_flg&IP_FW_F_FRAG) && !isfrag, IP_FW_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_FRAG ? " (INV)" : ""); + return 0; + } + + /* Fragment NEVER passes a SYN test, even an inverted one. */ + if (FWINV((f->ipfw.fw_flg&IP_FW_F_TCPSYN) && !tcpsyn, IP_FW_INV_SYN) + || (isfrag && (f->ipfw.fw_flg&IP_FW_F_TCPSYN))) { + dprintf("Rule requires SYN and packet has no SYN.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_SYN ? " (INV)" : ""); + return 0; + } + + if (f->ipfw.fw_proto) { + /* + * Specific firewall - packet's protocol + * must match firewall's. + */ + + if (FWINV(ip->protocol!=f->ipfw.fw_proto, IP_FW_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, f->ipfw.fw_proto, + f->ipfw.fw_invflg&IP_FW_INV_PROTO ? " (INV)":""); + return 0; + } + + /* For non TCP/UDP/ICMP, port range is max anyway. */ + if (!port_match(f->ipfw.fw_spts[0], + f->ipfw.fw_spts[1], + src_port, isfrag, + !!(f->ipfw.fw_invflg&IP_FW_INV_SRCPT)) + || !port_match(f->ipfw.fw_dpts[0], + f->ipfw.fw_dpts[1], + dst_port, isfrag, + !!(f->ipfw.fw_invflg + &IP_FW_INV_DSTPT))) { + dprintf("Port match failed.\n"); + return 0; + } + } + + dprintf("Match succeeded.\n"); + return 1; +} + +static const char *branchname(struct ip_chain *branch,int simplebranch) +{ + if (branch) + return branch->label; + switch (simplebranch) + { + case FW_BLOCK: return IP_FW_LABEL_BLOCK; + case FW_ACCEPT: return IP_FW_LABEL_ACCEPT; + case FW_REJECT: return IP_FW_LABEL_REJECT; + case FW_REDIRECT: return IP_FW_LABEL_REDIRECT; + case FW_MASQUERADE: return IP_FW_LABEL_MASQUERADE; + case FW_SKIP: return "-"; + case FW_SKIP+1: return IP_FW_LABEL_RETURN; + default: + return "UNKNOWN"; + } +} + +/* + * VERY ugly piece of code which actually + * makes kernel printf for matching packets... + */ +static void dump_packet(const struct iphdr *ip, + const char *ifname, + struct ip_fwkernel *f, + const ip_chainlabel chainlabel, + __u16 src_port, + __u16 dst_port, + unsigned int count, + int syn) +{ + __u32 *opt = (__u32 *) (ip + 1); + int opti; + + if (f) + { + printk(KERN_INFO "Packet log: %s ",chainlabel); + + printk("%s ",branchname(f->branch,f->simplebranch)); + if (f->simplebranch==FW_REDIRECT) + printk("%d ",f->ipfw.fw_redirpt); + } + + printk("%s PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu" + " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ifname, ip->protocol, + (ntohl(ip->saddr)>>24)&0xFF, + (ntohl(ip->saddr)>>16)&0xFF, + (ntohl(ip->saddr)>>8)&0xFF, + (ntohl(ip->saddr))&0xFF, + src_port, + (ntohl(ip->daddr)>>24)&0xFF, + (ntohl(ip->daddr)>>16)&0xFF, + (ntohl(ip->daddr)>>8)&0xFF, + (ntohl(ip->daddr))&0xFF, + dst_port, + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ntohs(ip->frag_off), ip->ttl); + + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk(" %s(#%d)\n", syn ? "SYN " : /* "PENANCE" */ "", count); +} + +/* function for checking chain labels for user space. */ +static int check_label(ip_chainlabel label) +{ + unsigned int i; + /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */ + for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++) + if (label[i] == '\0') return 1; + + return 0; +} + +/* This function returns a pointer to the first chain with a label + * that matches the one given. */ +static struct ip_chain *find_label(ip_chainlabel label) +{ + struct ip_chain *tmp; + FWC_HAVE_LOCK(fwc_rlocks | fwc_wlocks); + for (tmp = ip_fw_chains; tmp; tmp = tmp->next) + if (strcmp(tmp->label,label) == 0) + break; + return tmp; +} + +/* This function returns a boolean which when true sets answer to one + of the FW_*. */ +static int find_special(ip_chainlabel label, int *answer) +{ + if (label[0] == '\0') { + *answer = FW_SKIP; /* => pass-through rule */ + return 1; + } else if (strcmp(label,IP_FW_LABEL_ACCEPT) == 0) { + *answer = FW_ACCEPT; + return 1; + } else if (strcmp(label,IP_FW_LABEL_BLOCK) == 0) { + *answer = FW_BLOCK; + return 1; + } else if (strcmp(label,IP_FW_LABEL_REJECT) == 0) { + *answer = FW_REJECT; + return 1; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + } else if (strcmp(label,IP_FW_LABEL_REDIRECT) == 0) { + *answer = FW_REDIRECT; + return 1; +#endif +#ifdef CONFIG_IP_MASQUERADE + } else if (strcmp(label,IP_FW_LABEL_MASQUERADE) == 0) { + *answer = FW_MASQUERADE; + return 1; +#endif + } else if (strcmp(label, IP_FW_LABEL_RETURN) == 0) { + *answer = FW_SKIP+1; + return 1; + } else { + return 0; + } +} + +/* This function cleans up the prevchain and prevrule. If the verbose + * flag is set then he names of the chains will be printed as it + * cleans up. */ +static void cleanup(struct ip_chain *chain, + const int verbose, + unsigned int slot) +{ + struct ip_chain *tmpchain = chain->reent[slot].prevchain; + if (verbose) + printk(KERN_ERR "Chain backtrace: "); + while (tmpchain) { + if (verbose) + printk("%s<-",chain->label); + chain->reent[slot].prevchain = NULL; + chain = tmpchain; + tmpchain = chain->reent[slot].prevchain; + } + if (verbose) + printk("%s\n",chain->label); +} + +static inline int +ip_fw_domatch(struct ip_fwkernel *f, + struct iphdr *ip, + const char *rif, + const ip_chainlabel label, + struct sk_buff *skb, + unsigned int slot, + __u16 src_port, __u16 dst_port, + unsigned int count, + int tcpsyn) +{ + f->counters[slot].bcnt+=ntohs(ip->tot_len); + f->counters[slot].pcnt++; + if (f->ipfw.fw_flg & IP_FW_F_PRN) { + dump_packet(ip,rif,f,label,src_port,dst_port,count,tcpsyn); + } + ip->tos = (ip->tos & f->ipfw.fw_tosand) ^ f->ipfw.fw_tosxor; + +/* This functionality is useless in stock 2.0.x series, but we don't + * discard the mark thing altogether, to avoid breaking ipchains (and, + * more importantly, the ipfwadm wrapper) --PR */ + if (f->ipfw.fw_flg & IP_FW_F_MARKABS) + skb->fwmark = f->ipfw.fw_mark; + else + skb->fwmark+=f->ipfw.fw_mark; +#ifdef CONFIG_IP_FIREWALL_NETLINK + if (f->ipfw.fw_flg & IP_FW_F_NETLINK) { + size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len)) + + sizeof(__u32) + sizeof(skb->fwmark) + IFNAMSIZ; + struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC); + + duprintf("Sending packet out NETLINK (length = %u).\n", + (unsigned int)len); + if (outskb) { + /* Prepend length, mark & interface */ + skb_put(outskb, len); + *((__u32 *)outskb->data) = (__u32)len; + *((__u32 *)(outskb->data+sizeof(__u32))) = skb->fwmark; + strcpy(outskb->data+sizeof(__u32)*2, rif); + memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip, + len-(sizeof(__u32)*2+IFNAMSIZ)); + netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL); + } + else { + if (net_ratelimit()) + printk(KERN_WARNING "ip_fw: packet drop due to " + "netlink failure\n"); + return 0; + } + } +#endif + return 1; +} + +/* + * Returns one of the generic firewall policies, like FW_ACCEPT. + * + * The testing is either false for normal firewall mode or true for + * user checking mode (counters are not updated, TOS & mark not done). + */ +static int +ip_fw_check(struct iphdr *ip, + const char *rif, + __u16 *redirport, + struct ip_chain *chain, + struct sk_buff *skb, + unsigned int slot, + int testing) +{ + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl); + struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl); + __u32 src, dst; + __u16 src_port = 0xFFFF, dst_port = 0xFFFF; + char tcpsyn=0; + __u16 offset; + unsigned char oldtos; + struct ip_fwkernel *f; + int ret = FW_SKIP+2; + unsigned int count; + + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + + offset = ntohs(ip->frag_off) & IP_OFFSET; + + /* + * Don't allow a fragment of TCP 8 bytes in. Nobody + * normal causes this. Its a cracker trying to break + * in by doing a flag overwrite to pass the direction + * checks. + */ + + if (offset == 1 && ip->protocol == IPPROTO_TCP) { + if (!testing && net_ratelimit()) { + printk("Suspect TCP fragment.\n"); + dump_packet(ip,rif,NULL,NULL,0,0,0,0); + } + return FW_BLOCK; + } + + /* If we can't investigate ports, treat as fragment. It's + * either a trucated whole packet, or a truncated first + * fragment, or a TCP first fragment of length 8-15, in which + * case the above rule stops reassembly. + */ + if (offset == 0) { + unsigned int size_req; + switch (ip->protocol) { + case IPPROTO_TCP: + /* Don't care about things past flags word */ + size_req = 16; + break; + + case IPPROTO_UDP: + case IPPROTO_ICMP: + size_req = 8; + break; + + default: + size_req = 0; + } + offset = (ntohs(ip->tot_len) < (ip->ihl<<2)+size_req); + + /* If it is a truncated first fragment then it can be + * used to rewrite port information, and thus should + * be blocked. + */ + if (offset && (ntohs(ip->frag_off) & IP_MF)) { + if (!testing && net_ratelimit()) { + printk("Suspect short first fragment.\n"); + dump_packet(ip,rif,NULL,NULL,0,0,0,0); + } + return FW_BLOCK; + } + } + + src = ip->saddr; + dst = ip->daddr; + oldtos = ip->tos; + + /* + * If we got interface from which packet came + * we can use the address directly. Linux 2.1 now uses address + * chains per device too, but unlike BSD we first check if the + * incoming packet matches a device address and the routing + * table before calling the firewall. + */ + + dprintf("Packet "); + switch(ip->protocol) + { + case IPPROTO_TCP: + dprintf("TCP "); + if (!offset) { + src_port=ntohs(tcp->source); + dst_port=ntohs(tcp->dest); + + /* Connection initilisation can only + * be made when the syn bit is set and + * neither of the ack or reset is + * set. */ + if(tcp->syn && !(tcp->ack || tcp->rst)) + tcpsyn=1; + } + break; + case IPPROTO_UDP: + dprintf("UDP "); + if (!offset) { + src_port=ntohs(udp->source); + dst_port=ntohs(udp->dest); + } + break; + case IPPROTO_ICMP: + if (!offset) { + src_port=(__u16)icmp->type; + dst_port=(__u16)icmp->code; + } + dprintf("ICMP "); + break; + default: + dprintf("p=%d ",ip->protocol); + break; + } +#ifdef DEBUG_IP_FIREWALL + print_ip(ip->saddr); + + if (offset) + dprintf(":fragment (%i) ", ((int)offset)<<2); + else if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP + || ip->protocol==IPPROTO_ICMP) + dprintf(":%hu:%hu", src_port, dst_port); + dprintf("\n"); +#endif + + if (!testing) FWC_READ_LOCK(&ip_fw_lock); + else FWC_HAVE_LOCK(fwc_rlocks); + + f = chain->chain; + do { + count = 0; + for (; f; f = f->next) { + count++; + if (ip_rule_match(f,rif,ip, + tcpsyn,src_port,dst_port,offset)) { + if (!testing + && !ip_fw_domatch(f, ip, rif, chain->label, + skb, slot, + src_port, dst_port, + count, tcpsyn)) { + ret = FW_BLOCK; + goto out; + } + break; + } + } + if (f) { + if (f->branch) { + /* Do sanity check to see if we have + * already set prevchain and if so we + * must be in a loop */ + if (f->branch->reent[slot].prevchain) { + if (!testing) { + printk(KERN_ERR + "IP firewall: " + "Loop detected " + "at `%s'.\n", + f->branch->label); + cleanup(chain, 1, slot); + ret = FW_BLOCK; + } else { + cleanup(chain, 0, slot); + ret = FW_SKIP+1; + } + } + else { + f->branch->reent[slot].prevchain + = chain; + f->branch->reent[slot].prevrule + = f->next; + chain = f->branch; + f = chain->chain; + } + } + else if (f->simplebranch == FW_SKIP) + f = f->next; + else if (f->simplebranch == FW_SKIP+1) { + /* Just like falling off the chain */ + goto fall_off_chain; + } + else { + cleanup(chain, 0, slot); + ret = f->simplebranch; + } + } /* f == NULL */ + else { + fall_off_chain: + if (chain->reent[slot].prevchain) { + struct ip_chain *tmp = chain; + f = chain->reent[slot].prevrule; + chain = chain->reent[slot].prevchain; + tmp->reent[slot].prevchain = NULL; + } + else { + ret = chain->policy; + if (!testing) { + chain->reent[slot].counters.pcnt++; + chain->reent[slot].counters.bcnt + += ntohs(ip->tot_len); + } + } + } + } while (ret == FW_SKIP+2); + + out: + if (!testing) FWC_READ_UNLOCK(&ip_fw_lock); + + /* Recalculate checksum if not going to reject, and TOS changed. */ + if (ip->tos != oldtos + && ret != FW_REJECT && ret != FW_BLOCK + && !testing) + ip_send_check(ip); + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (ret == FW_REDIRECT && redirport) { + if ((*redirport = htons(f->ipfw.fw_redirpt)) == 0) { + /* Wildcard redirection. + * Note that redirport will become + * 0xFFFF for non-TCP/UDP packets. + */ + *redirport = htons(dst_port); + } + } +#endif + +#ifdef DEBUG_ALLOW_ALL + return (testing ? ret : FW_ACCEPT); +#else + return ret; +#endif +} + +/* Must have write lock & interrupts off for any of these */ + +/* This function sets all the byte counters in a chain to zero. The + * input is a pointer to the chain required for zeroing */ +static int zero_fw_chain(struct ip_chain *chainptr) +{ + struct ip_fwkernel *i; + + FWC_HAVE_LOCK(fwc_wlocks); + for (i = chainptr->chain; i; i = i->next) + memset(i->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS); + return 0; +} + +static int clear_fw_chain(struct ip_chain *chainptr) +{ + struct ip_fwkernel *i= chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + chainptr->chain=NULL; + + while (i) { + struct ip_fwkernel *tmp = i->next; + if (i->branch) + i->branch->refcount--; + kfree(i); + i = tmp; + } + return 0; +} + +static int replace_in_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl, + __u32 position) +{ + struct ip_fwkernel *f = chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + + while (--position && f != NULL) f = f->next; + if (f == NULL) + return EINVAL; + + if (f->branch) f->branch->refcount--; + if (frwl->branch) frwl->branch->refcount++; + + frwl->next = f->next; + memcpy(f,frwl,sizeof(struct ip_fwkernel)); + kfree(frwl); + return 0; +} + +static int append_to_chain(struct ip_chain *chainptr, struct ip_fwkernel *rule) +{ + struct ip_fwkernel *i; + + FWC_HAVE_LOCK(fwc_wlocks); + /* Special case if no rules already present */ + if (chainptr->chain == NULL) { + + /* If pointer writes are atomic then turning off + * interupts is not necessary. */ + chainptr->chain = rule; + if (rule->branch) rule->branch->refcount++; + return 0; + } + + /* Find the rule before the end of the chain */ + for (i = chainptr->chain; i->next; i = i->next); + i->next = rule; + if (rule->branch) rule->branch->refcount++; + return 0; +} + +/* This function inserts a rule at the position of position in the + * chain refenced by chainptr. If position is 1 then this rule will + * become the new rule one. */ +static int insert_in_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl, + __u32 position) +{ + struct ip_fwkernel *f = chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + /* special case if the position is number 1 */ + if (position == 1) { + frwl->next = chainptr->chain; + if (frwl->branch) frwl->branch->refcount++; + chainptr->chain = frwl; + return 0; + } + position--; + while (--position && f != NULL) f = f->next; + if (f == NULL) + return EINVAL; + if (frwl->branch) frwl->branch->refcount++; + frwl->next = f->next; + + f->next = frwl; + return 0; +} + +/* This function deletes the a rule from a given rulenum and chain. + * With rulenum = 1 is the first rule is deleted. */ + +static int del_num_from_chain(struct ip_chain *chainptr, __u32 rulenum) +{ + struct ip_fwkernel *i=chainptr->chain,*tmp; + + FWC_HAVE_LOCK(fwc_wlocks); + + if (!chainptr->chain) + return ENOENT; + + /* Need a special case for the first rule */ + if (rulenum == 1) { + /* store temp to allow for freeing up of memory */ + tmp = chainptr->chain; + if (chainptr->chain->branch) chainptr->chain->branch->refcount--; + chainptr->chain = chainptr->chain->next; + kfree(tmp); /* free memory that is now unused */ + } else { + rulenum--; + while (--rulenum && i->next ) i = i->next; + if (!i->next) + return ENOENT; + tmp = i->next; + if (i->next->branch) + i->next->branch->refcount--; + i->next = i->next->next; + kfree(tmp); + } + return 0; +} + + +/* This function deletes the a rule from a given rule and chain. + * The rule that is deleted is the first occursance of that rule. */ +static int del_rule_from_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl) +{ + struct ip_fwkernel *ltmp,*ftmp = chainptr->chain ; + int was_found; + + FWC_HAVE_LOCK(fwc_wlocks); + + /* Sure, we should compare marks, but since the `ipfwadm' + * script uses it for an unholy hack... well, life is easier + * this way. We also mask it out of the flags word. --PR */ + for (ltmp=NULL, was_found=0; + !was_found && ftmp != NULL; + ltmp = ftmp,ftmp = ftmp->next) { + if (ftmp->ipfw.fw_src.s_addr!=frwl->ipfw.fw_src.s_addr + || ftmp->ipfw.fw_dst.s_addr!=frwl->ipfw.fw_dst.s_addr + || ftmp->ipfw.fw_smsk.s_addr!=frwl->ipfw.fw_smsk.s_addr + || ftmp->ipfw.fw_dmsk.s_addr!=frwl->ipfw.fw_dmsk.s_addr +#if 0 + || ftmp->ipfw.fw_flg!=frwl->ipfw.fw_flg +#else + || ((ftmp->ipfw.fw_flg & ~IP_FW_F_MARKABS) + != (frwl->ipfw.fw_flg & ~IP_FW_F_MARKABS)) +#endif + || ftmp->ipfw.fw_invflg!=frwl->ipfw.fw_invflg + || ftmp->ipfw.fw_proto!=frwl->ipfw.fw_proto +#if 0 + || ftmp->ipfw.fw_mark!=frwl->ipfw.fw_mark +#endif + || ftmp->ipfw.fw_redirpt!=frwl->ipfw.fw_redirpt + || ftmp->ipfw.fw_spts[0]!=frwl->ipfw.fw_spts[0] + || ftmp->ipfw.fw_spts[1]!=frwl->ipfw.fw_spts[1] + || ftmp->ipfw.fw_dpts[0]!=frwl->ipfw.fw_dpts[0] + || ftmp->ipfw.fw_dpts[1]!=frwl->ipfw.fw_dpts[1] + || ftmp->ipfw.fw_outputsize!=frwl->ipfw.fw_outputsize) { + duprintf("del_rule_from_chain: mismatch:" + "src:%u/%u dst:%u/%u smsk:%u/%u dmsk:%u/%u " + "flg:%hX/%hX invflg:%hX/%hX proto:%u/%u " + "mark:%u/%u " + "ports:%hu-%hu/%hu-%hu %hu-%hu/%hu-%hu " + "outputsize:%hu-%hu\n", + ftmp->ipfw.fw_src.s_addr, + frwl->ipfw.fw_src.s_addr, + ftmp->ipfw.fw_dst.s_addr, + frwl->ipfw.fw_dst.s_addr, + ftmp->ipfw.fw_smsk.s_addr, + frwl->ipfw.fw_smsk.s_addr, + ftmp->ipfw.fw_dmsk.s_addr, + frwl->ipfw.fw_dmsk.s_addr, + ftmp->ipfw.fw_flg, + frwl->ipfw.fw_flg, + ftmp->ipfw.fw_invflg, + frwl->ipfw.fw_invflg, + ftmp->ipfw.fw_proto, + frwl->ipfw.fw_proto, + ftmp->ipfw.fw_mark, + frwl->ipfw.fw_mark, + ftmp->ipfw.fw_spts[0], + frwl->ipfw.fw_spts[0], + ftmp->ipfw.fw_spts[1], + frwl->ipfw.fw_spts[1], + ftmp->ipfw.fw_dpts[0], + frwl->ipfw.fw_dpts[0], + ftmp->ipfw.fw_dpts[1], + frwl->ipfw.fw_dpts[1], + ftmp->ipfw.fw_outputsize, + frwl->ipfw.fw_outputsize); + continue; + } + + if (strncmp(ftmp->ipfw.fw_vianame, + frwl->ipfw.fw_vianame, + IFNAMSIZ)) { + duprintf("del_rule_from_chain: if mismatch: %s/%s\n", + ftmp->ipfw.fw_vianame, + frwl->ipfw.fw_vianame); + continue; + } + if (ftmp->branch != frwl->branch) { + duprintf("del_rule_from_chain: branch mismatch: " + "%s/%s\n", + ftmp->branch?ftmp->branch->label:"(null)", + frwl->branch?frwl->branch->label:"(null)"); + continue; + } + if (ftmp->branch == NULL + && ftmp->simplebranch != frwl->simplebranch) { + duprintf("del_rule_from_chain: simplebranch mismatch: " + "%i/%i\n", + ftmp->simplebranch, frwl->simplebranch); + continue; + } + was_found = 1; + if (ftmp->branch) + ftmp->branch->refcount--; + if (ltmp) + ltmp->next = ftmp->next; + else + chainptr->chain = ftmp->next; + kfree(ftmp); + break; + } + + if (was_found) + return 0; + else { + duprintf("del_rule_from_chain: no matching rule found\n"); + return EINVAL; + } +} + +/* This function takes the label of a chain and deletes the first + * chain with that name. No special cases required for the built in + * chains as they have their refcount initilised to 1 so that they are + * never deleted. */ +static int del_chain(ip_chainlabel label) +{ + struct ip_chain *tmp,*tmp2; + + FWC_HAVE_LOCK(fwc_wlocks); + /* Corner case: return EBUSY not ENOENT for first elem ("input") */ + if (strcmp(label, ip_fw_chains->label) == 0) + return EBUSY; + + for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next) + if(strcmp(tmp->next->label,label) == 0) + break; + + tmp2 = tmp->next; + if (!tmp2) + return ENOENT; + + if (tmp2->refcount) + return EBUSY; + + if (tmp2->chain) + return ENOTEMPTY; + + tmp->next = tmp2->next; + kfree(tmp2); + return 0; +} + +/* This is a function to initilise a chain. Built in rules start with + * refcount = 1 so that they cannot be deleted. User defined rules + * start with refcount = 0 so they can be deleted. */ +static struct ip_chain *ip_init_chain(ip_chainlabel name, + __u32 ref, + int policy) +{ + unsigned int i; + struct ip_chain *label + = kmalloc(SIZEOF_STRUCT_IP_CHAIN, GFP_KERNEL); + if (label == NULL) + panic("Can't kmalloc for firewall chains.\n"); + strcpy(label->label,name); + label->next = NULL; + label->chain = NULL; + label->refcount = ref; + label->policy = policy; + for (i = 0; i < smp_num_cpus*2; i++) { + label->reent[i].counters.pcnt = label->reent[i].counters.bcnt + = 0; + label->reent[i].prevchain = NULL; + label->reent[i].prevrule = NULL; + } + + return label; +} + +/* This is a function for reating a new chain. The chains is not + * created if a chain of the same name already exists */ +static int create_chain(ip_chainlabel label) +{ + struct ip_chain *tmp; + + if (!check_label(label)) + return EINVAL; + + FWC_HAVE_LOCK(fwc_wlocks); + for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next) + if (strcmp(tmp->label,label) == 0) + return EEXIST; + + if (strcmp(tmp->label,label) == 0) + return EEXIST; + + tmp->next = ip_init_chain(label, 0, FW_SKIP); /* refcount is + * zero since this is a + * user defined chain * + * and therefore can be + * deleted */ + return 0; +} + +/* This function simply changes the policy on one of the built in + * chains. checking must be done before this is call to ensure that + * chainptr is pointing to one of the three possible chains */ +static int change_policy(struct ip_chain *chainptr, int policy) +{ + FWC_HAVE_LOCK(fwc_wlocks); + chainptr->policy = policy; + return 0; +} + +/* This function takes an ip_fwuser and converts it to a ip_fwkernel. It also + * performs some checks in the structure. */ +static struct ip_fwkernel *convert_ipfw(struct ip_fwuser *fwuser, int *errno) +{ + struct ip_fwkernel *fwkern; + + if ( (fwuser->ipfw.fw_flg & ~IP_FW_F_MASK) != 0 ) { + duprintf("convert_ipfw: undefined flag bits set (flags=%x)\n", + fwuser->ipfw.fw_flg); + *errno = EINVAL; + return NULL; + } + +#ifdef DEBUG_IP_FIREWALL_USER + /* These are sanity checks that don't really matter. + * We can get rid of these once testing is complete. + */ + if ((fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN) + && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO) + || fwuser->ipfw.fw_proto != IPPROTO_TCP)) { + duprintf("convert_ipfw: TCP SYN flag set but proto != TCP!\n"); + *errno = EINVAL; + return NULL; + } + + if (strcmp(fwuser->label, IP_FW_LABEL_REDIRECT) != 0 + && fwuser->ipfw.fw_redirpt != 0) { + duprintf("convert_ipfw: Target not REDIR but redirpt != 0!\n"); + *errno = EINVAL; + return NULL; + } + + if ((!(fwuser->ipfw.fw_flg & IP_FW_F_FRAG) + && (fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG)) + || (!(fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN) + && (fwuser->ipfw.fw_invflg & IP_FW_INV_SYN))) { + duprintf("convert_ipfw: Can't have INV flag if flag unset!\n"); + *errno = EINVAL; + return NULL; + } + + if (((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCPT) + && fwuser->ipfw.fw_spts[0] == 0 + && fwuser->ipfw.fw_spts[1] == 0xFFFF) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTPT) + && fwuser->ipfw.fw_dpts[0] == 0 + && fwuser->ipfw.fw_dpts[1] == 0xFFFF) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_VIA) + && (fwuser->ipfw.fw_vianame)[0] == '\0') + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCIP) + && fwuser->ipfw.fw_smsk.s_addr == 0) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTIP) + && fwuser->ipfw.fw_dmsk.s_addr == 0)) { + duprintf("convert_ipfw: INV flag makes rule unmatchable!\n"); + *errno = EINVAL; + return NULL; + } + + if ((fwuser->ipfw.fw_flg & IP_FW_F_FRAG) + && !(fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG) + && (fwuser->ipfw.fw_spts[0] != 0 + || fwuser->ipfw.fw_spts[1] != 0xFFFF + || fwuser->ipfw.fw_dpts[0] != 0 + || fwuser->ipfw.fw_dpts[1] != 0xFFFF + || (fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN))) { + duprintf("convert_ipfw: Can't test ports or SYN with frag!\n"); + *errno = EINVAL; + return NULL; + } +#endif + + if ((fwuser->ipfw.fw_spts[0] != 0 + || fwuser->ipfw.fw_spts[1] != 0xFFFF + || fwuser->ipfw.fw_dpts[0] != 0 + || fwuser->ipfw.fw_dpts[1] != 0xFFFF) + && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO) + || (fwuser->ipfw.fw_proto != IPPROTO_TCP + && fwuser->ipfw.fw_proto != IPPROTO_UDP + && fwuser->ipfw.fw_proto != IPPROTO_ICMP))) { + duprintf("convert_ipfw: Can only test ports for TCP/UDP/ICMP!\n"); + *errno = EINVAL; + return NULL; + } + + fwkern = kmalloc(SIZEOF_STRUCT_IP_FW_KERNEL, GFP_KERNEL); + if (!fwkern) { + duprintf("convert_ipfw: kmalloc failed!\n"); + *errno = ENOMEM; + return NULL; + } + memcpy(&fwkern->ipfw,&fwuser->ipfw,sizeof(struct ip_fw)); + + if (!find_special(fwuser->label, &fwkern->simplebranch)) { + fwkern->branch = find_label(fwuser->label); + if (!fwkern->branch) { + duprintf("convert_ipfw: chain doesn't exist `%s'.\n", + fwuser->label); + kfree(fwkern); + *errno = ENOENT; + return NULL; + } else if (fwkern->branch == IP_FW_INPUT_CHAIN + || fwkern->branch == IP_FW_FORWARD_CHAIN + || fwkern->branch == IP_FW_OUTPUT_CHAIN) { + duprintf("convert_ipfw: Can't branch to builtin chain `%s'.\n", + fwuser->label); + kfree(fwkern); + *errno = ENOENT; + return NULL; + } + } else + fwkern->branch = NULL; + memset(fwkern->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS); + + /* Handle empty vianame by making it a wildcard */ + if ((fwkern->ipfw.fw_vianame)[0] == '\0') + fwkern->ipfw.fw_flg |= IP_FW_F_WILDIF; + + fwkern->next = NULL; + return fwkern; +} + +int ip_fw_ctl(int cmd, void *m, int len) +{ + int ret; + struct ip_chain *chain; + unsigned long flags; + + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + switch (cmd) { + case IP_FW_FLUSH: + if (len != sizeof(ip_chainlabel) || !check_label(m)) + ret = EINVAL; + else if ((chain = find_label(m)) == NULL) + ret = ENOENT; + else ret = clear_fw_chain(chain); + break; + + case IP_FW_ZERO: + if (len != sizeof(ip_chainlabel) || !check_label(m)) + ret = EINVAL; + else if ((chain = find_label(m)) == NULL) + ret = ENOENT; + else ret = zero_fw_chain(chain); + break; + + case IP_FW_CHECK: { + struct ip_fwtest *new = m; + struct iphdr *ip; + + /* Don't need write lock. */ + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + + if (len != sizeof(struct ip_fwtest) || !check_label(m)) + return EINVAL; + + /* Need readlock to do find_label */ + FWC_READ_LOCK(&ip_fw_lock); + + if ((chain = find_label(new->fwt_label)) == NULL) + ret = ENOENT; + else { + ip = &(new->fwt_packet.fwp_iph); + + if (ip->ihl != sizeof(struct iphdr) / sizeof(int)) { + duprintf("ip_fw_ctl: ip->ihl=%d, want %d\n", + ip->ihl, + sizeof(struct iphdr) / sizeof(int)); + ret = EINVAL; + } + else { + ret = ip_fw_check(ip, new->fwt_packet.fwp_vianame, + NULL, chain, + NULL, SLOT_NUMBER(), 1); + switch (ret) { + case FW_ACCEPT: + ret = 0; break; + case FW_REDIRECT: + ret = ECONNABORTED; break; + case FW_MASQUERADE: + ret = ECONNRESET; break; + case FW_REJECT: + ret = ECONNREFUSED; break; + /* Hack to help diag; these only get + returned when testing. */ + case FW_SKIP+1: + ret = ELOOP; break; + case FW_SKIP: + ret = ENFILE; break; + default: /* FW_BLOCK */ + ret = ETIMEDOUT; break; + } + } + } + FWC_READ_UNLOCK(&ip_fw_lock); + return ret; + } + + case IP_FW_MASQ_TIMEOUTS: { +#ifdef CONFIG_IP_MASQUERADE + ret = ip_fw_masq_timeouts(m, len); +#else + ret = EINVAL; +#endif + } + break; + + case IP_FW_REPLACE: { + struct ip_fwkernel *ip_fwkern; + struct ip_fwnew *new = m; + + if (len != sizeof(struct ip_fwnew) + || !check_label(new->fwn_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwn_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret)) + != NULL) + ret = replace_in_chain(chain, ip_fwkern, + new->fwn_rulenum); + } + break; + + case IP_FW_APPEND: { + struct ip_fwchange *new = m; + struct ip_fwkernel *ip_fwkern; + + if (len != sizeof(struct ip_fwchange) + || !check_label(new->fwc_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwc_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret)) + != NULL) + ret = append_to_chain(chain, ip_fwkern); + } + break; + + case IP_FW_INSERT: { + struct ip_fwkernel *ip_fwkern; + struct ip_fwnew *new = m; + + if (len != sizeof(struct ip_fwnew) + || !check_label(new->fwn_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwn_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret)) + != NULL) + ret = insert_in_chain(chain, ip_fwkern, + new->fwn_rulenum); + } + break; + + case IP_FW_DELETE: { + struct ip_fwchange *new = m; + struct ip_fwkernel *ip_fwkern; + + if (len != sizeof(struct ip_fwchange) + || !check_label(new->fwc_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwc_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret)) + != NULL) { + ret = del_rule_from_chain(chain, ip_fwkern); + kfree(ip_fwkern); + } + } + break; + + case IP_FW_DELETE_NUM: { + struct ip_fwdelnum *new = m; + + if (len != sizeof(struct ip_fwdelnum) + || !check_label(new->fwd_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwd_label)) == NULL) + ret = ENOENT; + else ret = del_num_from_chain(chain, new->fwd_rulenum); + } + break; + + case IP_FW_CREATECHAIN: { + if (len != sizeof(ip_chainlabel)) { + duprintf("create_chain: bad size %i\n", len); + ret = EINVAL; + } + else ret = create_chain(m); + } + break; + + case IP_FW_DELETECHAIN: { + if (len != sizeof(ip_chainlabel)) { + duprintf("delete_chain: bad size %i\n", len); + ret = EINVAL; + } + else ret = del_chain(m); + } + break; + + case IP_FW_POLICY: { + struct ip_fwpolicy *new = m; + + if (len != sizeof(struct ip_fwpolicy) + || !check_label(new->fwp_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwp_label)) == NULL) + ret = ENOENT; + else if (chain != IP_FW_INPUT_CHAIN + && chain != IP_FW_FORWARD_CHAIN + && chain != IP_FW_OUTPUT_CHAIN) { + duprintf("change_policy: can't change policy on user" + " defined chain.\n"); + ret = EINVAL; + } + else { + int pol = FW_SKIP; + find_special(new->fwp_policy, &pol); + + switch(pol) { + case FW_MASQUERADE: + if (chain != IP_FW_FORWARD_CHAIN) { + ret = EINVAL; + break; + } + /* Fall thru... */ + case FW_BLOCK: + case FW_ACCEPT: + case FW_REJECT: + ret = change_policy(chain, pol); + break; + default: + duprintf("change_policy: bad policy `%s'\n", + new->fwp_policy); + ret = EINVAL; + } + } + break; + + } + default: + duprintf("ip_fw_ctl: unknown request %d\n",cmd); + ret = EINVAL; + } + + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + return ret; +} + +/* Returns bytes used - doesn't NUL terminate */ +static int dump_rule(char *buffer, + const char *chainlabel, + const struct ip_fwkernel *rule) +{ + int len; + unsigned int i; + __u64 packets = 0, bytes = 0; + + FWC_HAVE_LOCK(fwc_wlocks); + for (i = 0; i < NUM_SLOTS; i++) { + packets += rule->counters[i].pcnt; + bytes += rule->counters[i].bcnt; + } + + len=sprintf(buffer, + "%9s " /* Chain name */ + "%08lX/%08lX->%08lX/%08lX " /* Source & Destination IPs */ + "%.16s " /* Interface */ + "%X %X " /* fw_flg and fw_invflg fields */ + "%u " /* Protocol */ + "%-9u %-9u %-9u %-9u " /* Packet & byte counters */ + "%u-%u %u-%u " /* Source & Dest port ranges */ + "A%02X X%02X " /* TOS and and xor masks */ + "%08X " /* Redirection port */ + "%u " /* fw_mark field */ + "%u " /* output size */ + "%9s\n", /* Target */ + chainlabel, + ntohl(rule->ipfw.fw_src.s_addr), + ntohl(rule->ipfw.fw_smsk.s_addr), + ntohl(rule->ipfw.fw_dst.s_addr), + ntohl(rule->ipfw.fw_dmsk.s_addr), + (rule->ipfw.fw_vianame)[0] ? rule->ipfw.fw_vianame : "-", + rule->ipfw.fw_flg, + rule->ipfw.fw_invflg, + rule->ipfw.fw_proto, + (__u32)(packets >> 32), (__u32)packets, + (__u32)(bytes >> 32), (__u32)bytes, + rule->ipfw.fw_spts[0], rule->ipfw.fw_spts[1], + rule->ipfw.fw_dpts[0], rule->ipfw.fw_dpts[1], + rule->ipfw.fw_tosand, rule->ipfw.fw_tosxor, + rule->ipfw.fw_redirpt, + rule->ipfw.fw_mark, + rule->ipfw.fw_outputsize, + branchname(rule->branch,rule->simplebranch)); + + duprintf("dump_rule: %i bytes done.\n", len); + return len; +} + +/* File offset is actually in records, not bytes. */ +static int ip_chain_procinfo(char *buffer, char **start, + off_t offset, int length, int reset) +{ + struct ip_chain *i; + struct ip_fwkernel *j = ip_fw_chains->chain; + unsigned long flags; + int len = 0; + int last_len = 0; + off_t upto = 0; + + duprintf("Offset starts at %lu\n", offset); + duprintf("ip_fw_chains is 0x%0lX\n", (unsigned long int)ip_fw_chains); + + /* Need a write lock to lock out ``readers'' which update counters. */ + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + for (i = ip_fw_chains; i; i = i->next) { + for (j = i->chain; j; j = j->next) { + if (upto == offset) break; + duprintf("Skipping rule in chain `%s'\n", + i->label); + upto++; + } + if (upto == offset) break; + } + + /* Don't init j first time, or once i = NULL */ + for (; i; (void)((i = i->next) && (j = i->chain))) { + duprintf("Dumping chain `%s'\n", i->label); + for (; j; j = j->next, upto++, last_len = len) + { + len += dump_rule(buffer+len, i->label, j); + if (len > length) { + duprintf("Dumped to %i (past %i). " + "Moving back to %i.\n", + len, length, last_len); + len = last_len; + goto outside; + } + else if (reset) + memset(j->counters, 0, + sizeof(struct ip_counters)*NUM_SLOTS); + } + } +outside: + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + buffer[len] = '\0'; + + duprintf("ip_chain_procinfo: Length = %i (of %i). Offset = %li.\n", + len, length, upto); + /* `start' hack - see fs/proc/generic.c line ~165 */ + *start=(char *)((unsigned int)upto-offset); + return len; +} + +static int ip_chain_name_procinfo(char *buffer, char **start, + off_t offset, int length, int reset) +{ + struct ip_chain *i; + int len = 0,last_len = 0; + off_t pos = 0,begin = 0; + unsigned long flags; + + /* Need a write lock to lock out ``readers'' which update counters. */ + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + for (i = ip_fw_chains; i; i = i->next) + { + unsigned int j; + __u32 packetsHi = 0, packetsLo = 0, bytesHi = 0, bytesLo = 0; + + for (j = 0; j < NUM_SLOTS; j++) { + packetsLo += i->reent[j].counters.pcnt & 0xFFFFFFFF; + packetsHi += ((i->reent[j].counters.pcnt >> 32) + & 0xFFFFFFFF); + bytesLo += i->reent[j].counters.bcnt & 0xFFFFFFFF; + bytesHi += ((i->reent[j].counters.bcnt >> 32) + & 0xFFFFFFFF); + } + + /* print the label and the policy */ + len+=sprintf(buffer+len,"%s %s %i %u %u %u %u\n", + i->label,branchname(NULL, i->policy),i->refcount, + packetsHi, packetsLo, bytesHi, bytesLo); + pos=begin+len; + if(pos<offset) { + len=0; + begin=pos; + } + else if(pos>offset+length) { + len = last_len; + break; + } + + last_len = len; + } + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + + *start = buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +/* + * Interface to the generic firewall chains. + */ +int ipfw_input_check(struct firewall_ops *this, int pf, struct device *dev, + void *phdr, void *arg, struct sk_buff **pskb) +{ + return ip_fw_check(phdr, dev->name, + arg, IP_FW_INPUT_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +int ipfw_output_check(struct firewall_ops *this, int pf, struct device *dev, + void *phdr, void *arg, struct sk_buff **pskb) +{ + /* Locally generated bogus packets by root. <SIGH>. */ + if (((struct iphdr *)phdr)->ihl * 4 < sizeof(struct iphdr) + || (*pskb)->len < sizeof(struct iphdr)) + return FW_ACCEPT; + return ip_fw_check(phdr, dev->name, + arg, IP_FW_OUTPUT_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +int ipfw_forward_check(struct firewall_ops *this, int pf, struct device *dev, + void *phdr, void *arg, struct sk_buff **pskb) +{ + return ip_fw_check(phdr, dev->name, + arg, IP_FW_FORWARD_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +struct firewall_ops ipfw_ops= +{ + NULL, + ipfw_forward_check, + ipfw_input_check, + ipfw_output_check, + PF_INET, + 0 /* We don't even allow a fall through so we are last */ +}; + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ipfwchains_chain = { + PROC_NET_IPFW_CHAINS, sizeof(IP_FW_PROC_CHAINS)-1, + IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, ip_chain_procinfo +}; + +static struct proc_dir_entry proc_net_ipfwchains_chainnames = { + PROC_NET_IPFW_CHAIN_NAMES, sizeof(IP_FW_PROC_CHAIN_NAMES)-1, + IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, ip_chain_name_procinfo +}; + +#endif + +__initfunc(void ip_fw_init(void)) +{ +#ifdef DEBUG_IP_FIRWALL_LOCKING + fwc_wlocks = fwc_rlocks = 0; +#endif + + IP_FW_INPUT_CHAIN = ip_init_chain(IP_FW_LABEL_INPUT, 1, FW_ACCEPT); + IP_FW_FORWARD_CHAIN = ip_init_chain(IP_FW_LABEL_FORWARD, 1, FW_ACCEPT); + IP_FW_OUTPUT_CHAIN = ip_init_chain(IP_FW_LABEL_OUTPUT, 1, FW_ACCEPT); + + if(register_firewall(PF_INET,&ipfw_ops)<0) + panic("Unable to register IP firewall.\n"); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ipfwchains_chain); + proc_net_register(&proc_net_ipfwchains_chainnames); +#endif + +#ifdef CONFIG_IP_FIREWALL_NETLINK + ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL); + if (ipfwsk == NULL) + panic("ip_fw_init: cannot initialize netlink\n"); +#endif +#if defined(DEBUG_IP_FIREWALL) || defined(DEBUG_IP_FIREWALL_USER) + printk("Firewall graphs enabled! Untested kernel coming thru. \n"); +#endif +} diff --git a/pfinet/linux-src/net/ipv4/ip_gre.c b/pfinet/linux-src/net/ipv4/ip_gre.c new file mode 100644 index 00000000..6a7546fd --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_gre.c @@ -0,0 +1,1223 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ipip.h> +#include <net/arp.h> +#include <net/checksum.h> + +#ifdef CONFIG_IPV6 +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +/* + Problems & solutions + -------------------- + + 1. The most important issue is detecting local dead loops. + They would cause complete host lockup in transmit, which + would be "resolved" by stack overflow or, if queueing is enabled, + with infinite looping in net_bh. + + We cannot track such dead loops during route installation, + it is infeasible task. The most general solutions would be + to keep skb->encapsulation counter (sort of local ttl), + and silently drop packet when it expires. It is the best + solution, but it supposes maintaing new variable in ALL + skb, even if no tunneling is used. + + Current solution: t->recursion lock breaks dead loops. It looks + like dev->tbusy flag, but I preferred new variable, because + the semantics is different. One day, when hard_start_xmit + will be multithreaded we will have to use skb->encapsulation. + + + + 2. Networking dead loops would not kill routers, but would really + kill network. IP hop limit plays role of "t->recursion" in this case, + if we copy it from packet being encapsulated to upper header. + It is very good solution, but it introduces two problems: + + - Routing protocols, using packets with ttl=1 (OSPF, RIP2), + do not work over tunnels. + - traceroute does not work. I planned to relay ICMP from tunnel, + so that this problem would be solved and traceroute output + would even more informative. This idea appeared to be wrong: + only Linux complies to rfc1812 now (yes, guys, Linux is the only + true router now :-)), all routers (at least, in neighbourhood of mine) + return only 8 bytes of payload. It is the end. + + Hence, if we want that OSPF worked or traceroute said something reasonable, + we should search for another solution. + + One of them is to parse packet trying to detect inner encapsulation + made by our node. It is difficult or even impossible, especially, + taking into account fragmentation. TO be short, tt is not solution at all. + + Current solution: The solution was UNEXPECTEDLY SIMPLE. + We force DF flag on tunnels with preconfigured hop limit, + that is ALL. :-) Well, it does not remove the problem completely, + but exponential growth of network traffic is changed to linear + (branches, that exceed pmtu are pruned) and tunnel mtu + fastly degrades to value <68, where looping stops. + Yes, it is not good if there exists a router in the loop, + which does not force DF, even when encapsulating packets have DF set. + But it is not our problem! Nobody could accuse us, we made + all that we could make. Even if it is your gated who injected + fatal route to network, even if it were you who configured + fatal static route: you are innocent. :-) + + + + 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain + practically identical code. It would be good to glue them + together, but it is not very evident, how to make them modular. + sit is integral part of IPv6, ipip and gre are naturally modular. + We could extract common parts (hash table, ioctl etc) + to a separate module (ip_tunnel.c). + + Alexey Kuznetsov. + */ + +static int ipgre_tunnel_init(struct device *dev); + +/* Fallback tunnel: no source, no destination, no key, no options */ + +static int ipgre_fb_tunnel_init(struct device *dev); + +static struct device ipgre_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init, +}; + +static struct ip_tunnel ipgre_fb_tunnel = { + NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", } +}; + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static struct ip_tunnel *tunnels[4][HASH_SIZE]; + +#define tunnels_r_l (tunnels[3]) +#define tunnels_r (tunnels[2]) +#define tunnels_l (tunnels[1]) +#define tunnels_wc (tunnels[0]) + +/* Given src, dst and key, find approriate for input tunnel. */ + +static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(key); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_r[h0^h1]; t; t = t->next) { + if (remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr || + (local == t->parms.iph.daddr && MULTICAST(local))) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_wc[h1]; t; t = t->next) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + if (ipgre_fb_tunnel_dev.flags&IFF_UP) + return &ipgre_fb_tunnel; + return NULL; +} + +static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + u32 key = t->parms.i_key; + unsigned h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + + return &tunnels[prio][h]; +} + +static void ipgre_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipgre_bucket(t); + + t->next = *tp; + wmb(); + *tp = t; +} + +static void ipgre_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + *tp = t->next; + synchronize_bh(); + break; + } + } +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + u32 key = parms->i_key; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (key == t->parms.i_key) + return t; + } + } + if (!create) + return NULL; + + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipgre_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "gre%d", i); + if (dev_get(dev->name) == NULL) + break; + } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); + } + if (register_netdevice(dev) < 0) + goto failed; + + ipgre_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; +} + +static void ipgre_tunnel_destroy(struct device *dev) +{ + ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv); + + if (dev != &ipgre_fb_tunnel_dev) { + kfree(dev); + MOD_DEC_USE_COUNT; + } +} + + +void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft state for keyed + GRE tunnels with enabled checksum. Tell them "thank you". + + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standrads established + by themself??? + */ + + struct iphdr *iph = (struct iphdr*)dp; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int grehlen = (iph->ihl<<2) + 4; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + u16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (len < grehlen) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0); + if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + return; + + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + struct iphdr *eiph; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + u16 flags; + int grehlen = (iph->ihl<<2) + 4; + struct sk_buff *skb2; + struct rtable *rt; + + if (p[1] != __constant_htons(ETH_P_IP)) + return; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_CSUM) + grehlen += 4; + if (flags&GRE_KEY) + grehlen += 4; + if (flags&GRE_SEQ) + grehlen += 4; + } + if (len < grehlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + grehlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < (iph->ihl<<2)) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - grehlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necesary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < grehlen+68) + return; + rel_info -= grehlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > skb2->dst->pmtu) { + kfree_skb(skb2); + return; + } + skb2->dst->pmtu = rel_info; + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); +#endif +} + +int ipgre_rcv(struct sk_buff *skb, unsigned short len) +{ + struct iphdr *iph = skb->nh.iph; + u8 *h = skb->h.raw; + u16 flags = *(u16*)h; + u16 csum = 0; + u32 key = 0; + u32 seqno = 0; + struct ip_tunnel *tunnel; + int offset = 4; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop; + + if (flags&GRE_CSUM) { + csum = ip_compute_csum(h, len); + offset += 4; + } + if (flags&GRE_KEY) { + key = *(u32*)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(u32*)(h + offset)); + offset += 4; + } + } + + if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, h + offset - skb->data); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->ip_summed = 0; + skb->protocol = *(u16*)(h + 2); + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + /* Looped back packet, drop it! */ + if (((struct rtable*)skb->dst)->key.iif == 0) + goto drop; + tunnel->stat.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->stat.rx_crc_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->stat.rx_fifo_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return(0); + } + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + +drop: + kfree_skb(skb); + return(0); +} + +static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *tiph; + u8 tos; + u16 df; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int gre_hlen; + u32 dst; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (dev->hard_header) { + gre_hlen = 0; + tiph = (struct iphdr*)skb->data; + } else { + gre_hlen = tunnel->hlen; + tiph = &tunnel->parms.iph; + } + + if ((dst = tiph->daddr) == 0) { + /* NBMA tunnel */ + + if (skb->dst == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + rt = (struct rtable*)skb->dst; + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + int addr_type; + struct neighbour *neigh = skb->dst->neighbour; + + if (neigh == NULL) + goto tx_error; + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } +#endif + else + goto tx_error; + } + + tos = tiph->tos; + if (tos&1) { + if (skb->protocol == __constant_htons(ETH_P_IP)) + tos = old_iph->tos; + tos &= ~1; + } + + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + df = tiph->frag_off; + mtu = rt->u.dst.pmtu - tunnel->hlen; + + if (skb->protocol == __constant_htons(ETH_P_IP)) { + if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + + if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + skb->dst->pmtu = mtu; + } + } + + if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + skb->h.raw = skb->nh.raw; + + max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen; + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + } + + skb->nh.raw = skb_push(skb, gre_hlen); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_GRE; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) { + if (skb->protocol == __constant_htons(ETH_P_IP)) + iph->ttl = old_iph->ttl; +#ifdef CONFIG_IPV6 + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) + iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; +#endif + else + iph->ttl = ip_statistics.IpDefaultTTL; + } + + ((u16*)(iph+1))[0] = tunnel->parms.o_flags; + ((u16*)(iph+1))[1] = skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + } + } + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + stats->tx_bytes += skb->len; + stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); + +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipgre_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipgre_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL && + t != &ipgre_fb_tunnel) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned nflags=0; + + t = (struct ip_tunnel*)dev->priv; + + if (MULTICAST(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + start_bh_atomic(); + ipgre_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(t); + end_bh_atomic(); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == &ipgre_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipgre_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipgre_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +#ifdef CONFIG_NET_IPGRE_BROADCAST +/* Nice toy. Unfortunately, useless in real life :-) + It allows to construct virtual multiprotocol broadcast "LAN" + over the Internet, provided multicast routing is tuned. + + + I have no idea was this bicycle invented before me, + so that I had to set ARPHRD_IPGRE to a random value. + I have an impression, that Cisco could make something similar, + but this feature is apparently missing in IOS<=11.2(8). + + I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks + with broadcast 224.66.66.66. If you have access to mbone, play with me :-) + + ping -t 255 224.66.66.66 + + If nobody answers, mbone does not work. + + ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 + ip addr add 10.66.66.<somewhat>/24 dev Universe + ifconfig Universe up + ifconfig Universe add fe80::<Your_real_addr>/10 + ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 + ftp 10.66.66.66 + ... + ftp fec0:6666:6666::193.233.7.65 + ... + + */ + +static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); + u16 *p = (u16*)(iph+1); + + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&iph->saddr, saddr, 4); + + if (daddr) { + memcpy(&iph->daddr, daddr, 4); + return t->hlen; + } + if (iph->daddr && !MULTICAST(iph->daddr)) + return t->hlen; + + return -t->hlen; +} + +static int ipgre_open(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + MOD_INC_USE_COUNT; + if (MULTICAST(t->parms.iph.daddr)) { + struct rtable *rt; + if (ip_route_output(&rt, t->parms.iph.daddr, + t->parms.iph.saddr, RT_TOS(t->parms.iph.tos), + t->parms.link)) { + MOD_DEC_USE_COUNT; + return -EADDRNOTAVAIL; + } + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (dev->ip_ptr == NULL) { + MOD_DEC_USE_COUNT; + return -EADDRNOTAVAIL; + } + t->mlink = dev->ifindex; + ip_mc_inc_group(dev->ip_ptr, t->parms.iph.daddr); + } + return 0; +} + +static int ipgre_close(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + if (MULTICAST(t->parms.iph.daddr) && t->mlink) { + dev = dev_get_by_index(t->mlink); + if (dev && dev->ip_ptr) + ip_mc_dec_group(dev->ip_ptr, t->parms.iph.daddr); + } + MOD_DEC_USE_COUNT; + return 0; +} + +#endif + +static void ipgre_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipgre_tunnel_destroy; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->get_stats = ipgre_tunnel_get_stats; + dev->do_ioctl = ipgre_tunnel_ioctl; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_IPGRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = 1500 - sizeof(struct iphdr) - 4; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipgre_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = 1500; + int addend = sizeof(struct iphdr) + 4; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipgre_tunnel_init_gen(dev); + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + + dev->flags |= IFF_POINTOPOINT; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + if (!iph->saddr) + return -EINVAL; + dev->flags = IFF_BROADCAST; + dev->hard_header = ipgre_header; + dev->open = ipgre_open; + dev->stop = ipgre_close; + } +#endif + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + return 0; +} + +#ifdef MODULE +static int ipgre_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipgre_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipgre_fb_tunnel_init(struct device *dev)) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct iphdr *iph; + + ipgre_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipgre_fb_tunnel_open; + dev->stop = ipgre_fb_tunnel_close; +#endif + + iph = &ipgre_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_GRE; + iph->ihl = 5; + tunnel->hlen = sizeof(struct iphdr) + 4; + + tunnels_wc[0] = &ipgre_fb_tunnel; + return 0; +} + + +static struct inet_protocol ipgre_protocol = { + ipgre_rcv, /* GRE handler */ + ipgre_err, /* TUNNEL error control */ + 0, /* next */ + IPPROTO_GRE, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "GRE" /* name */ +}; + + +/* + * And now the modules code and kernel interface. + */ + +#ifdef MODULE +int init_module(void) +#else +__initfunc(int ipgre_init(void)) +#endif +{ + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel; + ipgre_fb_tunnel_dev.name = ipgre_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipgre_fb_tunnel_dev); +#else + register_netdevice(&ipgre_fb_tunnel_dev); +#endif + + inet_add_protocol(&ipgre_protocol); + return 0; +} + +#ifdef MODULE + +void cleanup_module(void) +{ + if ( inet_del_protocol(&ipgre_protocol) < 0 ) + printk(KERN_INFO "ipgre close: can't remove protocol\n"); + + unregister_netdev(&ipgre_fb_tunnel_dev); +} + +#endif diff --git a/pfinet/linux-src/net/ipv4/ip_input.c b/pfinet/linux-src/net/ipv4/ip_input.c new file mode 100644 index 00000000..7a3e2618 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_input.c @@ -0,0 +1,549 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) module. + * + * Version: $Id: ip_input.c,v 1.37 1999/04/22 10:38:36 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * + * + * Fixes: + * Alan Cox : Commented a couple of minor bits of surplus code + * Alan Cox : Undefining IP_FORWARD doesn't include the code + * (just stops a compiler warning). + * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes + * are junked rather than corrupting things. + * Alan Cox : Frames to bad broadcast subnets are dumped + * We used to process them non broadcast and + * boy could that cause havoc. + * Alan Cox : ip_forward sets the free flag on the + * new frame it queues. Still crap because + * it copies the frame but at least it + * doesn't eat memory too. + * Alan Cox : Generic queue code and memory fixes. + * Fred Van Kempen : IP fragment support (borrowed from NET2E) + * Gerhard Koerting: Forward fragmented frames correctly. + * Gerhard Koerting: Fixes to my fix of the above 8-). + * Gerhard Koerting: IP interface addressing fix. + * Linus Torvalds : More robustness checks + * Alan Cox : Even more checks: Still not as robust as it ought to be + * Alan Cox : Save IP header pointer for later + * Alan Cox : ip option setting + * Alan Cox : Use ip_tos/ip_ttl settings + * Alan Cox : Fragmentation bogosity removed + * (Thanks to Mark.Bush@prg.ox.ac.uk) + * Dmitry Gorodchanin : Send of a raw packet crash fix. + * Alan Cox : Silly ip bug when an overlength + * fragment turns up. Now frees the + * queue. + * Linus Torvalds/ : Memory leakage on fragmentation + * Alan Cox : handling. + * Gerhard Koerting: Forwarding uses IP priority hints + * Teemu Rantanen : Fragment problems. + * Alan Cox : General cleanup, comments and reformat + * Alan Cox : SNMP statistics + * Alan Cox : BSD address rule semantics. Also see + * UDP as there is a nasty checksum issue + * if you do things the wrong way. + * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file + * Alan Cox : IP options adjust sk->priority. + * Pedro Roque : Fix mtu/length error in ip_forward. + * Alan Cox : Avoid ip_chk_addr when possible. + * Richard Underwood : IP multicasting. + * Alan Cox : Cleaned up multicast handlers. + * Alan Cox : RAW sockets demultiplex in the BSD style. + * Gunther Mayer : Fix the SNMP reporting typo + * Alan Cox : Always in group 224.0.0.1 + * Pauline Middelink : Fast ip_checksum update when forwarding + * Masquerading support. + * Alan Cox : Multicast loopback error for 224.0.0.1 + * Alan Cox : IP_MULTICAST_LOOP option. + * Alan Cox : Use notifiers. + * Bjorn Ekwall : Removed ip_csum (from slhc.c too) + * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) + * Stefan Becker : Send out ICMP HOST REDIRECT + * Arnt Gulbrandsen : ip_build_xmit + * Alan Cox : Per socket routing cache + * Alan Cox : Fixed routing cache, added header cache. + * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. + * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. + * Alan Cox : Incoming IP option handling. + * Alan Cox : Set saddr on raw output frames as per BSD. + * Alan Cox : Stopped broadcast source route explosions. + * Alan Cox : Can disable source routing + * Takeshi Sone : Masquerading didn't work. + * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. + * Alan Cox : Memory leaks, tramples, misc debugging. + * Alan Cox : Fixed multicast (by popular demand 8)) + * Alan Cox : Fixed forwarding (by even more popular demand 8)) + * Alan Cox : Fixed SNMP statistics [I think] + * Gerhard Koerting : IP fragmentation forwarding fix + * Alan Cox : Device lock against page fault. + * Alan Cox : IP_HDRINCL facility. + * Werner Almesberger : Zero fragment bug + * Alan Cox : RAW IP frame length bug + * Alan Cox : Outgoing firewall on build_xmit + * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel + * Alan Cox : Multicast routing hooks + * Jos Vos : Do accounting *before* call_in_firewall + * Willy Konynenberg : Transparent proxying support + * + * + * + * To Fix: + * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient + * and could be made very efficient with the addition of some virtual memory hacks to permit + * the allocation of a buffer that can then be 'grown' by twiddling page tables. + * Output fragmentation wants updating along with the buffer management to use a single + * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet + * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause + * fragmentation anyway. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/net.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <linux/ip_fw.h> +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#include <linux/firewall.h> +#include <linux/mroute.h> +#include <linux/netlink.h> + +/* + * SNMP management statistics + */ + +struct ip_mib ip_statistics={2,IPDEFTTL,}; /* Forwarding=No, Default TTL=64 */ + + +/* + * Handle the issuing of an ioctl() request + * for the ip device. This is scheduled to + * disappear + */ + +int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) + { + default: + return(-EINVAL); + } +} + + +#if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG) +#define CONFIG_IP_ALWAYS_DEFRAG 1 +#endif + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + type = skb->h.icmph->type; + if (type < 32) + return test_bit(type, &sk->tp_pinfo.tp_raw4.filter); + + /* Do not block unknown ICMP types */ + return 0; +} + +/* + * Process Router Attention IP option + */ +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = skb->nh.iph->protocol; + struct sock *last = NULL; + + for (ra = ip_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && sk->num == protocol) { + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (skb == NULL) + return 1; + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + return 1; + } + return 0; +} + +/* + * Deliver IP Packets to the higher protocol layers. + */ +int ip_local_deliver(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct inet_protocol *ipprot; + struct sock *raw_sk=NULL; + unsigned char hash; + int flag = 0; + +#ifndef CONFIG_IP_ALWAYS_DEFRAG + /* + * Reassemble IP fragments. + */ + + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return 0; + iph = skb->nh.iph; + } +#endif + +#ifdef CONFIG_IP_MASQUERADE + /* + * Do we need to de-masquerade this packet? + */ + { + int ret; + /* + * Some masq modules can re-inject packets if + * bad configured. + */ + + if((IPCB(skb)->flags&IPSKB_MASQUERADED)) { + printk(KERN_DEBUG "ip_input(): demasq recursion detected. Check masq modules configuration\n"); + kfree_skb(skb); + return 0; + } + + ret = ip_fw_demasquerade(&skb); + if (ret < 0) { + kfree_skb(skb); + return 0; + } + + if (ret) { + iph=skb->nh.iph; + IPCB(skb)->flags |= IPSKB_MASQUERADED; + dst_release(skb->dst); + skb->dst = NULL; + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) { + kfree_skb(skb); + return 0; + } + return skb->dst->input(skb); + } + } +#endif + + /* + * Point into the IP datagram, just past the header. + */ + + skb->h.raw = skb->nh.raw + iph->ihl*4; + + /* + * Deliver to raw sockets. This is fun as to avoid copies we want to make no + * surplus copies. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. + */ + + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + hash = iph->protocol & (MAX_INET_PROTOS - 1); + + /* + * If there maybe a raw socket we must check - if not we don't care less + */ + + if((raw_sk = raw_v4_htable[hash]) != NULL) { + struct sock *sknext = NULL; + struct sk_buff *skb1; + raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); + if(raw_sk) { /* Any raw sockets */ + do { + /* Find the next */ + sknext = raw_v4_lookup(raw_sk->next, iph->protocol, + iph->saddr, iph->daddr, skb->dev->ifindex); + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) { + if (sknext == NULL) + break; + skb1 = skb_clone(skb, GFP_ATOMIC); + if(skb1) + { + raw_rcv(raw_sk, skb1); + } + } + raw_sk = sknext; + } while(raw_sk!=NULL); + + /* Here either raw_sk is the last raw socket, or NULL if + * none. We deliver to the last raw socket AFTER the + * protocol checks as it avoids a surplus copy. + */ + } + } + + /* + * skb->h.raw now points at the protocol beyond the IP header. + */ + + for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next) + { + struct sk_buff *skb2; + + if (ipprot->protocol != iph->protocol) + continue; + /* + * See if we need to make a copy of it. This will + * only be set if more than one protocol wants it. + * and then not for the last one. If there is a pending + * raw delivery wait for that + */ + + if (ipprot->copy || raw_sk) + { + skb2 = skb_clone(skb, GFP_ATOMIC); + if(skb2==NULL) + continue; + } + else + { + skb2 = skb; + } + flag = 1; + + /* + * Pass on the datagram to each protocol that wants it, + * based on the datagram protocol. We should really + * check the protocol handler's return values here... + */ + + ipprot->handler(skb2, ntohs(iph->tot_len) - (iph->ihl * 4)); + } + + /* + * All protocols checked. + * If this packet was a broadcast, we may *not* reply to it, since that + * causes (proven, grin) ARP storms and a leakage of memory (i.e. all + * ICMP reply messages get queued up for transmission...) + */ + + if(raw_sk!=NULL) /* Shift to last raw user */ + { + raw_rcv(raw_sk, skb); + + } + else if (!flag) /* Free and report errors */ + { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb); + } + + return(0); +} + +/* + * Main IP Receive routine. + */ +int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct iphdr *iph = skb->nh.iph; +#ifdef CONFIG_FIREWALL + int fwres; + u16 rport; +#endif /* CONFIG_FIREWALL */ + + /* + * When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. + */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + ip_statistics.IpInReceives++; + + /* + * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. + * + * Is the datagram acceptable? + * + * 1. Length at least the size of an ip header + * 2. Version of 4 + * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] + * 4. Doesn't have a bogus length + */ + + if (skb->len < sizeof(struct iphdr)) + goto inhdr_error; + if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto inhdr_error; + + { + __u32 len = ntohs(iph->tot_len); + if (skb->len < len) + goto inhdr_error; + + /* + * Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + + __skb_trim(skb, len); + } + +#ifdef CONFIG_IP_ALWAYS_DEFRAG + /* Won't send ICMP reply, since skb->dst == NULL. --RR */ + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return 0; + iph = skb->nh.iph; + ip_send_check(iph); + } +#endif + +#ifdef CONFIG_FIREWALL + /* + * See if the firewall wants to dispose of the packet. + * + * We can't do ICMP reply or local delivery before routing, + * so we delay those decisions until after route. --RR + */ + fwres = call_in_firewall(PF_INET, dev, iph, &rport, &skb); + if (fwres < FW_ACCEPT && fwres != FW_REJECT) + goto drop; + iph = skb->nh.iph; +#endif /* CONFIG_FIREWALL */ + + /* + * Initialise the virtual path cache for the packet. It describes + * how the packet travels inside Linux networking. + */ + if (skb->dst == NULL) { + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) + goto drop; +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + goto drop; + } +#endif + } + +#ifdef CONFIG_NET_CLS_ROUTE + if (skb->dst->tclassid) { + u32 idx = skb->dst->tclassid; + ip_rt_acct[idx&0xFF].o_packets++; + ip_rt_acct[idx&0xFF].o_bytes+=skb->len; + ip_rt_acct[(idx>>16)&0xFF].i_packets++; + ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len; + } +#endif + + if (iph->ihl > 5) { + struct ip_options *opt; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + + skb = skb_cow(skb, skb_headroom(skb)); + if (skb == NULL) + return 0; + iph = skb->nh.iph; + + skb->ip_summed = 0; + if (ip_options_compile(NULL, skb)) + goto inhdr_error; + + opt = &(IPCB(skb)->opt); + if (opt->srr) { + struct in_device *in_dev = dev->ip_ptr; + if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + goto drop; + } + if (ip_options_rcv_srr(skb)) + goto drop; + } + } + +#ifdef CONFIG_FIREWALL +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (fwres == FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0) + return ip_local_deliver(skb); +#endif /* CONFIG_IP_TRANSPARENT_PROXY */ + + if (fwres == FW_REJECT) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + goto drop; + } +#endif /* CONFIG_FIREWALL */ + + return skb->dst->input(skb); + +inhdr_error: + ip_statistics.IpInHdrErrors++; +drop: + kfree_skb(skb); + return(0); +} + diff --git a/pfinet/linux-src/net/ipv4/ip_masq.c b/pfinet/linux-src/net/ipv4/ip_masq.c new file mode 100644 index 00000000..0187c58d --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq.c @@ -0,0 +1,2545 @@ +/* + * + * Masquerading functionality + * + * Copyright (c) 1994 Pauline Middelink + * + * $Id: ip_masq.c,v 1.34.2.2 1999/08/07 10:56:28 davem Exp $ + * + * + * See ip_fw.c for original log + * + * Fixes: + * Juan Jose Ciarlante : Modularized application masquerading (see ip_masq_app.c) + * Juan Jose Ciarlante : New struct ip_masq_seq that holds output/input delta seq. + * Juan Jose Ciarlante : Added hashed lookup by proto,maddr,mport and proto,saddr,sport + * Juan Jose Ciarlante : Fixed deadlock if free ports get exhausted + * Juan Jose Ciarlante : Added NO_ADDR status flag. + * Richard Lynch : Added IP Autoforward + * Nigel Metheringham : Added ICMP handling for demasquerade + * Nigel Metheringham : Checksum checking of masqueraded data + * Nigel Metheringham : Better handling of timeouts of TCP conns + * Delian Delchev : Added support for ICMP requests and replys + * Nigel Metheringham : ICMP in ICMP handling, tidy ups, bug fixes, made ICMP optional + * Juan Jose Ciarlante : re-assign maddr if no packet received from outside + * Juan Jose Ciarlante : ported to 2.1 tree + * Juan Jose Ciarlante : reworked control connections + * Steven Clarke : Added Port Forwarding + * Juan Jose Ciarlante : Just ONE ip_masq_new (!) + * Juan Jose Ciarlante : IP masq modules support + * Juan Jose Ciarlante : don't go into search loop if mport specified + * Juan Jose Ciarlante : locking + * Steven Clarke : IP_MASQ_S_xx state design + * Juan Jose Ciarlante : IP_MASQ_S state implementation + * Juan Jose Ciarlante : xx_get() clears timer, _put() inserts it + * Juan Jose Ciarlante : create /proc/net/ip_masq/ + * Juan Jose Ciarlante : reworked checksums (save payload csum if possible) + * Juan Jose Ciarlante : added missing ip_fw_masquerade checksum + * Juan Jose Ciarlante : csum savings + * Juan Jose Ciarlante : added user-space tunnel creation/del, etc + * Juan Jose Ciarlante : (last) moved to ip_masq_user runtime module + * Juan Jose Ciarlante : user timeout handling again + * Juan Jose Ciarlante : make new modules support optional + * Juan Jose Ciarlante : u-space context => locks reworked + * Juan Jose Ciarlante : fixed stupid SMP locking bug + * Juan Jose Ciarlante : fixed "tap"ing in demasq path by copy-on-w + * Juan Jose Ciarlante : make masq_proto_doff() robust against fake sized/corrupted packets + * Kai Bankett : do not toss other IP protos in proto_doff() + * Dan Kegel : pointed correct NAT behavior for UDP streams + * Julian Anastasov : use daddr and dport as hash keys + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/checksum.h> +#include <net/ip_masq.h> + +#ifdef CONFIG_IP_MASQUERADE_MOD +#include <net/ip_masq_mod.h> +#endif + +#include <linux/sysctl.h> +#include <linux/ip_fw.h> +#include <linux/ip_masq.h> + +int sysctl_ip_masq_debug = 0; + +/* + * Exported wrapper + */ +int ip_masq_get_debug_level(void) +{ + return sysctl_ip_masq_debug; +} + +struct ip_masq_hook *ip_masq_user_hook = NULL; + +/* + * Timeout table[state] + */ +/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */ +static struct ip_masq_timeout_table masq_timeout_table = { + ATOMIC_INIT(0), /* refcnt */ + 0, /* scale */ + { + 30*60*HZ, /* IP_MASQ_S_NONE, */ + 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */ + 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */ + 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */ + 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */ + 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */ + 10*HZ, /* IP_MASQ_S_CLOSE, */ + 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */ + 30*HZ, /* IP_MASQ_S_LAST_ACK, */ + 2*60*HZ, /* IP_MASQ_S_LISTEN, */ + 5*60*HZ, /* IP_MASQ_S_UDP, */ + 1*60*HZ, /* IP_MASQ_S_ICMP, */ + 2*HZ,/* IP_MASQ_S_LAST */ + }, /* timeout */ +}; + +#define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT] + +static const char * state_name_table[IP_MASQ_S_LAST+1] = { + "NONE", /* IP_MASQ_S_NONE, */ + "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */ + "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */ + "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */ + "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */ + "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */ + "CLOSE", /* IP_MASQ_S_CLOSE, */ + "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */ + "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */ + "LISTEN", /* IP_MASQ_S_LISTEN, */ + "UDP", /* IP_MASQ_S_UDP, */ + "ICMP", /* IP_MASQ_S_ICMP, */ + "BUG!", /* IP_MASQ_S_LAST */ +}; + +#define mNO IP_MASQ_S_NONE +#define mES IP_MASQ_S_ESTABLISHED +#define mSS IP_MASQ_S_SYN_SENT +#define mSR IP_MASQ_S_SYN_RECV +#define mFW IP_MASQ_S_FIN_WAIT +#define mTW IP_MASQ_S_TIME_WAIT +#define mCL IP_MASQ_S_CLOSE +#define mCW IP_MASQ_S_CLOSE_WAIT +#define mLA IP_MASQ_S_LAST_ACK +#define mLI IP_MASQ_S_LISTEN + +struct masq_tcp_states_t { + int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */ +}; + +const char * ip_masq_state_name(int state) +{ + if (state >= IP_MASQ_S_LAST) + return "ERR!"; + return state_name_table[state]; +} + +struct masq_tcp_states_t masq_tcp_states [] = { +/* INPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }}, +/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }}, +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }}, + +/* OUTPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ +/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }}, +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }}, +/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }}, +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }}, +}; + +static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output) +{ + /* + * [0-3]: input states, [4-7]: output. + */ + if (output) + output=4; + + if (th->rst) + return output+3; + if (th->syn) + return output+0; + if (th->fin) + return output+1; + if (th->ack) + return output+2; + return -1; +} + + + +static int masq_set_state_timeout(struct ip_masq *ms, int state) +{ + struct ip_masq_timeout_table *mstim = ms->timeout_table; + int scale; + + /* + * Use default timeout table if no specific for this entry + */ + if (!mstim) + mstim = &masq_timeout_table; + + ms->timeout = mstim->timeout[ms->state=state]; + scale = mstim->scale; + + if (scale<0) + ms->timeout >>= -scale; + else if (scale > 0) + ms->timeout <<= scale; + + return state; +} + +static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_MASQ_S_CLOSE; + + if ((state_idx = masq_tcp_state_idx(th, output)) < 0) { + IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n", + output, state_idx); + goto tcp_state_out; + } + + new_state = masq_tcp_states[state_idx].next_state[ms->state]; + +tcp_state_out: + if (new_state!=ms->state) + IP_MASQ_DEBUG(1, "%s %s [%c%c%c%c] %08lX:%04X-%08lX:%04X state: %s->%s\n", + masq_proto_name(ms->protocol), + output? "output" : "input ", + th->syn? 'S' : '.', + th->fin? 'F' : '.', + th->ack? 'A' : '.', + th->rst? 'R' : '.', + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ip_masq_state_name(ms->state), + ip_masq_state_name(new_state)); + return masq_set_state_timeout(ms, new_state); +} + + +/* + * Handle state transitions + */ +static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp) +{ + switch (iph->protocol) { + case IPPROTO_ICMP: + return masq_set_state_timeout(ms, IP_MASQ_S_ICMP); + case IPPROTO_UDP: + return masq_set_state_timeout(ms, IP_MASQ_S_UDP); + case IPPROTO_TCP: + return masq_tcp_state(ms, output, tp); + } + return -1; +} + +/* + * Set LISTEN timeout. (ip_masq_put will setup timer) + */ +int ip_masq_listen(struct ip_masq *ms) +{ + masq_set_state_timeout(ms, IP_MASQ_S_LISTEN); + return ms->timeout; +} + +/* + * Dynamic address rewriting + */ +extern int sysctl_ip_dynaddr; + +/* + * Lookup lock + */ +rwlock_t __ip_masq_lock = RW_LOCK_UNLOCKED; + +/* + * Implement IP packet masquerading + */ + +/* + * Converts an ICMP reply code into the equivalent request code + */ +static __inline__ const __u8 icmp_type_request(__u8 type) +{ + switch (type) + { + case ICMP_ECHOREPLY: return ICMP_ECHO; break; + case ICMP_TIMESTAMPREPLY: return ICMP_TIMESTAMP; break; + case ICMP_INFO_REPLY: return ICMP_INFO_REQUEST; break; + case ICMP_ADDRESSREPLY: return ICMP_ADDRESS; break; + default: return (255); break; + } +} + +/* + * Helper macros - attempt to make code clearer! + */ + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) ((icmph->un).echo.id) +/* (port) hash value using in ICMP lookups for requests */ +#define icmp_hv_req(icmph) ((__u16)(icmph->code+(__u16)(icmph->type<<8))) +/* (port) hash value using in ICMP lookups for replies */ +#define icmp_hv_rep(icmph) ((__u16)(icmph->code+(__u16)(icmp_type_request(icmph->type)<<8))) + +/* + * Last masq_port number in use. + * Will cycle in MASQ_PORT boundaries. + */ +static __u16 masq_port = PORT_MASQ_BEGIN; +static spinlock_t masq_port_lock = SPIN_LOCK_UNLOCKED; + +/* + * free ports counters (UDP & TCP) + * + * Their value is _less_ or _equal_ to actual free ports: + * same masq port, diff masq addr (firewall iface address) allocated + * entries are accounted but their actually don't eat a more than 1 port. + * + * Greater values could lower MASQ_EXPIRATION setting as a way to + * manage 'masq_entries resource'. + * + * By default we will reuse masq.port iff (output) connection + * (5-upla) if not duplicated. + * This may break midentd and others ... + */ + +#ifdef CONFIG_IP_MASQ_NREUSE +#define PORT_MASQ_MUL 1 +#else +#define PORT_MASQ_MUL 10 +#endif + +/* + * At the moment, hardcore in sync with masq_proto_num + */ +atomic_t ip_masq_free_ports[3] = { + ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* UDP */ + ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* TCP */ + ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* ICMP */ +}; + +/* + * Counts entries that have been requested with specific mport. + * Used for incoming packets to "relax" input rule (port in MASQ range). + */ +atomic_t mport_count = ATOMIC_INIT(0); + +EXPORT_SYMBOL(ip_masq_get_debug_level); +EXPORT_SYMBOL(ip_masq_new); +EXPORT_SYMBOL(ip_masq_listen); +EXPORT_SYMBOL(ip_masq_free_ports); +EXPORT_SYMBOL(ip_masq_out_get); +EXPORT_SYMBOL(ip_masq_in_get); +EXPORT_SYMBOL(ip_masq_put); +EXPORT_SYMBOL(ip_masq_control_add); +EXPORT_SYMBOL(ip_masq_control_del); +EXPORT_SYMBOL(ip_masq_control_get); +EXPORT_SYMBOL(ip_masq_user_hook); +EXPORT_SYMBOL(ip_masq_state_name); +EXPORT_SYMBOL(ip_masq_select_addr); +EXPORT_SYMBOL(__ip_masq_lock); +EXPORT_SYMBOL(ip_masq_m_table); +EXPORT_SYMBOL(ip_masq_s_table); +EXPORT_SYMBOL(ip_masq_d_table); + +/* + * 3 ip_masq hash double linked tables: + * 2 for input m{addr,port} and output s{addr,port} pkts lookups. + * 1 for extra modules support (daddr) + */ + +#define IP_MASQ_NTABLES 3 + +struct list_head ip_masq_m_table[IP_MASQ_TAB_SIZE]; +struct list_head ip_masq_s_table[IP_MASQ_TAB_SIZE]; +struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE]; + +/* + * timeouts + */ + +#if 000 /* FIXED timeout handling */ +static struct ip_fw_masq ip_masq_dummy = { + MASQUERADE_EXPIRE_TCP, + MASQUERADE_EXPIRE_TCP_FIN, + MASQUERADE_EXPIRE_UDP +}; + +EXPORT_SYMBOL(ip_masq_expire); +struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy; +#endif + +/* + * These flags enable non-strict d{addr,port} checks + * Given that both (in/out) lookup tables are hashed + * by m{addr,port} and s{addr,port} this is quite easy + */ + +#define MASQ_DADDR_PASS (IP_MASQ_F_NO_DADDR|IP_MASQ_F_DLOOSE) +#define MASQ_DPORT_PASS (IP_MASQ_F_NO_DPORT|IP_MASQ_F_DLOOSE) + +/* + * By default enable dest loose semantics + */ +#define CONFIG_IP_MASQ_LOOSE_DEFAULT 1 + + +/* + * Set masq expiration (deletion) and adds timer, + * if timeout==0 cancel expiration. + * Warning: it does not check/delete previous timer! + */ + +static void __ip_masq_set_expire(struct ip_masq *ms, unsigned long tout) +{ + if (tout) { + ms->timer.expires = jiffies+tout; + add_timer(&ms->timer); + } else { + del_timer(&ms->timer); + } +} + + +/* + * Returns hash value + */ + +static __inline__ unsigned +ip_masq_hash_key(unsigned proto, __u32 addr, __u16 port) +{ + return (proto^ntohl(addr)^ntohs(port)) & (IP_MASQ_TAB_SIZE-1); +} + +/* + * Hashes ip_masq by its proto,addrs,ports. + * should be called with locked tables. + * returns bool success. + */ + +static int ip_masq_hash(struct ip_masq *ms) +{ + unsigned hash; + + if (ms->flags & IP_MASQ_F_HASHED) { + IP_MASQ_ERR( "ip_masq_hash(): request for already hashed, called from %p\n", + __builtin_return_address(0)); + return 0; + } + atomic_add(IP_MASQ_NTABLES, &ms->refcnt); + + if ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS | + IP_MASQ_F_SIMPLE_HASH)) == 0) + /* + * Hash by proto,m{addr,port},d{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, + ms->maddr^ms->daddr, ms->mport^ms->dport); + else + /* + * Hash by proto,m{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport); + + list_add(&ms->m_list, &ip_masq_m_table[hash]); + + if ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS | + IP_MASQ_F_NO_SADDR | IP_MASQ_F_NO_SPORT | + IP_MASQ_F_SIMPLE_HASH)) == 0) + /* + * Hash by proto,s{addr,port},d{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, + ms->saddr^ms->daddr, ms->sport^ms->dport); + else + /* + * Hash by proto,s{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport); + + list_add(&ms->s_list, &ip_masq_s_table[hash]); + + /* + * Hash by proto,d{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->daddr, ms->dport); + list_add(&ms->d_list, &ip_masq_d_table[hash]); + + + ms->flags |= IP_MASQ_F_HASHED; + return 1; +} + +/* + * UNhashes ip_masq from ip_masq_[ms]_tables. + * should be called with locked tables. + * returns bool success. + */ + +static int ip_masq_unhash(struct ip_masq *ms) +{ + if (!(ms->flags & IP_MASQ_F_HASHED)) { + IP_MASQ_ERR( "ip_masq_unhash(): request for unhash flagged, called from %p\n", + __builtin_return_address(0)); + return 0; + } + list_del(&ms->m_list); + list_del(&ms->s_list); + list_del(&ms->d_list); + + atomic_sub(IP_MASQ_NTABLES, &ms->refcnt); + + ms->flags &= ~IP_MASQ_F_HASHED; + return 1; +} + +/* + * Returns ip_masq associated with supplied parameters, either + * broken out of the ip/tcp headers or directly supplied for those + * pathological protocols with address/port in the data stream + * (ftp, irc). addresses and ports are in network order. + * called for pkts coming from OUTside-to-INside the firewall. + * + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (firewall) + * + * NB. Cannot check destination address, just for the incoming port. + * reason: archie.doc.ac.uk has 6 interfaces, you send to + * phoenix and get a reply from any other interface(==dst)! + * + * [Only for UDP] - AC + * + * Caller must lock tables + */ + +static struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms = NULL; + struct list_head *l,*e; + + hash = ip_masq_hash_key(protocol, d_addr^s_addr, d_port^s_port); + + l = &ip_masq_m_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + if (s_port==ms->dport && s_addr==ms->daddr && + d_port==ms->mport && protocol==ms->protocol && + d_addr==ms->maddr && + ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS)) == 0) + ) { + IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + atomic_inc(&ms->refcnt); + goto out; + } + } + + hash = ip_masq_hash_key(protocol, d_addr, d_port); + + l = &ip_masq_m_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + if (protocol==ms->protocol && + (d_addr==ms->maddr && d_port==ms->mport) && + (s_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (s_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { + IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + atomic_inc(&ms->refcnt); + goto out; + } + } + IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX fail\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + + ms = NULL; +out: + return ms; +} + +/* + * Returns ip_masq associated with supplied parameters, either + * broken out of the ip/tcp headers or directly supplied for those + * pathological protocols with address/port in the data stream + * (ftp, irc). addresses and ports are in network order. + * called for pkts coming from inside-to-OUTside the firewall. + * + * Normally we know the source address and port but for some protocols + * (e.g. ftp PASV) we do not know the source port initially. Alas the + * hash is keyed on source port so if the first lookup fails then try again + * with a zero port, this time only looking at entries marked "no source + * port". + * + * Caller must lock tables + */ + +static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms = NULL; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_masq_hash_key(protocol, s_addr^d_addr, s_port^d_port); + + l = &ip_masq_s_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, s_list); + if (d_addr==ms->daddr && d_port==ms->dport && + s_addr==ms->saddr && s_port==ms->sport && + protocol==ms->protocol && + ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS | + IP_MASQ_F_NO_SADDR | IP_MASQ_F_NO_SPORT)) == 0) + ) { + IP_MASQ_DEBUG(2, "lk/out0 %d %08X:%04hX->%08X:%04hX OK\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + + atomic_inc(&ms->refcnt); + goto out; + } + + } + + hash = ip_masq_hash_key(protocol, s_addr, s_port); + + l = &ip_masq_s_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, s_list); + if (protocol == ms->protocol && + s_addr == ms->saddr && s_port == ms->sport && + (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { + IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX OK\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + + atomic_inc(&ms->refcnt); + goto out; + } + + } + + /* + * Check for NO_SPORT entries + */ + hash = ip_masq_hash_key(protocol, s_addr, 0); + l = &ip_masq_s_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, s_list); + if (ms->flags & IP_MASQ_F_NO_SPORT && + protocol == ms->protocol && + s_addr == ms->saddr && + (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { + IP_MASQ_DEBUG(2, "lk/out2 %d %08X:%04hX->%08X:%04hX OK\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + + atomic_inc(&ms->refcnt); + goto out; + } + } + IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX fail\n", + protocol, + s_addr, + s_port, + d_addr, + d_port); + + ms = NULL; +out: + return ms; +} + +#ifdef CONFIG_IP_MASQ_NREUSE +/* + * Returns ip_masq for given proto,m_addr,m_port. + * called by allocation routine to find an unused m_port. + * + * Caller must lock tables + */ + +static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_port) +{ + unsigned hash; + struct ip_masq *ms = NULL; + + hash = ip_masq_hash_key(protocol, m_addr, m_port); + + for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { + if ( protocol==ms->protocol && + (m_addr==ms->maddr && m_port==ms->mport)) { + atomic_inc(&ms->refcnt); + goto out; + } + } + +out: + return ms; +} +#endif + +struct ip_masq * ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_masq *ms; + + read_lock(&__ip_masq_lock); + ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port); + read_unlock(&__ip_masq_lock); + + if (ms) + __ip_masq_set_expire(ms, 0); + return ms; +} + +struct ip_masq * ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_masq *ms; + + read_lock(&__ip_masq_lock); + ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); + read_unlock(&__ip_masq_lock); + + if (ms) + __ip_masq_set_expire(ms, 0); + return ms; +} + +static __inline__ void __ip_masq_put(struct ip_masq *ms) +{ + atomic_dec(&ms->refcnt); +} + +void ip_masq_put(struct ip_masq *ms) +{ + /* + * Decrement refcnt + */ + __ip_masq_put(ms); + + /* + * if refcnt==IP_MASQ_NTABLES + */ + if (atomic_read(&ms->refcnt)==IP_MASQ_NTABLES) { + __ip_masq_set_expire(ms, ms->timeout); + } else { + IP_MASQ_DEBUG(0, "did not set timer with refcnt=%d, called from %p\n", + atomic_read(&ms->refcnt), + __builtin_return_address(0)); + } +} + +static void masq_expire(unsigned long data) +{ + struct ip_masq *ms = (struct ip_masq *)data; + ms->timeout = MASQUERADE_EXPIRE_RETRY; + + /* + * hey, I'm using it + */ + atomic_inc(&ms->refcnt); + + IP_MASQ_DEBUG(1, "Masqueraded %s %08lX:%04X expired\n", + masq_proto_name(ms->protocol), + ntohl(ms->saddr),ntohs(ms->sport)); + + write_lock(&__ip_masq_lock); + +#if 0000 + /* + * Already locked, do bounce ... + */ + if (ip_masq_nlocks(&__ip_masq_lock) != 1) { + goto masq_expire_later; + } + +#endif + /* + * do I control anybody? + */ + if (atomic_read(&ms->n_control)) + goto masq_expire_later; + + /* + * does anybody controls me? + */ + + if (ms->control) + ip_masq_control_del(ms); + + if (ip_masq_unhash(ms)) { + if (ms->flags&IP_MASQ_F_MPORT) { + atomic_dec(&mport_count); + } else { + atomic_inc(ip_masq_free_ports + masq_proto_num(ms->protocol)); + } + ip_masq_unbind_app(ms); + } + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (atomic_read(&ms->refcnt) == 1) { + kfree_s(ms,sizeof(*ms)); + MOD_DEC_USE_COUNT; + goto masq_expire_out; + } + +masq_expire_later: + IP_MASQ_DEBUG(0, "masq_expire delayed: %s %08lX:%04X->%08lX:%04X masq.refcnt-1=%d masq.n_control=%d\n", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + atomic_read(&ms->refcnt)-1, + atomic_read(&ms->n_control)); + + ip_masq_put(ms); + +masq_expire_out: + write_unlock(&__ip_masq_lock); +} + +static __u16 get_next_mport(void) +{ + __u16 mport; + + spin_lock_irq(&masq_port_lock); + /* + * Try the next available port number + */ + mport = htons(masq_port++); + if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN; + + spin_unlock_irq(&masq_port_lock); + return mport; +} + +/* + * Create a new masquerade list entry, also allocate an + * unused mport, keeping the portnumber between the + * given boundaries MASQ_BEGIN and MASQ_END. + * + * Be careful, it can be called from u-space + */ + +struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) +{ + struct ip_masq *ms, *mst; + int ports_tried; + atomic_t *free_ports_p = NULL; + static int n_fails = 0; + int prio; + + + if (masq_proto_num(proto)!=-1 && mport == 0) { + free_ports_p = ip_masq_free_ports + masq_proto_num(proto); + + if (atomic_read(free_ports_p) == 0) { + if (++n_fails < 5) + IP_MASQ_ERR( "ip_masq_new(proto=%s): no free ports.\n", + masq_proto_name(proto)); + return NULL; + } + } + + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC; + + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio); + if (ms == NULL) { + if (++n_fails < 5) + IP_MASQ_ERR("ip_masq_new(proto=%s): no memory available.\n", + masq_proto_name(proto)); + return NULL; + } + MOD_INC_USE_COUNT; + memset(ms, 0, sizeof(*ms)); + INIT_LIST_HEAD(&ms->s_list); + INIT_LIST_HEAD(&ms->m_list); + INIT_LIST_HEAD(&ms->d_list); + init_timer(&ms->timer); + ms->timer.data = (unsigned long)ms; + ms->timer.function = masq_expire; + ms->protocol = proto; + ms->saddr = saddr; + ms->sport = sport; + ms->daddr = daddr; + ms->dport = dport; + ms->flags = mflags; + ms->app_data = NULL; + ms->control = NULL; + + atomic_set(&ms->n_control,0); + atomic_set(&ms->refcnt,0); + + if (proto == IPPROTO_UDP && !mport) +#ifdef CONFIG_IP_MASQ_LOOSE_DEFAULT + /* + * Flag this tunnel as "dest loose" + * + */ + ms->flags |= IP_MASQ_F_DLOOSE; +#else + ms->flags |= IP_MASQ_F_NO_DADDR; +#endif + + + /* get masq address from rif */ + ms->maddr = maddr; + + /* + * This flag will allow masq. addr (ms->maddr) + * to follow forwarding interface address. + */ + ms->flags |= IP_MASQ_F_NO_REPLY; + + /* + * We want a specific mport. Be careful. + */ + if (masq_proto_num(proto) == -1 || mport) { + ms->mport = mport; + + /* + * Check 5-upla uniqueness + */ + if (mflags & IP_MASQ_F_USER) + write_lock_bh(&__ip_masq_lock); + else + write_lock(&__ip_masq_lock); + + mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport); + if (mst==NULL) { + ms->flags |= IP_MASQ_F_MPORT; + + atomic_inc(&mport_count); + ip_masq_hash(ms); + + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + ip_masq_bind_app(ms); + atomic_inc(&ms->refcnt); + masq_set_state_timeout(ms, IP_MASQ_S_NONE); + return ms; + } + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + __ip_masq_put(mst); + + IP_MASQ_ERR( "Already used connection: %s, %d.%d.%d.%d:%d => %d.%d.%d.%d:%d, called from %p\n", + masq_proto_name(proto), + NIPQUAD(maddr), ntohs(mport), + NIPQUAD(daddr), ntohs(dport), + __builtin_return_address(0)); + + + goto mport_nono; + } + + + for (ports_tried = 0; + (atomic_read(free_ports_p) && (ports_tried <= (PORT_MASQ_END - PORT_MASQ_BEGIN))); + ports_tried++){ + + mport = ms->mport = get_next_mport(); + /* + * lookup to find out if this connection is used. + */ + + if (mflags & IP_MASQ_F_USER) + write_lock_bh(&__ip_masq_lock); + else + write_lock(&__ip_masq_lock); + +#ifdef CONFIG_IP_MASQ_NREUSE + mst = __ip_masq_getbym(proto, maddr, mport); +#else + mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport); +#endif + if (mst == NULL) { + + if (atomic_read(free_ports_p) == 0) { + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + break; + } + atomic_dec(free_ports_p); + ip_masq_hash(ms); + + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + ip_masq_bind_app(ms); + n_fails = 0; + atomic_inc(&ms->refcnt); + masq_set_state_timeout(ms, IP_MASQ_S_NONE); + return ms; + } + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + __ip_masq_put(mst); + } + + if (++n_fails < 5) + IP_MASQ_ERR( "ip_masq_new(proto=%s): could not get free masq entry (free=%d).\n", + masq_proto_name(ms->protocol), + atomic_read(free_ports_p)); +mport_nono: + kfree_s(ms, sizeof(*ms)); + + MOD_DEC_USE_COUNT; + return NULL; +} + +/* + * Get transport protocol data offset, check against size + * return: + * 0 if other IP proto + * -1 if error + */ +static __inline__ int proto_doff(unsigned proto, char *th, unsigned size) +{ + int ret = -1; + switch (proto) { + case IPPROTO_ICMP: + if (size >= sizeof(struct icmphdr)) + ret = sizeof(struct icmphdr); + break; + case IPPROTO_UDP: + if (size >= sizeof(struct udphdr)) + ret = sizeof(struct udphdr); + break; + case IPPROTO_TCP: + /* + * Is this case, this check _also_ avoids + * touching an invalid pointer if + * size is invalid + */ + if (size >= sizeof(struct tcphdr)) { + ret = ((struct tcphdr*)th)->doff << 2; + if (ret > size) { + ret = -1 ; + } + } + + break; + default: + /* Other proto: nothing to say, by now :) */ + ret = 0; + } + if (ret < 0) + IP_MASQ_DEBUG(0, "mess proto_doff for proto=%d, size =%d\n", + proto, size); + return ret; +} + +int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + union ip_masq_tphdr h; + struct ip_masq *ms; + int size; + + /* + * doff holds transport protocol data offset + * csum holds its checksum + * csum_ok says if csum is valid + */ + int doff = 0; + int csum = 0; + int csum_ok = 0; + + /* + * We can only masquerade protocols with ports... and hack some ICMPs + */ + + h.raw = (char*) iph + iph->ihl * 4; + size = ntohs(iph->tot_len) - (iph->ihl * 4); + + + doff = proto_doff(iph->protocol, h.raw, size); + if (doff <= 0) { + /* + * Output path: do not pass other IP protos nor + * invalid packets. + */ + return -1; + } + + switch (iph->protocol) { + case IPPROTO_ICMP: + return(ip_fw_masq_icmp(skb_p, maddr)); + case IPPROTO_UDP: + if (h.uh->check == 0) + /* No UDP checksum */ + break; + case IPPROTO_TCP: + /* Make sure packet is in the masq range */ + IP_MASQ_DEBUG(3, "O-pkt: %s size=%d\n", + masq_proto_name(iph->protocol), + size); + +#ifdef CONFIG_IP_MASQ_DEBUG + if (ip_masq_get_debug_level() > 3) { + skb->ip_summed = CHECKSUM_NONE; + } +#endif + /* Check that the checksum is OK */ + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + { + csum = csum_partial(h.raw + doff, size - doff, 0); + IP_MASQ_DEBUG(3, "O-pkt: %s I-datacsum=%d\n", + masq_proto_name(iph->protocol), + csum); + + skb->csum = csum_partial(h.raw , doff, csum); + } + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, skb->csum)) + { + IP_MASQ_DEBUG(0, "Outgoing failed %s checksum from %d.%d.%d.%d (size=%d)!\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); + return -1; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + break; + default: + return -1; + } + /* + * Now hunt the list to see if we have an old entry + */ + + /* h.raw = (char*) iph + iph->ihl * 4; */ + + IP_MASQ_DEBUG(2, "Outgoing %s %08lX:%04X -> %08lX:%04X\n", + masq_proto_name(iph->protocol), + ntohl(iph->saddr), ntohs(h.portp[0]), + ntohl(iph->daddr), ntohs(h.portp[1])); + + ms = ip_masq_out_get_iph(iph); + if (ms!=NULL) { + + /* + * If sysctl !=0 and no pkt has been received yet + * in this tunnel and routing iface address has changed... + * "You are welcome, diald". + */ + if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) { + + if (sysctl_ip_dynaddr > 1) { + IP_MASQ_INFO( "ip_fw_masquerade(): change masq.addr from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(ms->maddr),NIPQUAD(maddr)); + } + + write_lock(&__ip_masq_lock); + + ip_masq_unhash(ms); + ms->maddr = maddr; + ip_masq_hash(ms); + + write_unlock(&__ip_masq_lock); + } + + /* + * Set sport if not defined yet (e.g. ftp PASV). Because + * masq entries are hashed on sport, unhash with old value + * and hash with new. + */ + + if ( ms->flags & IP_MASQ_F_NO_SPORT && ms->protocol == IPPROTO_TCP ) { + + write_lock(&__ip_masq_lock); + + ip_masq_unhash(ms); + ms->flags &= ~IP_MASQ_F_NO_SPORT; + ms->sport = h.portp[0]; + ip_masq_hash(ms); /* hash on new sport */ + + write_unlock(&__ip_masq_lock); + + IP_MASQ_DEBUG(1, "ip_fw_masquerade(): filled sport=%d\n", + ntohs(ms->sport)); + } + if (ms->flags & IP_MASQ_F_DLOOSE) { + /* + * update dest loose values + */ + ms->dport = h.portp[1]; + ms->daddr = iph->daddr; + } + } else { + /* + * Nope, not found, create a new entry for it + */ + +#ifdef CONFIG_IP_MASQUERADE_MOD + if (!(ms = ip_masq_mod_out_create(skb, iph, maddr))) +#endif + ms = ip_masq_new(iph->protocol, + maddr, 0, + iph->saddr, h.portp[0], + iph->daddr, h.portp[1], + 0); + if (ms == NULL) + return -1; + } + + /* + * Call module's output update hook + */ + +#ifdef CONFIG_IP_MASQUERADE_MOD + ip_masq_mod_out_update(skb, iph, ms); +#endif + + /* + * Change the fragments origin + */ + + size = skb->len - (h.raw - skb->nh.raw); + + /* + * Set iph addr and port from ip_masq obj. + */ + iph->saddr = ms->maddr; + h.portp[0] = ms->mport; + + /* + * Invalidate csum saving if tunnel has masq helper + */ + + if (ms->app) + csum_ok = 0; + + /* + * Attempt ip_masq_app call. + * will fix ip_masq and iph seq stuff + */ + if (ip_masq_app_pkt_out(ms, skb_p, maddr) != 0) + { + /* + * skb has possibly changed, update pointers. + */ + skb = *skb_p; + iph = skb->nh.iph; + h.raw = (char*) iph + iph->ihl *4; + size = skb->len - (h.raw - skb->nh.raw); + /* doff should have not changed */ + } + + /* + * Adjust packet accordingly to protocol + */ + + /* + * Transport's payload partial csum + */ + + if (!csum_ok) { + csum = csum_partial(h.raw + doff, size - doff, 0); + } + skb->csum = csum; + + IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n", + masq_proto_name(iph->protocol), + size, + csum); + + /* + * Protocol csum + */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + h.th->check, + (char*) & (h.th->check) - (char*) h.raw); + + break; + case IPPROTO_UDP: + h.uh->check = 0; + h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + h.uh->check, + (char*) &(h.uh->check)- (char*) h.raw); + break; + } + ip_send_check(iph); + + IP_MASQ_DEBUG(2, "O-routed from %08lX:%04X with masq.addr %08lX\n", + ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); + + masq_set_state(ms, 1, iph, h.portp); + ip_masq_put(ms); + + return 0; + } + +/* + * Restore original addresses and ports in the original IP + * datagram if the failing packet has been [de]masqueraded. + * This is ugly in the extreme. We no longer have the original + * packet so we have to reconstruct it from the failing packet + * plus data in the masq tables. The resulting "original data" + * should be good enough to tell the sender which session to + * throttle. Relies on far too much knowledge of masq internals, + * there ought to be a better way - KAO 990303. + * + * Moved here from icmp.c - JJC. + * Already known: type == ICMP_DEST_UNREACH, IPSKB_MASQUERADED + * skb->nh.iph points to original header. + * + * Must try both OUT and IN tables; we could add a flag + * ala IPSKB_MASQUERADED to avoid 2nd tables lookup, but this is VERY + * unlike because routing makes mtu decision before reaching + * ip_fw_masquerade(). + * + */ +int ip_fw_unmasq_icmp(struct sk_buff *skb) { + struct ip_masq *ms; + struct iphdr *iph = skb->nh.iph; + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* + * Always called from _bh context: use read_[un]lock() + */ + + /* + * Peek "out" table, this packet has bounced: + * out->in(frag_needed!)->OUT[icmp] + * + * iph->daddr is IN host + * iph->saddr is OUT host + */ + read_lock(&__ip_masq_lock); + ms = __ip_masq_out_get(iph->protocol, + iph->daddr, portp[1], + iph->saddr, portp[0]); + read_unlock(&__ip_masq_lock); + if (ms) { + IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(iph->daddr), NIPQUAD(ms->maddr)); + iph->daddr = ms->maddr; + portp[1] = ms->mport; + __ip_masq_put(ms); + return 1; + } + /* + * Peek "in" table + * in->out(frag_needed!)->IN[icmp] + * + * iph->daddr is OUT host + * iph->saddr is MASQ host + * + */ + read_lock(&__ip_masq_lock); + ms = __ip_masq_in_get(iph->protocol, + iph->daddr, portp[1], + iph->saddr, portp[0]); + read_unlock(&__ip_masq_lock); + if (ms) { + IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(iph->saddr), NIPQUAD(ms->saddr)); + iph->saddr = ms->saddr; + portp[0] = ms->sport; + __ip_masq_put(ms); + return 1; + } + return 0; + +} +/* + * Handle ICMP messages in forward direction. + * Find any that might be relevant, check against existing connections, + * forward to masqueraded host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded + */ + +int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + struct ip_masq *ms; + unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4); + + IP_MASQ_DEBUG(2, "Incoming forward ICMP (%d,%d) %lX -> %lX\n", + icmph->type, ntohs(icmp_id(icmph)), + ntohl(iph->saddr), ntohl(iph->daddr)); + +#ifdef CONFIG_IP_MASQUERADE_ICMP + if ((icmph->type == ICMP_ECHO ) || + (icmph->type == ICMP_TIMESTAMP ) || + (icmph->type == ICMP_INFO_REQUEST ) || + (icmph->type == ICMP_ADDRESS )) { + + IP_MASQ_DEBUG(2, "icmp request rcv %lX->%lX id %d type %d\n", + ntohl(iph->saddr), + ntohl(iph->daddr), + ntohs(icmp_id(icmph)), + icmph->type); + + ms = ip_masq_out_get(iph->protocol, + iph->saddr, + icmp_id(icmph), + iph->daddr, + icmp_hv_req(icmph)); + if (ms == NULL) { + ms = ip_masq_new(iph->protocol, + maddr, 0, + iph->saddr, icmp_id(icmph), + iph->daddr, icmp_hv_req(icmph), + 0); + if (ms == NULL) + return (-1); + IP_MASQ_DEBUG(1, "Created new icmp entry\n"); + } + /* Rewrite source address */ + + /* + * If sysctl !=0 and no pkt has been received yet + * in this tunnel and routing iface address has changed... + * "You are welcome, diald". + */ + if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) { + + if (sysctl_ip_dynaddr > 1) { + IP_MASQ_INFO( "ip_fw_masq_icmp(): change masq.addr %d.%d.%d.%d to %d.%d.%d.%d", + NIPQUAD(ms->maddr), NIPQUAD(maddr)); + } + + write_lock(&__ip_masq_lock); + + ip_masq_unhash(ms); + ms->maddr = maddr; + ip_masq_hash(ms); + + write_unlock(&__ip_masq_lock); + } + + iph->saddr = ms->maddr; + ip_send_check(iph); + /* Rewrite port (id) */ + (icmph->un).echo.id = ms->mport; + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *)icmph, len); + + IP_MASQ_DEBUG(2, "icmp request rwt %lX->%lX id %d type %d\n", + ntohl(iph->saddr), + ntohl(iph->daddr), + ntohs(icmp_id(icmph)), + icmph->type); + + masq_set_state(ms, 1, iph, icmph); + ip_masq_put(ms); + + return 1; + } +#endif + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that + * means easy things are checked first to speed up + * processing.... however this means that some + * packets will manage to get a long way down this + * stack and then be rejected, but thats life + */ + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return 0; + + /* Now find the contained IP header */ + ciph = (struct iphdr *) (icmph + 1); + +#ifdef CONFIG_IP_MASQUERADE_ICMP + if (ciph->protocol == IPPROTO_ICMP) { + /* + * This section handles ICMP errors for ICMP packets + */ + struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph + + (ciph->ihl<<2)); + + + IP_MASQ_DEBUG(2, "fw icmp/icmp rcv %lX->%lX id %d type %d\n", + ntohl(ciph->saddr), + ntohl(ciph->daddr), + ntohs(icmp_id(cicmph)), + cicmph->type); + + read_lock(&__ip_masq_lock); + ms = __ip_masq_out_get(ciph->protocol, + ciph->daddr, + icmp_id(cicmph), + ciph->saddr, + icmp_hv_rep(cicmph)); + read_unlock(&__ip_masq_lock); + + if (ms == NULL) + return 0; + + /* Now we do real damage to this packet...! */ + /* First change the source IP address, and recalc checksum */ + iph->saddr = ms->maddr; + ip_send_check(iph); + + /* Now change the *dest* address in the contained IP */ + ciph->daddr = ms->maddr; + __ip_masq_put(ms); + + ip_send_check(ciph); + + /* Change the ID to the masqed one! */ + (cicmph->un).echo.id = ms->mport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + + + IP_MASQ_DEBUG(2, "fw icmp/icmp rwt %lX->%lX id %d type %d\n", + ntohl(ciph->saddr), + ntohl(ciph->daddr), + ntohs(icmp_id(cicmph)), + cicmph->type); + + return 1; + } +#endif /* CONFIG_IP_MASQUERADE_ICMP */ + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if ((ciph->protocol != IPPROTO_UDP) && (ciph->protocol != IPPROTO_TCP)) + return 0; + + /* + * Find the ports involved - this packet was + * incoming so the ports are right way round + * (but reversed relative to outer IP header!) + */ + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); +#if 0 + if (ntohs(pptr[1]) < PORT_MASQ_BEGIN || + ntohs(pptr[1]) > PORT_MASQ_END) + return 0; +#endif + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) + { + /* Failed checksum! */ + IP_MASQ_DEBUG(0, "forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + return(-1); + } + + + IP_MASQ_DEBUG(2, "Handling forward ICMP for %08lX:%04X -> %08lX:%04X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); + + +#if 0 + /* This is pretty much what __ip_masq_in_get_iph() does */ + ms = __ip_masq_in_get(ciph->protocol, ciph->saddr, pptr[0], ciph->daddr, pptr[1]); +#endif + read_lock(&__ip_masq_lock); + ms = __ip_masq_out_get(ciph->protocol, + ciph->daddr, + pptr[1], + ciph->saddr, + pptr[0]); + read_unlock(&__ip_masq_lock); + + if (ms == NULL) + return 0; + + /* Now we do real damage to this packet...! */ + /* First change the source IP address, and recalc checksum */ + iph->saddr = ms->maddr; + ip_send_check(iph); + + /* Now change the *dest* address in the contained IP */ + ciph->daddr = ms->maddr; + ip_send_check(ciph); + + /* the TCP/UDP dest port - cannot redo check */ + pptr[1] = ms->mport; + __ip_masq_put(ms); + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + + + IP_MASQ_DEBUG(2, "Rewrote forward ICMP to %08lX:%04X -> %08lX:%04X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); + + + return 1; +} + + +/* + * Own skb_cow() beast, tweaked for rewriting commonly + * used pointers in masq code + */ +static struct sk_buff * masq_skb_cow(struct sk_buff **skb_p, + struct iphdr **iph_p, unsigned char **t_p) { + struct sk_buff *skb=(*skb_p); + if (skb_cloned(skb)) { + skb = skb_copy(skb, GFP_ATOMIC); + if (skb) { + /* + * skb changed, update other pointers + */ + struct iphdr *iph = skb->nh.iph; + kfree_skb(*skb_p); + *skb_p = skb; + *iph_p = iph; + *t_p = (char*) iph + iph->ihl * 4; + } + } + return skb; +} + +/* + * Handle ICMP messages in reverse (demasquerade) direction. + * Find any that might be relevant, check against existing connections, + * forward to masqueraded host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded + */ + +int ip_fw_demasq_icmp(struct sk_buff **skb_p) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + struct ip_masq *ms; + unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4); + + + IP_MASQ_DEBUG(2, "icmp in/rev (%d,%d) %lX -> %lX\n", + icmph->type, ntohs(icmp_id(icmph)), + ntohl(iph->saddr), ntohl(iph->daddr)); + + +#ifdef CONFIG_IP_MASQUERADE_ICMP + if ((icmph->type == ICMP_ECHOREPLY) || + (icmph->type == ICMP_TIMESTAMPREPLY) || + (icmph->type == ICMP_INFO_REPLY) || + (icmph->type == ICMP_ADDRESSREPLY)) { + + IP_MASQ_DEBUG(2, "icmp reply rcv %lX->%lX id %d type %d, req %d\n", + ntohl(iph->saddr), + ntohl(iph->daddr), + ntohs(icmp_id(icmph)), + icmph->type, + icmp_type_request(icmph->type)); + + ms = ip_masq_in_get(iph->protocol, + iph->saddr, + icmp_hv_rep(icmph), + iph->daddr, + icmp_id(icmph)); + if (ms == NULL) + return 0; + + /* + * got reply, so clear flag + */ + ms->flags &= ~IP_MASQ_F_NO_REPLY; + + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + ip_masq_put(ms); + return -1; + } + + /* Reset source address */ + iph->daddr = ms->saddr; + /* Redo IP header checksum */ + ip_send_check(iph); + /* Set ID to fake port number */ + (icmph->un).echo.id = ms->sport; + /* Reset ICMP checksum and set expiry */ + icmph->checksum=0; + icmph->checksum=ip_compute_csum((unsigned char *)icmph,len); + + + + IP_MASQ_DEBUG(2, "icmp reply rwt %lX->%lX id %d type %d\n", + ntohl(iph->saddr), + ntohl(iph->daddr), + ntohs(icmp_id(icmph)), + icmph->type); + + masq_set_state(ms, 0, iph, icmph); + ip_masq_put(ms); + + return 1; + } else { +#endif + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return 0; +#ifdef CONFIG_IP_MASQUERADE_ICMP + } +#endif + /* + * If we get here we have an ICMP error of one of the above 3 types + * Now find the contained IP header + */ + + ciph = (struct iphdr *) (icmph + 1); + +#ifdef CONFIG_IP_MASQUERADE_ICMP + if (ciph->protocol == IPPROTO_ICMP) { + /* + * This section handles ICMP errors for ICMP packets + * + * First get a new ICMP header structure out of the IP packet + */ + struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph + + (ciph->ihl<<2)); + + + IP_MASQ_DEBUG(2, "rv icmp/icmp rcv %lX->%lX id %d type %d\n", + ntohl(ciph->saddr), + ntohl(ciph->daddr), + ntohs(icmp_id(cicmph)), + cicmph->type); + + read_lock(&__ip_masq_lock); + ms = __ip_masq_in_get(ciph->protocol, + ciph->daddr, + icmp_hv_req(cicmph), + ciph->saddr, + icmp_id(cicmph)); + read_unlock(&__ip_masq_lock); + + if (ms == NULL) + return 0; + + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + __ip_masq_put(ms); + return -1; + } + ciph = (struct iphdr *) (icmph + 1); + cicmph = (struct icmphdr *)((char *)ciph + + (ciph->ihl<<2)); + /* Now we do real damage to this packet...! */ + /* First change the dest IP address, and recalc checksum */ + iph->daddr = ms->saddr; + ip_send_check(iph); + + /* Now change the *source* address in the contained IP */ + ciph->saddr = ms->saddr; + ip_send_check(ciph); + + /* Change the ID to the original one! */ + (cicmph->un).echo.id = ms->sport; + __ip_masq_put(ms); + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + + + IP_MASQ_DEBUG(2, "rv icmp/icmp rwt %lX->%lX id %d type %d\n", + ntohl(ciph->saddr), + ntohl(ciph->daddr), + ntohs(icmp_id(cicmph)), + cicmph->type); + + return 1; + } +#endif /* CONFIG_IP_MASQUERADE_ICMP */ + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if ((ciph->protocol != IPPROTO_UDP) && + (ciph->protocol != IPPROTO_TCP)) + return 0; + + /* + * Find the ports involved - remember this packet was + * *outgoing* so the ports are reversed (and addresses) + */ + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); + if (ntohs(pptr[0]) < PORT_MASQ_BEGIN || + ntohs(pptr[0]) > PORT_MASQ_END) + return 0; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) + { + /* Failed checksum! */ + IP_MASQ_ERR( "reverse ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + return(-1); + } + + + IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08lX:%04X -> %08lX:%04X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); + + + /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */ + read_lock(&__ip_masq_lock); + ms = __ip_masq_in_get(ciph->protocol, + ciph->daddr, + pptr[1], + ciph->saddr, + pptr[0]); + read_unlock(&__ip_masq_lock); + + if (ms == NULL) + return 0; + + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + __ip_masq_put(ms); + return -1; + } + ciph = (struct iphdr *) (icmph + 1); + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); + + /* Now we do real damage to this packet...! */ + /* First change the dest IP address, and recalc checksum */ + iph->daddr = ms->saddr; + ip_send_check(iph); + + /* Now change the *source* address in the contained IP */ + ciph->saddr = ms->saddr; + ip_send_check(ciph); + + /* the TCP/UDP source port - cannot redo check */ + pptr[0] = ms->sport; + __ip_masq_put(ms); + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + + + IP_MASQ_DEBUG(2, "Rewrote reverse ICMP to %08lX:%04X -> %08lX:%04X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); + + + return 1; +} + + /* + * Check if it's an masqueraded port, look it up, + * and send it on its way... + * + * Better not have many hosts using the designated portrange + * as 'normal' ports, or you'll be spending many time in + * this function. + */ + +int ip_fw_demasquerade(struct sk_buff **skb_p) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + union ip_masq_tphdr h; + struct ip_masq *ms; + unsigned short size; + int doff = 0; + int csum = 0; + int csum_ok = 0; + __u32 maddr; + + /* + * Big tappo: only PACKET_HOST (nor loopback neither mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + + if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) { + IP_MASQ_DEBUG(2, "ip_fw_demasquerade(): packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + iph->protocol, + NIPQUAD(iph->daddr)); + return 0; + } + + h.raw = (char*) iph + iph->ihl * 4; + + /* + * IP payload size + */ + size = ntohs(iph->tot_len) - (iph->ihl * 4); + + doff = proto_doff(iph->protocol, h.raw, size); + + switch (doff) { + case 0: + /* + * Input path: other IP protos Ok, will + * reach local sockets path. + */ + return 0; + case -1: + IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n"); + return -1; + } + + maddr = iph->daddr; + switch (iph->protocol) { + case IPPROTO_ICMP: + return(ip_fw_demasq_icmp(skb_p)); + case IPPROTO_TCP: + case IPPROTO_UDP: + /* + * Make sure packet is in the masq range + * ... or some mod-ule relaxes input range + * ... or there is still some `special' mport opened + */ + if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN + || ntohs(h.portp[1]) > PORT_MASQ_END) +#ifdef CONFIG_IP_MASQUERADE_MOD + && (ip_masq_mod_in_rule(skb, iph) != 1) +#endif + && atomic_read(&mport_count) == 0 ) + return 0; + + /* Check that the checksum is OK */ + if ((iph->protocol == IPPROTO_UDP) && (h.uh->check == 0)) + /* No UDP checksum */ + break; +#ifdef CONFIG_IP_MASQ_DEBUG + if (ip_masq_get_debug_level() > 3) { + skb->ip_summed = CHECKSUM_NONE; + } +#endif + + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + csum = csum_partial(h.raw + doff, size - doff, 0); + csum_ok++; + skb->csum = csum_partial(h.raw , doff, csum); + + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, skb->csum)) + { + IP_MASQ_DEBUG(0, "Incoming failed %s checksum from %d.%d.%d.%d (size=%d)!\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); + return -1; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + break; + default: + return 0; + } + + + + IP_MASQ_DEBUG(2, "Incoming %s %08lX:%04X -> %08lX:%04X\n", + masq_proto_name(iph->protocol), + ntohl(iph->saddr), ntohs(h.portp[0]), + ntohl(iph->daddr), ntohs(h.portp[1])); + + /* + * reroute to original host:port if found... + */ + + ms = ip_masq_in_get_iph(iph); + + /* + * Give additional modules a chance to create an entry + */ +#ifdef CONFIG_IP_MASQUERADE_MOD + if (!ms) + ms = ip_masq_mod_in_create(skb, iph, maddr); + + /* + * Call module's input update hook + */ + ip_masq_mod_in_update(skb, iph, ms); +#endif + + + if (ms != NULL) + { + + /* + * got reply, so clear flag + */ + ms->flags &= ~IP_MASQ_F_NO_REPLY; + + /* + * Set daddr,dport if not defined yet + * and tunnel is not setup as "dest loose" + */ + + if (ms->flags & IP_MASQ_F_DLOOSE) { + /* + * update dest loose values + */ + ms->dport = h.portp[0]; + ms->daddr = iph->saddr; + } else { + if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */ + + write_lock(&__ip_masq_lock); + + ip_masq_unhash(ms); + ms->flags &= ~IP_MASQ_F_NO_DPORT; + ms->dport = h.portp[0]; + ip_masq_hash(ms); /* hash on new dport */ + + write_unlock(&__ip_masq_lock); + + IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled dport=%d\n", + ntohs(ms->dport)); + + } + if (ms->flags & IP_MASQ_F_NO_DADDR ) { /* && ms->protocol == IPPROTO_TCP) { */ + + write_lock(&__ip_masq_lock); + + ip_masq_unhash(ms); + ms->flags &= ~IP_MASQ_F_NO_DADDR; + ms->daddr = iph->saddr; + ip_masq_hash(ms); /* hash on new daddr */ + + write_unlock(&__ip_masq_lock); + + IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled daddr=%lX\n", + ntohl(ms->daddr)); + + } + } + if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) { + ip_masq_put(ms); + return -1; + } + iph->daddr = ms->saddr; + h.portp[1] = ms->sport; + + /* + * Invalidate csum saving if tunnel has masq helper + */ + + if (ms->app) + csum_ok = 0; + + /* + * Attempt ip_masq_app call. + * will fix ip_masq and iph ack_seq stuff + */ + + if (ip_masq_app_pkt_in(ms, skb_p, maddr) != 0) + { + /* + * skb has changed, update pointers. + */ + + skb = *skb_p; + iph = skb->nh.iph; + h.raw = (char*) iph + iph->ihl*4; + size = ntohs(iph->tot_len) - (iph->ihl * 4); + } + + /* + * Yug! adjust UDP/TCP checksums + */ + + /* + * Transport's payload partial csum + */ + + if (!csum_ok) { + csum = csum_partial(h.raw + doff, size - doff, 0); + } + skb->csum = csum; + + /* + * Protocol csum + */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + break; + case IPPROTO_UDP: + h.uh->check = 0; + h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + break; + } + ip_send_check(iph); + + IP_MASQ_DEBUG(2, "I-routed to %08lX:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1])); + + masq_set_state (ms, 0, iph, h.portp); + ip_masq_put(ms); + + return 1; + } + + /* sorry, all this trouble for a no-hit :) */ + return 0; +} + + +void ip_masq_control_add(struct ip_masq *ms, struct ip_masq* ctl_ms) +{ + if (ms->control) { + IP_MASQ_ERR( "request control ADD for already controlled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(ms->saddr),ntohs(ms->sport), + NIPQUAD(ms->daddr),ntohs(ms->dport)); + ip_masq_control_del(ms); + } + IP_MASQ_DEBUG(1, "ADDing control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(ms->daddr),ntohs(ms->dport), + NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport)); + ms->control = ctl_ms; + atomic_inc(&ctl_ms->n_control); +} + +void ip_masq_control_del(struct ip_masq *ms) +{ + struct ip_masq *ctl_ms = ms->control; + if (!ctl_ms) { + IP_MASQ_ERR( "request control DEL for uncontrolled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(ms->saddr),ntohs(ms->sport), + NIPQUAD(ms->daddr),ntohs(ms->dport)); + return; + } + IP_MASQ_DEBUG(1, "DELeting control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(ms->daddr),ntohs(ms->dport), + NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport)); + ms->control = NULL; + if (atomic_read(&ctl_ms->n_control) == 0) { + IP_MASQ_ERR( "BUG control DEL with n=0 : %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(ms->saddr),ntohs(ms->sport), + NIPQUAD(ms->daddr),ntohs(ms->dport)); + return; + + } + atomic_dec(&ctl_ms->n_control); +} + +struct ip_masq * ip_masq_control_get(struct ip_masq *ms) +{ + return ms->control; +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net entries + * From userspace + */ +static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, + int length, int unused) +{ + off_t pos=0, begin; + struct ip_masq *ms; + char temp[129]; + int idx = 0; + int len=0; + struct list_head *l,*e; + + if (offset < 128) + { + sprintf(temp, + "Prc FromIP FPrt ToIP TPrt Masq Init-seq Delta PDelta Expires (free=%d,%d,%d)", + atomic_read(ip_masq_free_ports), + atomic_read(ip_masq_free_ports+1), + atomic_read(ip_masq_free_ports+2)); + len = sprintf(buffer, "%-127s\n", temp); + } + pos = 128; + + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + read_lock_bh(&__ip_masq_lock); + + l = &ip_masq_m_table[idx]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + pos += 128; + if (pos <= offset) { + len = 0; + continue; + } + + /* + * We have locked the tables, no need to del/add timers + * nor cli() 8) + */ + + sprintf(temp,"%s %08lX:%04X %08lX:%04X %04X %08X %6d %6d %7lu", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ntohs(ms->mport), + ms->out_seq.init_seq, + ms->out_seq.delta, + ms->out_seq.previous_delta, + ms->timer.expires-jiffies); + len += sprintf(buffer+len, "%-127s\n", temp); + + if(len >= length) { + + read_unlock_bh(&__ip_masq_lock); + goto done; + } + } + read_unlock_bh(&__ip_masq_lock); + + } +done: + + + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} + +#endif + +/* + * Timeouts handling by ipfwadm/ipchains + * From ip_fw.c + */ + +int ip_fw_masq_timeouts(void *m, int len) +{ + struct ip_fw_masq *masq; + int ret = EINVAL; + + if (len != sizeof(struct ip_fw_masq)) { + IP_MASQ_DEBUG(1, "ip_fw_masq_timeouts: length %d, expected %d\n", + len, sizeof(struct ip_fw_masq)); + } else { + masq = (struct ip_fw_masq *)m; + if (masq->tcp_timeout) + masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED] + = masq->tcp_timeout; + + if (masq->tcp_fin_timeout) + masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT] + = masq->tcp_fin_timeout; + + if (masq->udp_timeout) + masq_timeout_table.timeout[IP_MASQ_S_UDP] + = masq->udp_timeout; + ret = 0; + } + return ret; +} +/* + * Module autoloading stuff + */ + +static int ip_masq_user_check_hook(void) { +#ifdef CONFIG_KMOD + if (ip_masq_user_hook == NULL) { + IP_MASQ_DEBUG(1, "About to request \"ip_masq_user\" module\n"); + request_module("ip_masq_user"); + } +#endif /* CONFIG_KMOD */ + return (ip_masq_user_hook != NULL); +} + +/* + * user module hook- info + */ +static int ip_masq_user_info(char *buffer, char **start, off_t offset, + int len, int *eof, void *data) +{ + int ret = -ENOPKG; + if (ip_masq_user_check_hook()) { + ret = ip_masq_user_hook->info(buffer, start, offset, len, (int) data); + } + return ret; +} + +/* + * user module hook- entry mgmt + */ +static int ip_masq_user_ctl(int optname, void *arg, int arglen) +{ + int ret = -ENOPKG; + if (ip_masq_user_check_hook()) { + ret = ip_masq_user_hook->ctl(optname, arg, arglen); + } + return ret; +} + +/* + * Control from ip_sockglue + * MAIN ENTRY point from userspace (apart from /proc *info entries) + * Returns errno + */ +int ip_masq_uctl(int optname, char * optval , int optlen) +{ + struct ip_masq_ctl masq_ctl; + int ret = -EINVAL; + + if(optlen>sizeof(masq_ctl)) + return -EINVAL; + + if(copy_from_user(&masq_ctl,optval,optlen)) + return -EFAULT; + + IP_MASQ_DEBUG(1,"ip_masq_ctl(optname=%d, optlen=%d, target=%d, cmd=%d)\n", + optname, optlen, masq_ctl.m_target, masq_ctl.m_cmd); + + switch (masq_ctl.m_target) { + case IP_MASQ_TARGET_USER: + ret = ip_masq_user_ctl(optname, &masq_ctl, optlen); + break; +#ifdef CONFIG_IP_MASQUERADE_MOD + case IP_MASQ_TARGET_MOD: + ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen); + break; +#endif + } + + /* + * If ret>0, copy to user space + */ + + if (ret > 0 && ret <= sizeof (masq_ctl)) { + if (copy_to_user(optval, &masq_ctl, ret) ) + return -EFAULT; + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_net_ip_masq = NULL; + +#ifdef MODULE +static void ip_masq_proc_count(struct inode *inode, int fill) +{ + if (fill) + MOD_INC_USE_COUNT; + else + MOD_DEC_USE_COUNT; +} +#endif + +int ip_masq_proc_register(struct proc_dir_entry *ent) +{ + if (!proc_net_ip_masq) return -1; + IP_MASQ_DEBUG(1, "registering \"/proc/net/ip_masq/%s\" entry\n", + ent->name); + return proc_register(proc_net_ip_masq, ent); +} +void ip_masq_proc_unregister(struct proc_dir_entry *ent) +{ + if (!proc_net_ip_masq) return; + IP_MASQ_DEBUG(1, "unregistering \"/proc/net/ip_masq/%s\" entry\n", + ent->name); + proc_unregister(proc_net_ip_masq, ent->low_ino); +} + + +__initfunc(static void masq_proc_init(void)) +{ + IP_MASQ_DEBUG(1,"registering /proc/net/ip_masq\n"); + if (!proc_net_ip_masq) { + struct proc_dir_entry *ent; + ent = create_proc_entry("net/ip_masq", S_IFDIR, 0); + if (ent) { +#ifdef MODULE + ent->fill_inode = ip_masq_proc_count; +#endif + proc_net_ip_masq = ent; + } else { + IP_MASQ_ERR("Could not create \"/proc/net/ip_masq\" entry\n"); + } + } +} +#endif /* CONFIG_PROC_FS */ +/* + * Wrapper over inet_select_addr() + */ +u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) +{ + return inet_select_addr(dev, dst, scope); +} + +/* + * Initialize ip masquerading + */ +__initfunc(int ip_masq_init(void)) +{ + int idx; + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_masq_s_table[idx]); + INIT_LIST_HEAD(&ip_masq_m_table[idx]); + INIT_LIST_HEAD(&ip_masq_d_table[idx]); + } +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_IPMSQHST, 13, "ip_masquerade", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_msqhst_procinfo + }); + masq_proc_init(); + + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 3, "tcp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_TCP, + ip_masq_user_info + }); + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 3, "udp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_UDP, + ip_masq_user_info + }); + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 4, "icmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_ICMP, + ip_masq_user_info + }); +#endif +#ifdef CONFIG_IP_MASQUERADE_IPAUTOFW + ip_autofw_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_IPPORTFW + ip_portfw_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_MFW + ip_mfw_init(); +#endif + ip_masq_app_init(); + + return 0; +} diff --git a/pfinet/linux-src/net/ipv4/ip_masq_app.c b/pfinet/linux-src/net/ipv4/ip_masq_app.c new file mode 100644 index 00000000..84e059fa --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_app.c @@ -0,0 +1,603 @@ +/* + * IP_MASQ_APP application masquerading module + * + * + * $Id: ip_masq_app.c,v 1.16 1998/08/29 23:51:14 davem Exp $ + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * JJC : Implemented also input pkt hook + * Miquel van Smoorenburg : Copy more stuff when resizing skb + * + * + * FIXME: + * - ip_masq_skb_replace(): use same skb if space available. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <net/ip_masq.h> + +#define IP_MASQ_APP_TAB_SIZE 16 /* must be power of 2 */ + +#define IP_MASQ_APP_HASH(proto, port) ((port^proto) & (IP_MASQ_APP_TAB_SIZE-1)) +#define IP_MASQ_APP_TYPE(proto, port) ( proto<<16 | port ) +#define IP_MASQ_APP_PORT(type) ( type & 0xffff ) +#define IP_MASQ_APP_PROTO(type) ( (type>>16) & 0x00ff ) + + +EXPORT_SYMBOL(register_ip_masq_app); +EXPORT_SYMBOL(unregister_ip_masq_app); +EXPORT_SYMBOL(ip_masq_skb_replace); + +/* + * will hold masq app. hashed list heads + */ + +struct ip_masq_app *ip_masq_app_base[IP_MASQ_APP_TAB_SIZE]; + +/* + * ip_masq_app registration routine + * port: host byte order. + */ + +int register_ip_masq_app(struct ip_masq_app *mapp, unsigned short proto, __u16 port) +{ + unsigned long flags; + unsigned hash; + if (!mapp) { + IP_MASQ_ERR("register_ip_masq_app(): NULL arg\n"); + return -EINVAL; + } + mapp->type = IP_MASQ_APP_TYPE(proto, port); + mapp->n_attach = 0; + hash = IP_MASQ_APP_HASH(proto, port); + + save_flags(flags); + cli(); + mapp->next = ip_masq_app_base[hash]; + ip_masq_app_base[hash] = mapp; + restore_flags(flags); + + return 0; +} + +/* + * ip_masq_app unreg. routine. + */ + +int unregister_ip_masq_app(struct ip_masq_app *mapp) +{ + struct ip_masq_app **mapp_p; + unsigned hash; + unsigned long flags; + if (!mapp) { + IP_MASQ_ERR("unregister_ip_masq_app(): NULL arg\n"); + return -EINVAL; + } + /* + * only allow unregistration if it has no attachments + */ + if (mapp->n_attach) { + IP_MASQ_ERR("unregister_ip_masq_app(): has %d attachments. failed\n", + mapp->n_attach); + return -EINVAL; + } + hash = IP_MASQ_APP_HASH(IP_MASQ_APP_PROTO(mapp->type), IP_MASQ_APP_PORT(mapp->type)); + + save_flags(flags); + cli(); + for (mapp_p = &ip_masq_app_base[hash]; *mapp_p ; mapp_p = &(*mapp_p)->next) + if (mapp == (*mapp_p)) { + *mapp_p = mapp->next; + restore_flags(flags); + return 0; + } + + restore_flags(flags); + IP_MASQ_ERR("unregister_ip_masq_app(proto=%s,port=%u): not hashed!\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), IP_MASQ_APP_PORT(mapp->type)); + return -EINVAL; +} + +/* + * get ip_masq_app object by its proto and port (net byte order). + */ + +struct ip_masq_app * ip_masq_app_get(unsigned short proto, __u16 port) +{ + struct ip_masq_app *mapp; + unsigned hash; + unsigned type; + + port = ntohs(port); + type = IP_MASQ_APP_TYPE(proto,port); + hash = IP_MASQ_APP_HASH(proto,port); + for(mapp = ip_masq_app_base[hash]; mapp ; mapp = mapp->next) { + if (type == mapp->type) return mapp; + } + return NULL; +} + +/* + * ip_masq_app object binding related funcs. + */ + +/* + * change ip_masq_app object's number of bindings + */ + +static __inline__ int ip_masq_app_bind_chg(struct ip_masq_app *mapp, int delta) +{ + unsigned long flags; + int n_at; + if (!mapp) return -1; + save_flags(flags); + cli(); + n_at = mapp->n_attach + delta; + if (n_at < 0) { + restore_flags(flags); + IP_MASQ_ERR("ip_masq_app: tried to set n_attach < 0 for (proto=%s,port==%d) ip_masq_app object.\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), + IP_MASQ_APP_PORT(mapp->type)); + return -1; + } + mapp->n_attach = n_at; + restore_flags(flags); + return 0; +} + +/* + * Bind ip_masq to its ip_masq_app based on proto and dport ALREADY + * set in ip_masq struct. Also calls constructor. + */ + +struct ip_masq_app * ip_masq_bind_app(struct ip_masq *ms) +{ + struct ip_masq_app * mapp; + + if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP) + return NULL; + + mapp = ip_masq_app_get(ms->protocol, ms->dport); + +#if 0000 +/* #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW */ + if (mapp == NULL) + mapp = ip_masq_app_get(ms->protocol, ms->sport); +/* #endif */ +#endif + + if (mapp != NULL) { + /* + * don't allow binding if already bound + */ + + if (ms->app != NULL) { + IP_MASQ_ERR("ip_masq_bind_app() called for already bound object.\n"); + return ms->app; + } + + ms->app = mapp; + if (mapp->masq_init_1) mapp->masq_init_1(mapp, ms); + ip_masq_app_bind_chg(mapp, +1); + } + return mapp; +} + +/* + * Unbind ms from type object and call ms destructor (does not kfree()). + */ + +int ip_masq_unbind_app(struct ip_masq *ms) +{ + struct ip_masq_app * mapp; + mapp = ms->app; + + if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP) + return 0; + + if (mapp != NULL) { + if (mapp->masq_done_1) mapp->masq_done_1(mapp, ms); + ms->app = NULL; + ip_masq_app_bind_chg(mapp, -1); + } + return (mapp != NULL); +} + +/* + * Fixes th->seq based on ip_masq_seq info. + */ + +static __inline__ void masq_fix_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th) +{ + __u32 seq; + + seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + + if (ms_seq->delta || ms_seq->previous_delta) { + if(after(seq,ms_seq->init_seq) ) { + th->seq = htonl(seq + ms_seq->delta); + IP_MASQ_DEBUG(1, "masq_fix_seq() : added delta (%d) to seq\n",ms_seq->delta); + } else { + th->seq = htonl(seq + ms_seq->previous_delta); + IP_MASQ_DEBUG(1, "masq_fix_seq() : added previous_delta (%d) to seq\n",ms_seq->previous_delta); + } + } + + +} + +/* + * Fixes th->ack_seq based on ip_masq_seq info. + */ + +static __inline__ void masq_fix_ack_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th) +{ + __u32 ack_seq; + + ack_seq=ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + + if (ms_seq->delta || ms_seq->previous_delta) { + if(after(ack_seq,ms_seq->init_seq)) { + th->ack_seq = htonl(ack_seq-ms_seq->delta); + IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted delta (%d) from ack_seq\n",ms_seq->delta); + + } else { + th->ack_seq = htonl(ack_seq-ms_seq->previous_delta); + IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted previous_delta (%d) from ack_seq\n",ms_seq->previous_delta); + } + } + +} + +/* + * Updates ip_masq_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ + +static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *ms_seq, unsigned mflag, __u32 seq, int diff) +{ + /* if (diff == 0) return; */ + + if ( !(ms->flags & mflag) || after(seq, ms_seq->init_seq)) + { + ms_seq->previous_delta=ms_seq->delta; + ms_seq->delta+=diff; + ms_seq->init_seq=seq; + ms->flags |= mflag; + } +} + +/* + * Output pkt hook. Will call bound ip_masq_app specific function + * called by ip_fw_masquerade(), assumes previously checked ms!=NULL + * returns (new - old) skb->len diff. + */ + +int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct ip_masq_app * mapp; + struct iphdr *iph; + struct tcphdr *th; + int diff; + __u32 seq; + + /* + * check if application masquerading is bound to + * this ip_masq. + * assumes that once an ip_masq is bound, + * it will not be unbound during its life. + */ + + if ( (mapp = ms->app) == NULL) + return 0; + + iph = (*skb_p)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + + if (ms->protocol == IPPROTO_TCP) { + if (ms->flags & IP_MASQ_F_OUT_SEQ) + masq_fix_seq(&ms->out_seq, th); + if (ms->flags & IP_MASQ_F_IN_SEQ) + masq_fix_ack_seq(&ms->in_seq, th); + } + + /* + * Call private output hook function + */ + + if ( mapp->pkt_out == NULL ) + return 0; + + diff = mapp->pkt_out(mapp, ms, skb_p, maddr); + + /* + * Update ip_masq seq stuff if len has changed. + */ + + if (diff != 0 && ms->protocol == IPPROTO_TCP) + masq_seq_update(ms, &ms->out_seq, IP_MASQ_F_OUT_SEQ, seq, diff); + + return diff; +} + +/* + * Input pkt hook. Will call bound ip_masq_app specific function + * called by ip_fw_demasquerade(), assumes previously checked ms!=NULL. + * returns (new - old) skb->len diff. + */ + +int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct ip_masq_app * mapp; + struct iphdr *iph; + struct tcphdr *th; + int diff; + __u32 seq; + + /* + * check if application masquerading is bound to + * this ip_masq. + * assumes that once an ip_masq is bound, + * it will not be unbound during its life. + */ + + if ( (mapp = ms->app) == NULL) + return 0; + + iph = (*skb_p)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + + if (ms->protocol == IPPROTO_TCP) { + if (ms->flags & IP_MASQ_F_IN_SEQ) + masq_fix_seq(&ms->in_seq, th); + if (ms->flags & IP_MASQ_F_OUT_SEQ) + masq_fix_ack_seq(&ms->out_seq, th); + } + + /* + * Call private input hook function + */ + + if ( mapp->pkt_in == NULL ) + return 0; + + diff = mapp->pkt_in(mapp, ms, skb_p, maddr); + + /* + * Update ip_masq seq stuff if len has changed. + */ + + if (diff != 0 && ms->protocol == IPPROTO_TCP) + masq_seq_update(ms, &ms->in_seq, IP_MASQ_F_IN_SEQ, seq, diff); + + return diff; +} + +/* + * /proc/ip_masq_app entry function + */ + +int ip_masq_app_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + int len=0; + struct ip_masq_app * mapp; + unsigned idx; + + if (offset < 40) + len=sprintf(buffer,"%-39s\n", "prot port n_attach name"); + pos = 40; + + for (idx=0 ; idx < IP_MASQ_APP_TAB_SIZE; idx++) + for (mapp = ip_masq_app_base[idx]; mapp ; mapp = mapp->next) { + /* + * If you change the length of this sprintf, then all + * the length calculations need fixing too! + * Line length = 40 (3 + 2 + 7 + 1 + 7 + 1 + 2 + 17) + */ + pos += 40; + if (pos < offset) + continue; + + len += sprintf(buffer+len, "%-3s %-7u %-7d %-17s\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), + IP_MASQ_APP_PORT(mapp->type), mapp->n_attach, + mapp->name); + + if(len >= length) + goto done; + } +done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if (len > length) + len = length; + return len; +} + + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ip_masq_app = { + PROC_NET_IP_MASQ_APP, 3, "app", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_masq_app_getinfo +}; +#endif + +/* + * Initialization routine + */ + +__initfunc(int ip_masq_app_init(void)) +{ +#ifdef CONFIG_PROC_FS + ip_masq_proc_register(&proc_net_ip_masq_app); +#endif + return 0; +} + +/* + * Replace a segment (of skb->data) with a new one. + * FIXME: Should re-use same skb if space available, this could + * be done if n_len < o_len, unless some extra space + * were already allocated at driver level :P . + */ + +static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len) +{ + int maxsize, diff, o_offset; + struct sk_buff *n_skb; + int offset; + + maxsize = skb->truesize; + + diff = n_len - o_len; + o_offset = o_buf - (char*) skb->data; + + if (maxsize <= n_len) { + if (diff != 0) { + memcpy(skb->data + o_offset + n_len,o_buf + o_len, + skb->len - (o_offset + o_len)); + } + + memcpy(skb->data + o_offset, n_buf, n_len); + + n_skb = skb; + skb->len = n_len; + skb->end = skb->head+n_len; + } else { + /* + * Sizes differ, make a copy. + * + * FIXME: move this to core/sbuff.c:skb_grow() + */ + + n_skb = alloc_skb(MAX_HEADER + skb->len + diff, pri); + if (n_skb == NULL) { + IP_MASQ_ERR("skb_replace(): no room left (from %p)\n", + __builtin_return_address(0)); + return skb; + + } + skb_reserve(n_skb, MAX_HEADER); + skb_put(n_skb, skb->len + diff); + + /* + * Copy as much data from the old skb as possible. Even + * though we're only forwarding packets, we need stuff + * like skb->protocol (PPP driver wants it). + */ + offset = n_skb->data - skb->data; + n_skb->nh.raw = skb->nh.raw + offset; + n_skb->h.raw = skb->h.raw + offset; + n_skb->dev = skb->dev; + n_skb->mac.raw = skb->mac.raw + offset; + n_skb->pkt_type = skb->pkt_type; + n_skb->protocol = skb->protocol; + n_skb->ip_summed = skb->ip_summed; + n_skb->dst = dst_clone(skb->dst); + + /* + * Copy pkt in new buffer + */ + + memcpy(n_skb->data, skb->data, o_offset); + memcpy(n_skb->data + o_offset, n_buf, n_len); + memcpy(n_skb->data + o_offset + n_len, o_buf + o_len, + skb->len - (o_offset + o_len) ); + + /* + * Problem, how to replace the new skb with old one, + * preferably inplace + */ + + kfree_skb(skb); + } + return n_skb; +} + +/* + * calls skb_replace() and update ip header if new skb was allocated + */ + +struct sk_buff * ip_masq_skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len) +{ + int diff; + struct sk_buff *n_skb; + unsigned skb_len; + + diff = n_len - o_len; + n_skb = skb_replace(skb, pri, o_buf, o_len, n_buf, n_len); + skb_len = skb->len; + + if (diff) + { + struct iphdr *iph; + IP_MASQ_DEBUG(1, "masq_skb_replace(): pkt resized for %d bytes (len=%d)\n", diff, skb->len); + /* + * update ip header + */ + iph = n_skb->nh.iph; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + iph->tot_len = htons(skb_len + diff); + } + return n_skb; +} diff --git a/pfinet/linux-src/net/ipv4/ip_masq_autofw.c b/pfinet/linux-src/net/ipv4/ip_masq_autofw.c new file mode 100644 index 00000000..d2a1729c --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_autofw.c @@ -0,0 +1,448 @@ +/* + * IP_MASQ_AUTOFW auto forwarding module + * + * + * $Id: ip_masq_autofw.c,v 1.3 1998/08/29 23:51:10 davem Exp $ + * + * Author: Richard Lynch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * Fixes: + * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c + * Juan Jose Ciarlante : modularized + * Juan Jose Ciarlante : use GFP_KERNEL when creating entries + * Juan Jose Ciarlante : call del_timer() when freeing entries (!) + * FIXME: + * - implement refcnt + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/if.h> +#include <linux/init.h> +#include <linux/ip_fw.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/ip_masq.h> + +#define IP_AUTOFW_EXPIRE 15*HZ + +/* WARNING: bitwise equal to ip_autofw_user in linux/ip_masq.h */ +struct ip_autofw { + struct ip_autofw * next; + __u16 type; + __u16 low; + __u16 hidden; + __u16 high; + __u16 visible; + __u16 protocol; + __u32 lastcontact; + __u32 where; + __u16 ctlproto; + __u16 ctlport; + __u16 flags; + struct timer_list timer; +}; + +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +/* + * Auto-forwarding table + */ + +static struct ip_autofw * ip_autofw_hosts = NULL; +static struct ip_masq_mod * mmod_self = NULL; + +/* + * Check if a masq entry should be created for a packet + */ + +static __inline__ struct ip_autofw * ip_autofw_check_range (__u32 where, __u16 port, __u16 protocol, int reqact) +{ + struct ip_autofw *af; + af=ip_autofw_hosts; + port=ntohs(port); + while (af) { + if (af->type==IP_FWD_RANGE && + port>=af->low && + port<=af->high && + protocol==af->protocol && + + /* + * It's ok to create masq entries after + * the timeout if we're in insecure mode + */ + (af->flags & IP_AUTOFW_ACTIVE || !reqact || !(af->flags & IP_AUTOFW_SECURE)) && + (!(af->flags & IP_AUTOFW_SECURE) || af->lastcontact==where || !reqact)) + return(af); + af=af->next; + } + return(NULL); +} + +static __inline__ struct ip_autofw * ip_autofw_check_port (__u16 port, __u16 protocol) +{ + struct ip_autofw *af; + af=ip_autofw_hosts; + port=ntohs(port); + while (af) + { + if (af->type==IP_FWD_PORT && port==af->visible && protocol==af->protocol) + return(af); + af=af->next; + } + return(NULL); +} + +static __inline__ struct ip_autofw * ip_autofw_check_direct (__u16 port, __u16 protocol) +{ + struct ip_autofw *af; + af=ip_autofw_hosts; + port=ntohs(port); + while (af) + { + if (af->type==IP_FWD_DIRECT && af->low<=port && af->high>=port) + return(af); + af=af->next; + } + return(NULL); +} + +static __inline__ void ip_autofw_update_out (__u32 who, __u32 where, __u16 port, __u16 protocol) +{ + struct ip_autofw *af; + af=ip_autofw_hosts; + port=ntohs(port); + while (af) + { + if (af->type==IP_FWD_RANGE && af->ctlport==port && af->ctlproto==protocol) + { + if (af->flags & IP_AUTOFW_USETIME) + { + mod_timer(&af->timer, + jiffies+IP_AUTOFW_EXPIRE); + } + af->flags|=IP_AUTOFW_ACTIVE; + af->lastcontact=where; + af->where=who; + } + af=af->next; + } +} + +#if 0 +static __inline__ void ip_autofw_update_in (__u32 where, __u16 port, __u16 protocol) +{ + struct ip_autofw *af; + af=ip_autofw_check_range(where, port,protocol); + if (af) + { + mod_timer(&af->timer, jiffies+IP_AUTOFW_EXPIRE); + } +} +#endif + + +static __inline__ void ip_autofw_expire(unsigned long data) +{ + struct ip_autofw * af; + af=(struct ip_autofw *) data; + af->flags &= ~IP_AUTOFW_ACTIVE; + af->timer.expires=0; + af->lastcontact=0; + if (af->flags & IP_AUTOFW_SECURE) + af->where=0; +} + + + +static __inline__ int ip_autofw_add(struct ip_autofw_user * af) +{ + struct ip_autofw * newaf; + newaf = kmalloc( sizeof(struct ip_autofw), GFP_KERNEL ); + init_timer(&newaf->timer); + if ( newaf == NULL ) + { + printk("ip_autofw_add: malloc said no\n"); + return( ENOMEM ); + } + + MOD_INC_USE_COUNT; + + memcpy(newaf, af, sizeof(struct ip_autofw_user)); + newaf->timer.data = (unsigned long) newaf; + newaf->timer.function = ip_autofw_expire; + newaf->timer.expires = 0; + newaf->lastcontact=0; + newaf->next=ip_autofw_hosts; + ip_autofw_hosts=newaf; + ip_masq_mod_inc_nent(mmod_self); + return(0); +} + +static __inline__ int ip_autofw_del(struct ip_autofw_user * af) +{ + struct ip_autofw ** af_p, *curr; + + for (af_p=&ip_autofw_hosts, curr=*af_p; (curr=*af_p); af_p = &(*af_p)->next) { + if (af->type == curr->type && + af->low == curr->low && + af->high == curr->high && + af->hidden == curr->hidden && + af->visible == curr->visible && + af->protocol == curr->protocol && + af->where == curr->where && + af->ctlproto == curr->ctlproto && + af->ctlport == curr->ctlport) + { + ip_masq_mod_dec_nent(mmod_self); + *af_p = curr->next; + if (af->flags&IP_AUTOFW_ACTIVE) + del_timer(&curr->timer); + kfree_s(curr,sizeof(struct ip_autofw)); + MOD_DEC_USE_COUNT; + return 0; + } + curr=curr->next; + } + return EINVAL; +} + +static __inline__ int ip_autofw_flush(void) +{ + struct ip_autofw * af; + + while (ip_autofw_hosts) + { + af=ip_autofw_hosts; + ip_masq_mod_dec_nent(mmod_self); + ip_autofw_hosts=ip_autofw_hosts->next; + if (af->flags&IP_AUTOFW_ACTIVE) + del_timer(&af->timer); + kfree_s(af,sizeof(struct ip_autofw)); + MOD_DEC_USE_COUNT; + } + return(0); +} + +/* + * Methods for registered object + */ + +static int autofw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_autofw_user *af = &mctl->u.autofw_user; + + switch (mctl->m_cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + if (optlen<sizeof(*af)) + return EINVAL; + return ip_autofw_add(af); + case IP_MASQ_CMD_DEL: + if (optlen<sizeof(*af)) + return EINVAL; + return ip_autofw_del(af); + case IP_MASQ_CMD_FLUSH: + return ip_autofw_flush(); + + } + return EINVAL; +} + + +static int autofw_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + /* + * Update any ipautofw entries ... + */ + + ip_autofw_update_out(iph->saddr, iph->daddr, portp[1], iph->protocol); + return IP_MASQ_MOD_NOP; +} + +static struct ip_masq * autofw_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + /* + * If the source port is supposed to match the masq port, then + * make it so + */ + + if (ip_autofw_check_direct(portp[1],iph->protocol)) { + return ip_masq_new(iph->protocol, + maddr, portp[0], + iph->saddr, portp[0], + iph->daddr, portp[1], + 0); + } + return NULL; +} + +#if 0 +static int autofw_in_update(const struct sk_buff *skb, const struct iphdr *iph, __u16 *portp, struct ip_masq *ms) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + ip_autofw_update_in(iph->saddr, portp[1], iph->protocol); + return IP_MASQ_MOD_NOP; +} +#endif + +static int autofw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + return (ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0) + || ip_autofw_check_direct(portp[1], iph->protocol) + || ip_autofw_check_port(portp[1], iph->protocol)); +} + +static struct ip_masq * autofw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + struct ip_autofw *af; + + if ((af=ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0))) { + IP_MASQ_DEBUG(1-debug, "autofw_check_range HIT\n"); + return ip_masq_new(iph->protocol, + maddr, portp[1], + af->where, portp[1], + iph->saddr, portp[0], + 0); + } + if ((af=ip_autofw_check_port(portp[1], iph->protocol)) ) { + IP_MASQ_DEBUG(1-debug, "autofw_check_port HIT\n"); + return ip_masq_new(iph->protocol, + maddr, htons(af->visible), + af->where, htons(af->hidden), + iph->saddr, portp[0], + 0); + } + return NULL; +} + +#ifdef CONFIG_PROC_FS +static int autofw_procinfo(char *buffer, char **start, off_t offset, + int length, int unused) +{ + off_t pos=0, begin=0; + struct ip_autofw * af; + int len=0; + + len=sprintf(buffer,"Type Prot Low High Vis Hid Where Last CPto CPrt Timer Flags\n"); + + for(af = ip_autofw_hosts; af ; af = af->next) + { + len+=sprintf(buffer+len,"%4X %4X %04X-%04X/%04X %04X %08lX %08lX %04X %04X %6lu %4X\n", + af->type, + af->protocol, + af->low, + af->high, + af->visible, + af->hidden, + ntohl(af->where), + ntohl(af->lastcontact), + af->ctlproto, + af->ctlport, + (af->timer.expires<jiffies ? 0 : af->timer.expires-jiffies), + af->flags); + + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +static struct proc_dir_entry autofw_proc_entry = { + 0, 0, NULL, + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + autofw_procinfo +}; + +#define proc_ent &autofw_proc_entry +#else /* !CONFIG_PROC_FS */ + +#define proc_ent NULL +#endif + + +#define autofw_in_update NULL +#define autofw_out_rule NULL +#define autofw_mod_init NULL +#define autofw_mod_done NULL + +static struct ip_masq_mod autofw_mod = { + NULL, /* next */ + NULL, /* next_reg */ + "autofw", /* name */ + ATOMIC_INIT(0), /* nent */ + ATOMIC_INIT(0), /* refcnt */ + proc_ent, + autofw_ctl, + autofw_mod_init, + autofw_mod_done, + autofw_in_rule, + autofw_in_update, + autofw_in_create, + autofw_out_rule, + autofw_out_update, + autofw_out_create, +}; + +__initfunc(int ip_autofw_init(void)) +{ + return register_ip_masq_mod ((mmod_self=&autofw_mod)); +} + +int ip_autofw_done(void) +{ + return unregister_ip_masq_mod(&autofw_mod); +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_autofw_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_autofw_done() != 0) + printk(KERN_INFO "ip_autofw_done(): can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c b/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c new file mode 100644 index 00000000..9b412baf --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c @@ -0,0 +1,264 @@ +/* + * IP_MASQ_FTP CUSeeMe masquerading module + * + * + * Version: @(#)$Id: ip_masq_cuseeme.c,v 1.4 1998/10/06 04:48:57 davem Exp $ + * + * Author: Richard Lynch + * + * + * Fixes: + * Richard Lynch : Updated patch to conform to new module + * specifications + * Nigel Metheringham : Multiple port support + * Michael Owings : Fixed broken init code + * Added code to update inbound + * packets with correct local addresses. + * Fixes audio and "chat" problems + * Thanx to the CU-SeeMe Consortium for + * technical docs + * Steven Clarke : Small changes for 2.1 + * + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Multiple Port Support + * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12) + * with the port numbers being defined at module load time. The module + * uses the symbol "ports" to define a list of monitored ports, which can + * be specified on the insmod command line as + * ports=x1,x2,x3... + * where x[n] are integer port numbers. This option can be put into + * /etc/conf.modules (or /etc/modules.conf depending on your config) + * where modload will pick it up should you use modload to load your + * modules. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/udp.h> + +/* #define IP_MASQ_NDEBUG */ +#include <net/ip_masq.h> + +#pragma pack(1) +/* CU-SeeMe Data Header */ +typedef struct { + u_short dest_family; + u_short dest_port; + u_long dest_addr; + short family; + u_short port; + u_long addr; + u_long seq; + u_short msg; + u_short data_type; + u_short packet_len; +} cu_header; + +/* Open Continue Header */ +typedef struct { + cu_header cu_head; + u_short client_count; /* Number of client info structs */ + u_long seq_no; + char user_name[20]; + char stuff[4]; /* flags, version stuff, etc */ +}oc_header; + +/* client info structures */ +typedef struct { + u_long address; /* Client address */ + char stuff[8]; /* Flags, pruning bitfield, packet counts etc */ +} client_info; +#pragma pack() + +/* + * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[MAX_MASQ_APP_PORTS] = {7648}; /* I rely on the trailing items being set to zero */ +struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; + +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); + +static int +masq_cuseeme_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int +masq_cuseeme_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +int +masq_cuseeme_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]); + cu_header *cu_head; + char *data=(char *)&uh[1]; + + if (skb->len - ((unsigned char *) data - skb->h.raw) >= sizeof(cu_header)) + { + cu_head = (cu_header *) data; + /* cu_head->port = ms->mport; */ + if( cu_head->addr ) + cu_head->addr = (u_long) maddr; + if(ntohs(cu_head->data_type) == 257) + IP_MASQ_DEBUG(1-debug, "Sending talk packet!\n"); + } + return 0; +} + +int +masq_cuseeme_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]); + cu_header *cu_head; + oc_header *oc; + client_info *ci; + char *data=(char *)&uh[1]; + u_short len = skb->len - ((unsigned char *) data - skb->h.raw); + int i, off; + + if (len >= sizeof(cu_header)) + { + cu_head = (cu_header *) data; + if(cu_head->dest_addr) /* Correct destination address */ + cu_head->dest_addr = (u_long) ms->saddr; + if(ntohs(cu_head->data_type)==101 && len > sizeof(oc_header)) + { + oc = (oc_header * ) data; + /* Spin (grovel) thru client_info structs till we find our own */ + off=sizeof(oc_header); + for(i=0; + (i < oc->client_count && off+sizeof(client_info) <= len); + i++) + { + ci=(client_info *)(data+off); + if(ci->address==(u_long) maddr) + { + /* Update w/ our real ip address and exit */ + ci->address = (u_long) ms->saddr; + break; + } + else + off+=sizeof(client_info); + } + } + } + return 0; +} + +struct ip_masq_app ip_masq_cuseeme = { + NULL, /* next */ + "cuseeme", + 0, /* type */ + 0, /* n_attach */ + masq_cuseeme_init_1, /* ip_masq_init_1 */ + masq_cuseeme_done_1, /* ip_masq_done_1 */ + masq_cuseeme_out, /* pkt_out */ + masq_cuseeme_in /* pkt_in */ +}; + + +/* + * ip_masq_cuseeme initialization + */ + +__initfunc(int ip_masq_cuseeme_init(void)) +{ + int i, j; + + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (ports[i]) { + if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app), + GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(masq_incarnations[i], &ip_masq_cuseeme, sizeof(struct ip_masq_app)); + if ((j = register_ip_masq_app(masq_incarnations[i], + IPPROTO_UDP, + ports[i]))) { + return j; + } +#if DEBUG_CONFIG_IP_MASQ_CUSEEME + IP_MASQ_DEBUG(1-debug, "CuSeeMe: loaded support on port[%d] = %d\n", + i, ports[i]); +#endif + } else { + /* To be safe, force the incarnation table entry to NULL */ + masq_incarnations[i] = NULL; + } + } + return 0; +} + +/* + * ip_masq_cuseeme fin. + */ + +int ip_masq_cuseeme_done(void) +{ + int i, j, k; + + k=0; + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (masq_incarnations[i]) { + if ((j = unregister_ip_masq_app(masq_incarnations[i]))) { + k = j; + } else { + kfree(masq_incarnations[i]); + masq_incarnations[i] = NULL; + IP_MASQ_DEBUG(1-debug, "CuSeeMe: unloaded support on port[%d] = %d\n", i, ports[i]); + } + } + } + return k; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_cuseeme_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_cuseeme_done() != 0) + IP_MASQ_DEBUG(1-debug, "ip_masq_cuseeme: can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_ftp.c b/pfinet/linux-src/net/ipv4/ip_masq_ftp.c new file mode 100644 index 00000000..35d1f544 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_ftp.c @@ -0,0 +1,393 @@ +/* + * IP_MASQ_FTP ftp masquerading module + * + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + * + * Fixes: + * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands + * Juan Jose Ciarlante : Code moved and adapted from ip_fw.c + * Keith Owens : Add keep alive for ftp control channel + * Nigel Metheringham : Added multiple port support + * Juan Jose Ciarlante : Use control_add() for ftp control chan + * Juan Jose Ciarlante : Litl bits for 2.1 + * Juan Jose Ciarlante : use ip_masq_listen() + * Juan Jose Ciarlante : use private app_data for own flag(s) + * + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Multiple Port Support + * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12) + * with the port numbers being defined at module load time. The module + * uses the symbol "ports" to define a list of monitored ports, which can + * be specified on the insmod command line as + * ports=x1,x2,x3... + * where x[n] are integer port numbers. This option can be put into + * /etc/conf.modules (or /etc/modules.conf depending on your config) + * where modload will pick it up should you use modload to load your + * modules. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> + +/* #define IP_MASQ_NDEBUG */ +#include <net/ip_masq.h> + + +/* + * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[MAX_MASQ_APP_PORTS] = {21}; /* I rely on the trailing items being set to zero */ +struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; + +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); + +/* Dummy variable */ +static int masq_ftp_pasv; + +static int +masq_ftp_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int +masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +int +masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *p, *data, *data_limit; + unsigned char p1,p2,p3,p4,p5,p6; + __u32 from; + __u16 port; + struct ip_masq *n_ms; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int diff; + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len - 18; + if (skb->len >= 6 && (memcmp(data, "PASV\r\n", 6) == 0 || memcmp(data, "pasv\r\n", 6) == 0)) + ms->app_data = &masq_ftp_pasv; + + while (data < data_limit) + { + if (memcmp(data,"PORT ",5) && memcmp(data,"port ",5)) + { + data ++; + continue; + } + p = data+5; + p1 = simple_strtoul(data+5,&data,10); + if (*data!=',') + continue; + p2 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p3 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p4 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p5 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p6 = simple_strtoul(data+1,&data,10); + if (*data!='\r' && *data!='\n') + continue; + + from = (p1<<24) | (p2<<16) | (p3<<8) | p4; + port = (p5<<8) | p6; + + IP_MASQ_DEBUG(1-debug, "PORT %X:%X detected\n",from,port); + + /* + * Now update or create an masquerade entry for it + */ + + IP_MASQ_DEBUG(1-debug, "protocol %d %lX:%X %X:%X\n", iph->protocol, htonl(from), htons(port), iph->daddr, 0); + + n_ms = ip_masq_out_get(iph->protocol, + htonl(from), htons(port), + iph->daddr, 0); + if (!n_ms) { + n_ms = ip_masq_new(IPPROTO_TCP, + maddr, 0, + htonl(from), htons(port), + iph->daddr, 0, + IP_MASQ_F_NO_DPORT); + + if (n_ms==NULL) + return 0; + ip_masq_control_add(n_ms, ms); + } + + /* + * Replace the old PORT with the new one + */ + from = ntohl(n_ms->maddr); + port = ntohs(n_ms->mport); + sprintf(buf,"%d,%d,%d,%d,%d,%d", + from>>24&255,from>>16&255,from>>8&255,from&255, + port>>8&255,port&255); + buf_len = strlen(buf); + + IP_MASQ_DEBUG(1-debug, "new PORT %X:%X\n",from,port); + + /* + * Calculate required delta-offset to keep TCP happy + */ + + diff = buf_len - (data-p); + + /* + * No shift. + */ + + if (diff==0) { + /* + * simple case, just replace the old PORT cmd + */ + memcpy(p,buf,buf_len); + } else { + + *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, p, data-p, buf, buf_len); + } + /* + * Move tunnel to listen state + */ + ip_masq_listen(n_ms); + ip_masq_put(n_ms); + + return diff; + + } + return 0; + +} + +/* + * Look at incoming ftp packets to catch the response to a PASV command. When + * we see one we build a masquerading entry for the client address, client port + * 0 (unknown at the moment), the server address and the server port. Mark the + * current masquerade entry as a control channel and point the new entry at the + * control entry. All this work just for ftp keepalive across masquerading. + * + * The incoming packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + * ncftp 2.3.0 cheats by skipping the leading number then going 22 bytes into + * the data so we do the same. If it's good enough for ncftp then it's good + * enough for me. + * + * In this case, the client is the source machine being masqueraded, the server + * is the destination for ftp requests. It all depends on your point of view ... + */ + +int +masq_ftp_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + unsigned char p1,p2,p3,p4,p5,p6; + __u32 to; + __u16 port; + struct ip_masq *n_ms; + + if (ms->app_data != &masq_ftp_pasv) + return 0; /* quick exit if no outstanding PASV */ + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + data_limit = skb->h.raw + skb->len; + + while (data < data_limit && *data != ' ') + ++data; + while (data < data_limit && *data == ' ') + ++data; + data += 22; + if (data >= data_limit || *data != '(') + return 0; + p1 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ',') + return 0; + p2 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ',') + return 0; + p3 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ',') + return 0; + p4 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ',') + return 0; + p5 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ',') + return 0; + p6 = simple_strtoul(data+1, &data, 10); + if (data >= data_limit || *data != ')') + return 0; + + to = (p1<<24) | (p2<<16) | (p3<<8) | p4; + port = (p5<<8) | p6; + + /* + * Now update or create an masquerade entry for it + */ + IP_MASQ_DEBUG(1-debug, "PASV response %lX:%X %X:%X detected\n", ntohl(ms->saddr), 0, to, port); + + n_ms = ip_masq_out_get(iph->protocol, + ms->saddr, 0, + htonl(to), htons(port)); + if (!n_ms) { + n_ms = ip_masq_new(IPPROTO_TCP, + maddr, 0, + ms->saddr, 0, + htonl(to), htons(port), + IP_MASQ_F_NO_SPORT); + + if (n_ms==NULL) + return 0; + ip_masq_control_add(n_ms, ms); + } + +#if 0 /* v0.12 state processing */ + + /* + * keep for a bit longer than tcp_fin, client may not issue open + * to server port before tcp_fin_timeout. + */ + n_ms->timeout = ip_masq_expire->tcp_fin_timeout*3; +#endif + ms->app_data = NULL; + ip_masq_put(n_ms); + + return 0; /* no diff required for incoming packets, thank goodness */ +} + +struct ip_masq_app ip_masq_ftp = { + NULL, /* next */ + "ftp", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_ftp_init_1, /* ip_masq_init_1 */ + masq_ftp_done_1, /* ip_masq_done_1 */ + masq_ftp_out, /* pkt_out */ + masq_ftp_in, /* pkt_in */ +}; + +/* + * ip_masq_ftp initialization + */ + +__initfunc(int ip_masq_ftp_init(void)) +{ + int i, j; + + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (ports[i]) { + if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app), + GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(masq_incarnations[i], &ip_masq_ftp, sizeof(struct ip_masq_app)); + if ((j = register_ip_masq_app(masq_incarnations[i], + IPPROTO_TCP, + ports[i]))) { + return j; + } + IP_MASQ_DEBUG(1-debug, "Ftp: loaded support on port[%d] = %d\n", + i, ports[i]); + } else { + /* To be safe, force the incarnation table entry to NULL */ + masq_incarnations[i] = NULL; + } + } + return 0; +} + +/* + * ip_masq_ftp fin. + */ + +int ip_masq_ftp_done(void) +{ + int i, j, k; + + k=0; + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (masq_incarnations[i]) { + if ((j = unregister_ip_masq_app(masq_incarnations[i]))) { + k = j; + } else { + kfree(masq_incarnations[i]); + masq_incarnations[i] = NULL; + IP_MASQ_DEBUG(1-debug, "Ftp: unloaded support on port[%d] = %d\n", + i, ports[i]); + } + } + } + return k; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_ftp_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_ftp_done() != 0) + printk(KERN_INFO "ip_masq_ftp: can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_irc.c b/pfinet/linux-src/net/ipv4/ip_masq_irc.c new file mode 100644 index 00000000..e52a5720 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_irc.c @@ -0,0 +1,345 @@ +/* + * IP_MASQ_IRC irc masquerading module + * + * + * Version: @(#)ip_masq_irc.c 0.04 99/06/19 + * + * Author: Juan Jose Ciarlante + * + * Additions: + * - recognize a few non-irc-II DCC requests (Oliver Wagner) + * DCC MOVE (AmIRC/DCC.MOVE; SEND with resuming) + * DCC SCHAT (AmIRC IDEA encrypted CHAT) + * DCC TSEND (AmIRC/PIRCH SEND without ACKs) + * Fixes: + * Juan Jose Ciarlante : set NO_DADDR flag in ip_masq_new() + * Nigel Metheringham : Added multiple port support + * Juan Jose Ciarlante : litl bits for 2.1 + * Oliver Wagner : more IRC cmds processing + * <winmute@lucifer.gv.kotnet.org> + * Juan Jose Ciarlante : put new ms entry to listen() + * Scottie Shore : added support for clients that add extra args + * <sshore@escape.ca> + * + * FIXME: + * - detect also previous "PRIVMSG" string ?. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Multiple Port Support + * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12) + * with the port numbers being defined at module load time. The module + * uses the symbol "ports" to define a list of monitored ports, which can + * be specified on the insmod command line as + * ports=x1,x2,x3... + * where x[n] are integer port numbers. This option can be put into + * /etc/conf.modules (or /etc/modules.conf depending on your config) + * where modload will pick it up should you use modload to load your + * modules. + * + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/system.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + + +/* + * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper + * First port is set to the default port. + */ +int ports[MAX_MASQ_APP_PORTS] = {6667}; /* I rely on the trailing items being set to zero */ +struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); + + +/* + * List of supported DCC protocols + */ + +#define NUM_DCCPROTO 5 + +struct dccproto +{ + char *match; + int matchlen; +}; + +struct dccproto dccprotos[NUM_DCCPROTO] = { + { "SEND ", 5 }, + { "CHAT ", 5 }, + { "MOVE ", 5 }, + { "TSEND ", 6 }, + { "SCHAT ", 6 } +}; +#define MAXMATCHLEN 6 + +static int +masq_irc_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int +masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +int +masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + __u32 s_addr; + __u16 s_port; + struct ip_masq *n_ms; + char buf[20]; /* "m_addr m_port" (dec base)*/ + unsigned buf_len; + int diff; + char *dcc_p, *addr_beg_p, *addr_end_p; + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + /* + * Hunt irc DCC string, the _shortest_: + * + * strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits) + * P: bound port (min 1 d ) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + + data_limit = skb->h.raw + skb->len; + + while (data < (data_limit - ( 22 + MAXMATCHLEN ) ) ) + { + int i; + if (memcmp(data,"\1DCC ",5)) { + data ++; + continue; + } + + dcc_p = data; + data += 5; /* point to DCC cmd */ + + for(i=0; i<NUM_DCCPROTO; i++) + { + /* + * go through the table and hunt a match string + */ + + if( memcmp(data, dccprotos[i].match, dccprotos[i].matchlen ) == 0 ) + { + data += dccprotos[i].matchlen; + + /* + * skip next string. + */ + + while( *data++ != ' ') + + /* + * must still parse, at least, "AAAAAAAA P\1\n", + * 12 bytes left. + */ + if (data > (data_limit-12)) return 0; + + + addr_beg_p = data; + + /* + * client bound address in dec base + */ + + s_addr = simple_strtoul(data,&data,10); + if (*data++ !=' ') + continue; + + /* + * client bound port in dec base + */ + + s_port = simple_strtoul(data,&data,10); + addr_end_p = data; + + /* + * Now create an masquerade entry for it + * must set NO_DPORT and NO_DADDR because + * connection is requested by another client. + */ + + n_ms = ip_masq_new(IPPROTO_TCP, + maddr, 0, + htonl(s_addr),htons(s_port), + 0, 0, + IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR); + if (n_ms==NULL) + return 0; + + /* + * Replace the old "address port" with the new one + */ + + buf_len = sprintf(buf,"%lu %u", + ntohl(n_ms->maddr),ntohs(n_ms->mport)); + + /* + * Calculate required delta-offset to keep TCP happy + */ + + diff = buf_len - (addr_end_p-addr_beg_p); + + *addr_beg_p = '\0'; + IP_MASQ_DEBUG(1-debug, "masq_irc_out(): '%s' %X:%X detected (diff=%d)\n", dcc_p, s_addr,s_port, diff); + + /* + * No shift. + */ + + if (diff==0) { + /* + * simple case, just copy. + */ + memcpy(addr_beg_p,buf,buf_len); + } else { + + *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, + addr_beg_p, addr_end_p-addr_beg_p, + buf, buf_len); + } + ip_masq_listen(n_ms); + ip_masq_put(n_ms); + return diff; + } + } + } + return 0; + +} + +/* + * Main irc object + * You need 1 object per port in case you need + * to offer also other used irc ports (6665,6666,etc), + * they will share methods but they need own space for + * data. + */ + +struct ip_masq_app ip_masq_irc = { + NULL, /* next */ + "irc", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_irc_init_1, /* init_1 */ + masq_irc_done_1, /* done_1 */ + masq_irc_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_irc initialization + */ + +__initfunc(int ip_masq_irc_init(void)) +{ + int i, j; + + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (ports[i]) { + if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app), + GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(masq_incarnations[i], &ip_masq_irc, sizeof(struct ip_masq_app)); + if ((j = register_ip_masq_app(masq_incarnations[i], + IPPROTO_TCP, + ports[i]))) { + return j; + } + IP_MASQ_DEBUG(1-debug, + "Irc: loaded support on port[%d] = %d\n", + i, ports[i]); + } else { + /* To be safe, force the incarnation table entry to NULL */ + masq_incarnations[i] = NULL; + } + } + return 0; +} + +/* + * ip_masq_irc fin. + */ + +int ip_masq_irc_done(void) +{ + int i, j, k; + + k=0; + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (masq_incarnations[i]) { + if ((j = unregister_ip_masq_app(masq_incarnations[i]))) { + k = j; + } else { + kfree(masq_incarnations[i]); + masq_incarnations[i] = NULL; + IP_MASQ_DEBUG(1-debug, "Irc: unloaded support on port[%d] = %d\n", + i, ports[i]); + } + } + } + return k; +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_irc_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_irc_done() != 0) + printk(KERN_INFO "ip_masq_irc: can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_mfw.c b/pfinet/linux-src/net/ipv4/ip_masq_mfw.c new file mode 100644 index 00000000..60c77970 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_mfw.c @@ -0,0 +1,769 @@ +/* + * IP_MASQ_MARKFW masquerading module + * + * Does (reverse-masq) forwarding based on skb->fwmark value + * + * $Id: ip_masq_mfw.c,v 1.3.2.1 1999/07/02 10:10:03 davem Exp $ + * + * Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar> + * based on Steven Clarke's portfw + * + * Fixes: + * JuanJo Ciarlante: added u-space sched support + * JuanJo Ciarlante: if rport==0, use packet dest port *grin* + * JuanJo Ciarlante: fixed tcp syn&&!ack creation + * + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/list.h> +#include <net/ip.h> +#include <linux/ip_fw.h> +#include <linux/ip_masq.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <asm/softirq.h> +#include <asm/spinlock.h> +#include <asm/atomic.h> + +static struct ip_masq_mod *mmod_self = NULL; +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +/* + * Lists structure: + * There is a "main" linked list with entries hashed + * by fwmark value (struct ip_masq_mfw, the "m-entries"). + * + * Each of this m-entry holds a double linked list + * of "forward-to" hosts (struct ip_masq_mfw_host, the "m.host"), + * the round-robin scheduling takes place by rotating m.host entries + * "inside" its m-entry. + */ + +/* + * Each forwarded host (addr:port) is stored here + */ +struct ip_masq_mfw_host { + struct list_head list; + __u32 addr; + __u16 port; + __u16 pad0; + __u32 fwmark; + int pref; + atomic_t pref_cnt; +}; + +#define IP_MASQ_MFW_HSIZE 16 +/* + * This entries are indexed by fwmark, + * they hold a list of forwarded addr:port + */ + +struct ip_masq_mfw { + struct ip_masq_mfw *next; /* linked list */ + __u32 fwmark; /* key: firewall mark */ + struct list_head hosts; /* list of forward-to hosts */ + atomic_t nhosts; /* number of "" */ + rwlock_t lock; +}; + + +static struct semaphore mfw_sema = MUTEX; +static rwlock_t mfw_lock = RW_LOCK_UNLOCKED; + +static struct ip_masq_mfw *ip_masq_mfw_table[IP_MASQ_MFW_HSIZE]; + +static __inline__ int mfw_hash_val(int fwmark) +{ + return fwmark & 0x0f; +} + +/* + * Get m-entry by "fwmark" + * Caller must lock tables. + */ + +static struct ip_masq_mfw *__mfw_get(int fwmark) +{ + struct ip_masq_mfw* mfw; + int hash = mfw_hash_val(fwmark); + + for (mfw=ip_masq_mfw_table[hash];mfw;mfw=mfw->next) { + if (mfw->fwmark==fwmark) { + goto out; + } + } +out: + return mfw; +} + +/* + * Links m-entry. + * Caller should have checked if already present for same fwmark + * + * Caller must lock tables. + */ +static int __mfw_add(struct ip_masq_mfw *mfw) +{ + int fwmark = mfw->fwmark; + int hash = mfw_hash_val(fwmark); + + mfw->next = ip_masq_mfw_table[hash]; + ip_masq_mfw_table[hash] = mfw; + ip_masq_mod_inc_nent(mmod_self); + + return 0; +} + +/* + * Creates a m-entry (doesn't link it) + */ + +static struct ip_masq_mfw * mfw_new(int fwmark) +{ + struct ip_masq_mfw *mfw; + + mfw = kmalloc(sizeof(*mfw), GFP_KERNEL); + if (mfw == NULL) + goto out; + + MOD_INC_USE_COUNT; + memset(mfw, 0, sizeof(*mfw)); + mfw->fwmark = fwmark; + mfw->lock = RW_LOCK_UNLOCKED; + + INIT_LIST_HEAD(&mfw->hosts); +out: + return mfw; +} + +static void mfw_host_to_user(struct ip_masq_mfw_host *h, struct ip_mfw_user *mu) +{ + mu->raddr = h->addr; + mu->rport = h->port; + mu->fwmark = h->fwmark; + mu->pref = h->pref; +} + +/* + * Creates a m.host (doesn't link it in a m-entry) + */ +static struct ip_masq_mfw_host * mfw_host_new(struct ip_mfw_user *mu) +{ + struct ip_masq_mfw_host * mfw_host; + mfw_host = kmalloc(sizeof (*mfw_host), GFP_KERNEL); + if (!mfw_host) + return NULL; + + MOD_INC_USE_COUNT; + memset(mfw_host, 0, sizeof(*mfw_host)); + mfw_host->addr = mu->raddr; + mfw_host->port = mu->rport; + mfw_host->fwmark = mu->fwmark; + mfw_host->pref = mu->pref; + atomic_set(&mfw_host->pref_cnt, mu->pref); + + return mfw_host; +} + +/* + * Create AND link m.host to m-entry. + * It locks m.lock. + */ +static int mfw_addhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu, int attail) +{ + struct ip_masq_mfw_host *mfw_host; + + mfw_host = mfw_host_new(mu); + if (!mfw_host) + return -ENOMEM; + + write_lock_bh(&mfw->lock); + list_add(&mfw_host->list, attail? mfw->hosts.prev : &mfw->hosts); + atomic_inc(&mfw->nhosts); + write_unlock_bh(&mfw->lock); + + return 0; +} + +/* + * Unlink AND destroy m.host(s) from m-entry. + * Wildcard (nul host or addr) ok. + * It uses m.lock. + */ +static int mfw_delhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu) +{ + + struct list_head *l,*e; + struct ip_masq_mfw_host *h; + int n_del = 0; + l = &mfw->hosts; + + write_lock_bh(&mfw->lock); + for (e=l->next; e!=l; e=e->next) + { + h = list_entry(e, struct ip_masq_mfw_host, list); + if ((!mu->raddr || h->addr == mu->raddr) && + (!mu->rport || h->port == mu->rport)) { + /* HIT */ + atomic_dec(&mfw->nhosts); + list_del(&h->list); + kfree_s(h, sizeof(*h)); + MOD_DEC_USE_COUNT; + n_del++; + } + + } + write_unlock_bh(&mfw->lock); + return n_del? 0 : -ESRCH; +} + +/* + * Changes m.host parameters + * Wildcards ok + * + * Caller must lock tables. + */ +static int __mfw_edithost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu) +{ + + struct list_head *l,*e; + struct ip_masq_mfw_host *h; + int n_edit = 0; + l = &mfw->hosts; + + for (e=l->next; e!=l; e=e->next) + { + h = list_entry(e, struct ip_masq_mfw_host, list); + if ((!mu->raddr || h->addr == mu->raddr) && + (!mu->rport || h->port == mu->rport)) { + /* HIT */ + h->pref = mu->pref; + atomic_set(&h->pref_cnt, mu->pref); + n_edit++; + } + + } + return n_edit? 0 : -ESRCH; +} + +/* + * Destroys m-entry. + * Caller must have checked that it doesn't hold any m.host(s) + */ +static void mfw_destroy(struct ip_masq_mfw *mfw) +{ + kfree_s(mfw, sizeof(*mfw)); + MOD_DEC_USE_COUNT; +} + +/* + * Unlink m-entry. + * + * Caller must lock tables. + */ +static int __mfw_del(struct ip_masq_mfw *mfw) +{ + struct ip_masq_mfw **mfw_p; + int ret = -EINVAL; + + + for(mfw_p=&ip_masq_mfw_table[mfw_hash_val(mfw->fwmark)]; + *mfw_p; + mfw_p = &((*mfw_p)->next)) + { + if (mfw==(*mfw_p)) { + *mfw_p = mfw->next; + ip_masq_mod_dec_nent(mmod_self); + ret = 0; + goto out; + } + } +out: + return ret; +} + +/* + * Crude m.host scheduler + * This interface could be exported to allow playing with + * other sched policies. + * + * Caller must lock m-entry. + */ +static struct ip_masq_mfw_host * __mfw_sched(struct ip_masq_mfw *mfw, int force) +{ + struct ip_masq_mfw_host *h = NULL; + + if (atomic_read(&mfw->nhosts) == 0) + goto out; + + /* + * Here resides actual sched policy: + * When pref_cnt touches 0, entry gets shifted to tail and + * its pref_cnt reloaded from h->pref (actual value + * passed from u-space). + * + * Exception is pref==0: avoid scheduling. + */ + + h = list_entry(mfw->hosts.next, struct ip_masq_mfw_host, list); + + if (atomic_read(&mfw->nhosts) <= 1) + goto out; + + if ((h->pref && atomic_dec_and_test(&h->pref_cnt)) || force) { + atomic_set(&h->pref_cnt, h->pref); + list_del(&h->list); + list_add(&h->list, mfw->hosts.prev); + } +out: + return h; +} + +/* + * Main lookup routine. + * HITs fwmark and schedules m.host entries if required + */ +static struct ip_masq_mfw_host * mfw_lookup(int fwmark) +{ + struct ip_masq_mfw *mfw; + struct ip_masq_mfw_host *h = NULL; + + read_lock(&mfw_lock); + mfw = __mfw_get(fwmark); + + if (mfw) { + write_lock(&mfw->lock); + h = __mfw_sched(mfw, 0); + write_unlock(&mfw->lock); + } + + read_unlock(&mfw_lock); + return h; +} + +#ifdef CONFIG_PROC_FS +static int mfw_procinfo(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct ip_masq_mfw *mfw; + struct ip_masq_mfw_host *h; + struct list_head *l,*e; + off_t pos=0, begin; + char temp[129]; + int idx = 0; + int len=0; + + MOD_INC_USE_COUNT; + + IP_MASQ_DEBUG(1-debug, "Entered mfw_info\n"); + + if (offset < 64) + { + sprintf(temp, "FwMark > RAddr RPort PrCnt Pref"); + len = sprintf(buffer, "%-63s\n", temp); + } + pos = 64; + + for(idx = 0; idx < IP_MASQ_MFW_HSIZE; idx++) + { + read_lock(&mfw_lock); + for(mfw = ip_masq_mfw_table[idx]; mfw ; mfw = mfw->next) + { + read_lock_bh(&mfw->lock); + l=&mfw->hosts; + + for(e=l->next;l!=e;e=e->next) { + h = list_entry(e, struct ip_masq_mfw_host, list); + pos += 64; + if (pos <= offset) { + len = 0; + continue; + } + + sprintf(temp,"0x%x > %08lX %5u %5d %5d", + h->fwmark, + ntohl(h->addr), ntohs(h->port), + atomic_read(&h->pref_cnt), h->pref); + len += sprintf(buffer+len, "%-63s\n", temp); + + if(len >= length) { + read_unlock_bh(&mfw->lock); + read_unlock(&mfw_lock); + goto done; + } + } + read_unlock_bh(&mfw->lock); + } + read_unlock(&mfw_lock); + } + +done: + + if (len) { + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + } + if(len>length) + len = length; + MOD_DEC_USE_COUNT; + return len; +} +static struct proc_dir_entry mfw_proc_entry = { +/* 0, 0, NULL", */ + 0, 3, "mfw", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + mfw_procinfo +}; + +#define proc_ent &mfw_proc_entry +#else /* !CONFIG_PROC_FS */ + +#define proc_ent NULL +#endif + + +static void mfw_flush(void) +{ + struct ip_masq_mfw *mfw, *local_table[IP_MASQ_MFW_HSIZE]; + struct ip_masq_mfw_host *h; + struct ip_masq_mfw *mfw_next; + int idx; + struct list_head *l,*e; + + write_lock_bh(&mfw_lock); + memcpy(local_table, ip_masq_mfw_table, sizeof ip_masq_mfw_table); + memset(ip_masq_mfw_table, 0, sizeof ip_masq_mfw_table); + write_unlock_bh(&mfw_lock); + + /* + * For every hash table row ... + */ + for(idx=0;idx<IP_MASQ_MFW_HSIZE;idx++) { + + /* + * For every m-entry in row ... + */ + for(mfw=local_table[idx];mfw;mfw=mfw_next) { + /* + * For every m.host in m-entry ... + */ + l=&mfw->hosts; + while((e=l->next) != l) { + h = list_entry(e, struct ip_masq_mfw_host, list); + atomic_dec(&mfw->nhosts); + list_del(&h->list); + kfree_s(h, sizeof(*h)); + MOD_DEC_USE_COUNT; + } + + if (atomic_read(&mfw->nhosts)) { + IP_MASQ_ERR("mfw_flush(): after flushing row nhosts=%d\n", + atomic_read(&mfw->nhosts)); + } + mfw_next = mfw->next; + kfree_s(mfw, sizeof(*mfw)); + MOD_DEC_USE_COUNT; + ip_masq_mod_dec_nent(mmod_self); + } + } +} + +/* + * User space control entry point + */ +static int mfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_mfw_user *mu = &mctl->u.mfw_user; + struct ip_masq_mfw *mfw; + int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; + + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*mu), + optlen, + sizeof (*mctl)); + + /* + * checks ... + */ + if (arglen != sizeof(*mu) && optlen != sizeof(*mctl)) + return -EINVAL; + + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_mfw_ctl(cmd=%d, fwmark=%d)\n", + cmd, mu->fwmark); + + + switch(cmd) { + case IP_MASQ_CMD_NONE: + return 0; + case IP_MASQ_CMD_FLUSH: + break; + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + case IP_MASQ_CMD_SET: + if (mu->fwmark == 0) { + IP_MASQ_DEBUG(1-debug, "invalid fwmark==0\n"); + return -EINVAL; + } + if (mu->pref < 0) { + IP_MASQ_DEBUG(1-debug, "invalid pref==%d\n", + mu->pref); + return -EINVAL; + } + break; + } + + + ret = -EINVAL; + + switch(cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + if (!mu->raddr) { + IP_MASQ_DEBUG(0-debug, "ip_masq_mfw_ctl(ADD): invalid redirect 0x%x:%d\n", + mu->raddr, mu->rport); + goto out; + } + + /* + * Cannot just use mfw_lock because below + * are allocations that can sleep; so + * to assure "new entry" atomic creation + * I use a semaphore. + * + */ + down(&mfw_sema); + + read_lock(&mfw_lock); + mfw = __mfw_get(mu->fwmark); + read_unlock(&mfw_lock); + + /* + * If first host, create m-entry + */ + if (mfw == NULL) { + mfw = mfw_new(mu->fwmark); + if (mfw == NULL) + ret = -ENOMEM; + } + + if (mfw) { + /* + * Put m.host in m-entry. + */ + ret = mfw_addhost(mfw, mu, cmd == IP_MASQ_CMD_ADD); + + /* + * If first host, link m-entry to hash table. + * Already protected by global lock. + */ + if (ret == 0 && atomic_read(&mfw->nhosts) == 1) { + write_lock_bh(&mfw_lock); + __mfw_add(mfw); + write_unlock_bh(&mfw_lock); + } + if (atomic_read(&mfw->nhosts) == 0) { + mfw_destroy(mfw); + } + } + + up(&mfw_sema); + + break; + + case IP_MASQ_CMD_DEL: + down(&mfw_sema); + + read_lock(&mfw_lock); + mfw = __mfw_get(mu->fwmark); + read_unlock(&mfw_lock); + + if (mfw) { + ret = mfw_delhost(mfw, mu); + + /* + * Last lease will free + * XXX check logic XXX + */ + if (atomic_read(&mfw->nhosts) == 0) { + write_lock_bh(&mfw_lock); + __mfw_del(mfw); + write_unlock_bh(&mfw_lock); + mfw_destroy(mfw); + } + } else + ret = -ESRCH; + + up(&mfw_sema); + break; + case IP_MASQ_CMD_FLUSH: + + down(&mfw_sema); + mfw_flush(); + up(&mfw_sema); + ret = 0; + break; + case IP_MASQ_CMD_SET: + /* + * No need to semaphorize here, main list is not + * modified. + */ + read_lock(&mfw_lock); + + mfw = __mfw_get(mu->fwmark); + if (mfw) { + write_lock_bh(&mfw->lock); + + if (mu->flags & IP_MASQ_MFW_SCHED) { + struct ip_masq_mfw_host *h; + if ((h=__mfw_sched(mfw, 1))) { + mfw_host_to_user(h, mu); + ret = 0; + } + } else { + ret = __mfw_edithost(mfw, mu); + } + + write_unlock_bh(&mfw->lock); + } + + read_unlock(&mfw_lock); + break; + } +out: + + return ret; +} + +/* + * Module stubs called from ip_masq core module + */ + +/* + * Input rule stub, called very early for each incoming packet, + * to see if this module has "interest" in packet. + */ +static int mfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + int val; + read_lock(&mfw_lock); + val = ( __mfw_get(skb->fwmark) != 0); + read_unlock(&mfw_lock); + return val; +} + +/* + * Input-create stub, called to allow "custom" masq creation + */ +static struct ip_masq * mfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + union ip_masq_tphdr tph; + struct ip_masq *ms = NULL; + struct ip_masq_mfw_host *h = NULL; + + tph.raw = (char*) iph + iph->ihl * 4; + + switch (iph->protocol) { + case IPPROTO_TCP: + /* + * Only open TCP tunnel if SYN+!ACK packet + */ + if (!tph.th->syn && tph.th->ack) + return NULL; + case IPPROTO_UDP: + break; + default: + return NULL; + } + + /* + * If no entry exists in the masquerading table + * and the port is involved + * in port forwarding, create a new masq entry + */ + + if ((h=mfw_lookup(skb->fwmark))) { + ms = ip_masq_new(iph->protocol, + iph->daddr, tph.portp[1], + /* if no redir-port, use packet dest port */ + h->addr, h->port? h->port : tph.portp[1], + iph->saddr, tph.portp[0], + 0); + + if (ms != NULL) + ip_masq_listen(ms); + } + return ms; +} + + +#define mfw_in_update NULL +#define mfw_out_rule NULL +#define mfw_out_create NULL +#define mfw_out_update NULL + +static struct ip_masq_mod mfw_mod = { + NULL, /* next */ + NULL, /* next_reg */ + "mfw", /* name */ + ATOMIC_INIT(0), /* nent */ + ATOMIC_INIT(0), /* refcnt */ + proc_ent, + mfw_ctl, + NULL, /* masq_mod_init */ + NULL, /* masq_mod_done */ + mfw_in_rule, + mfw_in_update, + mfw_in_create, + mfw_out_rule, + mfw_out_update, + mfw_out_create, +}; + + +__initfunc(int ip_mfw_init(void)) +{ + return register_ip_masq_mod ((mmod_self=&mfw_mod)); +} + +int ip_mfw_done(void) +{ + return unregister_ip_masq_mod(&mfw_mod); +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_mfw_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_mfw_done() != 0) + printk(KERN_INFO "can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_mod.c b/pfinet/linux-src/net/ipv4/ip_masq_mod.c new file mode 100644 index 00000000..b99502f3 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_mod.c @@ -0,0 +1,322 @@ +/* + * IP_MASQ_MOD masq modules support + * + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + * $Id: ip_masq_mod.c,v 1.5.2.1 1999/07/02 10:10:03 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Cyrus Durgin: fixed kerneld stuff for kmod. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> + +#include <linux/ip_masq.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +EXPORT_SYMBOL(register_ip_masq_mod); +EXPORT_SYMBOL(unregister_ip_masq_mod); +EXPORT_SYMBOL(ip_masq_mod_lkp_link); +EXPORT_SYMBOL(ip_masq_mod_lkp_unlink); + +static spinlock_t masq_mod_lock = SPIN_LOCK_UNLOCKED; + +/* + * Base pointer for registered modules + */ +struct ip_masq_mod * ip_masq_mod_reg_base = NULL; + +/* + * Base pointer for lookup (subset of above, a module could be + * registered, but it could have no active rule); will avoid + * unnecessary lookups. + */ +struct ip_masq_mod * ip_masq_mod_lkp_base = NULL; + +int ip_masq_mod_register_proc(struct ip_masq_mod *mmod) +{ +#ifdef CONFIG_PROC_FS + int ret; + + struct proc_dir_entry *ent = mmod->mmod_proc_ent; + + if (!ent) + return 0; + if (!ent->name) { + ent->name = mmod->mmod_name; + ent->namelen = strlen (mmod->mmod_name); + } + ret = ip_masq_proc_register(ent); + if (ret) mmod->mmod_proc_ent = NULL; + + return ret; +#else + return 0; +#endif +} + +void ip_masq_mod_unregister_proc(struct ip_masq_mod *mmod) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent = mmod->mmod_proc_ent; + if (!ent) + return; + ip_masq_proc_unregister(ent); +#endif +} + +/* + * Link/unlink object for lookups + */ + +int ip_masq_mod_lkp_unlink(struct ip_masq_mod *mmod) +{ + struct ip_masq_mod **mmod_p; + + write_lock_bh(&masq_mod_lock); + + for (mmod_p = &ip_masq_mod_lkp_base; *mmod_p ; mmod_p = &(*mmod_p)->next) + if (mmod == (*mmod_p)) { + *mmod_p = mmod->next; + mmod->next = NULL; + write_unlock_bh(&masq_mod_lock); + return 0; + } + + write_unlock_bh(&masq_mod_lock); + return -EINVAL; +} + +int ip_masq_mod_lkp_link(struct ip_masq_mod *mmod) +{ + write_lock_bh(&masq_mod_lock); + + mmod->next = ip_masq_mod_lkp_base; + ip_masq_mod_lkp_base=mmod; + + write_unlock_bh(&masq_mod_lock); + return 0; +} + +int register_ip_masq_mod(struct ip_masq_mod *mmod) +{ + if (!mmod) { + IP_MASQ_ERR("register_ip_masq_mod(): NULL arg\n"); + return -EINVAL; + } + if (!mmod->mmod_name) { + IP_MASQ_ERR("register_ip_masq_mod(): NULL mmod_name\n"); + return -EINVAL; + } + ip_masq_mod_register_proc(mmod); + + mmod->next_reg = ip_masq_mod_reg_base; + ip_masq_mod_reg_base=mmod; + + return 0; +} + +int unregister_ip_masq_mod(struct ip_masq_mod *mmod) +{ + struct ip_masq_mod **mmod_p; + + if (!mmod) { + IP_MASQ_ERR( "unregister_ip_masq_mod(): NULL arg\n"); + return -EINVAL; + } + + /* + * Only allow unregistration if it is not referenced + */ + if (atomic_read(&mmod->refcnt)) { + IP_MASQ_ERR( "unregister_ip_masq_mod(): is in use by %d guys. failed\n", + atomic_read(&mmod->refcnt)); + return -EINVAL; + } + + /* + * Must be already unlinked from lookup list + */ + if (mmod->next) { + IP_MASQ_WARNING("MASQ: unregistering \"%s\" while in lookup list.fixed.", + mmod->mmod_name); + ip_masq_mod_lkp_unlink(mmod); + } + + for (mmod_p = &ip_masq_mod_reg_base; *mmod_p ; mmod_p = &(*mmod_p)->next_reg) + if (mmod == (*mmod_p)) { + ip_masq_mod_unregister_proc(mmod); + *mmod_p = mmod->next_reg; + return 0; + } + + IP_MASQ_ERR("unregister_ip_masq_mod(%s): not linked \n", mmod->mmod_name); + return -EINVAL; +} + +int ip_masq_mod_in_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + struct ip_masq_mod *mmod; + int ret = IP_MASQ_MOD_NOP; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_in_rule) continue; + switch (ret=mmod->mmod_in_rule(skb, iph)) { + case IP_MASQ_MOD_NOP: + continue; + case IP_MASQ_MOD_ACCEPT: + case IP_MASQ_MOD_REJECT: + goto out; + } + } +out: + return ret; +} + +int ip_masq_mod_out_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + struct ip_masq_mod *mmod; + int ret = IP_MASQ_MOD_NOP; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_out_rule) continue; + switch (ret=mmod->mmod_out_rule(skb, iph)) { + case IP_MASQ_MOD_NOP: + continue; + case IP_MASQ_MOD_ACCEPT: + case IP_MASQ_MOD_REJECT: + goto out; + } + } +out: + return ret; +} + +struct ip_masq * ip_masq_mod_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + struct ip_masq_mod *mmod; + struct ip_masq *ms = NULL; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_in_create) continue; + if ((ms=mmod->mmod_in_create(skb, iph, maddr))) { + goto out; + } + } +out: + return ms; +} + +struct ip_masq * ip_masq_mod_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + struct ip_masq_mod *mmod; + struct ip_masq *ms = NULL; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_out_create) continue; + if ((ms=mmod->mmod_out_create(skb, iph, maddr))) { + goto out; + } + } +out: + return ms; +} + +int ip_masq_mod_in_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) +{ + struct ip_masq_mod *mmod; + int ret = IP_MASQ_MOD_NOP; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_in_update) continue; + switch (ret=mmod->mmod_in_update(skb, iph, ms)) { + case IP_MASQ_MOD_NOP: + continue; + case IP_MASQ_MOD_ACCEPT: + case IP_MASQ_MOD_REJECT: + goto out; + } + } +out: + return ret; +} + +int ip_masq_mod_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) +{ + struct ip_masq_mod *mmod; + int ret = IP_MASQ_MOD_NOP; + + for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { + if (!mmod->mmod_out_update) continue; + switch (ret=mmod->mmod_out_update(skb, iph, ms)) { + case IP_MASQ_MOD_NOP: + continue; + case IP_MASQ_MOD_ACCEPT: + case IP_MASQ_MOD_REJECT: + goto out; + } + } +out: + return ret; +} + +struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) +{ + struct ip_masq_mod * mmod; + + IP_MASQ_DEBUG(1, "searching mmod_name \"%s\"\n", mmod_name); + + for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next_reg) { + if (mmod->mmod_ctl && *(mmod_name) + && (strcmp(mmod_name, mmod->mmod_name)==0)) { + /* HIT */ + return mmod; + } + } + return NULL; +} + +/* + * Module control entry + */ +int ip_masq_mod_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_masq_mod * mmod; +#ifdef CONFIG_KMOD + char kmod_name[IP_MASQ_TNAME_MAX+8]; +#endif + /* tappo */ + mctl->m_tname[IP_MASQ_TNAME_MAX-1] = 0; + + mmod = ip_masq_mod_getbyname(mctl->m_tname); + if (mmod) + return mmod->mmod_ctl(optname, mctl, optlen); +#ifdef CONFIG_KMOD + sprintf(kmod_name,"ip_masq_%s", mctl->m_tname); + + IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name); + + /* + * Let sleep for a while ... + */ + request_module(kmod_name); + mmod = ip_masq_mod_getbyname(mctl->m_tname); + if (mmod) + return mmod->mmod_ctl(optname, mctl, optlen); +#endif + return ESRCH; +} diff --git a/pfinet/linux-src/net/ipv4/ip_masq_portfw.c b/pfinet/linux-src/net/ipv4/ip_masq_portfw.c new file mode 100644 index 00000000..6c697a10 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_portfw.c @@ -0,0 +1,508 @@ +/* + * IP_MASQ_PORTFW masquerading module + * + * + * $Id: ip_masq_portfw.c,v 1.3.2.1 1999/07/02 10:10:02 davem Exp $ + * + * Author: Steven Clarke <steven.clarke@monmouth.demon.co.uk> + * + * Fixes: + * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c + * Juan Jose Ciarlante : modularized + * Juan Jose Ciarlante : use GFP_KERNEL + * Juan Jose Ciarlante : locking + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/list.h> +#include <net/ip.h> +#include <linux/ip_fw.h> +#include <linux/ip_masq.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/proc_fs.h> +#include <linux/init.h> + +#define IP_PORTFW_PORT_MIN 1 +#define IP_PORTFW_PORT_MAX 60999 + +struct ip_portfw { + struct list_head list; + __u32 laddr, raddr; + __u16 lport, rport; + atomic_t pref_cnt; /* pref "counter" down to 0 */ + int pref; /* user set pref */ +}; + +static struct ip_masq_mod *mmod_self = NULL; +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +/* + * Lock + */ +static rwlock_t portfw_lock = RW_LOCK_UNLOCKED; + +static struct list_head portfw_list[2]; +static __inline__ int portfw_idx(int protocol) +{ + return (protocol==IPPROTO_TCP); +} + +/* + * + * Delete forwarding entry(s): + * called from _DEL, u-space. + * . "relaxed" match, except for lport + * + */ + +static __inline__ int ip_portfw_del(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr) +{ + int prot = portfw_idx(protocol); + struct ip_portfw *n; + struct list_head *entry; + struct list_head *list = &portfw_list[prot]; + int nent; + + nent = atomic_read(&mmod_self->mmod_nent); + + write_lock_bh(&portfw_lock); + + for (entry=list->next;entry != list;entry = entry->next) { + n = list_entry(entry, struct ip_portfw, list); + if (n->lport == lport && + (!laddr || n->laddr == laddr) && + (!raddr || n->raddr == raddr) && + (!rport || n->rport == rport)) { + list_del(entry); + ip_masq_mod_dec_nent(mmod_self); + kfree_s(n, sizeof(struct ip_portfw)); + MOD_DEC_USE_COUNT; + } + } + write_unlock_bh(&portfw_lock); + + return nent==atomic_read(&mmod_self->mmod_nent)? ESRCH : 0; +} + +/* + * Flush tables + * called from _FLUSH, u-space. + */ +static __inline__ void ip_portfw_flush(void) +{ + int prot; + struct list_head *l; + struct list_head *e; + struct ip_portfw *n; + + write_lock_bh(&portfw_lock); + + for (prot = 0; prot < 2;prot++) { + l = &portfw_list[prot]; + while((e=l->next) != l) { + ip_masq_mod_dec_nent(mmod_self); + n = list_entry (e, struct ip_portfw, list); + list_del(e); + kfree_s(n, sizeof (*n)); + MOD_DEC_USE_COUNT; + } + } + + write_unlock_bh(&portfw_lock); +} + +/* + * Lookup routine for lport,laddr match + * must be called with locked tables + */ +static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport, __u32 laddr, __u32 *daddr_p, __u16 *dport_p) +{ + int prot = portfw_idx(protocol); + + struct ip_portfw *n = NULL; + struct list_head *l, *e; + + l = &portfw_list[prot]; + + for (e=l->next;e!=l;e=e->next) { + n = list_entry(e, struct ip_portfw, list); + if (lport == n->lport && laddr == n->laddr) { + /* Please be nice, don't pass only a NULL dport */ + if (daddr_p) { + *daddr_p = n->raddr; + *dport_p = n->rport; + } + + goto out; + } + } + n = NULL; +out: + return n; +} + +/* + * Edit routine for lport,[laddr], [raddr], [rport] match + * By now, only called from u-space + */ +static __inline__ int ip_portfw_edit(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref) +{ + int prot = portfw_idx(protocol); + + struct ip_portfw *n = NULL; + struct list_head *l, *e; + int count = 0; + + + read_lock_bh(&portfw_lock); + + l = &portfw_list[prot]; + + for (e=l->next;e!=l;e=e->next) { + n = list_entry(e, struct ip_portfw, list); + if (lport == n->lport && + (!laddr || laddr == n->laddr) && + (!rport || rport == n->rport) && + (!raddr || raddr == n->raddr)) { + n->pref = pref; + atomic_set(&n->pref_cnt, pref); + count++; + } + } + + read_unlock_bh(&portfw_lock); + + return count; +} + +/* + * Add/edit en entry + * called from _ADD, u-space. + * must return 0 or +errno + */ +static __inline__ int ip_portfw_add(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref) +{ + struct ip_portfw *npf; + int prot = portfw_idx(protocol); + + if (pref <= 0) + return EINVAL; + + if (ip_portfw_edit(protocol, lport, laddr, rport, raddr, pref)) { + /* + * Edit ok ... + */ + return 0; + } + + /* may block ... */ + npf = (struct ip_portfw*) kmalloc(sizeof(struct ip_portfw), GFP_KERNEL); + + if (!npf) + return ENOMEM; + + MOD_INC_USE_COUNT; + memset(npf, 0, sizeof(*npf)); + + npf->laddr = laddr; + npf->lport = lport; + npf->rport = rport; + npf->raddr = raddr; + npf->pref = pref; + + atomic_set(&npf->pref_cnt, npf->pref); + INIT_LIST_HEAD(&npf->list); + + write_lock_bh(&portfw_lock); + + /* + * Add at head + */ + list_add(&npf->list, &portfw_list[prot]); + + write_unlock_bh(&portfw_lock); + + ip_masq_mod_inc_nent(mmod_self); + return 0; +} + + + +static __inline__ int portfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_portfw_user *mm = &mctl->u.portfw_user; + int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; + + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*mm), + optlen, + sizeof (*mctl)); + + /* + * Yes, I'm a bad guy ... + */ + if (arglen != sizeof(*mm) && optlen != sizeof(*mctl)) + return EINVAL; + + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_portfw_ctl(cmd=%d)\n", cmd); + + + switch (cmd) { + case IP_MASQ_CMD_NONE: + return 0; + case IP_MASQ_CMD_FLUSH: + break; + default: + if (htons(mm->lport) < IP_PORTFW_PORT_MIN || htons(mm->lport) > IP_PORTFW_PORT_MAX) + return EINVAL; + + if (mm->protocol!=IPPROTO_TCP && mm->protocol!=IPPROTO_UDP) + return EINVAL; + } + + switch(cmd) { + case IP_MASQ_CMD_ADD: + ret = ip_portfw_add(mm->protocol, + mm->lport, mm->laddr, + mm->rport, mm->raddr, + mm->pref); + break; + + case IP_MASQ_CMD_DEL: + ret = ip_portfw_del(mm->protocol, + mm->lport, mm->laddr, + mm->rport, mm->raddr); + break; + case IP_MASQ_CMD_FLUSH: + ip_portfw_flush(); + ret = 0; + break; + } + + + return ret; +} + + + + +#ifdef CONFIG_PROC_FS + +static int portfw_procinfo(char *buffer, char **start, off_t offset, + int length, int unused) +{ + off_t pos=0, begin; + struct ip_portfw *pf; + struct list_head *l, *e; + char temp[65]; + int ind; + int len=0; + + + if (offset < 64) + { + sprintf(temp, "Prot LAddr LPort > RAddr RPort PrCnt Pref"); + len = sprintf(buffer, "%-63s\n", temp); + } + pos = 64; + + read_lock_bh(&portfw_lock); + + for(ind = 0; ind < 2; ind++) + { + l = &portfw_list[ind]; + for (e=l->next; e!=l; e=e->next) + { + pf = list_entry(e, struct ip_portfw, list); + pos += 64; + if (pos <= offset) { + len = 0; + continue; + } + + sprintf(temp,"%s %08lX %5u > %08lX %5u %5d %5d", + ind ? "TCP" : "UDP", + ntohl(pf->laddr), ntohs(pf->lport), + ntohl(pf->raddr), ntohs(pf->rport), + atomic_read(&pf->pref_cnt), pf->pref); + len += sprintf(buffer+len, "%-63s\n", temp); + + if (len >= length) + goto done; + } + } +done: + read_unlock_bh(&portfw_lock); + + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} + +static struct proc_dir_entry portfw_proc_entry = { +/* 0, 0, NULL", */ + 0, 6, "portfw", /* Just for compatibility, for now ... */ + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + portfw_procinfo +}; + +#define proc_ent &portfw_proc_entry +#else /* !CONFIG_PROC_FS */ + +#define proc_ent NULL +#endif + +static int portfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); +#ifdef CONFIG_IP_MASQ_DEBUG + struct rtable *rt = (struct rtable *)skb->dst; +#endif + struct ip_portfw *pfw; + + IP_MASQ_DEBUG(2, "portfw_in_rule(): skb:= dev=%s (index=%d), rt_iif=%d, rt_flags=0x%x rt_dev___=%s daddr=%d.%d.%d.%d dport=%d\n", + skb->dev->name, skb->dev->ifindex, rt->rt_iif, rt->rt_flags, + rt->u.dst.dev->name, + NIPQUAD(iph->daddr), ntohs(portp[1])); + + read_lock(&portfw_lock); + pfw = ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, NULL, NULL); + read_unlock(&portfw_lock); + return (pfw!=0); +} + +static struct ip_masq * portfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + /* + * If no entry exists in the masquerading table + * and the port is involved + * in port forwarding, create a new masq entry + */ + + __u32 raddr; + __u16 rport; + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + struct ip_masq *ms = NULL; + struct ip_portfw *pf; + + /* + * Lock for writing. + */ + write_lock(&portfw_lock); + + if ((pf=ip_portfw_lookup(iph->protocol, + portp[1], iph->daddr, + &raddr, &rport))) { + ms = ip_masq_new(iph->protocol, + iph->daddr, portp[1], + raddr, rport, + iph->saddr, portp[0], + 0); + ip_masq_listen(ms); + + if (!ms || atomic_read(&mmod_self->mmod_nent) <= 1 + /* || ip_masq_nlocks(&portfw_lock) != 1 */ ) + /* + * Maybe later... + */ + goto out; + + /* + * Entry created, lock==1. + * if pref_cnt == 0, move + * entry at _tail_. + * This is a simple load balance scheduling + */ + + if (atomic_dec_and_test(&pf->pref_cnt)) { + + atomic_set(&pf->pref_cnt, pf->pref); + list_del(&pf->list); + list_add(&pf->list, + portfw_list[portfw_idx(iph->protocol)].prev); + + } + } +out: + write_unlock(&portfw_lock); + return ms; +} + +#define portfw_in_update NULL +#define portfw_out_rule NULL +#define portfw_out_create NULL +#define portfw_out_update NULL + +static struct ip_masq_mod portfw_mod = { + NULL, /* next */ + NULL, /* next_reg */ + "portfw", /* name */ + ATOMIC_INIT(0), /* nent */ + ATOMIC_INIT(0), /* refcnt */ + proc_ent, + portfw_ctl, + NULL, /* masq_mod_init */ + NULL, /* masq_mod_done */ + portfw_in_rule, + portfw_in_update, + portfw_in_create, + portfw_out_rule, + portfw_out_update, + portfw_out_create, +}; + + + +__initfunc(int ip_portfw_init(void)) +{ + INIT_LIST_HEAD(&portfw_list[0]); + INIT_LIST_HEAD(&portfw_list[1]); + return register_ip_masq_mod ((mmod_self=&portfw_mod)); +} + +int ip_portfw_done(void) +{ + return unregister_ip_masq_mod(&portfw_mod); +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_portfw_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_portfw_done() != 0) + printk(KERN_INFO "ip_portfw_done(): can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_quake.c b/pfinet/linux-src/net/ipv4/ip_masq_quake.c new file mode 100644 index 00000000..995c3a0a --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_quake.c @@ -0,0 +1,322 @@ +/* + * IP_MASQ_QUAKE quake masquerading module + * + * + * Version: @(#)ip_masq_quake.c 0.02 22/02/97 + * + * Author: Harald Hoyer mailto:HarryH@Royal.Net + * + * + * Fixes: + * Harald Hoyer : Unofficial Quake Specs found at + * http://www.gamers.org/dEngine/quake/spec/ + * Harald Hoyer : Check for QUAKE-STRING + * Juan Jose Ciarlante : litl bits for 2.1 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + */ + +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/ip_masq.h> + +#define DEBUG_CONFIG_IP_MASQ_QUAKE 0 + +typedef struct +{ + __u16 type; // (Little Endian) Type of message. + __u16 length; // (Little Endian) Length of message, header included. + char message[0]; // The contents of the message. +} QUAKEHEADER; + +struct quake_priv_data { + /* Have we seen a client connect message */ + signed char cl_connect; +}; + +static int +masq_quake_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + if ((ms->app_data = kmalloc(sizeof(struct quake_priv_data), + GFP_ATOMIC)) == NULL) + printk(KERN_INFO "Quake: No memory for application data\n"); + else + { + struct quake_priv_data *priv = + (struct quake_priv_data *)ms->app_data; + priv->cl_connect = 0; + } + return 0; +} + +static int +masq_quake_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + if (ms->app_data) + kfree_s(ms->app_data, sizeof(struct quake_priv_data)); + return 0; +} + +int +masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct udphdr *uh; + QUAKEHEADER *qh; + __u16 udp_port; + char *data; + unsigned char code; + struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data; + + if(priv->cl_connect == -1) + return 0; + + skb = *skb_p; + + iph = skb->nh.iph; + uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Check for lenght */ + if(ntohs(uh->len) < 5) + return 0; + + qh = (QUAKEHEADER *)&uh[1]; + + if(qh->type != 0x0080) + return 0; + + + code = qh->message[0]; + +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_in: code = %d \n", (int)code); +#endif + + switch(code) { + case 0x01: + /* Connection Request */ + + if(ntohs(qh->length) < 0x0c) { +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_in: length < 0xc \n"); +#endif + return 0; + } + + data = &qh->message[1]; + + /* Check for stomping string */ + if(memcmp(data,"QUAKE\0\3",7)) { +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: memcmp failed \n"); +#endif + return 0; + } + else { + priv->cl_connect = 1; +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: memcmp ok \n"); +#endif + } + break; + + case 0x81: + /* Accept Connection */ + if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0)) + return 0; + data = &qh->message[1]; + + memcpy(&udp_port, data, 2); + + ms->dport = htons(udp_port); + +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_in: in_rewrote UDP port %d \n", udp_port); +#endif + priv->cl_connect = -1; + + break; + } + + return 0; +} + +int +masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct udphdr *uh; + QUAKEHEADER *qh; + __u16 udp_port; + char *data; + unsigned char code; + struct ip_masq *n_ms; + struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data; + + if(priv->cl_connect == -1) + return 0; + + skb = *skb_p; + + iph = skb->nh.iph; + uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Check for lenght */ + if(ntohs(uh->len) < 5) + return 0; + + qh = (QUAKEHEADER *)&uh[1]; + +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: qh->type = %d \n", (int)qh->type); +#endif + + if(qh->type != 0x0080) + return 0; + + code = qh->message[0]; + +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: code = %d \n", (int)code); +#endif + + switch(code) { + case 0x01: + /* Connection Request */ + + if(ntohs(qh->length) < 0x0c) { +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: length < 0xc \n"); +#endif + return 0; + } + + data = &qh->message[1]; + + /* Check for stomping string */ + if(memcmp(data,"QUAKE\0\3",7)) { +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: memcmp failed \n"); +#endif + return 0; + } + else { + priv->cl_connect = 1; +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: memcmp ok \n"); +#endif + } + break; + + case 0x81: + /* Accept Connection */ + if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0)) + return 0; + + data = &qh->message[1]; + + memcpy(&udp_port, data, 2); + + n_ms = ip_masq_new(IPPROTO_UDP, + maddr, 0, + ms->saddr, htons(udp_port), + ms->daddr, ms->dport, + 0); + + if (n_ms==NULL) + return 0; + +#if DEBUG_CONFIG_IP_MASQ_QUAKE + printk("Quake_out: out_rewrote UDP port %d -> %d\n", + udp_port, ntohs(n_ms->mport)); +#endif + udp_port = ntohs(n_ms->mport); + memcpy(data, &udp_port, 2); + + ip_masq_listen(n_ms); + ip_masq_control_add(n_ms, ms); + ip_masq_put(n_ms); + + break; + } + + return 0; +} + +struct ip_masq_app ip_masq_quake = { + NULL, /* next */ + "Quake_26", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_quake_init_1, /* ip_masq_init_1 */ + masq_quake_done_1, /* ip_masq_done_1 */ + masq_quake_out, /* pkt_out */ + masq_quake_in /* pkt_in */ +}; +struct ip_masq_app ip_masq_quakenew = { + NULL, /* next */ + "Quake_27", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_quake_init_1, /* ip_masq_init_1 */ + masq_quake_done_1, /* ip_masq_done_1 */ + masq_quake_out, /* pkt_out */ + masq_quake_in /* pkt_in */ +}; + +/* + * ip_masq_quake initialization + */ + +__initfunc(int ip_masq_quake_init(void)) +{ + return (register_ip_masq_app(&ip_masq_quake, IPPROTO_UDP, 26000) + + register_ip_masq_app(&ip_masq_quakenew, IPPROTO_UDP, 27000)); +} + +/* + * ip_masq_quake fin. + */ + +int ip_masq_quake_done(void) +{ + return (unregister_ip_masq_app(&ip_masq_quake) + + unregister_ip_masq_app(&ip_masq_quakenew)); +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_quake_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_quake_done() != 0) + printk("ip_masq_quake: can't remove module"); +} + +#endif /* MODULE */ + + diff --git a/pfinet/linux-src/net/ipv4/ip_masq_raudio.c b/pfinet/linux-src/net/ipv4/ip_masq_raudio.c new file mode 100644 index 00000000..ee3e276b --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_raudio.c @@ -0,0 +1,578 @@ +/* + * IP_MASQ_RAUDIO - Real Audio masquerading module + * + * + * Version: @(#)$Id: ip_masq_raudio.c,v 1.11 1998/10/06 04:49:04 davem Exp $ + * + * Author: Nigel Metheringham + * Real Time Streaming code by Progressive Networks + * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne] + * [Real Audio information taken from Progressive Networks firewall docs] + * [Kudos to Progressive Networks for making the protocol specs available] + * + * + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * Limitations + * The IP Masquerading proxies at present do not have access to a processed + * data stream. Hence for a protocol like the Real Audio control protocol, + * which depends on knowing where you are in the data stream, you either + * to keep a *lot* of state in your proxy, or you cheat and simplify the + * problem [needless to say I did the latter]. + * + * This proxy only handles data in the first packet. Everything else is + * passed transparently. This means it should work under all normal + * circumstances, but it could be fooled by new data formats or a + * malicious application! + * + * At present the "first packet" is defined as a packet starting with + * the protocol ID string - "PNA". + * When the link is up there appears to be enough control data + * crossing the control link to keep it open even if a long audio + * piece is playing. + * + * The Robust UDP support added in RealAudio 3.0 is supported, but due + * to servers/clients not making great use of this has not been greatly + * tested. RealVideo (as used in the Real client version 4.0beta1) is + * supported but again is not greatly tested (bandwidth requirements + * appear to exceed that available at the sites supporting the protocol). + * + * Multiple Port Support + * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12) + * with the port numbers being defined at module load time. The module + * uses the symbol "ports" to define a list of monitored ports, which can + * be specified on the insmod command line as + * ports=x1,x2,x3... + * where x[n] are integer port numbers. This option can be put into + * /etc/conf.modules (or /etc/modules.conf depending on your config) + * where modload will pick it up should you use modload to load your + * modules. + * + * Fixes: + * Juan Jose Ciarlante : Use control_add() for control chan + * 10/15/97 - Modifications to allow masquerading of RTSP connections as + * well as PNA, which can potentially exist on the same port. + * Joe Rumsey <ogre@real.com> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + +/* +#ifndef DEBUG_CONFIG_IP_MASQ_RAUDIO +#define DEBUG_CONFIG_IP_MASQ_RAUDIO 0 +#endif +*/ + +#define TOLOWER(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c)) +#define ISDIGIT(c) (((c) >= '0') && ((c) <= '9')) + +struct raudio_priv_data { + /* Associated data connection - setup but not used at present */ + struct ip_masq *data_conn; + /* UDP Error correction connection - setup but not used at present */ + struct ip_masq *error_conn; + /* Have we seen and performed setup */ + short seen_start; + short is_rtsp; +}; + +int +masq_rtsp_out (struct ip_masq_app *mapp, + struct ip_masq *ms, + struct sk_buff **skb_p, + __u32 maddr); + +/* + * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper + * First port is set to the default port. + */ +int ports[MAX_MASQ_APP_PORTS] = {554, 7070, 0}; /* I rely on the trailing items being set to zero */ +struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; + +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); + + +static int +masq_raudio_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + if ((ms->app_data = kmalloc(sizeof(struct raudio_priv_data), + GFP_ATOMIC)) == NULL) + printk(KERN_INFO "RealAudio: No memory for application data\n"); + else + { + struct raudio_priv_data *priv = + (struct raudio_priv_data *)ms->app_data; + priv->seen_start = 0; + priv->data_conn = NULL; + priv->error_conn = NULL; + priv->is_rtsp = 0; + } + return 0; +} + +static int +masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + if (ms->app_data) + kfree_s(ms->app_data, sizeof(struct raudio_priv_data)); + return 0; +} + +int +masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *p, *data, *data_limit; + struct ip_masq *n_ms; + unsigned short version, msg_id, msg_len, udp_port; + struct raudio_priv_data *priv = + (struct raudio_priv_data *)ms->app_data; + + /* Everything running correctly already */ + if (priv && priv->seen_start) + return 0; + + if(priv && priv->is_rtsp) + return masq_rtsp_out(mapp, ms, skb_p, maddr); + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len; + + if(memcmp(data, "OPTIONS", 7) == 0 || + memcmp(data, "DESCRIBE", 8) == 0) + { + IP_MASQ_DEBUG(1-debug, "RealAudio: Detected RTSP connection\n"); + /* This is an RTSP client */ + if(priv) + priv->is_rtsp = 1; + return masq_rtsp_out(mapp, ms, skb_p, maddr); + } + + /* Check to see if this is the first packet with protocol ID */ + if (memcmp(data, "PNA", 3)) { + IP_MASQ_DEBUG(1-debug, "RealAudio: not initial protocol packet - ignored\n"); + return(0); + } + data += 3; + memcpy(&version, data, 2); + + IP_MASQ_DEBUG(1-debug, "RealAudio: initial seen - protocol version %d\n", + ntohs(version)); + if (priv) + priv->seen_start = 1; + + if (ntohs(version) >= 256) + { + printk(KERN_INFO "RealAudio: version (%d) not supported\n", + ntohs(version)); + return 0; + } + + data += 2; + while (data+4 < data_limit) { + memcpy(&msg_id, data, 2); + data += 2; + memcpy(&msg_len, data, 2); + data += 2; + if (ntohs(msg_id) == 0) { + /* The zero tag indicates the end of options */ + IP_MASQ_DEBUG(1-debug, "RealAudio: packet end tag seen\n"); + return 0; + } + IP_MASQ_DEBUG(1-debug, "RealAudio: msg %d - %d byte\n", + ntohs(msg_id), ntohs(msg_len)); + if (ntohs(msg_id) == 0) { + /* The zero tag indicates the end of options */ + return 0; + } + p = data; + data += ntohs(msg_len); + if (data > data_limit) + { + printk(KERN_INFO "RealAudio: Packet too short for data\n"); + return 0; + } + if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { + /* + * MsgId == 1 + * Audio UDP data port on client + * + * MsgId == 7 + * Robust UDP error correction port number on client + * + * Since these messages are treated just the same, they + * are bundled together here.... + */ + memcpy(&udp_port, p, 2); + + /* + * Sometimes a server sends a message 7 with a zero UDP port + * Rather than do anything with this, just ignore it! + */ + if (udp_port == 0) + continue; + + + n_ms = ip_masq_new(IPPROTO_UDP, + maddr, 0, + ms->saddr, udp_port, + ms->daddr, 0, + IP_MASQ_F_NO_DPORT); + + if (n_ms==NULL) + return 0; + + ip_masq_listen(n_ms); + ip_masq_control_add(n_ms, ms); + + memcpy(p, &(n_ms->mport), 2); + IP_MASQ_DEBUG(1-debug, "RealAudio: rewrote UDP port %d -> %d in msg %d\n", + ntohs(udp_port), ntohs(n_ms->mport), ntohs(msg_id)); + + /* Make ref in application data to data connection */ + if (priv) { + if (ntohs(msg_id) == 1) + priv->data_conn = n_ms; + else + priv->error_conn = n_ms; + } + + ip_masq_put(n_ms); + } + } + return 0; +} + +/* + * masq_rtsp_out + * + * + */ +int +masq_rtsp_out (struct ip_masq_app *mapp, + struct ip_masq *ms, + struct sk_buff **skb_p, + __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + struct ip_masq *n_ms, *n_ms2; + unsigned short udp_port; + struct raudio_priv_data *priv = + (struct raudio_priv_data *)ms->app_data; + const char* srch = "transport:"; + const char* srchpos = srch; + const char* srchend = srch + strlen(srch); + int state = 0; + char firstport[6]; + int firstportpos = 0; + char secondport[6]; + int secondportpos = 0; + char *portstart = NULL, *portend = NULL; + int diff; + + /* Everything running correctly already */ + if (priv && priv->seen_start) + return 0; + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len; + + firstport[0] = 0; + secondport[0] = 0; + + while(data < data_limit && state >= 0) + { + switch(state) + { + case 0: + case 1: + if(TOLOWER(*data) == *srchpos) + { + srchpos++; + if(srchpos == srchend) + { + IP_MASQ_DEBUG(1-debug, "Found string %s in message\n", + srch); + state++; + if(state == 1) + { + srch = "client_port"; + srchpos = srch; + srchend = srch + strlen(srch); + } + } + } + else + { + srchpos = srch; + } + break; + case 2: + if(*data == '=') + state = 3; + break; + case 3: + if(ISDIGIT(*data)) + { + portstart = data; + firstportpos = 0; + firstport[firstportpos++] = *data; + state = 4; + } + break; + case 4: + if(*data == '-') + { + state = 5; + } + else if(*data == ';') + { + portend = data - 1; + firstport[firstportpos] = 0; + state = -1; + } + else if(ISDIGIT(*data)) + { + firstport[firstportpos++] = *data; + } + else if(*data != ' ' && *data != '\t') + { + /* This is a badly formed RTSP message, let's bail out */ + IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n"); + return 0; + } + break; + case 5: + if(ISDIGIT(*data)) + { + secondportpos = 0; + secondport[secondportpos++] = *data; + state = 6; + } + else if(*data == ';') + { + portend = data - 1; + secondport[secondportpos] = 0; + state = -1; + } + break; + case 6: + if(*data == ';') + { + portend = data - 1; + secondport[secondportpos] = 0; + state = -1; + } + else if(ISDIGIT(*data)) + { + secondport[secondportpos++] = *data; + } + else if(*data != ' ' && *data != '\t') + { + /* This is a badly formed RTSP message, let's bail out */ + IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n"); + return 0; + } + break; + } + data++; + } + + if(state >= 0) + return 0; + + if(firstportpos > 0) + { + char newbuf[12]; /* xxxxx-xxxxx\0 */ + char* tmpptr; + + udp_port = htons(simple_strtoul(firstport, &tmpptr, 10)); + n_ms = ip_masq_new(IPPROTO_UDP, + maddr, 0, + ms->saddr, udp_port, + ms->daddr, 0, + IP_MASQ_F_NO_DPORT); + if (n_ms==NULL) + return 0; + + ip_masq_listen(n_ms); + ip_masq_control_add(n_ms, ms); + + if(secondportpos > 0) + { + udp_port = htons(simple_strtoul(secondport, &tmpptr, 10)); + n_ms2 = ip_masq_new(IPPROTO_UDP, + maddr, 0, + ms->saddr, udp_port, + ms->daddr, 0, + IP_MASQ_F_NO_DPORT); + if (n_ms2==NULL) { + ip_masq_put(n_ms); + return 0; + } + + ip_masq_listen(n_ms2); + ip_masq_control_add(n_ms2, ms); + sprintf(newbuf, "%d-%d", ntohs(n_ms->mport), + ntohs(n_ms2->mport)); + } + else + { + sprintf(newbuf, "%d", ntohs(n_ms->mport)); + n_ms2 = NULL; + } + *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, + portstart, portend - portstart + 1, + newbuf, strlen(newbuf)); + IP_MASQ_DEBUG(1-debug, "RTSP: rewrote client_port to %s\n", newbuf); + diff = strlen(newbuf) - (portend - portstart); + } + else + { + return 0; + } + + if(priv) + { + priv->seen_start = 1; + if(n_ms) + priv->data_conn = n_ms; + if(n_ms2) + priv->error_conn = n_ms2; + } + /* + * Release tunnels + */ + + if (n_ms) + ip_masq_put(n_ms); + + if (n_ms2) + ip_masq_put(n_ms2); + + return diff; +} + +struct ip_masq_app ip_masq_raudio = { + NULL, /* next */ + "RealAudio", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_raudio_init_1, /* ip_masq_init_1 */ + masq_raudio_done_1, /* ip_masq_done_1 */ + masq_raudio_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_raudio initialization + */ + +__initfunc(int ip_masq_raudio_init(void)) +{ + int i, j; + + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (ports[i]) { + if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app), + GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(masq_incarnations[i], &ip_masq_raudio, sizeof(struct ip_masq_app)); + if ((j = register_ip_masq_app(masq_incarnations[i], + IPPROTO_TCP, + ports[i]))) { + return j; + } + IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n", + i, ports[i]); + } else { + /* To be safe, force the incarnation table entry to NULL */ + masq_incarnations[i] = NULL; + } + } + return 0; +} + +/* + * ip_masq_raudio fin. + */ + +int ip_masq_raudio_done(void) +{ + int i, j, k; + + k=0; + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (masq_incarnations[i]) { + if ((j = unregister_ip_masq_app(masq_incarnations[i]))) { + k = j; + } else { + kfree(masq_incarnations[i]); + masq_incarnations[i] = NULL; + IP_MASQ_DEBUG(1-debug, "RealAudio: unloaded support on port[%d] = %d\n", + i, ports[i]); + } + } + } + return k; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_raudio_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_raudio_done() != 0) + printk(KERN_INFO "ip_masq_raudio: can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_user.c b/pfinet/linux-src/net/ipv4/ip_masq_user.c new file mode 100644 index 00000000..51297441 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_user.c @@ -0,0 +1,473 @@ +/* + * IP_MASQ_USER user space control module + * + * + * $Id: ip_masq_user.c,v 1.1.2.1 1999/08/07 10:56:33 davem Exp $ + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/checksum.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/sysctl.h> +#include <linux/ip_fw.h> + +#include <linux/ip_masq.h> + +/* + * Debug level + */ +static int debug=0; + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); +MODULE_PARM(debug, "i"); + +/* +static int check_5uple (struct ip_masq_user *ums) { + return 0; +} +*/ +static void masq_user_k2u(const struct ip_masq *ms, struct ip_masq_user *ums) +{ + ums->protocol = ms->protocol; + ums->daddr = ms->daddr; + ums->dport = ms->dport; + ums->maddr = ms->maddr; + ums->mport = ms->mport; + ums->saddr = ms->saddr; + ums->sport = ms->sport; + ums->timeout = ms->timeout; +} + + +static int ip_masq_user_maddr(struct ip_masq_user *ums) +{ + struct device *dev; + struct rtable *rt; + int ret = -EINVAL; + u32 rt_daddr, rt_saddr; + u32 tos; + + /* + * Did specify masq address. + */ + if (ums->maddr) + return 0; + + /* + * Select address to use for routing query + */ + + rt_daddr = ums->rt_daddr? ums->rt_daddr : ums->daddr; + rt_saddr = ums->rt_saddr? ums->rt_saddr : ums->saddr; + + + /* + * No address for routing, cannot continue + */ + if (rt_daddr == 0) { + IP_MASQ_DEBUG(1-debug, "cannot setup maddr with daddr=%lX, rt_addr=%lX\n", + ntohl(ums->daddr), ntohl(ums->rt_daddr)); + return -EINVAL; + } + + /* + * Find out rt device + */ + + rt_saddr = 0; + tos = RT_TOS(ums->ip_tos) | RTO_CONN; + + if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) { + IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n", + ntohl(rt_daddr), ntohl(rt_saddr)); + return ret; + } + dev = rt->u.dst.dev; + ums->maddr = ip_masq_select_addr(dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + + IP_MASQ_DEBUG(1-debug, "did setup maddr=%lX\n", ntohl(ums->maddr)); + ip_rt_put(rt); + return 0; +} + +/* + * Create new entry (from uspace) + */ +static int ip_masq_user_new(struct ip_masq_user *ums) +{ + struct ip_masq *ms = NULL; + unsigned mflags = 0; + int ret; + + if (masq_proto_num (ums->protocol) == -1) { + return EPROTONOSUPPORT; + } + + if (ums->dport == 0) { + ums->flags |= IP_MASQ_USER_F_LISTEN; + } + + if (ums->flags | IP_MASQ_USER_F_LISTEN) { + if ((ums->saddr == 0) || (ums->sport == 0)) { + return EINVAL; + } + mflags |= (IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR); + + } + + if ((ret = ip_masq_user_maddr(ums)) < 0) { + return -ret; + } + + mflags |= IP_MASQ_F_USER; + ms = ip_masq_new(ums->protocol, + ums->maddr, ums->mport, + ums->saddr, ums->sport, + ums->daddr, ums->dport, + mflags); + + if (ms == NULL) { + /* + * FIXME: ip_masq_new() should return errno + */ + return EBUSY; + } + + /* + * Setup timeouts for this new entry + */ + + if (ums->timeout) { + ms->timeout = ums->timeout; + } else if (ums->flags | IP_MASQ_USER_F_LISTEN) { + ip_masq_listen(ms); + } + + masq_user_k2u(ms, ums); + ip_masq_put(ms); + return 0; +} + +/* + * Delete existing entry + */ +static int ip_masq_user_del(struct ip_masq_user *ums) +{ + struct ip_masq *ms=NULL; + + if (masq_proto_num (ums->protocol) == -1) { + return EPROTONOSUPPORT; + } + start_bh_atomic(); + if (ums->mport && ums->maddr) { + ms = ip_masq_in_get(ums->protocol, + ums->daddr, ums->dport, + ums->maddr, ums->mport); + end_bh_atomic(); + } else if (ums->sport && ums->saddr) { + ms = ip_masq_out_get(ums->protocol, + ums->saddr, ums->sport, + ums->daddr, ums->dport); + end_bh_atomic(); + } else + return EINVAL; + + if (ms == NULL) { + return ESRCH; + } + + /* + * got (locked) entry, setup almost tiny timeout :) and + * give away + * + * FIXME: should use something better than S_CLOSE + */ + ms->timeout = IP_MASQ_S_CLOSE; + + masq_user_k2u(ms, ums); + ip_masq_put(ms); + return 0; +} + +static struct ip_masq * ip_masq_user_locked_get (struct ip_masq_user *ums, int *err) +{ + struct ip_masq *ms=NULL; + if (masq_proto_num (ums->protocol) == -1) { + *err = EPROTONOSUPPORT; + } + + start_bh_atomic(); + if (ums->mport && ums->maddr) { + ms = ip_masq_in_get(ums->protocol, + ums->daddr, ums->dport, + ums->maddr, ums->mport); + end_bh_atomic(); + } else if (ums->sport && ums->saddr) { + ms = ip_masq_out_get(ums->protocol, + ums->saddr, ums->sport, + ums->daddr, ums->dport); + end_bh_atomic(); + } else + *err = EINVAL; + + if (ms == NULL) *err = ESRCH; + return ms; +} + +/* + * Get existing entry (complete full tunnel info) + */ +static int ip_masq_user_get(struct ip_masq_user *ums) +{ + struct ip_masq *ms=NULL; + int err; + + ms = ip_masq_user_locked_get(ums, &err); + if (ms == NULL) + return err; + + masq_user_k2u(ms, ums); + + ip_masq_put(ms); + return 0; +} + +/* + * Set (some, valid) entry parameters + */ +static int ip_masq_user_set(struct ip_masq_user *ums) +{ + struct ip_masq *ms = NULL; + int err; + + ms = ip_masq_user_locked_get(ums, &err); + if (ms == NULL) + return err; + + /* + * FIXME: must allow selecting what you want to set + */ + ms->timeout = ums->timeout; + + masq_user_k2u(ms, ums); + + ip_masq_put(ms); + return 0; +} + + +/* + * Entry point + * ret value: + * <0 err + * ==0 ok + * >0 ok, copy to user + */ +static int ip_masq_user_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_masq_user *ums = &mctl->u.user; + int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*ums), + optlen, + sizeof (*mctl)); + + /* + * Yes, I'm a bad guy ... + */ + if (arglen != sizeof(*ums) && optlen != sizeof(*mctl)) + return EINVAL; + + MOD_INC_USE_COUNT; + + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(cmd=%d)\n", cmd); + + switch (mctl->m_cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + ret = ip_masq_user_new(ums); + break; + case IP_MASQ_CMD_DEL: + ret = ip_masq_user_del(ums); + break; + case IP_MASQ_CMD_SET: + ret = ip_masq_user_set(ums); + break; + case IP_MASQ_CMD_GET: + ret = ip_masq_user_get(ums); + break; + } + + /* + * For all of the above, return masq tunnel info + */ + + ret = -ret; + + if (ret == 0) { + ret = sizeof (*ums) + IP_MASQ_CTL_BSIZE; + IP_MASQ_DEBUG(1-debug, "will return %d bytes to user\n", ret); + } + + MOD_DEC_USE_COUNT; + return ret; +} + + +#ifdef CONFIG_PROC_FS +static int ip_masq_user_info(char *buffer, char **start, off_t offset, + int length, int proto) +{ + off_t pos=0, begin; + struct ip_masq *ms; + char temp[129]; + int idx = 0; + int col; + int len=0; + int magic_control; + struct list_head *l,*e; + + MOD_INC_USE_COUNT; + + IP_MASQ_DEBUG(1-debug, "Entered user_info with proto=%d\n", proto); + + if (offset < 128) + { + sprintf(temp, + "Prot SrcIP SPrt DstIP DPrt MAddr MPrt State Flgs Ref Ctl Expires HRow HCol (free=%d,%d,%d)", + atomic_read(ip_masq_free_ports), + atomic_read(ip_masq_free_ports+1), + atomic_read(ip_masq_free_ports+2)); + len = sprintf(buffer, "%-127s\n", temp); + } + pos = 128; + + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + col=0; + read_lock_bh(&__ip_masq_lock); + l = &ip_masq_m_table[idx]; + for (e=l->next; e!=l; e=e->next) { + col++; + ms = list_entry(e, struct ip_masq, m_list); + if (ms->protocol != proto) { + continue; + } + + pos += 128; + if (pos <= offset) { + len = 0; + continue; + } + + /* + * We have locked the tables, no need to del/add timers + * nor cli() 8) + */ + + + magic_control = atomic_read(&ms->n_control); + if (!magic_control && ms->control) magic_control = -1; + sprintf(temp,"%-4s %08lX:%04X %08lX:%04X %08lX:%04X %-12s %3X %4d %3d %7lu %4d %4d", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ntohl(ms->maddr), ntohs(ms->mport), + ip_masq_state_name(ms->state), + ms->flags, + atomic_read(&ms->refcnt), + magic_control, + (ms->timer.expires-jiffies)/HZ, + idx, col); + len += sprintf(buffer+len, "%-127s\n", temp); + + if(len >= length) { + read_unlock_bh(&__ip_masq_lock); + goto done; + } + } + read_unlock_bh(&__ip_masq_lock); + } + +done: + + if (len) { + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + } + if(len>length) + len = length; + MOD_DEC_USE_COUNT; + return len; +} +#else +#define ip_masq_user_info NULL +#endif + +static struct ip_masq_hook ip_masq_user = { + ip_masq_user_ctl, + ip_masq_user_info +}; + +int ip_masq_user_init(void) +{ + if (ip_masq_user_hook != NULL) + return -EEXIST; + ip_masq_user_hook = &ip_masq_user; + return 0; +} + +int ip_masq_user_done(void) +{ + if (ip_masq_user_hook == NULL) + return ENOENT; + ip_masq_user_hook = NULL; + return 0; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; +int init_module(void) +{ + if (ip_masq_user_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_user_done() != 0) + printk(KERN_INFO "ip_masq_user_done(): can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c b/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c new file mode 100644 index 00000000..4724e3b9 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c @@ -0,0 +1,294 @@ +/* + * IP_MASQ_VDOLIVE - VDO Live masquerading module + * + * + * Version: @(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $ + * + * Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net> + * PLAnet Online Ltd + * + * Fixes: Minor changes for 2.1 by + * Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Thanks: + * Thank you to VDOnet Corporation for allowing me access to + * a protocol description without an NDA. This means that + * this module can be distributed as source - a great help! + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/system.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + +struct vdolive_priv_data { + /* Ports used */ + unsigned short origport; + unsigned short masqport; + /* State of decode */ + unsigned short state; +}; + +/* + * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[MAX_MASQ_APP_PORTS] = {7000}; /* I rely on the trailing items being set to zero */ +struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; + +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); + +static int +masq_vdolive_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + if ((ms->app_data = kmalloc(sizeof(struct vdolive_priv_data), + GFP_ATOMIC)) == NULL) + IP_MASQ_DEBUG(1-debug, "VDOlive: No memory for application data\n"); + else + { + struct vdolive_priv_data *priv = + (struct vdolive_priv_data *)ms->app_data; + priv->origport = 0; + priv->masqport = 0; + priv->state = 0; + } + return 0; +} + +static int +masq_vdolive_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + if (ms->app_data) + kfree_s(ms->app_data, sizeof(struct vdolive_priv_data)); + return 0; +} + +int +masq_vdolive_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + unsigned int tagval; /* This should be a 32 bit quantity */ + struct ip_masq *n_ms; + struct vdolive_priv_data *priv = + (struct vdolive_priv_data *)ms->app_data; + + /* This doesn't work at all if no priv data was allocated on startup */ + if (!priv) + return 0; + + /* Everything running correctly already */ + if (priv->state == 3) + return 0; + + skb = *skb_p; + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len; + + if (data+8 > data_limit) { + IP_MASQ_DEBUG(1-debug, "VDOlive: packet too short for ID %p %p\n", data, data_limit); + return 0; + } + memcpy(&tagval, data+4, 4); + IP_MASQ_DEBUG(1-debug, "VDOlive: packet seen, tag %ld, in initial state %d\n", ntohl(tagval), priv->state); + + /* Check for leading packet ID */ + if ((ntohl(tagval) != 6) && (ntohl(tagval) != 1)) { + IP_MASQ_DEBUG(1-debug, "VDOlive: unrecognised tag %ld, in initial state %d\n", ntohl(tagval), priv->state); + return 0; + } + + + /* Check packet is long enough for data - ignore if not */ + if ((ntohl(tagval) == 6) && (data+36 > data_limit)) { + IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet too short %p %p\n", data, data_limit); + return 0; + } else if ((ntohl(tagval) == 1) && (data+20 > data_limit)) { + IP_MASQ_DEBUG(1-debug,"VDOlive: secondary packet too short %p %p\n", data, data_limit); + return 0; + } + + /* Adjust data pointers */ + /* + * I could check the complete protocol version tag + * in here however I am just going to look for the + * "VDO Live" tag in the hope that this part will + * remain constant even if the version changes + */ + if (ntohl(tagval) == 6) { + data += 24; + IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet found\n"); + } else { + data += 8; + IP_MASQ_DEBUG(1-debug, "VDOlive: secondary packet found\n"); + } + + if (memcmp(data, "VDO Live", 8) != 0) { + IP_MASQ_DEBUG(1-debug,"VDOlive: did not find tag\n"); + return 0; + } + /* + * The port number is the next word after the tag. + * VDOlive encodes all of these values + * in 32 bit words, so in this case I am + * skipping the first 2 bytes of the next + * word to get to the relevant 16 bits + */ + data += 10; + + /* + * If we have not seen the port already, + * set the masquerading tunnel up + */ + if (!priv->origport) { + memcpy(&priv->origport, data, 2); + IP_MASQ_DEBUG(1-debug, "VDOlive: found port %d\n", ntohs(priv->origport)); + + /* Open up a tunnel */ + n_ms = ip_masq_new(IPPROTO_UDP, + maddr, 0, + ms->saddr, priv->origport, + ms->daddr, 0, + IP_MASQ_F_NO_DPORT); + + if (n_ms==NULL) { + ip_masq_put(n_ms); + IP_MASQ_DEBUG(1-debug, "VDOlive: unable to build UDP tunnel for %x:%x\n", ms->saddr, priv->origport); + /* Leave state as unset */ + priv->origport = 0; + return 0; + } + ip_masq_listen(n_ms); + + ip_masq_put(ms); + priv->masqport = n_ms->mport; + } else if (memcmp(data, &(priv->origport), 2)) { + IP_MASQ_DEBUG(1-debug, "VDOlive: ports do not match\n"); + /* Write the port in anyhow!!! */ + } + + /* + * Write masq port into packet + */ + memcpy(data, &(priv->masqport), 2); + IP_MASQ_DEBUG(1-debug, "VDOlive: rewrote port %d to %d, server %08X\n", ntohs(priv->origport), ntohs(priv->masqport), ms->saddr); + + /* + * Set state bit to make which bit has been done + */ + + priv->state |= (ntohl(tagval) == 6) ? 1 : 2; + + return 0; +} + + +struct ip_masq_app ip_masq_vdolive = { + NULL, /* next */ + "VDOlive", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_vdolive_init_1, /* ip_masq_init_1 */ + masq_vdolive_done_1, /* ip_masq_done_1 */ + masq_vdolive_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_vdolive initialization + */ + +__initfunc(int ip_masq_vdolive_init(void)) +{ + int i, j; + + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (ports[i]) { + if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app), + GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(masq_incarnations[i], &ip_masq_vdolive, sizeof(struct ip_masq_app)); + if ((j = register_ip_masq_app(masq_incarnations[i], + IPPROTO_TCP, + ports[i]))) { + return j; + } + IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n", i, ports[i]); + } else { + /* To be safe, force the incarnation table entry to NULL */ + masq_incarnations[i] = NULL; + } + } + return 0; +} + +/* + * ip_masq_vdolive fin. + */ + +int ip_masq_vdolive_done(void) +{ + int i, j, k; + + k=0; + for (i=0; (i<MAX_MASQ_APP_PORTS); i++) { + if (masq_incarnations[i]) { + if ((j = unregister_ip_masq_app(masq_incarnations[i]))) { + k = j; + } else { + kfree(masq_incarnations[i]); + masq_incarnations[i] = NULL; + IP_MASQ_DEBUG(1-debug,"VDOlive: unloaded support on port[%d] = %d\n", i, ports[i]); + } + } + } + return k; +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_masq_vdolive_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_vdolive_done() != 0) + IP_MASQ_DEBUG(1-debug, "ip_masq_vdolive: can't remove module"); +} + +#endif /* MODULE */ diff --git a/pfinet/linux-src/net/ipv4/ip_nat_dumb.c b/pfinet/linux-src/net/ipv4/ip_nat_dumb.c new file mode 100644 index 00000000..5a1c6d75 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_nat_dumb.c @@ -0,0 +1,158 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Dumb Network Address Translation. + * + * Version: $Id: ip_nat_dumb.c,v 1.8 1999/03/21 05:22:40 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : A zero checksum is a special case + * only in UDP + * Rani Assaf : Added ICMP messages rewriting + * Rani Assaf : Repaired wrong changes, made by ANK. + * + * + * NOTE: It is just working model of real NAT. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#include <net/checksum.h> +#include <linux/route.h> +#include <net/route.h> +#include <net/ip_fib.h> + + +int +ip_do_nat(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct iphdr *iph = skb->nh.iph; + u32 odaddr = iph->daddr; + u32 osaddr = iph->saddr; + u16 check; + + IPCB(skb)->flags |= IPSKB_TRANSLATED; + + /* Rewrite IP header */ + iph->daddr = rt->rt_dst_map; + iph->saddr = rt->rt_src_map; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + /* If it is the first fragment, rewrite protocol headers */ + + if (!(iph->frag_off & htons(IP_OFFSET))) { + u16 *cksum; + + switch(iph->protocol) { + case IPPROTO_TCP: + cksum = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check; + if ((u8*)(cksum+1) > skb->tail) + goto truncated; + check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum)); + *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); + break; + case IPPROTO_UDP: + cksum = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check; + if ((u8*)(cksum+1) > skb->tail) + goto truncated; + if ((check = *cksum) != 0) { + check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); + check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); + *cksum = check ? : 0xFFFF; + } + break; + case IPPROTO_ICMP: + { + struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2)); + struct iphdr *ciph; + u32 idaddr, isaddr; + int updated; + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_TIME_EXCEEDED) && + (icmph->type != ICMP_PARAMETERPROB)) + break; + + ciph = (struct iphdr *) (icmph + 1); + + if ((u8*)(ciph+1) > skb->tail) + goto truncated; + + isaddr = ciph->saddr; + idaddr = ciph->daddr; + updated = 0; + + if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) { + ciph->saddr = iph->daddr; + updated = 1; + } + if (rt->rt_flags&RTCF_SNAT) { + if (ciph->daddr != osaddr) { + struct fib_result res; + struct rt_key key; + unsigned flags = 0; + + key.src = ciph->daddr; + key.dst = ciph->saddr; + key.iif = skb->dev->ifindex; + key.oif = 0; +#ifdef CONFIG_IP_ROUTE_TOS + key.tos = RT_TOS(ciph->tos); +#endif +#ifdef CONFIG_IP_ROUTE_FWMARK + key.fwmark = 0; +#endif + /* Use fib_lookup() until we get our own + * hash table of NATed hosts -- Rani + */ + if (fib_lookup(&key, &res) == 0 && res.r) { + ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags); + if (ciph->daddr != idaddr) + updated = 1; + } + } else { + ciph->daddr = iph->saddr; + updated = 1; + } + } + if (updated) { + cksum = &icmph->checksum; + /* Using tcpudp primitive. Why not? */ + check = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum)); + *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check); + } + break; + } + default: + break; + } + } + return 0; + +truncated: + return -EINVAL; +} diff --git a/pfinet/linux-src/net/ipv4/ip_options.c b/pfinet/linux-src/net/ipv4/ip_options.c new file mode 100644 index 00000000..a3d1f0aa --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_options.c @@ -0,0 +1,617 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The options processing module for ip.c + * + * Version: $Id: ip_options.c,v 1.16.2.1 1999/06/02 04:06:19 davem Exp $ + * + * Authors: A.N.Kuznetsov + * + */ + +#include <linux/types.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> + +/* + * Write options to IP header, record destination address to + * source route option, address of outgoing interface + * (we should already know it, so that this function is allowed be + * called only after routing decision) and timestamp, + * if we originate this datagram. + * + * daddr is real destination address, next hop is recorded in IP header. + * saddr is address of outgoing interface. + */ + +void ip_options_build(struct sk_buff * skb, struct ip_options * opt, + u32 daddr, struct rtable *rt, int is_frag) +{ + unsigned char * iph = skb->nh.raw; + + memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); + memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + opt = &(IPCB(skb)->opt); + opt->is_data = 0; + + if (opt->srr) + memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + + if (!is_frag) { + if (opt->rr_needaddr) + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); + if (opt->ts_needaddr) + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); + if (opt->ts_needtime) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + } + return; + } + if (opt->rr) { + memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + opt->rr = 0; + opt->rr_needaddr = 0; + } + if (opt->ts) { + memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + opt->ts = 0; + opt->ts_needaddr = opt->ts_needtime = 0; + } +} + +/* + * Provided (sopt, skb) points to received options, + * build in dopt compiled option set appropriate for answering. + * i.e. invert SRR option, copy anothers, + * and grab room in RR/TS options. + * + * NOTE: dopt cannot point to skb. + */ + +int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +{ + struct ip_options *sopt; + unsigned char *sptr, *dptr; + int soffset, doffset; + int optlen; + u32 daddr; + + memset(dopt, 0, sizeof(struct ip_options)); + + dopt->is_data = 1; + + sopt = &(IPCB(skb)->opt); + + if (sopt->optlen == 0) { + dopt->optlen = 0; + return 0; + } + + sptr = skb->nh.raw; + dptr = dopt->__data; + + if (skb->dst) + daddr = ((struct rtable*)skb->dst)->rt_spec_dst; + else + daddr = skb->nh.iph->daddr; + + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; + soffset = sptr[sopt->rr+2]; + dopt->rr = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->rr, optlen); + if (sopt->rr_needaddr && soffset <= optlen) { + if (soffset + 3 > optlen) + return -EINVAL; + dptr[2] = soffset + 4; + dopt->rr_needaddr = 1; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->ts) { + optlen = sptr[sopt->ts+1]; + soffset = sptr[sopt->ts+2]; + dopt->ts = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->ts, optlen); + if (soffset <= optlen) { + if (sopt->ts_needaddr) { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needaddr = 1; + soffset += 4; + } + if (sopt->ts_needtime) { + if (soffset + 3 > optlen) + return -EINVAL; + if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { + dopt->ts_needtime = 1; + soffset += 4; + } else { + dopt->ts_needtime = 0; + + if (soffset + 8 <= optlen) { + __u32 addr; + + memcpy(&addr, sptr+soffset-1, 4); + if (inet_addr_type(addr) != RTN_LOCAL) { + dopt->ts_needtime = 1; + soffset += 8; + } + } + } + } + dptr[2] = soffset; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->srr) { + unsigned char * start = sptr+sopt->srr; + u32 faddr; + + optlen = start[1]; + soffset = start[2]; + doffset = 0; + if (soffset > optlen) + soffset = optlen + 1; + soffset -= 4; + if (soffset > 3) { + memcpy(&faddr, &start[soffset-1], 4); + for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + memcpy(&dptr[doffset-1], &start[soffset-1], 4); + /* + * RFC1812 requires to fix illegal source routes. + */ + if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) + doffset -= 4; + } + if (doffset > 3) { + memcpy(&start[doffset-1], &daddr, 4); + dopt->faddr = faddr; + dptr[0] = start[0]; + dptr[1] = doffset+3; + dptr[2] = 4; + dptr += doffset+3; + dopt->srr = dopt->optlen + sizeof(struct iphdr); + dopt->optlen += doffset+3; + dopt->is_strictroute = sopt->is_strictroute; + } + } + while (dopt->optlen & 3) { + *dptr++ = IPOPT_END; + dopt->optlen++; + } + return 0; +} + +/* + * Options "fragmenting", just fill options not + * allowed in fragments with NOOPs. + * Simple and stupid 8), but the most efficient way. + */ + +void ip_options_fragment(struct sk_buff * skb) +{ + unsigned char * optptr = skb->nh.raw; + struct ip_options * opt = &(IPCB(skb)->opt); + int l = opt->optlen; + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return; + if (!IPOPT_COPIED(*optptr)) + memset(optptr, IPOPT_NOOP, optlen); + l -= optlen; + optptr += optlen; + } + opt->ts = 0; + opt->rr = 0; + opt->rr_needaddr = 0; + opt->ts_needaddr = 0; + opt->ts_needtime = 0; + return; +} + +/* + * Verify options and fill pointers in struct options. + * Caller should clear *opt, and set opt->data. + * If opt == NULL, then skb->data should point to IP header. + */ + +int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) +{ + int l; + unsigned char * iph; + unsigned char * optptr; + int optlen; + unsigned char * pp_ptr = NULL; + struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; + + if (!opt) { + opt = &(IPCB(skb)->opt); + memset(opt, 0, sizeof(struct ip_options)); + iph = skb->nh.raw; + opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); + optptr = iph + sizeof(struct iphdr); + opt->is_data = 0; + } else { + optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); + iph = optptr - sizeof(struct iphdr); + } + + for (l = opt->optlen; l > 0; ) { + switch (*optptr) { + case IPOPT_END: + for (optptr++, l--; l>0; l--) { + if (*optptr != IPOPT_END) { + *optptr = IPOPT_END; + opt->is_changed = 1; + } + } + goto eol; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) { + pp_ptr = optptr; + goto error; + } + switch (*optptr) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + /* NB: cf RFC-1812 5.2.4.1 */ + if (opt->srr) { + pp_ptr = optptr; + goto error; + } + if (!skb) { + if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { + pp_ptr = optptr + 1; + goto error; + } + memcpy(&opt->faddr, &optptr[3], 4); + if (optlen > 7) + memmove(&optptr[3], &optptr[7], optlen-7); + } + opt->is_strictroute = (optptr[0] == IPOPT_SSRR); + opt->srr = optptr - iph; + break; + case IPOPT_RR: + if (opt->rr) { + pp_ptr = optptr; + goto error; + } + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + if (optptr[2]+3 > optlen) { + pp_ptr = optptr + 2; + goto error; + } + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + opt->is_changed = 1; + } + optptr[2] += 4; + opt->rr_needaddr = 1; + } + opt->rr = optptr - iph; + break; + case IPOPT_TIMESTAMP: + if (opt->ts) { + pp_ptr = optptr; + goto error; + } + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 5) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + __u32 * timeptr = NULL; + if (optptr[2]+3 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + switch (optptr[3]&0xF) { + case IPOPT_TS_TSONLY: + opt->ts = optptr - iph; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]-1]; + opt->ts_needtime = 1; + optptr[2] += 4; + break; + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + optptr[2] += 8; + break; + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + { + u32 addr; + memcpy(&addr, &optptr[optptr[2]-1], 4); + if (inet_addr_type(addr) == RTN_UNICAST) + break; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needtime = 1; + optptr[2] += 8; + break; + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr + 3; + goto error; + } + break; + } + if (timeptr) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(timeptr, &midtime, sizeof(__u32)); + opt->is_changed = 1; + } + } else { + unsigned overflow = optptr[3]>>4; + if (overflow == 15) { + pp_ptr = optptr + 3; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); + opt->is_changed = 1; + } + } + break; + case IPOPT_RA: + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] == 0 && optptr[3] == 0) + opt->router_alert = optptr - iph; + break; + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr; + goto error; + } + break; + } + l -= optlen; + optptr += optlen; + } + +eol: + if (!pp_ptr) + return 0; + +error: + if (skb) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); + } + return -EINVAL; +} + + +/* + * Undo all the changes done by ip_options_compile(). + */ + +void ip_options_undo(struct ip_options * opt) +{ + if (opt->srr) { + unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + memmove(optptr+7, optptr+3, optptr[1]-7); + memcpy(optptr+3, &opt->faddr, 4); + } + if (opt->rr_needaddr) { + unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + if (opt->ts) { + unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + if (opt->ts_needtime) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + optptr[2] -= 4; + } + if (opt->ts_needaddr) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + } +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +{ + struct ip_options *opt; + + opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); + if (!opt) + return -ENOMEM; + memset(opt, 0, sizeof(struct ip_options)); + if (optlen) { + if (user) { + if (copy_from_user(opt->__data, data, optlen)) + return -EFAULT; + } else + memcpy(opt->__data, data, optlen); + } + while (optlen & 3) + opt->__data[optlen++] = IPOPT_END; + opt->optlen = optlen; + opt->is_data = 1; + opt->is_setbyuser = 1; + if (optlen && ip_options_compile(opt, NULL)) { + kfree_s(opt, sizeof(struct ip_options) + optlen); + return -EINVAL; + } + *optp = opt; + return 0; +} + +void ip_forward_options(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + unsigned char * optptr; + struct rtable *rt = (struct rtable*)skb->dst; + unsigned char *raw = skb->nh.raw; + + if (opt->rr_needaddr) { + optptr = (unsigned char *)raw + opt->rr; + ip_rt_get_source(&optptr[optptr[2]-5], rt); + opt->is_changed = 1; + } + if (opt->srr_is_hit) { + int srrptr, srrspace; + + optptr = raw + opt->srr; + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) { + if (srrptr + 3 > srrspace) + break; + if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) + break; + } + if (srrptr + 3 <= srrspace) { + opt->is_changed = 1; + ip_rt_get_source(&optptr[srrptr-1], rt); + skb->nh.iph->daddr = rt->rt_dst; + optptr[2] = srrptr+4; + } else + printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); + if (opt->ts_needaddr) { + optptr = raw + opt->ts; + ip_rt_get_source(&optptr[optptr[2]-9], rt); + opt->is_changed = 1; + } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(skb->nh.iph); + } +} + +int ip_options_rcv_srr(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + int srrspace, srrptr; + u32 nexthop; + struct iphdr *iph = skb->nh.iph; + unsigned char * optptr = skb->nh.raw + opt->srr; + struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt2; + int err; + + if (!opt->srr) + return 0; + + if (skb->pkt_type != PACKET_HOST) + return -EINVAL; + if (rt->rt_type == RTN_UNICAST) { + if (!opt->is_strictroute) + return 0; + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); + return -EINVAL; + } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; + + for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + if (srrptr + 3 > srrspace) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); + return -EINVAL; + } + memcpy(&nexthop, &optptr[srrptr-1], 4); + + rt = (struct rtable*)skb->dst; + skb->dst = NULL; + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + rt2 = (struct rtable*)skb->dst; + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { + ip_rt_put(rt2); + skb->dst = &rt->u.dst; + return -EINVAL; + } + ip_rt_put(rt); + if (rt2->rt_type != RTN_LOCAL) + break; + /* Superfast 8) loopback forward */ + memcpy(&iph->daddr, &optptr[srrptr-1], 4); + opt->is_changed = 1; + } + if (srrptr <= srrspace) { + opt->srr_is_hit = 1; + opt->is_changed = 1; + } + return 0; +} diff --git a/pfinet/linux-src/net/ipv4/ip_output.c b/pfinet/linux-src/net/ipv4/ip_output.c new file mode 100644 index 00000000..44d63557 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_output.c @@ -0,0 +1,992 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) output module. + * + * Version: $Id: ip_output.c,v 1.67 1999/03/25 00:43:00 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * + * See ip_input.c for original log + * + * Fixes: + * Alan Cox : Missing nonblock feature in ip_build_xmit. + * Mike Kilburn : htons() missing in ip_build_xmit. + * Bradford Johnson: Fix faulty handling of some frames when + * no route is found. + * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit + * (in case if packet not accepted by + * output firewall rules) + * Mike McLagan : Routing by source + * Alexey Kuznetsov: use new route cache + * Andi Kleen: Fix broken PMTU recovery and remove + * some redundant tests. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. + * Marc Boucher : When call_out_firewall returns FW_QUEUE, + * silently drop skb instead of failing with -EPERM. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <linux/igmp.h> +#include <linux/ip_fw.h> +#include <linux/firewall.h> +#include <linux/mroute.h> +#include <linux/netlink.h> + +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr = 0; + + +int ip_id_count = 0; + +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) +{ + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + +/* + * Add an ip header to a skbuff and send it out. + */ +void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) +{ + struct rtable *rt = (struct rtable *)skb->dst; + struct iphdr *iph; + struct device *dev; + + /* Build the IP header. */ + if (opt) + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); + else + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = sk->ip_tos; + iph->frag_off = 0; + if (ip_dont_fragment(sk, &rt->u.dst)) + iph->frag_off |= htons(IP_DF); + iph->ttl = sk->ip_ttl; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->protocol; + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + skb->nh.iph = iph; + + if (opt && opt->optlen) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, daddr, rt, 0); + } + + dev = rt->u.dst.dev; + +#ifdef CONFIG_FIREWALL + /* Now we have no better mechanism to notify about error. */ + switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) { + case FW_REJECT: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + /* Fall thru... */ + case FW_BLOCK: + case FW_QUEUE: + kfree_skb(skb); + return; + } +#endif + + ip_send_check(iph); + + /* Send it out. */ + skb->dst->output(skb); + return; +} + +int __ip_finish_output(struct sk_buff *skb) +{ + return ip_finish_output(skb); +} + +int ip_mc_output(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rtable *rt = (struct rtable*)skb->dst; + struct device *dev = rt->u.dst.dev; + + /* + * If the indicated interface is up and running, send the packet. + */ + + ip_statistics.IpOutRequests++; +#ifdef CONFIG_IP_ROUTE_NAT + if (rt->rt_flags & RTCF_NAT) + ip_do_nat(skb); +#endif + + skb->dev = dev; + skb->protocol = __constant_htons(ETH_P_IP); + + /* + * Multicasts are looped back for other local users + */ + + if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) { +#ifdef CONFIG_IP_MROUTE + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) +#endif + dev_loopback_xmit(skb); + + /* Multicasts with ttl 0 must not go beyond the host */ + + if (skb->nh.iph->ttl == 0) { + kfree_skb(skb); + return 0; + } + } + + if (rt->rt_flags&RTCF_BROADCAST) + dev_loopback_xmit(skb); + + return ip_finish_output(skb); +} + +int ip_output(struct sk_buff *skb) +{ +#ifdef CONFIG_IP_ROUTE_NAT + struct rtable *rt = (struct rtable*)skb->dst; +#endif + + ip_statistics.IpOutRequests++; + +#ifdef CONFIG_IP_ROUTE_NAT + if (rt->rt_flags&RTCF_NAT) + ip_do_nat(skb); +#endif + + return ip_finish_output(skb); +} + +/* Queues a packet to be sent, and starts the transmitter if necessary. + * This routine also needs to put in the total length and compute the + * checksum. We use to do this in two stages, ip_build_header() then + * this, but that scheme created a mess when routes disappeared etc. + * So we do it all here, and the TCP send engine has been changed to + * match. (No more unroutable FIN disasters, etc. wheee...) This will + * most likely make other reliable transport layers above IP easier + * to implement under Linux. + */ +void ip_queue_xmit(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct ip_options *opt = sk->opt; + struct rtable *rt; + struct device *dev; + struct iphdr *iph; + unsigned int tot_len; + + /* Make sure we can route this packet. */ + rt = (struct rtable *) sk->dst_cache; + if(rt == NULL || rt->u.dst.obsolete) { + u32 daddr; + + sk->dst_cache = NULL; + ip_rt_put(rt); + + /* Use correct destination address if we have options. */ + daddr = sk->daddr; + if(opt && opt->srr) + daddr = opt->faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times itself + * out. + */ + if(ip_route_output(&rt, daddr, sk->saddr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) + goto drop; + sk->dst_cache = &rt->u.dst; + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto no_route; + + /* We have a route, so grab a reference. */ + skb->dst = dst_clone(sk->dst_cache); + + /* OK, we know where to send it, allocate and build IP header. */ + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + iph->version = 4; + iph->ihl = 5; + iph->tos = sk->ip_tos; + iph->frag_off = 0; + iph->ttl = sk->ip_ttl; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->protocol; + skb->nh.iph = iph; + /* Transport layer set skb->h.foo itself. */ + + if(opt && opt->optlen) { + iph->ihl += opt->optlen >> 2; + ip_options_build(skb, opt, sk->daddr, rt, 0); + } + + tot_len = skb->len; + iph->tot_len = htons(tot_len); + iph->id = htons(ip_id_count++); + + dev = rt->u.dst.dev; + +#ifdef CONFIG_FIREWALL + /* Now we have no better mechanism to notify about error. */ + switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) { + case FW_REJECT: + start_bh_atomic(); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + end_bh_atomic(); + /* Fall thru... */ + case FW_BLOCK: + case FW_QUEUE: + goto drop; + } +#endif + + /* This can happen when the transport layer has segments queued + * with a cached route, and by the time we get here things are + * re-routed to a device with a different MTU than the original + * device. Sick, but we must cover it. + */ + if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15); + kfree_skb(skb); + if (skb2 == NULL) + return; + if (sk) + skb_set_owner_w(skb, sk); + skb = skb2; + iph = skb->nh.iph; + } + + /* Do we need to fragment. Again this is inefficient. We + * need to somehow lock the original buffer and use bits of it. + */ + if (tot_len > rt->u.dst.pmtu) + goto fragment; + + if (ip_dont_fragment(sk, &rt->u.dst)) + iph->frag_off |= __constant_htons(IP_DF); + + /* Add an IP checksum. */ + ip_send_check(iph); + + skb->priority = sk->priority; + skb->dst->output(skb); + return; + +fragment: + if (ip_dont_fragment(sk, &rt->u.dst) && + tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) { + /* Reject packet ONLY if TCP might fragment + it itself, if were careful enough. + Test is not precise (f.e. it does not take sacks + into account). Actually, tcp should make it. --ANK (980801) + */ + iph->frag_off |= __constant_htons(IP_DF); + NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n")); + + /* icmp_send is not reenterable, so that bh_atomic... --ANK */ + start_bh_atomic(); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(rt->u.dst.pmtu)); + end_bh_atomic(); + goto drop; + } + ip_fragment(skb, skb->dst->output); + return; + +no_route: + sk->dst_cache = NULL; + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; + /* Fall through... */ +drop: + kfree_skb(skb); +} + +/* + * Build and send a packet, with as little as one copy + * + * Doesn't care much about ip options... option length can be + * different for fragment at 0 and other fragments. + * + * Note that the fragment at the highest offset is sent first, + * so the getfrag routine can fill in the TCP/UDP checksum header + * field in the last fragment it sends... actually it also helps + * the reassemblers, they can put most packets in at the head of + * the fragment queue, and they know the total size in advance. This + * last feature will measurably improve the Linux fragment handler one + * day. + * + * The callback has five args, an arbitrary pointer (copy of frag), + * the source IP address (may depend on the routing table), the + * destination address (char *), the offset to copy from, and the + * length to be copied. + */ + +int ip_build_xmit_slow(struct sock *sk, + int getfrag (const void *, + char *, + unsigned int, + unsigned int), + const void *frag, + unsigned length, + struct ipcm_cookie *ipc, + struct rtable *rt, + int flags) +{ + unsigned int fraglen, maxfraglen, fragheaderlen; + int err; + int offset, mf; + int mtu; + unsigned short id; + + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; + int nfrags=0; + struct ip_options *opt = ipc->opt; + int df = 0; + + mtu = rt->u.dst.pmtu; + if (ip_dont_fragment(sk, &rt->u.dst)) + df = htons(IP_DF); + + length -= sizeof(struct iphdr); + + if (opt) { + fragheaderlen = sizeof(struct iphdr) + opt->optlen; + maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; + } else { + fragheaderlen = sizeof(struct iphdr); + + /* + * Fragheaderlen is the size of 'overhead' on each buffer. Now work + * out the size of the frames to send. + */ + + maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; + } + + if (length + fragheaderlen > 0xFFFF) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); + return -EMSGSIZE; + } + + /* + * Start at the end of the frame by handling the remainder. + */ + + offset = length - (length % (maxfraglen - fragheaderlen)); + + /* + * Amount of memory to allocate for final fragment. + */ + + fraglen = length - offset + fragheaderlen; + + if (length-offset==0) { + fraglen = maxfraglen; + offset -= maxfraglen-fragheaderlen; + } + + + /* + * The last fragment will not have MF (more fragments) set. + */ + + mf = 0; + + /* + * Don't fragment packets for path mtu discovery. + */ + + if (offset > 0 && df) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); + return(-EMSGSIZE); + } + + /* + * Lock the device lists. + */ + + dev_lock_list(); + + /* + * Get an identifier + */ + + id = htons(ip_id_count++); + + /* + * Begin outputting the bytes. + */ + + do { + char *data; + struct sk_buff * skb; + + /* + * Get the memory we require with some space left for alignment. + */ + + skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + + /* + * Fill in the control structures + */ + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + + data = skb_put(skb, fraglen); + skb->nh.iph = (struct iphdr *)data; + + /* + * Only write IP header onto non-raw packets + */ + + { + struct iphdr *iph = (struct iphdr *)data; + + iph->version = 4; + iph->ihl = 5; + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, + ipc->addr, rt, offset); + } + iph->tos = sk->ip_tos; + iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); + iph->id = id; + iph->frag_off = htons(offset>>3); + iph->frag_off |= mf|df; + if (rt->rt_type == RTN_MULTICAST) + iph->ttl = sk->ip_mc_ttl; + else + iph->ttl = sk->ip_ttl; + iph->protocol = sk->protocol; + iph->check = 0; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + data += iph->ihl*4; + + /* + * Any further fragments will have MF set. + */ + + mf = htons(IP_MF); + } + + /* + * User data callback + */ + + if (getfrag(frag, data, offset, fraglen-fragheaderlen)) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset -= (maxfraglen-fragheaderlen); + fraglen = maxfraglen; + + nfrags++; + +#ifdef CONFIG_FIREWALL + switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) { + case FW_QUEUE: + kfree_skb(skb); + continue; + case FW_BLOCK: + case FW_REJECT: + kfree_skb(skb); + err = -EPERM; + goto error; + } +#endif + + err = -ENETDOWN; + if (rt->u.dst.output(skb)) + goto error; + } while (offset >= 0); + + if (nfrags>1) + ip_statistics.IpFragCreates += nfrags; + dev_unlock_list(); + return 0; + +error: + ip_statistics.IpOutDiscards++; + if (nfrags>1) + ip_statistics.IpFragCreates += nfrags; + dev_unlock_list(); + return err; +} + + +/* + * Fast path for unfragmented packets. + */ +int ip_build_xmit(struct sock *sk, + int getfrag (const void *, + char *, + unsigned int, + unsigned int), + const void *frag, + unsigned length, + struct ipcm_cookie *ipc, + struct rtable *rt, + int flags) +{ + int err; + struct sk_buff *skb; + int df; + struct iphdr *iph; + + /* + * Try the simple case first. This leaves fragmented frames, and by + * choice RAW frames within 20 bytes of maximum size(rare) to the long path + */ + + if (!sk->ip_hdrincl) { + length += sizeof(struct iphdr); + + /* + * Check for slow path. + */ + if (length > rt->u.dst.pmtu || ipc->opt != NULL) + return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); + } else { + if (length > rt->u.dst.dev->mtu) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + } + + /* + * Do path mtu discovery if needed. + */ + df = 0; + if (ip_dont_fragment(sk, &rt->u.dst)) + df = htons(IP_DF); + + /* + * Fast path for unfragmented frames without options. + */ + { + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + 0, flags&MSG_DONTWAIT, &err); + if(skb==NULL) + goto error; + skb_reserve(skb, hh_len); + } + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + dev_lock_list(); + + if(!sk->ip_hdrincl) { + iph->version=4; + iph->ihl=5; + iph->tos=sk->ip_tos; + iph->tot_len = htons(length); + iph->id=htons(ip_id_count++); + iph->frag_off = df; + iph->ttl=sk->ip_mc_ttl; + if (rt->rt_type != RTN_MULTICAST) + iph->ttl=sk->ip_ttl; + iph->protocol=sk->protocol; + iph->saddr=rt->rt_src; + iph->daddr=rt->rt_dst; + iph->check=0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); + } + else + err = getfrag(frag, (void *)iph, 0, length); + + dev_unlock_list(); + + if (err) + goto error_fault; + +#ifdef CONFIG_FIREWALL + switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) { + case FW_QUEUE: + kfree_skb(skb); + return 0; + case FW_BLOCK: + case FW_REJECT: + kfree_skb(skb); + err = -EPERM; + goto error; + } +#endif + + return rt->u.dst.output(skb); + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + ip_statistics.IpOutDiscards++; + return err; +} + + + +/* + * This IP datagram is too large to be sent in one piece. Break it up into + * smaller pieces (each of size equal to IP header plus + * a block of the data of the original IP data part) that will yet fit in a + * single device frame, and queue such a frame for sending. + * + * Yes this is inefficient, feel free to submit a quicker one. + */ + +void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +{ + struct iphdr *iph; + unsigned char *raw; + unsigned char *ptr; + struct device *dev; + struct sk_buff *skb2; + unsigned int mtu, hlen, left, len; + int offset; + int not_last_frag; + struct rtable *rt = (struct rtable*)skb->dst; + + dev = rt->u.dst.dev; + + /* + * Point into the IP datagram header. + */ + + raw = skb->nh.raw; + iph = (struct iphdr*)raw; + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + left = ntohs(iph->tot_len) - hlen; /* Space per frame */ + mtu = rt->u.dst.pmtu - hlen; /* Size of data space */ + ptr = raw + hlen; /* Where to start from */ + + /* + * The protocol doesn't seem to say what to do in the case that the + * frame + options doesn't fit the mtu. As it used to fall down dead + * in this case we were fortunate it didn't happen + * + * It is impossible, because mtu>=68. --ANK (980801) + */ + +#ifdef CONFIG_NET_PARANOIA + if (mtu<8) + goto fail; +#endif + + /* + * Fragment the datagram. + */ + + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + not_last_frag = iph->frag_off & htons(IP_MF); + + /* + * Keep copying data until we run out. + */ + + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + goto fail; + } + + /* + * Set up data on packet + */ + + skb2->pkt_type = skb->pkt_type; + skb2->priority = skb->priority; + skb_reserve(skb2, (dev->hard_header_len+15)&~15); + skb_put(skb2, len + hlen); + skb2->nh.raw = skb2->data; + skb2->h.raw = skb2->data + hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + skb2->dst = dst_clone(skb->dst); + + /* + * Copy the packet header into the new buffer. + */ + + memcpy(skb2->nh.raw, raw, hlen); + + /* + * Copy a block of the IP datagram. + */ + memcpy(skb2->h.raw, ptr, len); + left -= len; + + /* + * Fill in the new header fields. + */ + iph = skb2->nh.iph; + iph->frag_off = htons((offset >> 3)); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || not_last_frag) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + ip_statistics.IpFragCreates++; + + iph->tot_len = htons(len + hlen); + + ip_send_check(iph); + + output(skb2); + } + kfree_skb(skb); + ip_statistics.IpFragOKs++; + return; + +fail: + kfree_skb(skb); + ip_statistics.IpFragFails++; +} + +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, + unsigned int fraglen) +{ + struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr; + u16 *pktp = (u16 *)to; + struct iovec *iov; + int len; + int hdrflag = 1; + + iov = &dp->iov[0]; + if (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + hdrflag = 0; + } + len = iov->iov_len - offset; + if (fraglen > len) { /* overlapping. */ + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, + dp->csum); + offset = 0; + fraglen -= len; + to += len; + iov++; + } + + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, + dp->csum); + + if (hdrflag && dp->csumoffset) + *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */ + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len) +{ + struct { + struct ip_options opt; + char data[40]; + } replyopts; + struct ipcm_cookie ipc; + u32 daddr; + struct rtable *rt = (struct rtable*)skb->dst; + + if (ip_options_echo(&replyopts.opt, skb)) + return; + + sk->ip_tos = skb->nh.iph->tos; + sk->priority = skb->priority; + sk->protocol = skb->nh.iph->protocol; + + daddr = ipc.addr = rt->rt_src; + ipc.opt = &replyopts.opt; + + if (ipc.opt->srr) + daddr = replyopts.opt.faddr; + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) + return; + + /* And let IP do all the hard work. */ + ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT); + ip_rt_put(rt); +} + +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = +{ + __constant_htons(ETH_P_IP), + NULL, /* All devices */ + ip_rcv, + NULL, + NULL, +}; + + + +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST +static struct proc_dir_entry proc_net_igmp = { + PROC_NET_IGMP, 4, "igmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_mc_procinfo +}; +#endif +#endif + +/* + * IP registers the packet type and then calls the subprotocol initialisers + */ + +__initfunc(void ip_init(void)) +{ + dev_add_pack(&ip_packet_type); + + ip_rt_init(); + +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST + proc_net_register(&proc_net_igmp); +#endif +#endif +} + diff --git a/pfinet/linux-src/net/ipv4/ip_sockglue.c b/pfinet/linux-src/net/ipv4/ip_sockglue.c new file mode 100644 index 00000000..369a6770 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ip_sockglue.c @@ -0,0 +1,739 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP to API glue. + * + * Version: $Id: ip_sockglue.c,v 1.42 1999/04/22 10:07:34 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip.c for history. + * Martin Mares : TOS setting fixed. + * Alan Cox : Fixed a couple of oopses in Martin's + * TOS tweaks. + * Mike McLagan : Routing by source + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/igmp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#include <linux/route.h> +#include <linux/mroute.h> +#include <net/route.h> +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include <net/transp_v6.h> +#endif + +#ifdef CONFIG_IP_MASQUERADE +#include <linux/ip_masq.h> +#endif + +#include <linux/errqueue.h> +#include <asm/uaccess.h> + +#define MAX(a,b) ((a)>(b)?(a):(b)) + +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 + +/* + * SOL_IP control messages. + */ + +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct in_pktinfo info; + struct rtable *rt = (struct rtable *)skb->dst; + + info.ipi_addr.s_addr = skb->nh.iph->daddr; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } + + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); +} + +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) +{ + int ttl = skb->nh.iph->ttl; + put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); +} + +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); +} + +static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); +} + + +void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options * opt = (struct ip_options*)optbuf; + + if (IPCB(skb)->opt.optlen == 0) + return; + + if (ip_options_echo(opt, skb)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + ip_options_undo(opt); + + put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); +} + + +void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) +{ + unsigned flags = skb->sk->ip_cmsg_flags; + + /* Ordered by supposed usage frequency */ + if (flags & 1) + ip_cmsg_recv_pktinfo(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_ttl(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_tos(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_opts(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_retopts(msg, skb); +} + +int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) +{ + int err; + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_len < sizeof(struct cmsghdr) || + (unsigned long)(((char*)cmsg - (char*)msg->msg_control) + + cmsg->cmsg_len) > msg->msg_controllen) { + return -EINVAL; + } + if (cmsg->cmsg_level != SOL_IP) + continue; + switch (cmsg->cmsg_type) { + case IP_RETOPTS: + err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + if (err) + return err; + break; + case IP_PKTINFO: + { + struct in_pktinfo *info; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return -EINVAL; + info = (struct in_pktinfo *)CMSG_DATA(cmsg); + ipc->oif = info->ipi_ifindex; + ipc->addr = info->ipi_spec_dst.s_addr; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + + +/* Special input handler for packets catched by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain *ip_ra_chain; + +int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra, **rap; + + if (sk->type != SOCK_RAW || sk->num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + *rap = ra->next; + synchronize_bh(); + + if (ra->destructor) + ra->destructor(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) + return -ENOBUFS; + new_ra->sk = sk; + new_ra->destructor = destructor; + + new_ra->next = ra; + wmb(); + *rap = new_ra; + + return 0; +} + +void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct sock_exterr_skb *serr; + + if (!sk->ip_recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; + serr->ee.ee_type = skb->h.icmph->type; + serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + skb_pull(skb, payload - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info) +{ + struct sock_exterr_skb *serr; + struct iphdr *iph; + struct sk_buff *skb; + + if (!sk->ip_recverr) + return; + + skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); + skb->nh.iph = iph; + iph->daddr = daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = port; + + skb->h.raw = skb->tail; + skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free_skb; + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset); + sin->sin_port = serr->port; + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin_family = AF_UNSPEC; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + if (sk->ip_cmsg_flags) + ip_cmsg_recv(msg, skb); + } + + put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + sk->err = 0; + if ((skb2 = skb_peek(&sk->error_queue)) != NULL) { + sk->err = SKB_EXT_ERR(skb2)->ee.ee_errno; + sk->error_report(sk); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +/* + * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on + * an IP socket. + * + * We implement IP_TOS (type of service), IP_TTL (time to live). + */ + +int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) +{ + int val=0,err; +#if defined(CONFIG_IP_FIREWALL) + char tmp_fw[MAX(sizeof(struct ip_fwtest),sizeof(struct ip_fwnew))]; +#endif + if(optlen>=sizeof(int)) { + if(get_user(val, (int *) optval)) + return -EFAULT; + } else if(optlen>=sizeof(char)) { + unsigned char ucval; + if(get_user(ucval, (unsigned char *) optval)) + return -EFAULT; + val = (int)ucval; + } + /* If optlen==0, it is equivalent to val == 0 */ + + if(level!=SOL_IP) + return -ENOPROTOOPT; +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_setsockopt(sk,optname,optval,optlen); + } +#endif + + switch(optname) + { + case IP_OPTIONS: + { + struct ip_options * opt = NULL; + if (optlen > 40 || optlen < 0) + return -EINVAL; + err = ip_options_get(&opt, optval, optlen, 1); + if (err) + return err; + lock_sock(sk); + if (sk->type == SOCK_STREAM) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->family == PF_INET || + ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + && sk->daddr != LOOPBACK4_IPV6)) { +#endif + if (opt) + tp->ext_header_len = opt->optlen; + tcp_sync_mss(sk, tp->pmtu_cookie); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } +#endif + } + opt = xchg(&sk->opt, opt); + release_sock(sk); + if (opt) + kfree_s(opt, sizeof(struct ip_options) + opt->optlen); + return 0; + } + case IP_PKTINFO: + if (val) + sk->ip_cmsg_flags |= IP_CMSG_PKTINFO; + else + sk->ip_cmsg_flags &= ~IP_CMSG_PKTINFO; + return 0; + case IP_RECVTTL: + if (val) + sk->ip_cmsg_flags |= IP_CMSG_TTL; + else + sk->ip_cmsg_flags &= ~IP_CMSG_TTL; + return 0; + case IP_RECVTOS: + if (val) + sk->ip_cmsg_flags |= IP_CMSG_TOS; + else + sk->ip_cmsg_flags &= ~IP_CMSG_TOS; + return 0; + case IP_RECVOPTS: + if (val) + sk->ip_cmsg_flags |= IP_CMSG_RECVOPTS; + else + sk->ip_cmsg_flags &= ~IP_CMSG_RECVOPTS; + return 0; + case IP_RETOPTS: + if (val) + sk->ip_cmsg_flags |= IP_CMSG_RETOPTS; + else + sk->ip_cmsg_flags &= ~IP_CMSG_RETOPTS; + return 0; + case IP_TOS: /* This sets both TOS and Precedence */ + /* Reject setting of unused bits */ + if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK)) + return -EINVAL; + if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && + !capable(CAP_NET_ADMIN)) + return -EPERM; + if (sk->ip_tos != val) { + lock_sock(sk); + sk->ip_tos=val; + sk->priority = rt_tos2priority(val); + dst_release(xchg(&sk->dst_cache, NULL)); + release_sock(sk); + } + return 0; + case IP_TTL: + if (optlen<1) + return -EINVAL; + if(val==-1) + val = ip_statistics.IpDefaultTTL; + if(val<1||val>255) + return -EINVAL; + sk->ip_ttl=val; + return 0; + case IP_HDRINCL: + if(sk->type!=SOCK_RAW) + return -ENOPROTOOPT; + sk->ip_hdrincl=val?1:0; + return 0; + case IP_MTU_DISCOVER: + if (val<0 || val>2) + return -EINVAL; + sk->ip_pmtudisc = val; + return 0; + case IP_RECVERR: + sk->ip_recverr = !!val; + if (!val) + skb_queue_purge(&sk->error_queue); + return 0; + case IP_MULTICAST_TTL: + if (optlen<1) + return -EINVAL; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + return -EINVAL; + sk->ip_mc_ttl=val; + return 0; + case IP_MULTICAST_LOOP: + if (optlen<1) + return -EINVAL; + sk->ip_mc_loop = val ? 1 : 0; + return 0; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct device *dev = NULL; + + /* + * Check the arguments are allowable + */ + + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + return -EFAULT; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == INADDR_ANY) { + sk->ip_mc_index = 0; + sk->ip_mc_addr = 0; + return 0; + } + dev = ip_dev_find(mreq.imr_address.s_addr); + } else + dev = dev_get_by_index(mreq.imr_ifindex); + + if (!dev) + return -EADDRNOTAVAIL; + + if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if) + return -EINVAL; + + sk->ip_mc_index = mreq.imr_ifindex; + sk->ip_mc_addr = mreq.imr_address.s_addr; + return 0; + } + + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; + + if (optlen < sizeof(struct ip_mreq)) + return -EINVAL; + if (optlen >= sizeof(struct ip_mreqn)) { + if(copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + return -EFAULT; + } + + if (optname == IP_ADD_MEMBERSHIP) + return ip_mc_join_group(sk,&mreq); + else + return ip_mc_leave_group(sk,&mreq); + } + case IP_ROUTER_ALERT: + return ip_ra_control(sk, val ? 1 : 0, NULL); + +#ifdef CONFIG_IP_FIREWALL + case IP_FW_MASQ_TIMEOUTS: + case IP_FW_APPEND: + case IP_FW_REPLACE: + case IP_FW_DELETE: + case IP_FW_DELETE_NUM: + case IP_FW_INSERT: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_CHECK: + case IP_FW_CREATECHAIN: + case IP_FW_DELETECHAIN: + case IP_FW_POLICY: + if(!capable(CAP_NET_ADMIN)) + return -EACCES; + if(optlen>sizeof(tmp_fw) || optlen<1) + return -EINVAL; + if(copy_from_user(&tmp_fw,optval,optlen)) + return -EFAULT; + err=ip_fw_ctl(optname, &tmp_fw,optlen); + return -err; /* -0 is 0 after all */ +#endif /* CONFIG_IP_FIREWALL */ +#ifdef CONFIG_IP_MASQUERADE + case IP_FW_MASQ_CTL: + if(!capable(CAP_NET_ADMIN)) + return -EPERM; + if(optlen<1) + return -EINVAL; + err=ip_masq_uctl(optname, optval ,optlen); + return err; + +#endif + default: + return(-ENOPROTOOPT); + } +} + +/* + * Get the options. Note for future reference. The GET of IP options gets the + * _received_ ones. The set sets the _sent_ ones. + */ + +int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) +{ + int val; + int len; + + if(level!=SOL_IP) + return -EOPNOTSUPP; + +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_getsockopt(sk,optname,optval,optlen); + } +#endif + + if(get_user(len,optlen)) + return -EFAULT; + + switch(optname) + { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options * opt = (struct ip_options*)optbuf; + lock_sock(sk); + opt->optlen = 0; + if (sk->opt) + memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen); + release_sock(sk); + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len=min(len, opt->optlen); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (sk->ip_cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (sk->ip_cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (sk->ip_cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (sk->ip_cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (sk->ip_cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_TOS: + val=sk->ip_tos; + break; + case IP_TTL: + val=sk->ip_ttl; + break; + case IP_HDRINCL: + val=sk->ip_hdrincl; + break; + case IP_MTU_DISCOVER: + val=sk->ip_pmtudisc; + break; + case IP_MTU: + val = 0; + lock_sock(sk); + if (sk->dst_cache) + val = sk->dst_cache->pmtu; + release_sock(sk); + if (!val) + return -ENOTCONN; + break; + case IP_RECVERR: + val=sk->ip_recverr; + break; + case IP_MULTICAST_TTL: + val=sk->ip_mc_ttl; + break; + case IP_MULTICAST_LOOP: + val=sk->ip_mc_loop; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + len = min(len,sizeof(struct ip_mreqn)); + if(put_user(len, optlen)) + return -EFAULT; + mreq.imr_ifindex = sk->ip_mc_index; + mreq.imr_address.s_addr = sk->ip_mc_addr; + mreq.imr_multiaddr.s_addr = 0; + if(copy_to_user((void *)optval, &mreq, len)) + return -EFAULT; + return 0; + } + default: + return(-ENOPROTOOPT); + } + + if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&ucval,1)) + return -EFAULT; + } else { + len=min(sizeof(int),len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + } + return 0; +} diff --git a/pfinet/linux-src/net/ipv4/ipconfig.c b/pfinet/linux-src/net/ipv4/ipconfig.c new file mode 100644 index 00000000..0770bad1 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ipconfig.c @@ -0,0 +1,970 @@ +/* + * $Id: ipconfig.c,v 1.20.2.1 1999/06/28 11:33:27 davem Exp $ + * + * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied + * information to configure own IP address and routes. + * + * Copyright (C) 1996--1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Derived from network configuration code in fs/nfs/nfsroot.c, + * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + * + * BOOTP rewritten to construct and analyse packets itself instead + * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; + * -- MJ, December 1998 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/utsname.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/socket.h> +#include <linux/route.h> +#include <linux/udp.h> +#include <net/arp.h> +#include <net/ip.h> +#include <net/ipconfig.h> + +#include <asm/segment.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> + +/* Define this to allow debugging output */ +#undef IPCONFIG_DEBUG + +#ifdef IPCONFIG_DEBUG +#define DBG(x) printk x +#else +#define DBG(x) do { } while(0) +#endif + +/* Define the timeout for waiting for a RARP/BOOTP reply */ +#define CONF_BASE_TIMEOUT (HZ*5) /* Initial timeout: 5 seconds */ +#define CONF_RETRIES 10 /* 10 retries */ +#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ +#define CONF_TIMEOUT_MULT *5/4 /* Rate of timeout growth */ +#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ + +/* IP configuration */ +static char user_dev_name[IFNAMSIZ] __initdata = { 0, };/* Name of user-selected boot device */ +u32 ic_myaddr __initdata = INADDR_NONE; /* My IP address */ +u32 ic_servaddr __initdata = INADDR_NONE; /* Server IP address */ +u32 ic_gateway __initdata = INADDR_NONE; /* Gateway IP address */ +u32 ic_netmask __initdata = INADDR_NONE; /* Netmask for local subnet */ +int ic_enable __initdata = 1; /* Automatic IP configuration enabled */ +int ic_host_name_set __initdata = 0; /* Host name configured manually */ +int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ + +u32 root_server_addr __initdata = INADDR_NONE; /* Address of boot server */ +u8 root_server_path[256] __initdata = { 0, }; /* Path to mount as root */ + +#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_RARP) + +#define CONFIG_IP_PNP_DYNAMIC + +static int ic_proto_enabled __initdata = 0 /* Protocols enabled */ +#ifdef CONFIG_IP_PNP_BOOTP + | IC_BOOTP +#endif +#ifdef CONFIG_IP_PNP_RARP + | IC_RARP +#endif + ; +static int ic_got_reply __initdata = 0; /* Protocol(s) we got reply from */ + +#else + +static int ic_proto_enabled __initdata = 0; + +#endif + +static int ic_proto_have_if __initdata = 0; + +/* + * Network devices + */ + +struct ic_device { + struct ic_device *next; + struct device *dev; + unsigned short flags; + int able; +}; + +static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ +static struct device *ic_dev __initdata = NULL; /* Selected device */ + +static int __init ic_open_devs(void) +{ + struct ic_device *d, **last; + struct device *dev; + unsigned short oflags; + + last = &ic_first_dev; + for (dev = dev_base; dev; dev = dev->next) + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) { + int able = 0; + if (dev->mtu >= 364) + able |= IC_BOOTP; + else + printk(KERN_WARNING "BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); + if (!(dev->flags & IFF_NOARP)) + able |= IC_RARP; + able &= ic_proto_enabled; + if (ic_proto_enabled && !able) + continue; + oflags = dev->flags; + if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + continue; + } + if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) + return -1; + d->dev = dev; + *last = d; + last = &d->next; + d->flags = oflags; + d->able = able; + ic_proto_have_if |= able; + DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able)); + } + *last = NULL; + + if (!ic_first_dev) { + if (user_dev_name[0]) + printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); + else + printk(KERN_ERR "IP-Config: No network devices available.\n"); + return -1; + } + return 0; +} + +static void __init ic_close_devs(void) +{ + struct ic_device *d, *next; + struct device *dev; + + next = ic_first_dev; + while ((d = next)) { + next = d->next; + dev = d->dev; + if (dev != ic_dev) { + DBG(("IP-Config: Downing %s\n", dev->name)); + dev_change_flags(dev, d->flags); + } + kfree_s(d, sizeof(struct ic_device)); + } +} + +/* + * Interface to various network functions. + */ + +static inline void +set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port) +{ + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = addr; + sin->sin_port = port; +} + +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = devinet_ioctl(cmd, arg); + set_fs(oldfs); + return res; +} + +static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = ip_rt_ioctl(cmd, arg); + set_fs(oldfs); + return res; +} + +/* + * Set up interface addresses and routes. + */ + +static int __init ic_setup_if(void) +{ + struct ifreq ir; + struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; + int err; + + memset(&ir, 0, sizeof(ir)); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + set_sockaddr(sin, ic_myaddr, 0); + if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); + return -1; + } + return 0; +} + +static int __init ic_setup_routes(void) +{ + /* No need to setup device routes, only the default route... */ + + if (ic_gateway != INADDR_NONE) { + struct rtentry rm; + int err; + + memset(&rm, 0, sizeof(rm)); + if ((ic_gateway ^ ic_myaddr) & ic_netmask) { + printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); + return -1; + } + set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); + rm.rt_flags = RTF_UP | RTF_GATEWAY; + if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); + return -1; + } + } + + return 0; +} + +/* + * Fill in default values for all missing parameters. + */ + +static int __init ic_defaults(void) +{ + /* + * At this point we have no userspace running so need not + * claim locks on system_utsname + */ + + if (!ic_host_name_set) + strcpy(system_utsname.nodename, in_ntoa(ic_myaddr)); + + if (root_server_addr == INADDR_NONE) + root_server_addr = ic_servaddr; + + if (ic_netmask == INADDR_NONE) { + if (IN_CLASSA(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSA_NET); + else if (IN_CLASSB(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSB_NET); + else if (IN_CLASSC(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSC_NET); + else { + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %08x\n", ic_myaddr); + return -1; + } + printk("IP-Config: Guessing netmask %s\n", in_ntoa(ic_netmask)); + } + + return 0; +} + +/* + * RARP support. + */ + +#ifdef CONFIG_IP_PNP_RARP + +static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt); + +static struct packet_type rarp_packet_type __initdata = { + __constant_htons(ETH_P_RARP), + NULL, /* Listen to all devices */ + ic_rarp_recv, + NULL, + NULL +}; + +static inline void ic_rarp_init(void) +{ + dev_add_pack(&rarp_packet_type); +} + +static inline void ic_rarp_cleanup(void) +{ + dev_remove_pack(&rarp_packet_type); +} + +/* + * Process received RARP packet. + */ +static int __init +ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct arphdr *rarp = (struct arphdr *)skb->h.raw; + unsigned char *rarp_ptr = (unsigned char *) (rarp + 1); + unsigned long sip, tip; + unsigned char *sha, *tha; /* s for "source", t for "target" */ + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop; + + /* If this test doesn't pass, it's not IP, or we should ignore it anyway */ + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) + goto drop; + + /* If it's not a RARP reply, delete it. */ + if (rarp->ar_op != htons(ARPOP_RREPLY)) + goto drop; + + /* If it's not Ethernet, delete it. */ + if (rarp->ar_pro != htons(ETH_P_IP)) + goto drop; + + /* Extract variable-width fields */ + sha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&sip, rarp_ptr, 4); + rarp_ptr += 4; + tha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&tip, rarp_ptr, 4); + + /* Discard packets which are not meant for us. */ + if (memcmp(tha, dev->dev_addr, dev->addr_len)) + goto drop; + + /* Discard packets which are not from specified server. */ + if (ic_servaddr != INADDR_NONE && ic_servaddr != sip) + goto drop; + + /* Victory! The packet is what we were looking for! */ + if (!ic_got_reply) { + ic_got_reply = IC_RARP; + ic_dev = dev; + if (ic_myaddr == INADDR_NONE) + ic_myaddr = tip; + ic_servaddr = sip; + } + + /* And throw the packet out... */ +drop: + kfree_skb(skb); + return 0; +} + + +/* + * Send RARP request packet over all devices which allow RARP. + */ +static void __init ic_rarp_send(void) +{ + struct ic_device *d; + + for (d=ic_first_dev; d; d=d->next) + if (d->able & IC_RARP) { + struct device *dev = d->dev; + arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, + dev->dev_addr, dev->dev_addr); + } +} + +#endif + +/* + * BOOTP support. + */ + +#ifdef CONFIG_IP_PNP_BOOTP + +struct bootp_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ + u8 op; /* 1=request, 2=reply */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + u8 hops; /* Used only by gateways */ + u32 xid; /* Transaction ID */ + u16 secs; /* Seconds since we started */ + u16 flags; /* Just what it says */ + u32 client_ip; /* Client's IP address if known */ + u32 your_ip; /* Assigned IP address */ + u32 server_ip; /* Server's IP address */ + u32 relay_ip; /* IP address of BOOTP relay */ + u8 hw_addr[16]; /* Client's HW address */ + u8 serv_name[64]; /* Server host name */ + u8 boot_file[128]; /* Name of boot file */ + u8 vendor_area[128]; /* Area for extensions */ +}; + +#define BOOTP_REQUEST 1 +#define BOOTP_REPLY 2 + +static u32 ic_bootp_xid; + +static int ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt); + +static struct packet_type bootp_packet_type __initdata = { + __constant_htons(ETH_P_IP), + NULL, /* Listen to all devices */ + ic_bootp_recv, + NULL, + NULL +}; + + +/* + * Initialize BOOTP extension fields in the request. + */ +static void __init ic_bootp_init_ext(u8 *e) +{ + *e++ = 99; /* RFC1048 Magic Cookie */ + *e++ = 130; + *e++ = 83; + *e++ = 99; + *e++ = 1; /* Subnet mask request */ + *e++ = 4; + e += 4; + *e++ = 3; /* Default gateway request */ + *e++ = 4; + e += 4; + *e++ = 12; /* Host name request */ + *e++ = 32; + e += 32; + *e++ = 40; /* NIS Domain name request */ + *e++ = 32; + e += 32; + *e++ = 17; /* Boot path */ + *e++ = 32; + e += 32; + *e = 255; /* End of the list */ +} + + +/* + * Initialize the BOOTP mechanism. + */ +static inline void ic_bootp_init(void) +{ + get_random_bytes(&ic_bootp_xid, sizeof(u32)); + DBG(("BOOTP: XID=%08x\n", ic_bootp_xid)); + dev_add_pack(&bootp_packet_type); +} + + +/* + * BOOTP cleanup. + */ +static inline void ic_bootp_cleanup(void) +{ + dev_remove_pack(&bootp_packet_type); +} + + +/* + * Send BOOTP request to single interface. + */ +static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies) +{ + struct device *dev = d->dev; + struct sk_buff *skb; + struct bootp_pkt *b; + int hh_len = (dev->hard_header_len + 15) & ~15; + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, hh_len); + b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); + memset(b, 0, sizeof(struct bootp_pkt)); + + /* Construct IP header */ + skb->nh.iph = h = &b->iph; + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct bootp_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = INADDR_BROADCAST; + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(68); + b->udph.dest = htons(67); + b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct BOOTP header */ + b->op = BOOTP_REQUEST; + b->htype = dev->type; + b->hlen = dev->addr_len; + memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); + b->secs = htons(jiffies / HZ); + b->xid = ic_bootp_xid; + ic_bootp_init_ext(b->vendor_area); + + /* Chain packet down the line... */ + skb->dev = dev; + skb->protocol = __constant_htons(ETH_P_IP); + if ((dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + dev_queue_xmit(skb) < 0) + printk("E"); +} + + +/* + * Send BOOTP requests to all interfaces. + */ +static void __init ic_bootp_send(u32 jiffies) +{ + struct ic_device *d; + + for(d=ic_first_dev; d; d=d->next) + if (d->able & IC_BOOTP) + ic_bootp_send_if(d, jiffies); +} + + +/* + * Copy BOOTP-supplied string if not already set. + */ +static int __init ic_bootp_string(char *dest, char *src, int len, int max) +{ + if (!len) + return 0; + if (len > max-1) + len = max-1; + strncpy(dest, src, len); + dest[len] = '\0'; + return 1; +} + + +/* + * Process BOOTP extension. + */ +static void __init ic_do_bootp_ext(u8 *ext) +{ +#ifdef IPCONFIG_DEBUG + u8 *c; + + printk("BOOTP: Got extension %02x",*ext); + for(c=ext+2; c<ext+2+ext[1]; c++) + printk(" %02x", *c); + printk("\n"); +#endif + + switch (*ext++) { + case 1: /* Subnet mask */ + if (ic_netmask == INADDR_NONE) + memcpy(&ic_netmask, ext+1, 4); + break; + case 3: /* Default gateway */ + if (ic_gateway == INADDR_NONE) + memcpy(&ic_gateway, ext+1, 4); + break; + case 12: /* Host name */ + ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_host_name_set = 1; + break; + case 40: /* NIS Domain name */ + ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + break; + case 17: /* Root path */ + if (!root_server_path[0]) + ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); + break; + } +} + + +/* + * Receive BOOTP reply. + */ +static int __init ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct bootp_pkt *b = (struct bootp_pkt *) skb->nh.iph; + struct iphdr *h = &b->iph; + int len; + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop; + + /* Check whether it's a BOOTP packet */ + if (skb->pkt_type == PACKET_OTHERHOST || + skb->len < sizeof(struct udphdr) + sizeof(struct iphdr) || + h->ihl != 5 || + h->version != 4 || + ip_fast_csum((char *) h, h->ihl) != 0 || + skb->len < ntohs(h->tot_len) || + h->protocol != IPPROTO_UDP || + b->udph.source != htons(67) || + b->udph.dest != htons(68) || + ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; + + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET|IP_MF)) { + printk(KERN_ERR "BOOTP: Ignoring fragmented reply.\n"); + goto drop; + } + + /* Is it a reply to our BOOTP request? */ + len = ntohs(b->udph.len) - sizeof(struct udphdr); + if (len < 300 || /* See RFC 951:2.1 */ + b->op != BOOTP_REPLY || + b->xid != ic_bootp_xid) { + printk("?"); + goto drop; + } + + /* Extract basic fields */ + ic_myaddr = b->your_ip; + ic_servaddr = b->server_ip; + ic_got_reply = IC_BOOTP; + ic_dev = dev; + + /* Parse extensions */ + if (b->vendor_area[0] == 99 && /* Check magic cookie */ + b->vendor_area[1] == 130 && + b->vendor_area[2] == 83 && + b->vendor_area[3] == 99) { + u8 *ext = &b->vendor_area[4]; + u8 *end = (u8 *) b + ntohs(b->iph.tot_len); + while (ext < end && *ext != 0xff) { + if (*ext == 0) /* Padding */ + ext++; + else { + u8 *opt = ext; + ext += ext[1] + 2; + if (ext <= end) + ic_do_bootp_ext(opt); + } + } + } + + if (ic_gateway == INADDR_NONE && b->relay_ip) + ic_gateway = b->relay_ip; + +drop: + kfree_skb(skb); + return 0; +} + + +#endif + + +/* + * Dynamic IP configuration -- BOOTP and RARP. + */ + +#ifdef CONFIG_IP_PNP_DYNAMIC + +static int __init ic_dynamic(void) +{ + int retries; + unsigned long timeout, jiff; + unsigned long start_jiffies; + int do_rarp = ic_proto_have_if & IC_RARP; + int do_bootp = ic_proto_have_if & IC_BOOTP; + + /* + * If neither BOOTP nor RARP was selected, return with an error. This + * routine gets only called when some pieces of information are mis- + * sing, and without BOOTP and RARP we are not able to get that in- + * formation. + */ + if (!ic_proto_enabled) { + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + return -1; + } + +#ifdef CONFIG_IP_PNP_BOOTP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) + printk(KERN_ERR "BOOTP: No suitable device found.\n"); +#endif + +#ifdef CONFIG_IP_PNP_RARP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) + printk(KERN_ERR "RARP: No suitable device found.\n"); +#endif + + if (!ic_proto_have_if) + /* Error message already printed */ + return -1; + + /* + * Setup RARP and BOOTP protocols + */ +#ifdef CONFIG_IP_PNP_RARP + if (do_rarp) + ic_rarp_init(); +#endif +#ifdef CONFIG_IP_PNP_BOOTP + if (do_bootp) + ic_bootp_init(); +#endif + + /* + * Send requests and wait, until we get an answer. This loop + * seems to be a terrible waste of CPU time, but actually there is + * only one process running at all, so we don't need to use any + * scheduler functions. + * [Actually we could now, but the nothing else running note still + * applies.. - AC] + */ + printk(KERN_NOTICE "Sending %s%s%s requests...", + do_bootp ? "BOOTP" : "", + do_bootp && do_rarp ? " and " : "", + do_rarp ? "RARP" : ""); + start_jiffies = jiffies; + retries = CONF_RETRIES; + get_random_bytes(&timeout, sizeof(timeout)); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + for(;;) { +#ifdef CONFIG_IP_PNP_BOOTP + if (do_bootp) + ic_bootp_send(jiffies - start_jiffies); +#endif +#ifdef CONFIG_IP_PNP_RARP + if (do_rarp) + ic_rarp_send(); +#endif + printk("."); + jiff = jiffies + timeout; + while (jiffies < jiff && !ic_got_reply) + ; + if (ic_got_reply) { + printk(" OK\n"); + break; + } + if (! --retries) { + printk(" timed out!\n"); + break; + } + timeout = timeout CONF_TIMEOUT_MULT; + if (timeout > CONF_TIMEOUT_MAX) + timeout = CONF_TIMEOUT_MAX; + } + +#ifdef CONFIG_IP_PNP_RARP + if (do_rarp) + ic_rarp_cleanup(); +#endif +#ifdef CONFIG_IP_PNP_BOOTP + if (do_bootp) + ic_bootp_cleanup(); +#endif + + if (!ic_got_reply) + return -1; + + printk("IP-Config: Got %s answer from %s, ", + (ic_got_reply & IC_BOOTP) ? "BOOTP" : "RARP", + in_ntoa(ic_servaddr)); + printk("my address is %s\n", in_ntoa(ic_myaddr)); + + return 0; +} + +#endif + +/* + * IP Autoconfig dispatcher. + */ + +int __init ip_auto_config(void) +{ + if (!ic_enable) + return 0; + + DBG(("IP-Config: Entered.\n")); + + /* Setup all network devices */ + if (ic_open_devs() < 0) + return -1; + + /* + * If the config information is insufficient (e.g., our IP address or + * IP address of the boot server is missing or we have multiple network + * interfaces and no default was set), use BOOTP or RARP to get the + * missing values. + */ + if (ic_myaddr == INADDR_NONE || +#ifdef CONFIG_ROOT_NFS + (root_server_addr == INADDR_NONE && ic_servaddr == INADDR_NONE) || +#endif + ic_first_dev->next) { +#ifdef CONFIG_IP_PNP_DYNAMIC + if (ic_dynamic() < 0) { + printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); + ic_close_devs(); + return -1; + } +#else + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + ic_close_devs(); + return -1; +#endif + } else { + ic_dev = ic_first_dev->dev; /* Device selected manually or only one device -> use it */ + } + + /* + * Use defaults whereever applicable. + */ + if (ic_defaults() < 0) + return -1; + + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + ic_close_devs(); + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + return -1; + + DBG(("IP-Config: device=%s, local=%08x, server=%08x, boot=%08x, gw=%08x, mask=%08x\n", + ic_dev->name, ic_myaddr, ic_servaddr, root_server_addr, ic_gateway, ic_netmask)); + DBG(("IP-Config: host=%s, domain=%s, path=`%s'\n", system_utsname.nodename, + system_utsname.domainname, root_server_path)); + return 0; +} + +/* + * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel + * command line parameter. It consists of option fields separated by colons in + * the following order: + * + * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<bootp|rarp> + * + * Any of the fields can be empty which means to use a default value: + * <client-ip> - address given by BOOTP or RARP + * <server-ip> - address of host returning BOOTP or RARP packet + * <gw-ip> - none, or the address returned by BOOTP + * <netmask> - automatically determined from <client-ip>, or the + * one returned by BOOTP + * <host name> - <client-ip> in ASCII notation, or the name returned + * by BOOTP + * <device> - use all available devices + * <bootp|rarp|both|off> - use both protocols to determine my own address + */ +static int __init ic_proto_name(char *name) +{ + if (!strcmp(name, "off")) { + ic_proto_enabled = 0; + return 1; + } +#ifdef CONFIG_IP_PNP_BOOTP + else if (!strcmp(name, "bootp")) { + ic_proto_enabled &= ~IC_RARP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + else if (!strcmp(name, "rarp")) { + ic_proto_enabled &= ~IC_BOOTP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_DYNAMIC + else if (!strcmp(name, "both")) { + return 1; + } +#endif + return 0; +} + +void __init ip_auto_config_setup(char *addrs, int *ints) +{ + char *cp, *ip, *dp; + int num = 0; + + ic_set_manually = 1; + if (!strcmp(addrs, "off")) { + ic_enable = 0; + return; + } + if (ic_proto_name(addrs)) + return; + + /* Parse the whole string */ + ip = addrs; + while (ip && *ip) { + if ((cp = strchr(ip, ':'))) + *cp++ = '\0'; + if (strlen(ip) > 0) { + DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + switch (num) { + case 0: + if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + ic_myaddr = INADDR_NONE; + break; + case 1: + if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + ic_servaddr = INADDR_NONE; + break; + case 2: + if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + ic_gateway = INADDR_NONE; + break; + case 3: + if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + ic_netmask = INADDR_NONE; + break; + case 4: + if ((dp = strchr(ip, '.'))) { + *dp++ = '\0'; + strncpy(system_utsname.domainname, dp, __NEW_UTS_LEN); + system_utsname.domainname[__NEW_UTS_LEN] = '\0'; + } + strncpy(system_utsname.nodename, ip, __NEW_UTS_LEN); + system_utsname.nodename[__NEW_UTS_LEN] = '\0'; + ic_host_name_set = 1; + break; + case 5: + strncpy(user_dev_name, ip, IFNAMSIZ); + user_dev_name[IFNAMSIZ-1] = '\0'; + break; + case 6: + ic_proto_name(ip); + break; + } + } + ip = cp; + num++; + } +} diff --git a/pfinet/linux-src/net/ipv4/ipip.c b/pfinet/linux-src/net/ipv4/ipip.c new file mode 100644 index 00000000..0aeef4a3 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ipip.c @@ -0,0 +1,870 @@ +/* + * Linux NET3: IP/IP protocol decoder. + * + * Version: $Id: ipip.c,v 1.26 1999/03/25 10:04:32 davem Exp $ + * + * Authors: + * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + * + * Fixes: + * Alan Cox : Merged and made usable non modular (its so tiny its silly as + * a module taking up 2 pages). + * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) + * to keep ip_forward happy. + * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). + * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (Alan.Cox@linux.org) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ipip.h> + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip_fb_tunnel_init(struct device *dev); +static int ipip_tunnel_init(struct device *dev); + +static struct device ipip_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init, +}; + +static struct ip_tunnel ipip_fb_tunnel = { + NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", } +}; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &tunnels[prio][h]; +} + + +static void ipip_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + *tp = t->next; + synchronize_bh(); + break; + } + } +} + +static void ipip_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipip_bucket(t); + + t->next = *tp; + wmb(); + *tp = t; +} + +struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipip_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "tunl%d", i); + if (dev_get(dev->name) == NULL) + break; + } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); + } + if (register_netdevice(dev) < 0) + goto failed; + + ipip_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; +} + + +static void ipip_tunnel_destroy(struct device *dev) +{ + if (dev == &ipip_fb_tunnel_dev) { + tunnels_wc[0] = NULL; + synchronize_bh(); + } else { + ipip_tunnel_unlink((struct ip_tunnel*)dev->priv); + kfree(dev); + MOD_DEC_USE_COUNT; + } +} + +void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)dp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + if (len < sizeof(struct iphdr)) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + return; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct iphdr *eiph; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rtable *rt; + + if (len < hlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necesary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < hlen+68) + return; + rel_info -= hlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > skb2->dst->pmtu) { + kfree_skb(skb2); + return; + } + skb2->dst->pmtu = rel_info; + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); + return; +#endif +} + +int ipip_rcv(struct sk_buff *skb, unsigned short len) +{ + struct iphdr *iph; + struct ip_tunnel *tunnel; + + iph = skb->nh.iph; + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + + if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; + } + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb); + return 0; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + u16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != __constant_htons(ETH_P_IP)) + goto tx_error; + + if (tos&1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = (struct rtable*)skb->dst) == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (skb->dst && mtu < skb->dst->pmtu) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + } + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + stats->tx_bytes += skb->len; + stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipip_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != &ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL && + t != &ipip_fb_tunnel) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = (struct ip_tunnel*)dev->priv; + start_bh_atomic(); + ipip_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip_tunnel_link(t); + end_bh_atomic(); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == &ipip_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipip_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipip_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipip_tunnel_destroy; + dev->hard_start_xmit = ipip_tunnel_xmit; + dev->get_stats = ipip_tunnel_get_stats; + dev->do_ioctl = ipip_tunnel_ioctl; + dev->change_mtu = ipip_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipip_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipip_tunnel_init_gen(dev); + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +#ifdef MODULE +static int ipip_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipip_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipip_fb_tunnel_init(struct device *dev)) +{ + struct iphdr *iph; + + ipip_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipip_fb_tunnel_open; + dev->stop = ipip_fb_tunnel_close; +#endif + + iph = &ipip_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + tunnels_wc[0] = &ipip_fb_tunnel; + return 0; +} + +static struct inet_protocol ipip_protocol = { + ipip_rcv, /* IPIP handler */ + ipip_err, /* TUNNEL error control */ + 0, /* next */ + IPPROTO_IPIP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IPIP" /* name */ +}; + +#ifdef MODULE +int init_module(void) +#else +__initfunc(int ipip_init(void)) +#endif +{ + printk(KERN_INFO "IPv4 over IPv4 tunneling driver\n"); + + ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel; + ipip_fb_tunnel_dev.name = ipip_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipip_fb_tunnel_dev); +#else + register_netdevice(&ipip_fb_tunnel_dev); +#endif + + inet_add_protocol(&ipip_protocol); + return 0; +} + +#ifdef MODULE + +void cleanup_module(void) +{ + if ( inet_del_protocol(&ipip_protocol) < 0 ) + printk(KERN_INFO "ipip close: can't remove protocol\n"); + + unregister_netdevice(&ipip_fb_tunnel_dev); +} + +#endif diff --git a/pfinet/linux-src/net/ipv4/ipmr.c b/pfinet/linux-src/net/ipv4/ipmr.c new file mode 100644 index 00000000..cd51cd9a --- /dev/null +++ b/pfinet/linux-src/net/ipv4/ipmr.c @@ -0,0 +1,1609 @@ +/* + * IP multicast routing support for mrouted 3.6/3.8 + * + * (c) 1995 Alan Cox, <alan@redhat.com> + * Linux Consultancy and Custom Driver Development + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version: $Id: ipmr.c,v 1.40.2.2 1999/06/20 21:27:44 davem Exp $ + * + * Fixes: + * Michael Chastain : Incorrect size of copying. + * Alan Cox : Added the cache manager code + * Alan Cox : Fixed the clone/copy bug and device race. + * Mike McLagan : Routing by source + * Malcolm Beattie : Buffer handling fixes. + * Alexey Kuznetsov : Double buffer free and other fixes. + * SVR Anand : Fixed several multicast bugs and problems. + * Alexey Kuznetsov : Status, optimisations and more. + * Brad Parker : Better behaviour on mrouted upcall + * overflow. + * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requrement to work with older peers. + * + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/proc_fs.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <linux/ip_fw.h> +#include <linux/firewall.h> +#include <net/ipip.h> +#include <net/checksum.h> + +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + +/* + * Multicast router control variables + */ + +static struct vif_device vif_table[MAXVIFS]; /* Devices */ +static unsigned long vifc_map; /* Active device map */ +static int maxvif; +int mroute_do_assert = 0; /* Set in PIM assert */ +int mroute_do_pim = 0; +static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ +int cache_resolve_queue_len = 0; /* Size of unresolved */ + +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); + +extern struct inet_protocol pim_protocol; + +static +struct device *ipmr_new_tunnel(struct vifctl *v) +{ + struct device *dev = NULL; + + rtnl_lock(); + dev = dev_get("tunl0"); + + if (dev) { + int err; + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0 && (dev = dev_get(p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = dev->ip_ptr; + if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + goto failure; + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + } + } + rtnl_unlock(); + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static int reg_vif_num = -1; +static struct device * reg_dev; + +static int reg_vif_xmit(struct sk_buff *skb, struct device *dev) +{ + ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)dev->priv)->tx_packets++; + ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + kfree_skb(skb); + return 0; +} + +static struct net_device_stats *reg_vif_get_stats(struct device *dev) +{ + return (struct net_device_stats*)dev->priv; +} + +static +struct device *ipmr_reg_vif(struct vifctl *v) +{ + struct device *dev; + struct in_device *in_dev; + int size; + + size = sizeof(*dev) + IFNAMSIZ + sizeof(struct net_device_stats); + dev = kmalloc(size, GFP_KERNEL); + if (!dev) + return NULL; + + memset(dev, 0, size); + + dev->priv = dev + 1; + dev->name = dev->priv + sizeof(struct net_device_stats); + + strcpy(dev->name, "pimreg"); + + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->hard_start_xmit = reg_vif_xmit; + dev->get_stats = reg_vif_get_stats; + + rtnl_lock(); + + if (register_netdevice(dev)) { + rtnl_unlock(); + kfree(dev); + return NULL; + } + dev->iflink = 0; + + if ((in_dev = inetdev_init(dev)) == NULL) + goto failure; + + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + + rtnl_unlock(); + reg_dev = dev; + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + kfree(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int vif_delete(int vifi) +{ + struct vif_device *v; + struct device *dev; + struct in_device *in_dev; + + if (vifi < 0 || vifi >= maxvif || !(vifc_map&(1<<vifi))) + return -EADDRNOTAVAIL; + + v = &vif_table[vifi]; + + dev = v->dev; + v->dev = NULL; + vifc_map &= ~(1<<vifi); + + if ((in_dev = dev->ip_ptr) != NULL) + in_dev->cnf.mc_forwarding = 0; + + dev_set_allmulti(dev, -1); + ip_rt_multicast_event(in_dev); + + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) { +#ifdef CONFIG_IP_PIMSM + if (vifi == reg_vif_num) { + reg_vif_num = -1; + reg_dev = NULL; + } +#endif + unregister_netdevice(dev); + if (v->flags&VIFF_REGISTER) + kfree(dev); + } + + if (vifi+1 == maxvif) { + int tmp; + for (tmp=vifi-1; tmp>=0; tmp--) { + if (vifc_map&(1<<tmp)) + break; + } + maxvif = tmp+1; + } + return 0; +} + +static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) +{ + int vifi; + + start_bh_atomic(); + + cache->mfc_minvif = MAXVIFS; + cache->mfc_maxvif = 0; + memset(cache->mfc_ttls, 255, MAXVIFS); + + for (vifi=0; vifi<maxvif; vifi++) { + if (vifc_map&(1<<vifi) && ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_ttls[vifi] = ttls[vifi]; + if (cache->mfc_minvif > vifi) + cache->mfc_minvif = vifi; + if (cache->mfc_maxvif <= vifi) + cache->mfc_maxvif = vifi + 1; + } + } + end_bh_atomic(); +} + +/* + * Delete a multicast route cache entry + */ + +static void ipmr_cache_delete(struct mfc_cache *cache) +{ + struct sk_buff *skb; + int line; + struct mfc_cache **cp; + + /* + * Find the right cache line + */ + + line=MFC_HASH(cache->mfc_mcastgrp,cache->mfc_origin); + cp=&(mfc_cache_array[line]); + + if(cache->mfc_flags&MFC_QUEUED) + del_timer(&cache->mfc_timer); + + /* + * Unlink the buffer + */ + + while(*cp!=NULL) + { + if(*cp==cache) + { + *cp=cache->next; + break; + } + cp=&((*cp)->next); + } + + /* + * Free the buffer. If it is a pending resolution + * clean up the other resources. + */ + + if(cache->mfc_flags&MFC_QUEUED) + { + cache_resolve_queue_len--; + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else +#endif + kfree_skb(skb); + } + } + kfree_s(cache,sizeof(cache)); +} + +/* + * Cache expiry timer + */ + +static void ipmr_cache_timer(unsigned long data) +{ + struct mfc_cache *cache=(struct mfc_cache *)data; + ipmr_cache_delete(cache); +} + +/* + * Insert a multicast cache entry + */ + +static void ipmr_cache_insert(struct mfc_cache *c) +{ + int line=MFC_HASH(c->mfc_mcastgrp,c->mfc_origin); + c->next=mfc_cache_array[line]; + mfc_cache_array[line]=c; +} + +/* + * Find a multicast cache entry + */ + +struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp) +{ + int line=MFC_HASH(mcastgrp,origin); + struct mfc_cache *cache; + + cache=mfc_cache_array[line]; + while(cache!=NULL) + { + if(cache->mfc_origin==origin && cache->mfc_mcastgrp==mcastgrp) + return cache; + cache=cache->next; + } + return NULL; +} + +/* + * Allocate a multicast cache entry + */ + +static struct mfc_cache *ipmr_cache_alloc(int priority) +{ + struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(*c)); + skb_queue_head_init(&c->mfc_unresolved); + init_timer(&c->mfc_timer); + c->mfc_timer.data=(long)c; + c->mfc_timer.function=ipmr_cache_timer; + c->mfc_minvif = MAXVIFS; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ipmr_cache_resolve(struct mfc_cache *cache) +{ + struct sk_buff *skb; + + start_bh_atomic(); + + /* + * Kill the queue entry timer. + */ + + del_timer(&cache->mfc_timer); + + if (cache->mfc_flags&MFC_QUEUED) { + cache->mfc_flags&=~MFC_QUEUED; + cache_resolve_queue_len--; + } + + end_bh_atomic(); + + /* + * Play the pending entries through our router + */ + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (ipmr_fill_mroute(skb, cache, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb->tail - (u8*)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else +#endif + ip_mr_forward(skb, cache, 0); + } +} + +/* + * Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme.. + */ + +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) +{ + struct sk_buff *skb; + int ihl = pkt->nh.iph->ihl<<2; + struct igmphdr *igmp; + struct igmpmsg *msg; + int ret; + + if (mroute_socket==NULL) + return -EINVAL; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + + if(!skb) + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix ihl, length etc. + And all this only to mangle msg->im_msgtype and + to set msg->im_mbz to "mbz" :-) + */ + msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); + skb->nh.raw = skb->h.raw = (u8*)msg; + memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = reg_vif_num; + skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; + skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + } else +#endif + { + + /* + * Copy the IP header + */ + + skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); + memcpy(skb->data,pkt->data,ihl); + skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg*)skb->nh.iph; + msg->im_vif = vifi; + skb->dst = dst_clone(pkt->dst); + + /* + * Add our header + */ + + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); + igmp->type = + msg->im_msgtype = assert; + igmp->code = 0; + skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ + skb->h.raw = skb->nh.raw; + } + + /* + * Deliver to mrouted + */ + if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution + */ + +static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb) +{ + if(cache==NULL) + { + /* + * Create a new entry if allowable + */ + if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL) + { + kfree_skb(skb); + return -ENOBUFS; + } + /* + * Fill in the new cache entry + */ + cache->mfc_parent=ALL_VIFS; + cache->mfc_origin=skb->nh.iph->saddr; + cache->mfc_mcastgrp=skb->nh.iph->daddr; + cache->mfc_flags=MFC_QUEUED; + /* + * Link to the unresolved list + */ + ipmr_cache_insert(cache); + cache_resolve_queue_len++; + /* + * Fire off the expiry timer + */ + cache->mfc_timer.expires=jiffies+10*HZ; + add_timer(&cache->mfc_timer); + /* + * Reflect first query at mrouted. + */ + if(mroute_socket) + { + /* If the report failed throw the cache entry + out - Brad Parker + + OK, OK, Brad. Only do not forget to free skb + and return :-) --ANK + */ + if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) { + ipmr_cache_delete(cache); + kfree_skb(skb); + return -ENOBUFS; + } + } + } + /* + * See if we can append the packet + */ + if(cache->mfc_queuelen>3) + { + kfree_skb(skb); + return -ENOBUFS; + } + cache->mfc_queuelen++; + skb_queue_tail(&cache->mfc_unresolved,skb); + return 0; +} + +/* + * MFC cache manipulation by user space mroute daemon + */ + +int ipmr_mfc_modify(int action, struct mfcctl *mfc) +{ + struct mfc_cache *cache; + + if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + return -EINVAL; + /* + * Find the cache line + */ + + start_bh_atomic(); + + cache=ipmr_cache_find(mfc->mfcc_origin.s_addr,mfc->mfcc_mcastgrp.s_addr); + + /* + * Delete an entry + */ + if(action==MRT_DEL_MFC) + { + if(cache) + { + ipmr_cache_delete(cache); + end_bh_atomic(); + return 0; + } + end_bh_atomic(); + return -ENOENT; + } + if(cache) + { + + /* + * Update the cache, see if it frees a pending queue + */ + + cache->mfc_flags|=MFC_RESOLVED; + cache->mfc_parent=mfc->mfcc_parent; + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + + if(cache->mfc_flags&MFC_QUEUED) + ipmr_cache_resolve(cache); /* Unhook & send the frames */ + end_bh_atomic(); + return 0; + } + + /* + * Unsolicited update - that's ok, add anyway. + */ + + + cache=ipmr_cache_alloc(GFP_ATOMIC); + if(cache==NULL) + { + end_bh_atomic(); + return -ENOMEM; + } + cache->mfc_flags=MFC_RESOLVED; + cache->mfc_origin=mfc->mfcc_origin.s_addr; + cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; + cache->mfc_parent=mfc->mfcc_parent; + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); + ipmr_cache_insert(cache); + end_bh_atomic(); + return 0; +} + +static void mrtsock_destruct(struct sock *sk) +{ + if (sk == mroute_socket) { + ipv4_devconf.mc_forwarding = 0; + + mroute_socket=NULL; + synchronize_bh(); + + mroute_close(sk); + } +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) +{ + struct vifctl vif; + struct mfcctl mfc; + + if(optname!=MRT_INIT) + { + if(sk!=mroute_socket) + return -EACCES; + } + + switch(optname) + { + case MRT_INIT: + if(sk->type!=SOCK_RAW || sk->num!=IPPROTO_IGMP) + return -EOPNOTSUPP; + if(optlen!=sizeof(int)) + return -ENOPROTOOPT; + { + int opt; + if (get_user(opt,(int *)optval)) + return -EFAULT; + if (opt != 1) + return -ENOPROTOOPT; + } + if(mroute_socket) + return -EADDRINUSE; + mroute_socket=sk; + ipv4_devconf.mc_forwarding = 1; + if (ip_ra_control(sk, 1, mrtsock_destruct) == 0) + return 0; + mrtsock_destruct(sk); + return -EADDRINUSE; + case MRT_DONE: + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if(optlen!=sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif,optval,sizeof(vif))) + return -EFAULT; + if(vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + if(optname==MRT_ADD_VIF) + { + struct vif_device *v=&vif_table[vif.vifc_vifi]; + struct device *dev; + struct in_device *in_dev; + + /* Is vif busy ? */ + if (vifc_map&(1<<vif.vifc_vifi)) + return -EADDRINUSE; + + switch (vif.vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (reg_vif_num >= 0) + return -EADDRINUSE; + reg_vif_num = vif.vifc_vifi; + dev = ipmr_reg_vif(&vif); + if (!dev) { + reg_vif_num = -1; + return -ENOBUFS; + } + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(&vif); + if (!dev) + return -ENOBUFS; + break; + case 0: + dev=ip_dev_find(vif.vifc_lcl_addr.s_addr); + if (!dev) + return -EADDRNOTAVAIL; + break; + default: +#if 0 + printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags); +#endif + return -EINVAL; + } + + if ((in_dev = dev->ip_ptr) == NULL) + return -EADDRNOTAVAIL; + if (in_dev->cnf.mc_forwarding) + return -EADDRINUSE; + in_dev->cnf.mc_forwarding = 1; + dev_set_allmulti(dev, +1); + ip_rt_multicast_event(in_dev); + + /* + * Fill in the VIF structures + */ + start_bh_atomic(); + v->rate_limit=vif.vifc_rate_limit; + v->local=vif.vifc_lcl_addr.s_addr; + v->remote=vif.vifc_rmt_addr.s_addr; + v->flags=vif.vifc_flags; + v->threshold=vif.vifc_threshold; + v->dev=dev; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (vif.vifc_flags&(VIFF_TUNNEL|VIFF_REGISTER)) + v->link = dev->iflink; + vifc_map|=(1<<vif.vifc_vifi); + if (vif.vifc_vifi+1 > maxvif) + maxvif = vif.vifc_vifi+1; + end_bh_atomic(); + return 0; + } else { + int ret; + rtnl_lock(); + ret = vif_delete(vif.vifc_vifi); + rtnl_unlock(); + return ret; + } + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if(optlen!=sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + return ipmr_mfc_modify(optname, &mfc); + /* + * Control PIM assert. + */ + case MRT_ASSERT: + { + int v; + if(get_user(v,(int *)optval)) + return -EFAULT; + mroute_do_assert=(v)?1:0; + return 0; + } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v; + if(get_user(v,(int *)optval)) + return -EFAULT; + v = (v)?1:0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; +#ifdef CONFIG_IP_PIMSM_V2 + if (mroute_do_pim) + inet_add_protocol(&pim_protocol); + else + inet_del_protocol(&pim_protocol); +#endif + } + return 0; + } +#endif + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) +{ + int olr; + int val; + + if(sk!=mroute_socket) + return -EACCES; + if(optname!=MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname!=MRT_PIM && +#endif + optname!=MRT_ASSERT) + return -ENOPROTOOPT; + + if(get_user(olr, optlen)) + return -EFAULT; + + olr=min(olr,sizeof(int)); + if(put_user(olr,optlen)) + return -EFAULT; + if(optname==MRT_VERSION) + val=0x0305; +#ifdef CONFIG_IP_PIMSM + else if(optname==MRT_PIM) + val=mroute_do_pim; +#endif + else + val=mroute_do_assert; + if(copy_to_user(optval,&val,olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct sioc_sg_req sr; + struct sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + + switch(cmd) + { + case SIOCGETVIFCNT: + if (copy_from_user(&vr,(void *)arg,sizeof(vr))) + return -EFAULT; + if(vr.vifi>=maxvif) + return -EINVAL; + vif=&vif_table[vr.vifi]; + if(vifc_map&(1<<vr.vifi)) + { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; + if (copy_to_user((void *)arg,&vr,sizeof(vr))) + return -EFAULT; + return 0; + } + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr,(void *)arg,sizeof(sr))) + return -EFAULT; + for (c = mfc_cache_array[MFC_HASH(sr.grp.s_addr, sr.src.s_addr)]; + c; c = c->next) { + if (sr.grp.s_addr == c->mfc_mcastgrp && + sr.src.s_addr == c->mfc_origin) { + sr.pktcnt = c->mfc_pkt; + sr.bytecnt = c->mfc_bytes; + sr.wrong_if = c->mfc_wrong_if; + if (copy_to_user((void *)arg,&sr,sizeof(sr))) + return -EFAULT; + return 0; + } + } + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +void mroute_close(struct sock *sk) +{ + int i; + + /* + * Shut down all active vif entries + */ + rtnl_lock(); + for(i=0; i<maxvif; i++) + vif_delete(i); + rtnl_unlock(); + + /* + * Wipe the cache + */ + for(i=0;i<MFC_LINES;i++) + { + start_bh_atomic(); + while(mfc_cache_array[i]!=NULL) + ipmr_cache_delete(mfc_cache_array[i]); + end_bh_atomic(); + } +} + +static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct vif_device *v; + int ct; + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + v=&vif_table[0]; + for(ct=0;ct<maxvif;ct++) { + if (vifc_map&(1<<ct) && v->dev==ptr) + vif_delete(ct); + v++; + } + return NOTIFY_DONE; +} + + +static struct notifier_block ip_mr_notifier={ + ipmr_device_event, + NULL, + 0 +}; + +/* + * Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. + */ + +static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->tos = skb->nh.iph->tos; + iph->ttl = skb->nh.iph->ttl; + iph->frag_off = 0; + iph->daddr = daddr; + iph->saddr = saddr; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + skb->h.ipiph = skb->nh.iph; + skb->nh.iph = iph; +} + +/* + * Processing handlers for ipmr_forward + */ + +static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, + int vifi, int last) +{ + struct iphdr *iph = skb->nh.iph; + struct vif_device *vif = &vif_table[vifi]; + struct device *dev; + struct rtable *rt; + int encap = 0; + struct sk_buff *skb2; + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out+=skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + return; + } +#endif + + if (vif->flags&VIFF_TUNNEL) { + if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link)) + return; + encap = sizeof(struct iphdr); + } else { + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link)) + return; + } + + dev = rt->u.dst.dev; + + if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) { + /* Do not fragment multicasts. Alas, IPv4 does not + allow to send ICMP, so that packets will disappear + to blackhole. + */ + + ip_statistics.IpFragFails++; + ip_rt_put(rt); + return; + } + + encap += dev->hard_header_len; + + if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) + skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); + else if (atomic_read(&skb->users) != 1) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + atomic_inc(&skb->users); + skb2 = skb; + } + + if (skb2 == NULL) { + ip_rt_put(rt); + return; + } + + vif->pkt_out++; + vif->bytes_out+=skb->len; + + dst_release(skb2->dst); + skb2->dst = &rt->u.dst; + iph = skb2->nh.iph; + ip_decrease_ttl(iph); + +#ifdef CONFIG_FIREWALL + if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } + if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } +#endif + if (vif->flags & VIFF_TUNNEL) { + ip_encap(skb2, vif->local, vif->remote); +#ifdef CONFIG_FIREWALL + /* Double output firewalling on tunnels: one is on tunnel + another one is on real device. + */ + if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } +#endif + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len; + } + + IPCB(skb2)->flags |= IPSKB_FORWARDED; + + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + if (skb2->len <= rt->u.dst.pmtu) + skb2->dst->output(skb2); + else + ip_fragment(skb2, skb2->dst->output); +} + +int ipmr_find_vif(struct device *dev) +{ + int ct; + for (ct=0; ct<maxvif; ct++) { + if (vifc_map&(1<<ct) && vif_table[ct].dev == dev) + return ct; + } + return ALL_VIFS; +} + +/* "local" means that we should preserve one skb (for local delivery) */ + +int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) +{ + int psend = -1; + int vif, ct; + + vif = cache->mfc_parent; + cache->mfc_pkt++; + cache->mfc_bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (((struct rtable*)skb->dst)->key.iif == 0) { + /* It is our own packet, looped back. + Very complicated situation... + + The best workaround until routing daemons will be + fixed is not to redistribute packet, if it was + send through wrong interface. It means, that + multicast applications WILL NOT work for + (S,G), which have default multicast route pointing + to wrong oif. In any case, it is not a good + idea to use multicasting applications on router. + */ + goto dont_forward; + } + + cache->mfc_wrong_if++; + true_vifi = ipmr_find_vif(skb->dev); + + if (true_vifi < MAXVIFS && mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mroute_do_pim || cache->mfc_ttls[true_vifi] < 255) && + jiffies - cache->mfc_last_assert > MFC_ASSERT_THRESH) { + cache->mfc_last_assert = jiffies; + ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); + } + goto dont_forward; + } + + vif_table[vif].pkt_in++; + vif_table[vif].bytes_in+=skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_maxvif-1; ct >= cache->mfc_minvif; ct--) { + if (skb->nh.iph->ttl > cache->mfc_ttls[ct]) { + if (psend != -1) + ipmr_queue_xmit(skb, cache, psend, 0); + psend=ct; + } + } + if (psend != -1) + ipmr_queue_xmit(skb, cache, psend, !local); + +dont_forward: + if (!local) + kfree_skb(skb); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + + /* Packet is looped back after forward, it should not be + forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags&IPSKB_FORWARDED) + goto dont_forward; + + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (skb->nh.iph->protocol == IPPROTO_IGMP && mroute_socket) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + Cisco IOS <= 11.2(8)) do not put router alert + option to IGMP packets destined to routable + groups. It is very bad, because it means + that we can forward NO IGMP messages. + */ + raw_rcv(mroute_socket, skb); + return 0; + } + } + + cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + + /* + * No usable cache entry + */ + + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) + return -ENOBUFS; + skb = skb2; + } + + vif = ipmr_find_vif(skb->dev); + if (vif != ALL_VIFS) { + ipmr_cache_unresolved(cache, vif, skb); + return -EAGAIN; + } + kfree_skb(skb); + return 0; + } + + ip_mr_forward(skb, cache, local); + + if (local) + return ip_local_deliver(skb); + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff * skb, unsigned short len) +{ + struct igmphdr *pim = (struct igmphdr*)skb->h.raw; + struct iphdr *encap; + + if (!mroute_do_pim || + len < sizeof(*pim) + sizeof(*encap) || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER || + reg_dev == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + /* + Check that: + a. packet is really destinted to a multicast group + b. packet is not a NULL-REGISTER + c. packet is not truncated + */ + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb); + return -EINVAL; + } + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + skb->dst = NULL; + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + netif_rx(skb); + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +int pim_rcv(struct sk_buff * skb, unsigned short len) +{ + struct pimreghdr *pim = (struct pimreghdr*)skb->h.raw; + struct iphdr *encap; + + if (len < sizeof(*pim) + sizeof(*encap) || + pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || + (pim->flags&PIM_NULL_REGISTER) || + reg_dev == NULL || + (ip_compute_csum((void *)pim, sizeof(*pim)) && + ip_compute_csum((void *)pim, len))) { + kfree_skb(skb); + return -EINVAL; + } + + /* check if the inner packet is destined to mcast group */ + encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb); + return -EINVAL; + } + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + skb->dst = NULL; + netif_rx(skb); + return 0; +} +#endif + +#ifdef CONFIG_RTNETLINK + +static int +ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + struct device *dev = vif_table[c->mfc_parent].dev; + u8 *b = skb->tail; + struct rtattr *mp_head; + + if (dev) + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + + mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) { + if (c->mfc_ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_ttls[ct]; + nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -EMSGSIZE; +} + +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + struct mfc_cache *cache; + struct rtable *rt = (struct rtable*)skb->dst; + + start_bh_atomic(); + cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + struct device *dev; + int vif; + int err; + + if (nowait) { + end_bh_atomic(); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) { + end_bh_atomic(); + return -ENODEV; + } + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb->nh.iph->saddr = rt->rt_src; + skb->nh.iph->daddr = rt->rt_dst; + skb->nh.iph->version = 0; + err = ipmr_cache_unresolved(cache, vif, skb); + end_bh_atomic(); + return err; + } + /* Resolved cache entry is not changed by net bh, + so that we are allowed to enable it. + */ + end_bh_atomic(); + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + return ipmr_fill_mroute(skb, cache, rtm); +} +#endif + +/* + * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + */ + +int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct vif_device *vif; + int len=0; + off_t pos=0; + off_t begin=0; + int size; + int ct; + + len += sprintf(buffer, + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); + pos=len; + + for (ct=0;ct<maxvif;ct++) + { + char *name = "none"; + vif=&vif_table[ct]; + if(!(vifc_map&(1<<ct))) + continue; + if (vif->dev) + name = vif->dev->name; + size = sprintf(buffer+len, "%2d %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + ct, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, + vif->flags, vif->local, vif->remote); + len+=size; + pos+=size; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct mfc_cache *mfc; + int len=0; + off_t pos=0; + off_t begin=0; + int size; + int ct; + + len += sprintf(buffer, + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); + pos=len; + + for (ct=0;ct<MFC_LINES;ct++) + { + start_bh_atomic(); + mfc=mfc_cache_array[ct]; + while(mfc!=NULL) + { + int n; + + /* + * Interface forwarding map + */ + size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld", + (unsigned long)mfc->mfc_mcastgrp, + (unsigned long)mfc->mfc_origin, + mfc->mfc_parent == ALL_VIFS ? -1 : mfc->mfc_parent, + (mfc->mfc_flags & MFC_QUEUED) ? mfc->mfc_unresolved.qlen : mfc->mfc_pkt, + mfc->mfc_bytes, + mfc->mfc_wrong_if); + for(n=mfc->mfc_minvif;n<mfc->mfc_maxvif;n++) + { + if(vifc_map&(1<<n) && mfc->mfc_ttls[n] < 255) + size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_ttls[n]); + } + size += sprintf(buffer+len+size, "\n"); + len+=size; + pos+=size; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + { + end_bh_atomic(); + goto done; + } + mfc=mfc->next; + } + end_bh_atomic(); + } +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) { + len = 0; + } + return len; +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ipmr_vif = { + PROC_NET_IPMR_VIF, 9 ,"ip_mr_vif", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ipmr_vif_info +}; +static struct proc_dir_entry proc_net_ipmr_mfc = { + PROC_NET_IPMR_MFC, 11 ,"ip_mr_cache", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ipmr_mfc_info +}; +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +struct inet_protocol pim_protocol = +{ + pim_rcv, /* PIM handler */ + NULL, /* PIM error control */ + NULL, /* next */ + IPPROTO_PIM, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "PIM" /* name */ +}; +#endif + + +/* + * Setup for IP multicast routing + */ + +__initfunc(void ip_mr_init(void)) +{ + printk(KERN_INFO "Linux IP multicast router 0.06 plus PIM-SM\n"); + register_netdevice_notifier(&ip_mr_notifier); +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ipmr_vif); + proc_net_register(&proc_net_ipmr_mfc); +#endif +} diff --git a/pfinet/linux-src/net/ipv4/proc.c b/pfinet/linux-src/net/ipv4/proc.c new file mode 100644 index 00000000..1640a056 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/proc.c @@ -0,0 +1,387 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. It is mainly used for debugging and + * statistics. + * + * Version: $Id: proc.c,v 1.34 1999/02/08 11:20:34 davem Exp $ + * + * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> + * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> + * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de> + * + * Fixes: + * Alan Cox : UDP sockets show the rxqueue/txqueue + * using hint flag for the netinfo. + * Pauline Middelink : identd support + * Alan Cox : Make /proc safer. + * Erik Schoenfelder : /proc/net/snmp + * Alan Cox : Handle dead sockets properly. + * Gerhard Koerting : Show both timers + * Alan Cox : Allow inode to be NULL (kernel socket) + * Andi Kleen : Add support for open_requests and + * split functions for more readibility. + * Andi Kleen : Add support for /proc/net/netstat + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/system.h> +#include <linux/sched.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/un.h> +#include <linux/in.h> +#include <linux/param.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> + +/* Format a single open_request into tmpbuf. */ +static inline void get__openreq(struct sock *sk, struct open_request *req, + char *tmpbuf, + int i) +{ + sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u", + i, + (long unsigned int)req->af.v4_req.loc_addr, + ntohs(sk->sport), + (long unsigned int)req->af.v4_req.rmt_addr, + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + (unsigned long)(req->expires - jiffies), + req->retrans, + sk->socket ? sk->socket->inode->i_uid : 0, + 0, /* non standard timer */ + 0 /* open_requests have no inode */ + ); +} + +/* Format a single socket into tmpbuf. */ +static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) +{ + unsigned long dest, src; + unsigned short destp, srcp; + int timer_active, timer_active1, timer_active2; + int tw_bucket = 0; + unsigned long timer_expires; + struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; + + dest = sp->daddr; + src = sp->rcv_saddr; + destp = sp->dport; + srcp = sp->sport; + + /* FIXME: The fact that retransmit_timer occurs as a field + * in two different parts of the socket structure is, + * to say the least, confusing. This code now uses the + * right retransmit_timer variable, but I'm not sure + * the rest of the timer stuff is still correct. + * In particular I'm not sure what the timeout value + * is suppose to reflect (as opposed to tm->when). -- erics + */ + + destp = ntohs(destp); + srcp = ntohs(srcp); + if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + extern int tcp_tw_death_row_slot; + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp; + int slot_dist; + + tw_bucket = 1; + timer_active1 = timer_active2 = 0; + timer_active = 3; + slot_dist = tw->death_slot; + if(slot_dist > tcp_tw_death_row_slot) + slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; + else + slot_dist = tcp_tw_death_row_slot - slot_dist; + timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); + } else { + timer_active1 = del_timer(&tp->retransmit_timer); + timer_active2 = del_timer(&sp->timer); + if (!timer_active1) tp->retransmit_timer.expires=0; + if (!timer_active2) sp->timer.expires=0; + timer_active = 0; + timer_expires = (unsigned) -1; + } + if (timer_active1 && tp->retransmit_timer.expires < timer_expires) { + timer_active = 1; + timer_expires = tp->retransmit_timer.expires; + } + if (timer_active2 && sp->timer.expires < timer_expires) { + timer_active = 2; + timer_expires = sp->timer.expires; + } + if(timer_active == 0) + timer_expires = jiffies; + sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld", + i, src, srcp, dest, destp, sp->state, + (tw_bucket ? + 0 : + (format == 0) ? + tp->write_seq-tp->snd_una : atomic_read(&sp->wmem_alloc)), + (tw_bucket ? + 0 : + (format == 0) ? + tp->rcv_nxt-tp->copied_seq: atomic_read(&sp->rmem_alloc)), + timer_active, timer_expires-jiffies, + (tw_bucket ? 0 : tp->retransmits), + (!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0, + (!tw_bucket && timer_active) ? sp->timeout : 0, + (!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0); + + if (timer_active1) add_timer(&tp->retransmit_timer); + if (timer_active2) add_timer(&sp->timer); +} + +/* + * Get__netinfo returns the length of that string. + * + * KNOWN BUGS + * As in get_unix_netinfo, the buffer might be too small. If this + * happens, get__netinfo returns only part of the available infos. + * + * Assumes that buffer length is a multiply of 128 - if not it will + * write past the end. + */ +static int +get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t offset, int length) +{ + struct sock *sp, *next; + int len=0, i = 0; + off_t pos=0; + off_t begin; + char tmpbuf[129]; + + if (offset < 128) + len += sprintf(buffer, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout inode"); + pos = 128; + SOCKHASH_LOCK(); + sp = pro->sklist_next; + while(sp != (struct sock *)pro) { + if (format == 0 && sp->state == TCP_LISTEN) { + struct open_request *req; + + for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req; + i++, req = req->dl_next) { + if (req->sk) + continue; + pos += 128; + if (pos < offset) + continue; + get__openreq(sp, req, tmpbuf, i); + len += sprintf(buffer+len, "%-127s\n", tmpbuf); + if(len >= length) + goto out; + } + } + + pos += 128; + if (pos < offset) + goto next; + + get__sock(sp, tmpbuf, i, format); + + len += sprintf(buffer+len, "%-127s\n", tmpbuf); + if(len >= length) + break; + next: + next = sp->sklist_next; + sp = next; + i++; + } +out: + SOCKHASH_UNLOCK(); + + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + if (len<0) + len = 0; + return len; +} + +int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + return get__netinfo(&tcp_prot, buffer,0, start, offset, length); +} + +int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + return get__netinfo(&udp_prot, buffer,1, start, offset, length); +} + +int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + return get__netinfo(&raw_prot, buffer,1, start, offset, length); +} + +/* + * Report socket allocation statistics [mea@utu.fi] + */ +int afinet_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + /* From net/socket.c */ + extern int socket_get_info(char *, char **, off_t, int); + + int len = socket_get_info(buffer,start,offset,length); + + len += sprintf(buffer+len,"TCP: inuse %d highest %d\n", + tcp_prot.inuse, tcp_prot.highestinuse); + len += sprintf(buffer+len,"UDP: inuse %d highest %d\n", + udp_prot.inuse, udp_prot.highestinuse); + len += sprintf(buffer+len,"RAW: inuse %d highest %d\n", + raw_prot.inuse, raw_prot.highestinuse); + if (offset >= len) + { + *start = buffer; + return 0; + } + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Called from the PROCfs module. This outputs /proc/net/snmp. + */ + +int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + extern struct tcp_mib tcp_statistics; + extern struct udp_mib udp_statistics; + int len; +/* + extern unsigned long tcp_rx_miss, tcp_rx_hit1,tcp_rx_hit2; +*/ + + len = sprintf (buffer, + "Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates\n" + "Ip: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + ip_statistics.IpForwarding, ip_statistics.IpDefaultTTL, + ip_statistics.IpInReceives, ip_statistics.IpInHdrErrors, + ip_statistics.IpInAddrErrors, ip_statistics.IpForwDatagrams, + ip_statistics.IpInUnknownProtos, ip_statistics.IpInDiscards, + ip_statistics.IpInDelivers, ip_statistics.IpOutRequests, + ip_statistics.IpOutDiscards, ip_statistics.IpOutNoRoutes, + ip_statistics.IpReasmTimeout, ip_statistics.IpReasmReqds, + ip_statistics.IpReasmOKs, ip_statistics.IpReasmFails, + ip_statistics.IpFragOKs, ip_statistics.IpFragFails, + ip_statistics.IpFragCreates); + + len += sprintf (buffer + len, + "Icmp: InMsgs InErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps\n" + "Icmp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + icmp_statistics.IcmpInMsgs, icmp_statistics.IcmpInErrors, + icmp_statistics.IcmpInDestUnreachs, icmp_statistics.IcmpInTimeExcds, + icmp_statistics.IcmpInParmProbs, icmp_statistics.IcmpInSrcQuenchs, + icmp_statistics.IcmpInRedirects, icmp_statistics.IcmpInEchos, + icmp_statistics.IcmpInEchoReps, icmp_statistics.IcmpInTimestamps, + icmp_statistics.IcmpInTimestampReps, icmp_statistics.IcmpInAddrMasks, + icmp_statistics.IcmpInAddrMaskReps, icmp_statistics.IcmpOutMsgs, + icmp_statistics.IcmpOutErrors, icmp_statistics.IcmpOutDestUnreachs, + icmp_statistics.IcmpOutTimeExcds, icmp_statistics.IcmpOutParmProbs, + icmp_statistics.IcmpOutSrcQuenchs, icmp_statistics.IcmpOutRedirects, + icmp_statistics.IcmpOutEchos, icmp_statistics.IcmpOutEchoReps, + icmp_statistics.IcmpOutTimestamps, icmp_statistics.IcmpOutTimestampReps, + icmp_statistics.IcmpOutAddrMasks, icmp_statistics.IcmpOutAddrMaskReps); + + len += sprintf (buffer + len, + "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts\n" + "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + tcp_statistics.TcpRtoAlgorithm, tcp_statistics.TcpRtoMin, + tcp_statistics.TcpRtoMax, tcp_statistics.TcpMaxConn, + tcp_statistics.TcpActiveOpens, tcp_statistics.TcpPassiveOpens, + tcp_statistics.TcpAttemptFails, tcp_statistics.TcpEstabResets, + tcp_statistics.TcpCurrEstab, tcp_statistics.TcpInSegs, + tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs, + tcp_statistics.TcpInErrs, tcp_statistics.TcpOutRsts); + + len += sprintf (buffer + len, + "Udp: InDatagrams NoPorts InErrors OutDatagrams\nUdp: %lu %lu %lu %lu\n", + udp_statistics.UdpInDatagrams, udp_statistics.UdpNoPorts, + udp_statistics.UdpInErrors, udp_statistics.UdpOutDatagrams); +/* + len += sprintf( buffer + len, + "TCP fast path RX: H2: %ul H1: %ul L: %ul\n", + tcp_rx_hit2,tcp_rx_hit1,tcp_rx_miss); +*/ + + if (offset >= len) + { + *start = buffer; + return 0; + } + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + +/* + * Output /proc/net/netstat + */ + +int netstat_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + extern struct linux_mib net_statistics; + int len; + + len = sprintf(buffer, + "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" + " EmbryonicRsts PruneCalled RcvPruned OfoPruned" + " OutOfWindowIcmps LockDroppedIcmps\n" + "TcpExt: %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + net_statistics.SyncookiesSent, + net_statistics.SyncookiesRecv, + net_statistics.SyncookiesFailed, + net_statistics.EmbryonicRsts, + net_statistics.PruneCalled, + net_statistics.RcvPruned, + net_statistics.OfoPruned, + net_statistics.OutOfWindowIcmps, + net_statistics.LockDroppedIcmps); + + if (offset >= len) + { + *start = buffer; + return 0; + } + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} diff --git a/pfinet/linux-src/net/ipv4/protocol.c b/pfinet/linux-src/net/ipv4/protocol.c new file mode 100644 index 00000000..b47480be --- /dev/null +++ b/pfinet/linux-src/net/ipv4/protocol.c @@ -0,0 +1,211 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * INET protocol dispatch tables. + * + * Version: $Id: protocol.c,v 1.9 1997/10/29 20:27:34 kuznet Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : Ahah! udp icmp errors don't work because + * udp_err is never called! + * Alan Cox : Added new fields for init and ready for + * proper fragmentation (_NO_ 4K limits!) + * Richard Colella : Hang on hash collision + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/config.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/timer.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/ipip.h> +#include <linux/igmp.h> + +#define IPPROTO_PREVIOUS NULL + +#ifdef CONFIG_IP_MULTICAST + +static struct inet_protocol igmp_protocol = +{ + igmp_rcv, /* IGMP handler */ + NULL, /* IGMP error control */ + IPPROTO_PREVIOUS, /* next */ + IPPROTO_IGMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IGMP" /* name */ +}; + +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &igmp_protocol + +#endif + +static struct inet_protocol tcp_protocol = +{ + tcp_v4_rcv, /* TCP handler */ + tcp_v4_err, /* TCP error control */ + IPPROTO_PREVIOUS, + IPPROTO_TCP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "TCP" /* name */ +}; + +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &tcp_protocol + +static struct inet_protocol udp_protocol = +{ + udp_rcv, /* UDP handler */ + udp_err, /* UDP error control */ + IPPROTO_PREVIOUS, /* next */ + IPPROTO_UDP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "UDP" /* name */ +}; + +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &udp_protocol + + +static struct inet_protocol icmp_protocol = +{ + icmp_rcv, /* ICMP handler */ + NULL, /* ICMP error control */ + IPPROTO_PREVIOUS, /* next */ + IPPROTO_ICMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "ICMP" /* name */ +}; + +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &icmp_protocol + + +struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS; + +struct inet_protocol *inet_protos[MAX_INET_PROTOS] = +{ + NULL +}; + + +/* + * Find a protocol in the protocol tables given its + * IP type. + */ + +struct inet_protocol *inet_get_protocol(unsigned char prot) +{ + unsigned char hash; + struct inet_protocol *p; + + hash = prot & (MAX_INET_PROTOS - 1); + for (p = inet_protos[hash] ; p != NULL; p=p->next) + { + if (p->protocol == prot) + return((struct inet_protocol *) p); + } + return(NULL); +} + +/* + * Add a protocol handler to the hash tables + */ + +void inet_add_protocol(struct inet_protocol *prot) +{ + unsigned char hash; + struct inet_protocol *p2; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + prot ->next = inet_protos[hash]; + inet_protos[hash] = prot; + prot->copy = 0; + + /* + * Set the copy bit if we need to. + */ + + p2 = (struct inet_protocol *) prot->next; + while(p2 != NULL) + { + if (p2->protocol == prot->protocol) + { + prot->copy = 1; + break; + } + p2 = (struct inet_protocol *) p2->next; + } +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet_del_protocol(struct inet_protocol *prot) +{ + struct inet_protocol *p; + struct inet_protocol *lp = NULL; + unsigned char hash; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + if (prot == inet_protos[hash]) + { + inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next; + return(0); + } + + p = (struct inet_protocol *) inet_protos[hash]; + while(p != NULL) + { + /* + * We have to worry if the protocol being deleted is + * the last one on the list, then we may need to reset + * someone's copied bit. + */ + if (p->next != NULL && p->next == prot) + { + /* + * if we are the last one with this protocol and + * there is a previous one, reset its copy bit. + */ + if (p->copy == 0 && lp != NULL) + lp->copy = 0; + p->next = prot->next; + return(0); + } + if (p->next != NULL && p->next->protocol == prot->protocol) + lp = p; + + p = (struct inet_protocol *) p->next; + } + return(-1); +} diff --git a/pfinet/linux-src/net/ipv4/rarp.c b/pfinet/linux-src/net/ipv4/rarp.c new file mode 100644 index 00000000..7f7c7e3f --- /dev/null +++ b/pfinet/linux-src/net/ipv4/rarp.c @@ -0,0 +1,606 @@ +/* linux/net/inet/rarp.c + * + * Copyright (C) 1994 by Ross Martin + * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche + * + * $Id: rarp.c,v 1.25 1998/06/19 13:22:34 davem Exp $ + * + * This module implements the Reverse Address Resolution Protocol + * (RARP, RFC 903), which is used to convert low level addresses such + * as Ethernet addresses into high level addresses such as IP addresses. + * The most common use of RARP is as a means for a diskless workstation + * to discover its IP address during a network boot. + * + ** + *** WARNING:::::::::::::::::::::::::::::::::WARNING + **** + ***** SUN machines seem determined to boot solely from the person who + **** answered their RARP query. NEVER add a SUN to your RARP table + *** unless you have all the rest to boot the box from it. + ** + * + * Currently, only Ethernet address -> IP address is likely to work. + * (Is RARP ever used for anything else?) + * + * This code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes + * Alan Cox : Rarp delete on device down needed as + * reported by Walter Wolfgang. + * Mike McLagan : Routing by source + * + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in.h> +#include <linux/config.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <stdarg.h> +#include <linux/inet.h> +#include <linux/etherdevice.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/rarp.h> +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#include <net/ax25.h> +#endif +#include <linux/proc_fs.h> +#include <linux/stat.h> + +extern int (*rarp_ioctl_hook)(unsigned int,void*); + +/* + * This structure defines the RARP mapping cache. As long as we make + * changes in this structure, we keep interrupts off. + */ + +struct rarp_table +{ + struct rarp_table *next; /* Linked entry list */ + unsigned long ip; /* ip address of entry */ + unsigned char ha[MAX_ADDR_LEN]; /* Hardware address */ + unsigned char hlen; /* Length of hardware address */ + unsigned char htype; /* Type of hardware in use */ + struct device *dev; /* Device the entry is tied to */ +}; + +struct rarp_table *rarp_tables = NULL; + +static int rarp_rcv(struct sk_buff *, struct device *, struct packet_type *); + +static struct packet_type rarp_packet_type = +{ + 0, /* Should be: __constant_htons(ETH_P_RARP) - but this _doesn't_ come out constant! */ + 0, /* copy */ + rarp_rcv, + NULL, + NULL +}; + +static int initflag = 1; + + +/* + * Release the memory for this entry. + */ + +static inline void rarp_release_entry(struct rarp_table *entry) +{ + kfree_s(entry, sizeof(struct rarp_table)); + MOD_DEC_USE_COUNT; + return; +} + +/* + * Delete a RARP mapping entry in the cache. + */ + +static void rarp_destroy(unsigned long ip_addr) +{ + struct rarp_table *entry; + struct rarp_table **pentry; + + start_bh_atomic(); + pentry = &rarp_tables; + while ((entry = *pentry) != NULL) + { + if (entry->ip == ip_addr) + { + *pentry = entry->next; + end_bh_atomic(); + rarp_release_entry(entry); + return; + } + pentry = &entry->next; + } + end_bh_atomic(); +} + +/* + * Flush a device. + */ + +static void rarp_destroy_dev(struct device *dev) +{ + struct rarp_table *entry; + struct rarp_table **pentry; + + start_bh_atomic(); + pentry = &rarp_tables; + while ((entry = *pentry) != NULL) + { + if (entry->dev == dev) + { + *pentry = entry->next; + rarp_release_entry(entry); + } + else + pentry = &entry->next; + } + end_bh_atomic(); +} + +static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + if(event!=NETDEV_DOWN) + return NOTIFY_DONE; + rarp_destroy_dev((struct device *)ptr); + return NOTIFY_DONE; +} + +/* + * Called once when data first added to rarp cache with ioctl. + */ + +static struct notifier_block rarp_dev_notifier={ + rarp_device_event, + NULL, + 0 +}; + +static int rarp_pkt_inited=0; + +static void rarp_init_pkt (void) +{ + /* Register the packet type */ + rarp_packet_type.type=htons(ETH_P_RARP); + dev_add_pack(&rarp_packet_type); + register_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=1; +} + +#ifdef MODULE + +static void rarp_end_pkt(void) +{ + if(!rarp_pkt_inited) + return; + dev_remove_pack(&rarp_packet_type); + unregister_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=0; +} + +#endif + +/* + * Receive an arp request by the device layer. Maybe it should be + * rewritten to use the incoming packet for the reply. The current + * "overhead" time isn't that high... + */ + +static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ +/* + * We shouldn't use this type conversion. Check later. + */ + struct arphdr *rarp = (struct arphdr *) skb->data; + unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr)); + struct rarp_table *entry; + struct in_device *in_dev = dev->ip_ptr; + long sip,tip; + unsigned char *sha,*tha; /* s for "source", t for "target" */ + +/* + * If this test doesn't pass, it's not IP, or we should ignore it anyway + */ + + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd) + || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list) + { + kfree_skb(skb); + return 0; + } + +/* + * If it's not a RARP request, delete it. + */ + if (rarp->ar_op != htons(ARPOP_RREQUEST)) + { + kfree_skb(skb); + return 0; + } + +/* + * For now we will only deal with IP addresses. + */ + + if ( +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + (rarp->ar_pro != htons(AX25_P_IP) && dev->type == ARPHRD_AX25) || +#endif + (rarp->ar_pro != htons(ETH_P_IP) && dev->type != ARPHRD_AX25) + || rarp->ar_pln != 4) + { + /* + * This packet is not for us. Remove it. + */ + kfree_skb(skb); + return 0; + } + +/* + * Extract variable width fields + */ + + sha=rarp_ptr; + rarp_ptr+=dev->addr_len; + memcpy(&sip,rarp_ptr,4); + rarp_ptr+=4; + tha=rarp_ptr; + rarp_ptr+=dev->addr_len; + memcpy(&tip,rarp_ptr,4); + +/* + * Process entry. Use tha for table lookup according to RFC903. + */ + + for (entry = rarp_tables; entry != NULL; entry = entry->next) + if (!memcmp(entry->ha, tha, rarp->ar_hln)) + break; + + if (entry != NULL) + { + sip=entry->ip; + + arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, in_dev->ifa_list->ifa_address, sha, + dev->dev_addr, sha); + } + + kfree_skb(skb); + return 0; +} + + +/* + * Set (create) a RARP cache entry. + */ + +static int rarp_req_set(struct arpreq *req) +{ + struct arpreq r; + struct rarp_table *entry; + struct sockaddr_in *si; + int htype, hlen; + unsigned long ip; + struct rtable *rt; + struct device * dev; + int err; + + err = copy_from_user(&r, req, sizeof(r)); + if (err) + return -EFAULT; + + /* + * We only understand about IP addresses... + */ + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + + switch (r.arp_ha.sa_family) + { + case ARPHRD_ETHER: + htype = ARPHRD_ETHER; + hlen = ETH_ALEN; + break; +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + htype = ARPHRD_AX25; + hlen = 7; + break; +#endif + default: + return -EPFNOSUPPORT; + } + + si = (struct sockaddr_in *) &r.arp_pa; + ip = si->sin_addr.s_addr; + if (ip == 0) + { + printk(KERN_DEBUG "RARP: SETRARP: requested PA is 0.0.0.0 !\n"); + return -EINVAL; + } + +/* + * Is it reachable directly ? + */ + + err = ip_route_output(&rt, ip, 0, 1, 0); + if (err) + return err; + if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { + ip_rt_put(rt); + return -EINVAL; + } + dev = rt->u.dst.dev; + +/* + * Is there an existing entry for this address? Find out... + */ + + for (entry = rarp_tables; entry != NULL; entry = entry->next) + if (entry->ip == ip) + break; + +/* + * If no entry was found, create a new one. + */ + + if (entry == NULL) + { + entry = (struct rarp_table *) kmalloc(sizeof(struct rarp_table), + GFP_ATOMIC); + if (entry == NULL) + { + return -ENOMEM; + } + if (initflag) + { + rarp_init_pkt(); + initflag=0; + } + + /* Block interrupts until table modification is finished */ + + cli(); + entry->next = rarp_tables; + rarp_tables = entry; + } + cli(); + entry->ip = ip; + entry->hlen = hlen; + entry->htype = htype; + memcpy(&entry->ha, &r.arp_ha.sa_data, hlen); + entry->dev = dev; + sti(); + + /* Don't unlink if we have entries to serve. */ + MOD_INC_USE_COUNT; + + return 0; +} + + +/* + * Get a RARP cache entry. + */ + +static int rarp_req_get(struct arpreq *req) +{ + struct arpreq r; + struct rarp_table *entry; + struct sockaddr_in *si; + unsigned long ip; + int err; + +/* + * We only understand about IP addresses... + */ + + err = copy_from_user(&r, req, sizeof(r)); + if (err) + return -EFAULT; + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + +/* + * Is there an existing entry for this address? + */ + + si = (struct sockaddr_in *) &r.arp_pa; + ip = si->sin_addr.s_addr; + + for (entry = rarp_tables; entry != NULL; entry = entry->next) + if (entry->ip == ip) + break; + + if (entry == NULL) + { + return -ENXIO; + } + +/* + * We found it; copy into structure. + */ + + memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen); + r.arp_ha.sa_family = entry->htype; + +/* + * Copy the information back + */ + + return copy_to_user(req, &r, sizeof(r)) ? -EFAULT : 0; +} + + +/* + * Handle a RARP layer I/O control request. + */ + +int rarp_ioctl(unsigned int cmd, void *arg) +{ + struct arpreq r; + struct sockaddr_in *si; + int err; + + switch(cmd) + { + case SIOCDRARP: + if (!suser()) + return -EPERM; + err = copy_from_user(&r, arg, sizeof(r)); + if (err) + return -EFAULT; + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + si = (struct sockaddr_in *) &r.arp_pa; + rarp_destroy(si->sin_addr.s_addr); + return 0; + + case SIOCGRARP: + + return rarp_req_get((struct arpreq *)arg); + case SIOCSRARP: + if (!suser()) + return -EPERM; + return rarp_req_set((struct arpreq *)arg); + default: + return -EINVAL; + } + + /*NOTREACHED*/ + return 0; +} + +#ifdef CONFIG_PROC_FS +int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t begin=0; + off_t pos=0; + int size; + struct rarp_table *entry; + char ipbuffer[20]; + unsigned long netip; + if (initflag) + { + size = sprintf(buffer,"RARP disabled until entries added to cache.\n"); + pos+=size; + len+=size; + } + else + { + size = sprintf(buffer, + "IP address HW type HW address\n"); + pos+=size; + len+=size; + + for(entry=rarp_tables; entry!=NULL; entry=entry->next) + { + netip=htonl(entry->ip); /* switch to network order */ + sprintf(ipbuffer,"%d.%d.%d.%d", + (unsigned int)(netip>>24)&255, + (unsigned int)(netip>>16)&255, + (unsigned int)(netip>>8)&255, + (unsigned int)(netip)&255); + + size = sprintf(buffer+len, + "%-17s%-20s%02x:%02x:%02x:%02x:%02x:%02x\n", + ipbuffer, + "10Mbps Ethernet", + (unsigned int)entry->ha[0], + (unsigned int)entry->ha[1], + (unsigned int)entry->ha[2], + (unsigned int)entry->ha[3], + (unsigned int)entry->ha[4], + (unsigned int)entry->ha[5]); + + len+=size; + pos=begin+len; + + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + } + + *start = buffer+(offset-begin); /* Start of wanted data */ + len -= (offset-begin); /* Start slop */ + if (len>length) + len = length; /* Ending slop */ + return len; +} + +struct proc_dir_entry proc_net_rarp = { + PROC_NET_RARP, 4, "rarp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rarp_get_info +}; +#endif + +__initfunc(void +rarp_init(void)) +{ +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_rarp); +#endif + rarp_ioctl_hook = rarp_ioctl; +} + +#ifdef MODULE + +int init_module(void) +{ + rarp_init(); + return 0; +} + +void cleanup_module(void) +{ + struct rarp_table *rt, *rt_next; +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_RARP); +#endif + rarp_ioctl_hook = NULL; + cli(); + /* Destroy the RARP-table */ + rt = rarp_tables; + rarp_tables = NULL; + sti(); + /* ... and free it. */ + for ( ; rt != NULL; rt = rt_next) { + rt_next = rt->next; + rarp_release_entry(rt); + } + rarp_end_pkt(); +} +#endif diff --git a/pfinet/linux-src/net/ipv4/raw.c b/pfinet/linux-src/net/ipv4/raw.c new file mode 100644 index 00000000..5e7910dd --- /dev/null +++ b/pfinet/linux-src/net/ipv4/raw.c @@ -0,0 +1,573 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * RAW - implementation of IP "raw" sockets. + * + * Version: $Id: raw.c,v 1.39.2.1 1999/06/20 20:14:50 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : verify_area() fixed up + * Alan Cox : ICMP error handling + * Alan Cox : EMSGSIZE if you send too big a packet + * Alan Cox : Now uses generic datagrams and shared skbuff + * library. No more peek crashes, no more backlogs + * Alan Cox : Checks sk->broadcast. + * Alan Cox : Uses skb_free_datagram/skb_copy_datagram + * Alan Cox : Raw passes ip options too + * Alan Cox : Setsocketopt added + * Alan Cox : Fixed error return for broadcasts + * Alan Cox : Removed wake_up calls + * Alan Cox : Use ttl/tos + * Alan Cox : Cleaned up old debugging + * Alan Cox : Use new kernel side addresses + * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. + * Alan Cox : BSD style RAW socket demultiplexing. + * Alan Cox : Beginnings of mrouted support. + * Alan Cox : Added IP_HDRINCL option. + * Alan Cox : Skip broadcast check if BSDism set. + * David S. Miller : New socket lookup architecture. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/mroute.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/raw.h> +#include <net/checksum.h> + +#ifdef CONFIG_IP_MROUTE +struct sock *mroute_socket=NULL; +#endif + +struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE]; + +static void raw_v4_hash(struct sock *sk) +{ + struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)]; + + SOCKHASH_LOCK(); + if ((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + *skp = sk; + sk->pprev = skp; + SOCKHASH_UNLOCK(); +} + +static void raw_v4_unhash(struct sock *sk) +{ + SOCKHASH_LOCK(); + if (sk->pprev) { + if (sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + } + SOCKHASH_UNLOCK(); +} + +/* Grumble... icmp and ip_input want to get at this... */ +struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, + unsigned long raddr, unsigned long laddr, int dif) +{ + struct sock *s = sk; + + SOCKHASH_LOCK(); + for(s = sk; s; s = s->next) { + if((s->num == num) && + !(s->dead && (s->state == TCP_CLOSE)) && + !(s->daddr && s->daddr != raddr) && + !(s->rcv_saddr && s->rcv_saddr != laddr) && + !(s->bound_dev_if && s->bound_dev_if != dif)) + break; /* gotcha */ + } + SOCKHASH_UNLOCK(); + return s; +} + +void raw_err (struct sock *sk, struct sk_buff *skb) +{ + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + u32 info = 0; + int err = 0; + int harderr = 0; + + /* Report error on raw socket, if: + 1. User requested ip_recverr. + 2. Socket is connected (otherwise the error indication + is useless without ip_recverr and error is hard. + */ + if (!sk->ip_recverr && sk->state != TCP_ESTABLISHED) + return; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + info = ntohl(skb->h.icmph->un.gateway)>>24; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + err = EHOSTUNREACH; + if (code > NR_ICMP_UNREACH) + break; + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; + if (code == ICMP_FRAG_NEEDED) { + harderr = (sk->ip_pmtudisc != IP_PMTUDISC_DONT); + err = EMSGSIZE; + info = ntohs(skb->h.icmph->un.frag.mtu); + } + } + + if (sk->ip_recverr) + ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1)); + + if (sk->ip_recverr || harderr) { + sk->err = err; + sk->error_report(sk); + } +} + +static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (sock_queue_rcv_skb(sk,skb)<0) + { + ip_statistics.IpInDiscards++; + kfree_skb(skb); + return -1; + } + + ip_statistics.IpInDelivers++; + return 0; +} + +/* + * This should be the easiest of all, all we do is + * copy it into a buffer. All demultiplexing is done + * in ip.c + */ + +int raw_rcv(struct sock *sk, struct sk_buff *skb) +{ + /* Now we need to copy this into memory. */ + skb_trim(skb, ntohs(skb->nh.iph->tot_len)); + + skb->h.raw = skb->nh.raw; + + raw_rcv_skb(sk, skb); + return 0; +} + +struct rawfakehdr +{ + struct iovec *iov; + u32 saddr; +}; + +/* + * Send a RAW IP packet. + */ + +/* + * Callback support is trivial for SOCK_RAW + */ + +static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen) +{ + struct rawfakehdr *rfh = (struct rawfakehdr *) p; + return memcpy_fromiovecend(to, rfh->iov, offset, fraglen); +} + +/* + * IPPROTO_RAW needs extra work. + */ + +static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen) +{ + struct rawfakehdr *rfh = (struct rawfakehdr *) p; + + if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen)) + return -EFAULT; + + if (offset==0) { + struct iphdr *iph = (struct iphdr *)to; + if (!iph->saddr) + iph->saddr = rfh->saddr; + iph->check=0; + iph->tot_len=htons(fraglen); /* This is right as you can't frag + RAW packets */ + /* + * Deliberate breach of modularity to keep + * ip_build_xmit clean (well less messy). + */ + if (!iph->id) + iph->id = htons(ip_id_count++); + iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl); + } + return 0; +} + +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipcm_cookie ipc; + struct rawfakehdr rfh; + struct rtable *rt = NULL; + int free = 0; + u32 daddr; + u8 tos; + int err; + + /* This check is ONLY to check for arithmetic overflow + on integer(!) len. Not more! Real check will be made + in ip_build_xmit --ANK + + BTW socket.c -> af_*.c -> ... make multiple + invalid conversions size_t -> int. We MUST repair it f.e. + by replacing all of them with size_t and revise all + the places sort of len += sizeof(struct iphdr) + If len was ULONG_MAX-10 it would be cathastrophe --ANK + */ + + if (len < 0 || len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ + return -EOPNOTSUPP; + + if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT)) + return(-EINVAL); + + /* + * Get and verify the address. + */ + + if (msg->msg_namelen) { + struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return(-EINVAL); + if (usin->sin_family != AF_INET) { + static int complained; + if (!complained++) + printk(KERN_INFO "%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current->comm); + if (usin->sin_family) + return -EINVAL; + } + daddr = usin->sin_addr.s_addr; + /* ANK: I did not forget to get protocol from port field. + * I just do not know, who uses this weirdness. + * IP_HDRINCL is much more convenient. + */ + } else { + if (sk->state != TCP_ESTABLISHED) + return(-EINVAL); + daddr = sk->daddr; + } + + ipc.addr = sk->saddr; + ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; + + if (msg->msg_controllen) { + int tmp = ip_cmsg_send(msg, &ipc); + if (tmp) + return tmp; + if (ipc.opt) + free=1; + } + + rfh.saddr = ipc.addr; + ipc.addr = daddr; + + if (!ipc.opt) + ipc.opt = sk->opt; + + if (ipc.opt) { + err = -EINVAL; + /* Linux does not mangle headers on raw sockets, + * so that IP options + IP_HDRINCL is non-sense. + */ + if (sk->ip_hdrincl) + goto done; + if (ipc.opt->srr) { + if (!daddr) + goto done; + daddr = ipc.opt->faddr; + } + } + tos = RT_TOS(sk->ip_tos) | sk->localroute; + if (msg->msg_flags&MSG_DONTROUTE) + tos |= RTO_ONLINK; + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!rfh.saddr) + rfh.saddr = sk->ip_mc_addr; + } + + err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); + + if (err) + goto done; + + err = -EACCES; + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) + goto done; + + rfh.iov = msg->msg_iov; + rfh.saddr = rt->rt_src; + if (!ipc.addr) + ipc.addr = rt->rt_dst; + err=ip_build_xmit(sk, sk->ip_hdrincl ? raw_getrawfrag : raw_getfrag, + &rfh, len, &ipc, rt, msg->msg_flags); + +done: + if (free) + kfree(ipc.opt); + ip_rt_put(rt); + + return err<0 ? err : len; +} + +static void raw_close(struct sock *sk, long timeout) +{ + /* Observation: when raw_close is called, processes have + no access to socket anymore. But net still has. + Step one, detach it from networking: + + A. Remove from hash tables. + */ + sk->state = TCP_CLOSE; + raw_v4_unhash(sk); + /* + B. Raw sockets may have direct kernel refereneces. Kill them. + */ + ip_ra_control(sk, 0, NULL); + + /* In this point socket cannot receive new packets anymore */ + + + /* But we still have packets pending on receive + queue and probably, our own packets waiting in device queues. + sock_destroy will drain receive queue, but transmitted + packets will delay socket destruction. + Set sk->dead=1 in order to prevent wakeups, when these + packet will be freed. + */ + sk->dead=1; + destroy_sock(sk); + + /* That's all. No races here. */ +} + +/* This gets rid of all the nasties in af_inet. -DaveM */ +static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int chk_addr_ret; + + if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in))) + return -EINVAL; + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* Superuser may bind to any address to allow transparent proxying. */ + if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN)) +#endif + return -EADDRNOTAVAIL; + } + sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + sk->saddr = 0; /* Use device */ + dst_release(xchg(&sk->dst_cache, NULL)); + return 0; +} + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags,int *addr_len) +{ + int copied=0; + struct sk_buff *skb; + int err; + struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (addr_len) + *addr_len=sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + + skb=skb_recv_datagram(sk,flags,noblock,&err); + if(skb==NULL) + return err; + + copied = skb->len; + if (len < copied) + { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sk->stamp=skb->stamp; + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + } + if (sk->ip_cmsg_flags) + ip_cmsg_recv(msg, skb); +done: + skb_free_datagram(sk, skb); + return (err ? : copied); +} + +static int raw_init(struct sock *sk) +{ + struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); + if (sk->num == IPPROTO_ICMP) + memset(&tp->filter, 0, sizeof(tp->filter)); + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen) +{ + int len; + + if (get_user(len,optlen)) + return -EFAULT; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len)) + return -EFAULT; + return 0; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_seticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_geticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} + +struct proto raw_prot = { + (struct sock *)&raw_prot, /* sklist_next */ + (struct sock *)&raw_prot, /* sklist_prev */ + raw_close, /* close */ + udp_connect, /* connect */ + NULL, /* accept */ + NULL, /* retransmit */ + NULL, /* write_wakeup */ + NULL, /* read_wakeup */ + datagram_poll, /* poll */ +#ifdef CONFIG_IP_MROUTE + ipmr_ioctl, /* ioctl */ +#else + NULL, /* ioctl */ +#endif + raw_init, /* init */ + NULL, /* destroy */ + NULL, /* shutdown */ + raw_setsockopt, /* setsockopt */ + raw_getsockopt, /* getsockopt */ + raw_sendmsg, /* sendmsg */ + raw_recvmsg, /* recvmsg */ + raw_bind, /* bind */ + raw_rcv_skb, /* backlog_rcv */ + raw_v4_hash, /* hash */ + raw_v4_unhash, /* unhash */ + NULL, /* get_port */ + 128, /* max_header */ + 0, /* retransmits */ + "RAW", /* name */ + 0, /* inuse */ + 0 /* highestinuse */ +}; diff --git a/pfinet/linux-src/net/ipv4/route.c b/pfinet/linux-src/net/ipv4/route.c new file mode 100644 index 00000000..06eb5fe5 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/route.c @@ -0,0 +1,2048 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the IP router. + * + * Version: $Id: route.c,v 1.67.2.3 1999/08/08 08:43:12 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Linus Torvalds, <Linus.Torvalds@helsinki.fi> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * Alan Cox : Verify area fixes. + * Alan Cox : cli() protects routing changes + * Rui Oliveira : ICMP routing table updates + * (rco@di.uminho.pt) Routing table insertion and update + * Linus Torvalds : Rewrote bits to be sensible + * Alan Cox : Added BSD route gw semantics + * Alan Cox : Super /proc >4K + * Alan Cox : MTU in route table + * Alan Cox : MSS actually. Also added the window + * clamper. + * Sam Lantinga : Fixed route matching in rt_del() + * Alan Cox : Routing cache support. + * Alan Cox : Removed compatibility cruft. + * Alan Cox : RTF_REJECT support. + * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source + * Alexey Kuznetsov : End of old history. Splitted to fib.c and + * route.c and rewritten from scratch. + * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/pkt_sched.h> +#include <linux/mroute.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/sock.h> +#include <net/ip_fib.h> +#include <net/arp.h> +#include <net/tcp.h> +#include <net/icmp.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define IP_MAX_MTU 0xFFF0 + +#define RT_GC_TIMEOUT (300*HZ) + +int ip_rt_min_delay = 2*HZ; +int ip_rt_max_delay = 10*HZ; +int ip_rt_gc_thresh = RT_HASH_DIVISOR; +int ip_rt_max_size = RT_HASH_DIVISOR*16; +int ip_rt_gc_timeout = RT_GC_TIMEOUT; +int ip_rt_gc_interval = 60*HZ; +int ip_rt_gc_min_interval = 5*HZ; +int ip_rt_redirect_number = 9; +int ip_rt_redirect_load = HZ/50; +int ip_rt_redirect_silence = ((HZ/50) << (9+1)); +int ip_rt_error_cost = HZ; +int ip_rt_error_burst = 5*HZ; +int ip_rt_gc_elasticity = 8; +int ip_rt_mtu_expires = 10*60*HZ; + +static unsigned long rt_deadline = 0; + +#define RTprint(a...) printk(KERN_DEBUG a) + +static void rt_run_flush(unsigned long dummy); + +static struct timer_list rt_flush_timer = + { NULL, NULL, 0, 0L, rt_run_flush }; +static struct timer_list rt_periodic_timer = + { NULL, NULL, 0, 0L, NULL }; + +/* + * Interface to generic destination cache. + */ + +static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32); +static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, + struct sk_buff *); +static struct dst_entry * ipv4_negative_advice(struct dst_entry *); +static void ipv4_link_failure(struct sk_buff *skb); +static int rt_garbage_collect(void); + + +struct dst_ops ipv4_dst_ops = +{ + AF_INET, + __constant_htons(ETH_P_IP), + RT_HASH_DIVISOR, + + rt_garbage_collect, + ipv4_dst_check, + ipv4_dst_reroute, + NULL, + ipv4_negative_advice, + ipv4_link_failure, +}; + +__u8 ip_tos2prio[16] = { + TC_PRIO_BESTEFFORT, + TC_PRIO_FILLER, + TC_PRIO_BESTEFFORT, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER +}; + + +/* + * Route cache. + */ + +struct rtable *rt_hash_table[RT_HASH_DIVISOR]; + +static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res); + +static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos) +{ + unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4); + hash = hash^saddr^tos; + hash = hash^(hash>>16); + return (hash^(hash>>8)) & 0xFF; +} + +#ifdef CONFIG_PROC_FS + +static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t pos=0; + char temp[129]; + struct rtable *r; + int i; + + pos = 128; + + if (offset<128) { + sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst"); + len = 128; + } + + + start_bh_atomic(); + + for (i = 0; i<RT_HASH_DIVISOR; i++) { + for (r = rt_hash_table[i]; r; r = r->u.rt_next) { + /* + * Spin through entries until we are ready + */ + pos += 128; + + if (pos <= offset) { + len = 0; + continue; + } + sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", + r->u.dst.dev ? r->u.dst.dev->name : "*", + (unsigned long)r->rt_dst, + (unsigned long)r->rt_gateway, + r->rt_flags, + atomic_read(&r->u.dst.use), + atomic_read(&r->u.dst.refcnt), + 0, + (unsigned long)r->rt_src, (int)r->u.dst.pmtu, + r->u.dst.window, + (int)r->u.dst.rtt, r->key.tos, + r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, + r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0, + r->rt_spec_dst); + sprintf(buffer+len,"%-127s\n",temp); + len += 128; + if (pos >= offset+length) + goto done; + } + } + +done: + end_bh_atomic(); + + *start = buffer+len-(pos-offset); + len = pos-offset; + if (len>length) + len = length; + return len; +} +#endif + +static __inline__ void rt_free(struct rtable *rt) +{ + dst_free(&rt->u.dst); +} + +static __inline__ void rt_drop(struct rtable *rt) +{ + ip_rt_put(rt); + dst_free(&rt->u.dst); +} + +static __inline__ int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) + && rth->key.iif && rth->u.rt_next); +} + +static __inline__ int rt_valuable(struct rtable *rth) +{ + return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY)) + || rth->u.dst.expires); +} + +static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2) +{ + int age; + + if (atomic_read(&rth->u.dst.use)) + return 0; + + if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0) + return 1; + + age = jiffies - rth->u.dst.lastuse; + if (age <= tmo1 && !rt_fast_clean(rth)) + return 0; + if (age <= tmo2 && rt_valuable(rth)) + return 0; + return 1; +} + +static void rt_check_expire(unsigned long dummy) +{ + int i; + static int rover; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + + for (i=0; i<RT_HASH_DIVISOR/5; i++) { + unsigned tmo = ip_rt_gc_timeout; + + rover = (rover + 1) & (RT_HASH_DIVISOR-1); + rthp = &rt_hash_table[rover]; + + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entrie is expired even if it is in use */ + if ((long)(now - rth->u.dst.expires) <= 0) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + + /* + * Cleanup aged off entries. + */ + *rthp = rth->u.rt_next; + rt_free(rth); + } + + /* Fallback loop breaker. */ + if ((jiffies - now) > 0) + break; + } + rt_periodic_timer.expires = now + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); +} + +static void rt_run_flush(unsigned long dummy) +{ + int i; + struct rtable * rth, * next; + + rt_deadline = 0; + + start_bh_atomic(); + for (i=0; i<RT_HASH_DIVISOR; i++) { + if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL) + continue; + end_bh_atomic(); + + for (; rth; rth=next) { + next = rth->u.rt_next; + rth->u.rt_next = NULL; + rt_free(rth); + } + + start_bh_atomic(); + } + end_bh_atomic(); +} + +void rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_interrupt(); + + if (delay < 0) + delay = ip_rt_min_delay; + + start_bh_atomic(); + + if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { + long tmo = (long)(rt_deadline - now); + + /* If flush timer is already running + and flush request is not immediate (delay > 0): + + if deadline is not achieved, prolongate timer to "delay", + otherwise fire it at deadline time. + */ + + if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + end_bh_atomic(); + rt_run_flush(0); + return; + } + + if (rt_deadline == 0) + rt_deadline = now + ip_rt_max_delay; + + rt_flush_timer.expires = now + delay; + add_timer(&rt_flush_timer); + end_bh_atomic(); +} + +/* + Short description of GC goals. + + We want to build algorithm, which will keep routing cache + at some equilibrium point, when number of aged off entries + is kept approximately equal to newly generated ones. + + Current expiration strength is variable "expire". + We try to adjust it dynamically, so that if networking + is idle expires is large enough to keep enough of warm entries, + and when load increases it reduces to limit cache size. + */ + +static int rt_garbage_collect(void) +{ + static unsigned expire = RT_GC_TIMEOUT; + static unsigned long last_gc; + static int rover; + static int equilibrium; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + int goal; + + /* + * Garbage collection is pretty expensive, + * do not make it too frequently. + */ + if (now - last_gc < ip_rt_gc_min_interval && + atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + return 0; + + /* Calculate number of entries, which we want to expire now. */ + goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity; + if (goal <= 0) { + if (equilibrium < ipv4_dst_ops.gc_thresh) + equilibrium = ipv4_dst_ops.gc_thresh; + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + if (goal > 0) { + equilibrium += min(goal/2, RT_HASH_DIVISOR); + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + } + } else { + /* We are in dangerous area. Try to reduce cache really + * aggressively. + */ + goal = max(goal/2, RT_HASH_DIVISOR); + equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + } + + if (now - last_gc >= ip_rt_gc_min_interval) + last_gc = now; + + if (goal <= 0) { + equilibrium += goal; + goto work_done; + } + + do { + int i, k; + + start_bh_atomic(); + for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) { + unsigned tmo = expire; + + k = (k + 1) & (RT_HASH_DIVISOR-1); + rthp = &rt_hash_table[k]; + while ((rth = *rthp) != NULL) { + if (!rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + *rthp = rth->u.rt_next; + rth->u.rt_next = NULL; + rt_free(rth); + goal--; + } + if (goal <= 0) + break; + } + rover = k; + end_bh_atomic(); + + if (goal <= 0) + goto work_done; + + /* Goal is not achieved. We stop process if: + + - if expire reduced to zero. Otherwise, expire is halfed. + - if table is not full. + - if we are called from interrupt. + - jiffies check is just fallback/debug loop breaker. + We will not spin here for long time in any case. + */ + + if (expire == 0) + break; + + expire >>= 1; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i); +#endif + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + return 0; + } while (!in_interrupt() && jiffies - now < 1); + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + return 0; + if (net_ratelimit()) + printk("dst cache overflow\n"); + return 1; + +work_done: + expire += ip_rt_gc_min_interval; + if (expire > ip_rt_gc_timeout || + atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover); +#endif + return 0; +} + +static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp) +{ + struct rtable *rth, **rthp; + unsigned long now = jiffies; + int attempts = !in_interrupt(); + +restart: + start_bh_atomic(); + + rthp = &rt_hash_table[hash]; + + while ((rth = *rthp) != NULL) { + if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { + /* Put it first */ + *rthp = rth->u.rt_next; + rth->u.rt_next = rt_hash_table[hash]; + rt_hash_table[hash] = rth; + + atomic_inc(&rth->u.dst.refcnt); + atomic_inc(&rth->u.dst.use); + rth->u.dst.lastuse = now; + end_bh_atomic(); + + rt_drop(rt); + *rp = rth; + return 0; + } + + rthp = &rth->u.rt_next; + } + + /* Try to bind route to arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { + if (!arp_bind_neighbour(&rt->u.dst)) { + end_bh_atomic(); + + /* Neighbour tables are full and nothing + can be released. Try to shrink route cache, + it is most likely it holds some neighbour records. + */ + if (attempts-- > 0) { + int saved_elasticity = ip_rt_gc_elasticity; + int saved_int = ip_rt_gc_min_interval; + ip_rt_gc_elasticity = 1; + ip_rt_gc_min_interval = 0; + rt_garbage_collect(); + ip_rt_gc_min_interval = saved_int; + ip_rt_gc_elasticity = saved_elasticity; + goto restart; + } + + rt_drop(rt); + if (net_ratelimit()) + printk("neighbour table overflow\n"); + return -ENOBUFS; + } + } + + rt->u.rt_next = rt_hash_table[hash]; +#if RT_CACHE_DEBUG >= 2 + if (rt->u.rt_next) { + struct rtable * trt; + printk("rt_cache @%02x: %08x", hash, rt->rt_dst); + for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next) + printk(" . %08x", trt->rt_dst); + printk("\n"); + } +#endif + rt_hash_table[hash] = rt; + end_bh_atomic(); + *rp = rt; + return 0; +} + +static void rt_del(unsigned hash, struct rtable *rt) +{ + struct rtable **rthp; + + start_bh_atomic(); + ip_rt_put(rt); + for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) { + if (*rthp == rt) { + *rthp = rt->u.rt_next; + rt_free(rt); + break; + } + } + end_bh_atomic(); +} + +void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, + u32 saddr, u8 tos, struct device *dev) +{ + int i, k; + struct in_device *in_dev = dev->ip_ptr; + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + + tos &= IPTOS_TOS_MASK; + + if (!in_dev) + return; + + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) + || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + goto reject_redirect; + + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + goto reject_redirect; + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) + goto reject_redirect; + } + + for (i=0; i<2; i++) { + for (k=0; k<2; k++) { + unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos); + + rthp=&rt_hash_table[hash]; + + while ( (rth = *rthp) != NULL) { + struct rtable *rt; + + if (rth->key.dst != daddr || + rth->key.src != skeys[i] || + rth->key.tos != tos || + rth->key.oif != ikeys[k] || + rth->key.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } + + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; + + dst_clone(&rth->u.dst); + + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (rt == NULL) { + ip_rt_put(rth); + return; + } + + /* + * Copy all the information. + */ + *rt = *rth; + atomic_set(&rt->u.dst.refcnt, 1); + atomic_set(&rt->u.dst.use, 1); + rt->u.dst.lastuse = jiffies; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.obsolete = 0; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); + + if (!arp_bind_neighbour(&rt->u.dst) || + !(rt->u.dst.neighbour->nud_state&NUD_VALID)) { + if (rt->u.dst.neighbour) + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rth); + rt_drop(rt); + break; + } + + rt_del(hash, rth); + + if (!rt_intern_hash(hash, rt, &rt)) + ip_rt_put(rt); + break; + } + } + } + return; + +reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "Redirect from %lX/%s to %lX ignored." + "Path = %lX -> %lX, tos %02x\n", + ntohl(old_gw), dev->name, ntohl(new_gw), + ntohl(saddr), ntohl(daddr), tos); +#endif +} + +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable*)dst; + + if (rt != NULL) { + if (dst->obsolete) { + ip_rt_put(rt); + return NULL; + } + if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) { + unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos); +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); +#endif + rt_del(hash, rt); + return NULL; + } + } + return dst; +} + +/* + * Algorithm: + * 1. The first ip_rt_redirect_number redirects are sent + * with exponential backoff, then we stop sending them at all, + * assuming that the host ignores our redirects. + * 2. If we did not see packets requiring redirects + * during ip_rt_redirect_silence, we assume that the host + * forgot redirected route and start to send redirects again. + * + * This algorithm is much cheaper and more intelligent than dumb load limiting + * in icmp.c. + * + * NOTE. Do not forget to inhibit load limiting for redirects (redundant) + * and "frag. need" (breaks PMTU discovery) in icmp.c. + */ + +void ip_rt_send_redirect(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr; + + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) + return; + + /* No redirected packets during ip_rt_redirect_silence; + * reset the algorithm. + */ + if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence) + rt->u.dst.rate_tokens = 0; + + /* Too many ignored redirects; do not send anything + * set u.dst.rate_last to the last seen redirected packet. + */ + if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { + rt->u.dst.rate_last = jiffies; + return; + } + + /* Check for load limit; set rate_last to the latest sent + * redirect. + */ + if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + rt->u.dst.rate_last = jiffies; + ++rt->u.dst.rate_tokens; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && + rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) + printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n", + rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway); +#endif + } +} + +static int ip_error(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + unsigned long now; + int code; + + switch (rt->u.dst.error) { + case EINVAL: + default: + kfree_skb(skb); + return 0; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; + } + + now = jiffies; + if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst) + rt->u.dst.rate_tokens = ip_rt_error_burst; + rt->u.dst.rate_last = now; + if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { + rt->u.dst.rate_tokens -= ip_rt_error_cost; + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + } + + kfree_skb(skb); + return 0; +} + +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + +static __inline__ unsigned short guess_mtu(unsigned short old_mtu) +{ + int i; + + for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; + return 68; +} + +unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +{ + int i; + unsigned short old_mtu = ntohs(iph->tot_len); + struct rtable *rth; + u32 skeys[2] = { iph->saddr, 0, }; + u32 daddr = iph->daddr; + u8 tos = iph->tos & IPTOS_TOS_MASK; + unsigned short est_mtu = 0; + + if (ipv4_config.no_pmtu_disc) + return 0; + + for (i=0; i<2; i++) { + unsigned hash = rt_hash_code(daddr, skeys[i], tos); + + for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) { + if (rth->key.dst == daddr && + rth->key.src == skeys[i] && + rth->rt_dst == daddr && + rth->rt_src == iph->saddr && + rth->key.tos == tos && + rth->key.iif == 0 && + !(rth->u.dst.mxlock&(1<<RTAX_MTU))) { + unsigned short mtu = new_mtu; + + if (new_mtu < 68 || new_mtu >= old_mtu) { + + /* BSD 4.2 compatibility hack :-( */ + if (mtu == 0 && old_mtu >= rth->u.dst.pmtu && + old_mtu >= 68 + (iph->ihl<<2)) + old_mtu -= iph->ihl<<2; + + mtu = guess_mtu(old_mtu); + } + if (mtu <= rth->u.dst.pmtu) { + if (mtu < rth->u.dst.pmtu) { + dst_confirm(&rth->u.dst); + rth->u.dst.pmtu = mtu; + dst_set_expires(&rth->u.dst, ip_rt_mtu_expires); + } + est_mtu = mtu; + } + } + } + } + return est_mtu ? : new_mtu; +} + +void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu) +{ + if (dst->pmtu > mtu && mtu >= 68 && + !(dst->mxlock&(1<<RTAX_MTU))) { + dst->pmtu = mtu; + dst_set_expires(dst, ip_rt_mtu_expires); + } +} + +static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie) +{ + dst_release(dst); + return NULL; +} + +static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, + struct sk_buff *skb) +{ + return NULL; +} + +static void ipv4_link_failure(struct sk_buff *skb) +{ + struct rtable *rt; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + + rt = (struct rtable *) skb->dst; + if (rt) + dst_set_expires(&rt->u.dst, 0); +} + +static int ip_rt_bug(struct sk_buff *skb) +{ + printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + kfree_skb(skb); + return 0; +} + +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. + + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->key.iif == 0) + src = rt->rt_src; + else if (fib_lookup(&rt->key, &res) == 0 && res.type != RTN_NAT) + src = FIB_RES_PREFSRC(res); + else + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); +} + +#ifdef CONFIG_NET_CLS_ROUTE +static void set_class_tag(struct rtable *rt, u32 tag) +{ + if (!(rt->u.dst.tclassid&0xFFFF)) + rt->u.dst.tclassid |= tag&0xFFFF; + if (!(rt->u.dst.tclassid&0xFFFF0000)) + rt->u.dst.tclassid |= tag&0xFFFF0000; +} +#endif + +static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +{ + struct fib_info *fi = res->fi; + + if (fi) { + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); + rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1]; + rt->u.dst.pmtu = fi->fib_mtu; + if (fi->fib_mtu == 0) { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.pmtu > IP_MAX_MTU) + rt->u.dst.pmtu = IP_MAX_MTU; + if (rt->u.dst.pmtu < 68) + rt->u.dst.pmtu = 68; + if (rt->u.dst.mxlock&(1<<RTAX_MTU) && + rt->rt_gateway != rt->rt_dst && + rt->u.dst.pmtu > 576) + rt->u.dst.pmtu = 576; + } + rt->u.dst.window= fi->fib_window ? : 0; + rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; +#ifdef CONFIG_NET_CLS_ROUTE + rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif + } else { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.pmtu > IP_MAX_MTU) + rt->u.dst.pmtu = IP_MAX_MTU; + if (rt->u.dst.pmtu < 68) + rt->u.dst.pmtu = 68; + rt->u.dst.window= 0; + rt->u.dst.rtt = TCP_TIMEOUT_INIT; + } +#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_MULTIPLE_TABLES + set_class_tag(rt, fib_rules_tclass(res)); +#endif + set_class_tag(rt, itag); +#endif + rt->rt_type = res->type; +} + +static int +ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev, int our) +{ + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = dev->ip_ptr; + u32 itag = 0; + + /* Primary sanity checks. */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) + return -EINVAL; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + return -EINVAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0) + return -EINVAL; + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->key.fwmark = skb->fwmark; +#endif + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = daddr; + rth->rt_src_map = saddr; +#endif +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + + hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); + return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); +} + +/* + * NOTE. We drop all the packets that has local source + * addresses, because every properly looped back packet + * must have correct destination already attached by output routine. + * + * Such approach solves two big problems: + * 1. Not simplex devices are handled properly. + * 2. IP spoofing attempts are filtered with 100% of guarantee. + */ + +int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev) +{ + struct rt_key key; + struct fib_result res; + struct in_device *in_dev = dev->ip_ptr; + struct in_device *out_dev; + unsigned flags = 0; + u32 itag = 0; + struct rtable * rth; + unsigned hash; + u32 spec_dst; + int err = -EINVAL; + + /* + * IP on this device is disabled. + */ + + if (!in_dev) + return -EINVAL; + + key.dst = daddr; + key.src = saddr; + key.tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + key.fwmark = skb->fwmark; +#endif + key.iif = dev->ifindex; + key.oif = 0; + key.scope = RT_SCOPE_UNIVERSE; + + hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + goto martian_source; + + if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) + */ + if (ZERONET(saddr)) + goto martian_source; + + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + goto martian_destination; + + /* + * Now we are ready to route packet. + */ + if ((err = fib_lookup(&key, &res))) { + if (!IN_DEV_FORWARD(in_dev)) + return -EINVAL; + goto no_route; + } + +#ifdef CONFIG_IP_ROUTE_NAT + /* Policy is applied before mapping destination, + but rerouting after map should be made with old source. + */ + + if (1) { + u32 src_map = saddr; + if (res.r) + src_map = fib_rules_policy(saddr, &res, &flags); + + if (res.type == RTN_NAT) { + key.dst = fib_rules_map_destination(daddr, &res); + if (fib_lookup(&key, &res) || res.type != RTN_UNICAST) + return -EINVAL; + flags |= RTCF_DNAT; + } + key.src = src_map; + } +#endif + + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + int result; + result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex, + dev, &spec_dst, &itag); + if (result < 0) + goto martian_source; + if (result) + flags |= RTCF_DIRECTSRC; + spec_dst = daddr; + goto local_input; + } + + if (!IN_DEV_FORWARD(in_dev)) + return -EINVAL; + if (res.type != RTN_UNICAST) + goto martian_destination; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); +#endif + out_dev = FIB_RES_DEV(res)->ip_ptr; + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n"); + return -EINVAL; + } + + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag); + if (err < 0) + goto martian_source; + + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) && + (IN_DEV_SHARED_MEDIA(out_dev) + || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) + flags |= RTCF_DOREDIRECT; + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + /* Not IP (i.e. ARP). Do not create route, if it is + * invalid for proxy arp. DNAT routes are always valid. + */ + if (out_dev == in_dev && !(flags&RTCF_DNAT)) + return -EINVAL; + } + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->key.fwmark = skb->fwmark; +#endif + rth->key.src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_src_map = key.src; + rth->rt_dst_map = key.dst; + if (flags&RTCF_DNAT) + rth->rt_gateway = key.dst; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = out_dev->dev; + rth->key.oif = 0; + rth->rt_spec_dst= spec_dst; + + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; + + rt_set_nexthop(rth, &res, itag); + + rth->rt_flags = flags; + +#ifdef CONFIG_NET_FASTROUTE + if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { + struct device *odev = rth->u.dst.dev; + if (odev != dev && + dev->accept_fastpath && + odev->mtu >= dev->mtu && + dev->accept_fastpath(dev, &rth->u.dst) == 0) + rth->rt_flags |= RTCF_FAST; + } +#endif + + return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + +brd_input: + if (skb->protocol != __constant_htons(ETH_P_IP)) + return -EINVAL; + + if (ZERONET(saddr)) { + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag); + if (err < 0) + goto martian_source; + if (err) + flags |= RTCF_DIRECTSRC; + } + flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; + +local_input: + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->key.fwmark = skb->fwmark; +#endif + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + rth->rt_flags = flags|RTCF_LOCAL; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= -err; + rth->rt_flags &= ~RTCF_LOCAL; + } + rth->rt_type = res.type; + return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + +no_route: + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; + + /* + * Do not cache martian addresses: they should be logged (RFC1812) + */ +martian_destination: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name); +#endif + return -EINVAL; + +martian_source: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { + /* + * RFC1812 recommenadtion, if source is martian, + * the only hint is MAC header. + */ + printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name); + if (dev->hard_header_len) { + int i; + unsigned char *p = skb->mac.raw; + printk(KERN_WARNING "ll header:"); + for (i=0; i<dev->hard_header_len; i++, p++) + printk(" %02x", *p); + printk("\n"); + } + } +#endif + return -EINVAL; +} + +int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev) +{ + struct rtable * rth; + unsigned hash; + int iif = dev->ifindex; + + tos &= IPTOS_TOS_MASK; + hash = rt_hash_code(daddr, saddr^(iif<<5), tos); + + for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { + if (rth->key.dst == daddr && + rth->key.src == saddr && + rth->key.iif == iif && + rth->key.oif == 0 && +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->key.fwmark == skb->fwmark && +#endif + rth->key.tos == tos) { + rth->u.dst.lastuse = jiffies; + atomic_inc(&rth->u.dst.use); + atomic_inc(&rth->u.dst.refcnt); + skb->dst = (struct dst_entry*)rth; + return 0; + } + } + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many Ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + int our = ip_check_mc(dev, daddr); + if (!our +#ifdef CONFIG_IP_MROUTE + && (LOCAL_MCAST(daddr) || !dev->ip_ptr || + !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr)) +#endif + ) return -EINVAL; + return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); + } + return ip_route_input_slow(skb, daddr, saddr, tos, dev); +} + +/* + * Major route resolver routine. + */ + +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) +{ + struct rt_key key; + struct fib_result res; + unsigned flags = 0; + struct rtable *rth; + struct device *dev_out = NULL; + unsigned hash; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + u32 nochecksrc = (tos & RTO_TPROXY); +#endif + + tos &= IPTOS_TOS_MASK|RTO_ONLINK; + key.dst = daddr; + key.src = saddr; + key.tos = tos&IPTOS_TOS_MASK; + key.iif = loopback_dev.ifindex; + key.oif = oif; + key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (saddr) { + if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) + return -EINVAL; + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(saddr); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* If address is not local, test for transparent proxy flag; + if address is local --- clear the flag. + */ + if (dev_out == NULL) { + if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST) + return -EINVAL; + flags |= RTCF_TPROXY; + } +#else + if (dev_out == NULL) + return -EINVAL; +#endif + + /* I removed check for oif == dev_out->oif here. + It was wrong by three reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oif == 0 && +#ifdef CONFIG_IP_TRANSPARENT_PROXY + dev_out && +#endif + (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + key.oif = dev_out->ifindex; + goto make_route; + } + dev_out = NULL; + } + if (oif) { + dev_out = dev_get_by_index(oif); + if (dev_out == NULL) + return -ENODEV; + if (dev_out->ip_ptr == NULL) + return -ENODEV; /* Wrong error code */ + + if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) { + if (!key.src) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; + } + if (!key.src) { + if (MULTICAST(daddr)) + key.src = inet_select_addr(dev_out, 0, key.scope); + else if (!daddr) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); + } + } + + if (!key.dst) { + key.dst = key.src; + if (!key.dst) + key.dst = key.src = htonl(INADDR_LOOPBACK); + dev_out = &loopback_dev; + key.oif = loopback_dev.ifindex; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; + } + + if (fib_lookup(&key, &res)) { + res.fi = NULL; + if (oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, ignoring both routing tables + and ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + if (key.src == 0) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + res.type = RTN_UNICAST; + goto make_route; + } + return -ENETUNREACH; + } + + if (res.type == RTN_NAT) + return -EINVAL; + + if (res.type == RTN_LOCAL) { + if (!key.src) + key.src = key.dst; + dev_out = &loopback_dev; + key.oif = dev_out->ifindex; + res.fi = NULL; + flags |= RTCF_LOCAL; + goto make_route; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); + else +#endif + if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0) + fib_select_default(&key, &res); + + if (!key.src) + key.src = FIB_RES_PREFSRC(res); + + dev_out = FIB_RES_DEV(res); + key.oif = dev_out->ifindex; + +make_route: + if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) + return -EINVAL; + + if (key.dst == 0xFFFFFFFF) + res.type = RTN_BROADCAST; + else if (MULTICAST(key.dst)) + res.type = RTN_MULTICAST; + else if (BADCLASS(key.dst) || ZERONET(key.dst)) + return -EINVAL; + + if (dev_out->flags&IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + if (res.type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST|RTCF_LOCAL; + res.fi = NULL; + } else if (res.type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST|RTCF_LOCAL; + if (!ip_check_mc(dev_out, daddr)) + flags &= ~RTCF_LOCAL; + /* If multicast route do not exist use + default one, but do not gateway in this case. + Yes, it is hack. + */ + if (res.fi && res.prefixlen < 4) + res.fi = NULL; + } + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->key.iif = 0; + rth->key.oif = oif; + rth->rt_dst = key.dst; + rth->rt_src = key.src; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = oif ? : dev_out->ifindex; + rth->u.dst.dev = dev_out; + rth->rt_gateway = key.dst; + rth->rt_spec_dst= key.src; + + rth->u.dst.output=ip_output; + + if (flags&RTCF_LOCAL) { + rth->u.dst.input = ip_local_deliver; + rth->rt_spec_dst = key.dst; + } + if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + rth->rt_spec_dst = key.src; + if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) + rth->u.dst.output = ip_mc_output; +#ifdef CONFIG_IP_MROUTE + if (res.type == RTN_MULTICAST && dev_out->ip_ptr) { + struct in_device *in_dev = dev_out->ip_ptr; + if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) { + rth->u.dst.input = ip_mr_input; + rth->u.dst.output = ip_mc_output; + } + } +#endif + } + + rt_set_nexthop(rth, &res, 0); + + rth->rt_flags = flags; + + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); + return rt_intern_hash(hash, rth, rp); +} + +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) +{ + unsigned hash; + struct rtable *rth; + + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); + + start_bh_atomic(); + for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { + if (rth->key.dst == daddr && + rth->key.src == saddr && + rth->key.iif == 0 && + rth->key.oif == oif && +#ifndef CONFIG_IP_TRANSPARENT_PROXY + rth->key.tos == tos +#else + !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) && + ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY)) +#endif + ) { + rth->u.dst.lastuse = jiffies; + atomic_inc(&rth->u.dst.use); + atomic_inc(&rth->u.dst.refcnt); + end_bh_atomic(); + *rp = rth; + return 0; + } + } + end_bh_atomic(); + + return ip_route_output_slow(rp, daddr, saddr, tos, oif); +} + +#ifdef CONFIG_RTNETLINK + +static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; +#ifdef CONFIG_IP_MROUTE + struct rtattr *eptr; +#endif + struct rtattr *mx; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 0; + r->rtm_tos = rt->key.tos; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + if (rt->key.src) { + r->rtm_src_len = 32; + RTA_PUT(skb, RTA_SRC, 4, &rt->key.src); + } + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); +#ifdef CONFIG_NET_CLS_ROUTE + if (rt->u.dst.tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); +#endif + if (rt->key.iif) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + else if (rt->rt_src != rt->key.src) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + mx = (struct rtattr*)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.mxlock) + RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); + ci.rta_lastuse = jiffies - rt->u.dst.lastuse; + ci.rta_used = atomic_read(&rt->u.dst.refcnt); + ci.rta_clntref = atomic_read(&rt->u.dst.use); + if (rt->u.dst.expires) + ci.rta_expires = rt->u.dst.expires - jiffies; + else + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; +#ifdef CONFIG_IP_MROUTE + eptr = (struct rtattr*)skb->tail; +#endif + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + if (rt->key.iif) { +#ifdef CONFIG_IP_MROUTE + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nlmsg_failure; + } else { + if (err == -EMSGSIZE) + goto nlmsg_failure; + ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + } + } + } else +#endif + { + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); + } + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int iif = 0; + int err; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta[RTA_SRC-1]) + memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_IIF-1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + + if (iif) { + struct device *dev; + dev = dev_get_by_index(iif); + if (!dev) + return -ENODEV; + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = dev; + start_bh_atomic(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + end_bh_atomic(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = -rt->u.dst.error; + } else { + int oif = 0; + if (rta[RTA_OIF-1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); + } + if (err) { + kfree_skb(skb); + return err; + } + + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + + err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); + if (err == 0) + return 0; + if (err < 0) + return -EMSGSIZE; + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err < 0) + return err; + return 0; +} + + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for (h=0; h < RT_HASH_DIVISOR; h++) { + if (h < s_h) continue; + if (h > s_h) + s_idx = 0; + start_bh_atomic(); + for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + end_bh_atomic(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + end_bh_atomic(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +#endif /* CONFIG_RTNETLINK */ + +void ip_rt_multicast_event(struct in_device *in_dev) +{ + rt_cache_flush(0); +} + + + +#ifdef CONFIG_SYSCTL + +static int flush_delay; + +static +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp); + rt_cache_flush(flush_delay); + return 0; + } else + return -EINVAL; +} + +static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + void **context) +{ + int delay; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(delay,(int *)newval)) + return -EFAULT; + rt_cache_flush(delay); + return 0; +} + +ctl_table ipv4_route_table[] = { + {NET_IPV4_ROUTE_FLUSH, "flush", + &flush_delay, sizeof(int), 0644, NULL, + &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy }, + {NET_IPV4_ROUTE_MIN_DELAY, "min_delay", + &ip_rt_min_delay, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_ROUTE_MAX_DELAY, "max_delay", + &ip_rt_max_delay, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh", + &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_MAX_SIZE, "max_size", + &ip_rt_max_size, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval", + &ip_rt_gc_min_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout", + &ip_rt_gc_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval", + &ip_rt_gc_interval, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load", + &ip_rt_redirect_load, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number", + &ip_rt_redirect_number, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence", + &ip_rt_redirect_silence, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_ERROR_COST, "error_cost", + &ip_rt_error_cost, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_ERROR_BURST, "error_burst", + &ip_rt_error_burst, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity", + &ip_rt_gc_elasticity, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires", + &ip_rt_mtu_expires, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {0} +}; +#endif + +#ifdef CONFIG_NET_CLS_ROUTE +struct ip_rt_acct ip_rt_acct[256]; + +#ifdef CONFIG_PROC_FS +static int ip_rt_acct_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + *start=buffer; + + if (offset + length > sizeof(ip_rt_acct)) { + length = sizeof(ip_rt_acct) - offset; + *eof = 1; + } + if (length > 0) { + start_bh_atomic(); + memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length); + end_bh_atomic(); + return length; + } + return 0; +} +#endif +#endif + + +__initfunc(void ip_rt_init(void)) +{ +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_NET_CLS_ROUTE + struct proc_dir_entry *ent; +#endif +#endif + devinet_init(); + ip_fib_init(); + rt_periodic_timer.function = rt_check_expire; + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval + + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); + +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RTCACHE, 8, "rt_cache", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt_cache_get_info + }); +#ifdef CONFIG_NET_CLS_ROUTE + ent = create_proc_entry("net/rt_acct", 0, 0); + ent->read_proc = ip_rt_acct_read; +#endif +#endif +} diff --git a/pfinet/linux-src/net/ipv4/syncookies.c b/pfinet/linux-src/net/ipv4/syncookies.c new file mode 100644 index 00000000..fb4e8f80 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/syncookies.c @@ -0,0 +1,201 @@ +/* + * Syncookies implementation for the Linux kernel + * + * Copyright (C) 1997 Andi Kleen + * Based on ideas by D.J.Bernstein and Eric Schenk. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * $Id: syncookies.c,v 1.7.2.1 1999/08/08 08:43:13 davem Exp $ + * + * Missing: IPv6 support. + */ + +#include <linux/config.h> +#if defined(CONFIG_SYN_COOKIES) +#include <linux/tcp.h> +#include <linux/malloc.h> +#include <linux/random.h> +#include <net/tcp.h> + +extern int sysctl_tcp_syncookies; + +static unsigned long tcp_lastsynq_overflow; + +/* + * This table has to be sorted and terminated with (__u16)-1. + * XXX generate a better table. + * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + */ +static __u16 const msstab[] = { + 64-1, + 256-1, + 512-1, + 536-1, + 1024-1, + 1440-1, + 1460-1, + 4312-1, + (__u16)-1 +}; +/* The number doesn't include the -1 terminator */ +#define NUM_MSS (sizeof(msstab)/sizeof(msstab[0]) - 1) + +/* + * Generate a syncookie. mssp points to the mss, which is returned + * rounded down to the value encoded in the cookie. + */ +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, + __u16 *mssp) +{ + int mssind; + const __u16 mss = *mssp; + + tcp_lastsynq_overflow = jiffies; + /* XXX sort msstab[] by probability? Binary search? */ + for (mssind = 0; mss > msstab[mssind+1]; mssind++) + ; + *mssp = msstab[mssind]+1; + + net_statistics.SyncookiesSent++; + + return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + ntohl(skb->h.th->seq), + jiffies / (HZ*60), mssind); +} + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 +/* + * Check if a ack sequence number is a valid syncookie. + * Return the decoded mss if it is, or 0 if not. + */ +static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +{ + __u32 seq; + __u32 mssind; + + if ((jiffies - tcp_lastsynq_overflow) > TCP_TIMEOUT_INIT) + return 0; + + seq = ntohl(skb->h.th->seq)-1; + mssind = check_tcp_syn_cookie(cookie, + skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + seq, jiffies/(HZ*60), COUNTER_TRIES); + + return mssind < NUM_MSS ? msstab[mssind]+1 : 0; +} + +extern struct or_calltable or_ipv4; + +static inline struct sock * +get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, + struct dst_entry *dst) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + req->sk = sk; + + /* Queue up for accept() */ + tcp_synq_queue(tp, req); + + return sk; +} + +struct sock * +cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) +{ + __u32 cookie = ntohl(skb->h.th->ack_seq)-1; + struct open_request *req; + int mss; + struct rtable *rt; + __u8 rcv_wscale; + + if (!sysctl_tcp_syncookies) + return sk; + if (!skb->h.th->ack) + return sk; + + mss = cookie_check(skb, cookie); + if (mss == 0) { + net_statistics.SyncookiesFailed++; + return sk; + } + + net_statistics.SyncookiesRecv++; + + req = tcp_openreq_alloc(); + if (req == NULL) + return NULL; + + req->rcv_isn = htonl(skb->h.th->seq)-1; + req->snt_isn = cookie; + req->mss = mss; + req->rmt_port = skb->h.th->source; + req->af.v4_req.loc_addr = skb->nh.iph->daddr; + req->af.v4_req.rmt_addr = skb->nh.iph->saddr; + req->class = &or_ipv4; /* for safety */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY + req->lcl_port = skb->h.th->dest; +#endif + + req->af.v4_req.opt = NULL; + + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ + if (opt && opt->optlen) { + int opt_size = sizeof(struct ip_options) + opt->optlen; + + req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); + if (req->af.v4_req.opt) { + if (ip_options_echo(req->af.v4_req.opt, skb)) { + kfree_s(req->af.v4_req.opt, opt_size); + req->af.v4_req.opt = NULL; + } + } + } + + req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; + req->wscale_ok = 0; + req->expires = 0UL; + req->retrans = 0; + + /* + * We need to lookup the route here to get at the correct + * window size. We should better make sure that the window size + * hasn't changed since we received the original syn, but I see + * no easy way to do this. + */ + if (ip_route_output(&rt, + opt && + opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, + sk->ip_tos | RTO_CONN, + 0)) { + tcp_openreq_free(req); + return NULL; + } + + /* Try to redo what tcp_v4_send_synack did. */ + req->window_clamp = rt->u.dst.window; + tcp_select_initial_window(sock_rspace(sk)/2,req->mss, + &req->rcv_wnd, &req->window_clamp, + 0, &rcv_wscale); + req->rcv_wscale = rcv_wscale; + + return get_cookie_sock(sk, skb, req, &rt->u.dst); +} + +#endif diff --git a/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c b/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c new file mode 100644 index 00000000..e578e4e7 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c @@ -0,0 +1,205 @@ +/* + * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. + * + * $Id: sysctl_net_ipv4.c,v 1.38.2.1 1999/08/08 08:43:14 davem Exp $ + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/config.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/tcp.h> + +/* + * TCP configuration parameters + */ + +#define TCP_PMTU_DISC 0x00000001 /* perform PMTU discovery */ +#define TCP_CONG_AVOID 0x00000002 /* congestion avoidance algorithm */ +#define TCP_DELAY_ACKS 0x00000003 /* delayed ack stategy */ + +#if 0 +static int boolean_min = 0; +static int boolean_max = 1; +#endif + +/* From icmp.c */ +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; + +/* From ip_fragment.c */ +extern int sysctl_ipfrag_low_thresh; +extern int sysctl_ipfrag_high_thresh; +extern int sysctl_ipfrag_time; + +/* From ip_output.c */ +extern int sysctl_ip_dynaddr; + +/* From ip_masq.c */ +extern int sysctl_ip_masq_debug; + +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; +extern int sysctl_tcp_retrans_collapse; +extern int sysctl_tcp_keepalive_time; +extern int sysctl_tcp_keepalive_probes; +extern int sysctl_tcp_max_ka_probes; +extern int sysctl_tcp_retries1; +extern int sysctl_tcp_retries2; +extern int sysctl_tcp_fin_timeout; +extern int sysctl_tcp_syncookies; +extern int sysctl_tcp_syn_retries; +extern int sysctl_tcp_stdurg; +extern int sysctl_tcp_rfc1337; +extern int sysctl_tcp_syn_taildrop; +extern int sysctl_max_syn_backlog; + +/* From icmp.c */ +extern int sysctl_icmp_destunreach_time; +extern int sysctl_icmp_timeexceed_time; +extern int sysctl_icmp_paramprob_time; +extern int sysctl_icmp_echoreply_time; + +/* From igmp.c */ +extern int sysctl_igmp_max_memberships; + +int tcp_retr1_max = 255; + +struct ipv4_config ipv4_config; + +extern ctl_table ipv4_route_table[]; + +#ifdef CONFIG_SYSCTL + +static +int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int val = ipv4_devconf.forwarding; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write && ipv4_devconf.forwarding != val) + inet_forward_change(); + + return ret; +} + +static int ipv4_sysctl_forward_strategy(ctl_table *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + void **context) +{ + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new,(int *)newval)) + return -EFAULT; + if (new != ipv4_devconf.forwarding) + inet_forward_change(); + return 0; /* caller does change again and handles handles oldval */ +} + +ctl_table ipv4_table[] = { + {NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps", + &sysctl_tcp_timestamps, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling", + &sysctl_tcp_window_scaling, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_TCP_SACK, "tcp_sack", + &sysctl_tcp_sack, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse", + &sysctl_tcp_retrans_collapse, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_FORWARD, "ip_forward", + &ipv4_devconf.forwarding, sizeof(int), 0644, NULL, + &ipv4_sysctl_forward,&ipv4_sysctl_forward_strategy}, + {NET_IPV4_DEFAULT_TTL, "ip_default_ttl", + &ip_statistics.IpDefaultTTL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_AUTOCONFIG, "ip_autoconfig", + &ipv4_config.autoconfig, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc", + &ipv4_config.no_pmtu_disc, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", + &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", + &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh", + &sysctl_ipfrag_low_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_DYNADDR, "ip_dynaddr", + &sysctl_ip_dynaddr, sizeof(int), 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_IP_MASQUERADE + {NET_IPV4_IP_MASQ_DEBUG, "ip_masq_debug", + &sysctl_ip_masq_debug, sizeof(int), 0644, NULL, &proc_dointvec}, +#endif + {NET_IPV4_IPFRAG_TIME, "ipfrag_time", + &sysctl_ipfrag_time, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, + &sysctl_jiffies}, + {NET_IPV4_TCP_MAX_KA_PROBES, "tcp_max_ka_probes", + &sysctl_tcp_max_ka_probes, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time", + &sysctl_tcp_keepalive_time, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes", + &sysctl_tcp_keepalive_probes, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_TCP_RETRIES1, "tcp_retries1", + &sysctl_tcp_retries1, sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, NULL, &tcp_retr1_max}, + {NET_IPV4_TCP_RETRIES2, "tcp_retries2", + &sysctl_tcp_retries2, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout", + &sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies, &sysctl_jiffies}, +#ifdef CONFIG_SYN_COOKIES + {NET_TCP_SYNCOOKIES, "tcp_syncookies", + &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec}, +#endif + {NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg, + sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337, + sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog, + sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range", + &sysctl_local_port_range, sizeof(sysctl_local_port_range), 0644, + NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all", + &sysctl_icmp_echo_ignore_all, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts", + &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses", + &sysctl_icmp_ignore_bogus_error_responses, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate", + &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate", + &sysctl_icmp_timeexceed_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_PARAMPROB_RATE, "icmp_paramprob_rate", + &sysctl_icmp_paramprob_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate", + &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ROUTE, "route", NULL, 0, 0555, ipv4_route_table}, +#ifdef CONFIG_IP_MULTICAST + {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships", + &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec}, +#endif + {0} +}; + +#endif /* CONFIG_SYSCTL */ diff --git a/pfinet/linux-src/net/ipv4/tcp.c b/pfinet/linux-src/net/ipv4/tcp.c new file mode 100644 index 00000000..65763215 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp.c @@ -0,0 +1,1826 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp.c,v 1.140.2.4 1999/08/09 03:13:12 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but its a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +/* + * RFC1122 status: + * NOTE: I'm not going to be doing comments in the code for this one except + * for violations and the like. tcp.c is just too big... If I say something + * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out + * with Alan. -- MS 950903 + * [Note: Most of the TCP code has been rewriten/redesigned since this + * RFC1122 check. It is probably not correct anymore. It should be redone + * before 2.2. -AK] + * + * Use of PSH (4.2.2.2) + * MAY aggregate data sent without the PSH flag. (does) + * MAY queue data received without the PSH flag. (does) + * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) + * MAY implement PSH on send calls. (doesn't, thus:) + * MUST NOT buffer data indefinitely (doesn't [1 second]) + * MUST set PSH on last segment (does) + * MAY pass received PSH to application layer (doesn't) + * SHOULD send maximum-sized segment whenever possible. (almost always does) + * + * Window Size (4.2.2.3, 4.2.2.16) + * MUST treat window size as an unsigned number (does) + * SHOULD treat window size as a 32-bit number (does not) + * MUST NOT shrink window once it is offered (does not normally) + * + * Urgent Pointer (4.2.2.4) + * **MUST point urgent pointer to last byte of urgent data (not right + * after). (doesn't, to be like BSD. That's configurable, but defaults + * to off) + * MUST inform application layer asynchronously of incoming urgent + * data. (does) + * MUST provide application with means of determining the amount of + * urgent data pending. (does) + * **MUST support urgent data sequence of arbitrary length. (doesn't, but + * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) + * [Follows BSD 1 byte of urgent data] + * + * TCP Options (4.2.2.5) + * MUST be able to receive TCP options in any segment. (does) + * MUST ignore unsupported options (does) + * + * Maximum Segment Size Option (4.2.2.6) + * MUST implement both sending and receiving MSS. (does, but currently + * only uses the smaller of both of them) + * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send + * it always). (does, even when MSS == 536, which is legal) + * MUST assume MSS == 536 if no MSS received at connection setup (does) + * MUST calculate "effective send MSS" correctly: + * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) + * (does - but allows operator override) + * + * TCP Checksum (4.2.2.7) + * MUST generate and check TCP checksum. (does) + * + * Initial Sequence Number Selection (4.2.2.8) + * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's + * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is + * necessary for 10Mbps networks - and harder than BSD to spoof! + * With syncookies we don't) + * + * Simultaneous Open Attempts (4.2.2.10) + * MUST support simultaneous open attempts (does) + * + * Recovery from Old Duplicate SYN (4.2.2.11) + * MUST keep track of active vs. passive open (does) + * + * RST segment (4.2.2.12) + * SHOULD allow an RST segment to contain data (does, but doesn't do + * anything with it, which is standard) + * + * Closing a Connection (4.2.2.13) + * MUST inform application of whether connection was closed by RST or + * normal close. (does) + * MAY allow "half-duplex" close (treat connection as closed for the + * local app, even before handshake is done). (does) + * MUST linger in TIME_WAIT for 2 * MSL (does) + * + * Retransmission Timeout (4.2.2.15) + * MUST implement Jacobson's slow start and congestion avoidance + * stuff. (does) + * + * Probing Zero Windows (4.2.2.17) + * MUST support probing of zero windows. (does) + * MAY keep offered window closed indefinitely. (does) + * MUST allow remote window to stay closed indefinitely. (does) + * + * Passive Open Calls (4.2.2.18) + * MUST NOT let new passive open affect other connections. (doesn't) + * MUST support passive opens (LISTENs) concurrently. (does) + * + * Time to Live (4.2.2.19) + * MUST make TCP TTL configurable. (does - IP_TTL option) + * + * Event Processing (4.2.2.20) + * SHOULD queue out-of-order segments. (does) + * MUST aggregate ACK segments whenever possible. (does but badly) + * + * Retransmission Timeout Calculation (4.2.3.1) + * MUST implement Karn's algorithm and Jacobson's algorithm for RTO + * calculation. (does, or at least explains them in the comments 8*b) + * SHOULD initialize RTO to 0 and RTT to 3. (does) + * + * When to Send an ACK Segment (4.2.3.2) + * SHOULD implement delayed ACK. (does) + * MUST keep ACK delay < 0.5 sec. (does) + * + * When to Send a Window Update (4.2.3.3) + * MUST implement receiver-side SWS. (does) + * + * When to Send Data (4.2.3.4) + * MUST implement sender-side SWS. (does) + * SHOULD implement Nagle algorithm. (does) + * + * TCP Connection Failures (4.2.3.5) + * MUST handle excessive retransmissions "properly" (see the RFC). (does) + * SHOULD inform application layer of soft errors. (does) + * + * TCP Keep-Alives (4.2.3.6) + * MAY provide keep-alives. (does) + * MUST make keep-alives configurable on a per-connection basis. (does) + * MUST default to no keep-alives. (does) + * MUST make keep-alive interval configurable. (does) + * MUST make default keep-alive interval > 2 hours. (does) + * MUST NOT interpret failure to ACK keep-alive packet as dead + * connection. (doesn't) + * SHOULD send keep-alive with no data. (does) + * + * TCP Multihoming (4.2.3.7) + * MUST get source address from IP layer before sending first + * SYN. (does) + * MUST use same local address for all segments of a connection. (does) + * + * IP Options (4.2.3.8) + * MUST ignore unsupported IP options. (does) + * MAY support Time Stamp and Record Route. (does) + * MUST allow application to specify a source route. (does) + * MUST allow received Source Route option to set route for all future + * segments on this connection. (does not (security issues)) + * + * ICMP messages (4.2.3.9) + * MUST act on ICMP errors. (does) + * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore + * because that is deprecated now by the IETF, can be turned on) + * MUST NOT abort connection upon receipt of soft Destination + * Unreachables (0, 1, 5), Time Exceededs and Parameter + * Problems. (doesn't) + * SHOULD report soft Destination Unreachables etc. to the + * application. (does, except during SYN_RECV and may drop messages + * in some rare cases before accept() - ICMP is unreliable) + * SHOULD abort connection upon receipt of hard Destination Unreachable + * messages (2, 3, 4). (does, but see above) + * + * Remote Address Validation (4.2.3.10) + * MUST reject as an error OPEN for invalid remote IP address. (does) + * MUST ignore SYN with invalid source address. (does) + * MUST silently discard incoming SYN for broadcast/multicast + * address. (does) + * + * Asynchronous Reports (4.2.4.1) + * MUST provide mechanism for reporting soft errors to application + * layer. (does) + * + * Type of Service (4.2.4.2) + * MUST allow application layer to set Type of Service. (does IP_TOS) + * + * (Whew. -- MS 950903) + * (Updated by AK, but not complete yet.) + **/ + +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/poll.h> +#include <linux/init.h> + +#include <net/icmp.h> +#include <net/tcp.h> + +#include <asm/uaccess.h> + +int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + +struct tcp_mib tcp_statistics; + +kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; + +/* + * Find someone to 'accept'. Must be called with + * the socket locked or with interrupts disabled + */ + +static struct open_request *tcp_find_established(struct tcp_opt *tp, + struct open_request **prevp) +{ + struct open_request *req = tp->syn_wait_queue; + struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; + while(req) { + if (req->sk && + ((1 << req->sk->state) & + ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) + break; + prev = req; + req = req->dl_next; + } + *prevp = prev; + return req; +} + +/* + * Walk down the receive queue counting readable data. + * + * Must be called with the socket lock held. + */ + +static int tcp_readable(struct sock *sk) +{ + unsigned long counted; + unsigned long amount; + struct sk_buff *skb; + int sum; + + SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); + + skb = skb_peek(&sk->receive_queue); + if (skb == NULL) { + SOCK_DEBUG(sk, "empty\n"); + return(0); + } + + counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ + amount = 0; + + /* Do until a push or until we are out of data. */ + do { + /* Found a hole so stops here. */ + if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ + break; + + /* Length - header but start from where we are up to + * avoid overlaps. + */ + sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); + if (sum >= 0) { + /* Add it up, move on. */ + amount += sum; + counted += sum; + if (skb->h.th->syn) + counted++; + } + + /* Don't count urg data ... but do it in the right place! + * Consider: "old_data (ptr is here) URG PUSH data" + * The old code would stop at the first push because + * it counted the urg (amount==1) and then does amount-- + * *after* the loop. This means tcp_readable() always + * returned zero if any URG PUSH was in the queue, even + * though there was normal data available. If we subtract + * the urg data right here, we even get it to work for more + * than one URG PUSH skb without normal data. + * This means that poll() finally works now with urg data + * in the queue. Note that rlogin was never affected + * because it doesn't use poll(); it uses two processes + * and a blocking read(). And the queue scan in tcp_read() + * was correct. Mike <pall@rz.uni-karlsruhe.de> + */ + + /* Don't count urg data. */ + if (skb->h.th->urg) + amount--; +#if 0 + if (amount && skb->h.th->psh) break; +#endif + skb = skb->next; + } while(skb != (struct sk_buff *)&sk->receive_queue); + + SOCK_DEBUG(sk, "got %lu bytes.\n",amount); + return(amount); +} + +/* + * LISTEN is a special case for poll.. + */ +static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) +{ + struct open_request *req, *dummy; + + lock_sock(sk); + req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy); + release_sock(sk); + if (req) + return POLLIN | POLLRDNORM; + return 0; +} + +/* + * Compute minimal free write space needed to queue new packets. + */ +#define tcp_min_write_space(__sk) \ + (atomic_read(&(__sk)->wmem_alloc) / 2) + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + poll_wait(file, sk->sleep, wait); + if (sk->state == TCP_LISTEN) + return tcp_listen_poll(sk, wait); + + mask = 0; + if (sk->err) + mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + */ + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* Connected? */ + if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq+1 || + sk->urginline || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->shutdown & SEND_SHUTDOWN)) { + if (sock_wspace(sk) >= tcp_min_write_space(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + sk->socket->flags |= SO_NOSPACE; + } + } + + if (tp->urg_data & URG_VALID) + mask |= POLLPRI; + } + return mask; +} + +/* + * Socket write_space callback. + * This (or rather the sock_wake_async) should agree with poll. + */ +void tcp_write_space(struct sock *sk) +{ + if (sk->dead) + return; + + wake_up_interruptible(sk->sleep); + if (sock_wspace(sk) >= + tcp_min_write_space(sk)) + sock_wake_async(sk->socket, 2); +} + + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int answ; + + switch(cmd) { + case TIOCINQ: +#ifdef FIXME /* FIXME: */ + case FIONREAD: +#endif + if (sk->state == TCP_LISTEN) + return(-EINVAL); + lock_sock(sk); + answ = tcp_readable(sk); + release_sock(sk); + break; + case SIOCATMARK: + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + } + case TIOCOUTQ: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + answ = sock_wspace(sk); + break; + default: + return(-ENOIOCTLCMD); + }; + + return put_user(answ, (int *)arg); +} + +/* + * Wait for a socket to get into the connected state + * + * Note: must be called with the socket locked. + */ +static int wait_for_tcp_connect(struct sock * sk, int flags) +{ + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { + if(sk->err) + return sock_error(sk); + if((1 << sk->state) & + ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if(sk->keepopen && !(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, tsk, 0); + return -EPIPE; + } + if(flags & MSG_DONTWAIT) + return -EAGAIN; + if(signal_pending(tsk)) + return -ERESTARTSYS; + + tsk->state = TASK_INTERRUPTIBLE; + add_wait_queue(sk->sleep, &wait); + release_sock(sk); + + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && + sk->err == 0) + schedule(); + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); + } + return 0; +} + +static inline int tcp_memory_free(struct sock *sk) +{ + return atomic_read(&sk->wmem_alloc) < sk->sndbuf; +} + +/* + * Wait for more memory for a socket + */ +static void wait_for_tcp_memory(struct sock * sk) +{ + release_sock(sk); + if (!tcp_memory_free(sk)) { + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (tcp_memory_free(sk)) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + } + lock_sock(sk); +} + +/* + * Wait for a buffer. + */ +static int wait_for_buffer(struct sock *sk) +{ + struct wait_queue wait = { current, NULL }; + + release_sock(sk); + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); + return 0; +} + +/* When all user supplied data has been queued set the PSH bit */ +#define PSH_NEEDED (seglen == 0 && iovlen == 0) + +/* + * This routine copies from a user buffer into a socket, + * and starts the transmit system. + * + * Note: must be called with the socket locked. + */ + +int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) +{ + struct iovec *iov; + struct tcp_opt *tp; + struct sk_buff *skb; + int iovlen, flags; + int mss_now; + int err, copied; + + lock_sock(sk); + + err = 0; + tp = &(sk->tp_pinfo.af_tcp); + + /* Wait for a connection to finish. */ + flags = msg->msg_flags; + if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if((err = wait_for_tcp_connect(sk, flags)) != 0) + goto out; + + /* This should be in poll */ + sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ + + mss_now = tcp_current_mss(sk); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + while(--iovlen >= 0) { + int seglen=iov->iov_len; + unsigned char * from=iov->iov_base; + + iov++; + + while(seglen > 0) { + int copy, tmp, queue_it, psh; + + if (err) + goto do_fault2; + + /* Stop on errors. */ + if (sk->err) + goto do_sock_err; + + /* Make sure that we are established. */ + if (sk->shutdown & SEND_SHUTDOWN) + goto do_shutdown; + + /* Now we need to check if we have a half + * built packet we can tack some data onto. + */ + if (tp->send_head && !(flags & MSG_OOB)) { + skb = sk->write_queue.prev; + copy = skb->len; + /* If the remote does SWS avoidance we should + * queue the best we can if not we should in + * fact send multiple packets... + * A method for detecting this would be most + * welcome. + */ + if (skb_tailroom(skb) > 0 && + (mss_now - copy) > 0 && + tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { + int last_byte_was_odd = (copy % 4); + + /* + * Check for parallel writers sleeping in user access. + */ + if (tp->partial_writers++ > 0) { + wait_for_buffer(sk); + tp->partial_writers--; + continue; + } + + copy = mss_now - copy; + if(copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if(copy > seglen) + copy = seglen; + + if(last_byte_was_odd) { + if(copy_from_user(skb_put(skb, copy), + from, copy)) + err = -EFAULT; + skb->csum = csum_partial(skb->data, + skb->len, 0); + } else { + skb->csum = + csum_and_copy_from_user( + from, skb_put(skb, copy), + copy, skb->csum, &err); + } + + /* + * FIXME: the *_user functions should + * return how much data was + * copied before the fault + * occurred and then a partial + * packet with this data should + * be sent. Unfortunately + * csum_and_copy_from_user doesn't + * return this information. + * ATM it might send partly zeroed + * data in this case. + */ + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + from += copy; + copied += copy; + seglen -= copy; + if (PSH_NEEDED) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + + if (--tp->partial_writers > 0) + wake_up_interruptible(sk->sleep); + + continue; + } + } + + /* We also need to worry about the window. If + * window < 1/2 the maximum window we've seen + * from this host, don't use it. This is + * sender side silly window prevention, as + * specified in RFC1122. (Note that this is + * different than earlier versions of SWS + * prevention, e.g. RFC813.). What we + * actually do is use the whole MSS. Since + * the results in the right edge of the packet + * being outside the window, it will be queued + * for later rather than sent. + */ + psh = 0; + copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if(copy > (tp->max_window >> 1)) { + copy = min(copy, mss_now); + psh = 1; + } else { + copy = mss_now; + } + if(copy > seglen) + copy = seglen; + + /* Determine how large of a buffer to allocate. */ + tmp = MAX_HEADER + sk->prot->max_header; + if (copy < min(mss_now, tp->max_window >> 1) && + !(flags & MSG_OOB)) { + tmp += min(mss_now, tp->max_window); + + /* What is happening here is that we want to + * tack on later members of the users iovec + * if possible into a single frame. When we + * leave this loop our caller checks to see if + * we can send queued frames onto the wire. + * See tcp_v[46]_sendmsg() for this. + */ + queue_it = 1; + } else { + tmp += copy; + queue_it = 0; + } + skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); + + /* If we didn't get any memory, we need to sleep. */ + if (skb == NULL) { + sk->socket->flags |= SO_NOSPACE; + if (flags&MSG_DONTWAIT) { + err = -EAGAIN; + goto do_interrupted; + } + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto do_interrupted; + } + tcp_push_pending_frames(sk, tp); + wait_for_tcp_memory(sk); + + /* If SACK's were formed or PMTU events happened, + * we must find out about it. + */ + mss_now = tcp_current_mss(sk); + continue; + } + + seglen -= copy; + + /* Prepare control bits for TCP header creation engine. */ + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | + ((PSH_NEEDED || psh) ? + TCPCB_FLAG_PSH : 0)); + TCP_SKB_CB(skb)->sacked = 0; + if (flags & MSG_OOB) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; + TCP_SKB_CB(skb)->urg_ptr = copy; + } else + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* TCP data bytes are SKB_PUT() on top, later + * TCP+IP+DEV headers are SKB_PUSH()'d beneath. + * Reserve header space and checksum the data. + */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = csum_and_copy_from_user(from, + skb_put(skb, copy), copy, 0, &err); + + if (err) + goto do_fault; + + from += copy; + copied += copy; + + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; + + /* This advances tp->write_seq for us. */ + tcp_send_skb(sk, skb, queue_it); + } + } + sk->err = 0; + err = copied; + goto out; + +do_sock_err: + if(copied) + err = copied; + else + err = sock_error(sk); + goto out; +do_shutdown: + if(copied) + err = copied; + else { + if (!(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + } + goto out; +do_interrupted: + if(copied) + err = copied; + goto out; +do_fault: + kfree_skb(skb); +do_fault2: + err = -EFAULT; +out: + tcp_push_pending_frames(sk, tp); + release_sock(sk); + return err; +} + +#undef PSH_NEEDED + +/* + * Send an ack if one is backlogged at this point. Ought to merge + * this with tcp_send_ack(). + * This is called for delayed acks also. + */ + +void tcp_read_wakeup(struct sock *sk) +{ + /* If we're closed, don't send an ack, or we'll get a RST + * from the closed destination. + */ + if (sk->state != TCP_CLOSE) + tcp_send_ack(sk); +} + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock * sk, int nonblock, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* No URG data to read. */ + if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->err) + return sock_error(sk); + + if (sk->done) + return -ENOTCONN; + + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { + sk->done = 1; + return 0; + } + + lock_sock(sk); + if (tp->urg_data & URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = URG_READ; + + if(msg->msg_name) + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + + if(addr_len) + *addr_len = tp->af_specific->sockaddr_len; + + /* Read urgent data. */ + msg->msg_flags|=MSG_OOB; + release_sock(sk); + + if(len>0) + { + err = memcpy_toiovec(msg->msg_iov, &c, 1); + /* N.B. already set above ... */ + msg->msg_flags|=MSG_OOB; + } + else + msg->msg_flags|=MSG_TRUNC; + + /* N.B. Is this right?? If len == 0 we didn't read any data */ + return err ? -EFAULT : 1; + } + release_sock(sk); + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike <pall@rz.uni-karlsruhe.de> + */ + return -EAGAIN; +} + +/* + * Release a skb if it is no longer needed. This routine + * must be called with interrupts disabled or with the + * socket locked so that the sk_buff queue operation is ok. + */ + +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) +{ + __skb_unlink(skb, &sk->receive_queue); + kfree_skb(skb); +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) +{ + struct sk_buff *skb; + + /* NOTE! The socket must be locked, so that we don't get + * a messed-up receive queue. + */ + while ((skb=skb_peek(&sk->receive_queue)) != NULL) { + if (!skb->used || atomic_read(&skb->users) > 1) + break; + tcp_eat_skb(sk, skb); + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + */ + if(copied > 0) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rcv_window_now = tcp_receive_window(tp); + __u32 new_window = __tcp_select_window(sk); + + /* We won't be raising the window any further than + * the window-clamp allows. Our window selection + * also keeps things a nice multiple of MSS. These + * checks are necessary to prevent spurious ACKs + * which don't advertize a larger window. + */ + if((new_window && (new_window >= rcv_window_now * 2)) && + ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) + tcp_read_wakeup(sk); + } +} + + +/* + * This routine copies from a sock struct into the user buffer. + */ + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags, int *addr_len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct wait_queue wait = { current, NULL }; + int copied = 0; + u32 peek_seq; + volatile u32 *seq; /* So gcc doesn't overoptimise */ + unsigned long used; + int err = 0; + int target = 1; /* Read at least this many bytes */ + + if (sk->err) + return sock_error(sk); + + if (sk->state == TCP_LISTEN) + return -ENOTCONN; + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); + + /* Copying sequence to update. This is volatile to handle + * the multi-reader case neatly (memcpy_to/fromfs might be + * inline and thus not flush cached variables otherwise). + */ + peek_seq = tp->copied_seq; + seq = &tp->copied_seq; + if (flags & MSG_PEEK) + seq = &peek_seq; + + /* Handle the POSIX bogosity MSG_WAITALL. */ + if (flags & MSG_WAITALL) + target=len; + + add_wait_queue(sk->sleep, &wait); + lock_sock(sk); + + /* + * BUG BUG BUG + * This violates 1003.1g compliance. We must wait for + * data to exist even if we read none! + */ + + while (len > 0) { + struct sk_buff * skb; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything. */ + if (copied && tp->urg_data && tp->urg_seq == *seq) + break; + + /* We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesnt impact 1003.1g + * and move it down to the bottom of the loop + */ + if (signal_pending(current)) { + if (copied) + break; + copied = -ERESTARTSYS; + if (nonblock) + copied = -EAGAIN; + break; + } + + /* Next get a buffer. */ + current->state = TASK_INTERRUPTIBLE; + + skb = skb_peek(&sk->receive_queue); + do { + if (!skb) + break; + + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (before(*seq, TCP_SKB_CB(skb)->seq)) { + printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", + *seq, TCP_SKB_CB(skb)->seq); + break; + } + offset = *seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + skb->used = 1; + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->receive_queue); + + if (copied >= target) + break; + + /* + These three lines and clause if (sk->state == TCP_CLOSE) + are unlikely to be correct, if target > 1. + I DO NOT FIX IT, because I have no idea, what + POSIX prescribes to make here. Probably, it really + wants to lose data 8), if not all target is received. + --ANK + */ + if (sk->err && !(flags&MSG_PEEK)) { + copied = sock_error(sk); + break; + } + + if (sk->shutdown & RCV_SHUTDOWN) { + sk->done = 1; + break; + } + + if (sk->state == TCP_CLOSE) { + if (!sk->done) { + sk->done = 1; + break; + } + copied = -ENOTCONN; + break; + } + + if (nonblock) { + copied = -EAGAIN; + break; + } + + cleanup_rbuf(sk, copied); + release_sock(sk); + sk->socket->flags |= SO_WAITDATA; + schedule(); + sk->socket->flags &= ~SO_WAITDATA; + lock_sock(sk); + continue; + + found_ok_skb: + /* Lock the buffer. We can be fairly relaxed as + * an interrupt will never steal a buffer we are + * using unless I've missed something serious in + * tcp_data. + */ + atomic_inc(&skb->users); + + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sk->urginline) { + ++*seq; + offset++; + used--; + } + } else + used = urg_offset; + } + } + + /* Copy it - We _MUST_ update *seq first so that we + * don't ever double read when we have dual readers + */ + *seq += used; + + /* This memcpy_toiovec can sleep. If it sleeps and we + * do a second read it relies on the skb->users to avoid + * a crash when cleanup_rbuf() gets called. + */ + err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); + if (err) { + /* Exception. Bailout! */ + atomic_dec(&skb->users); + copied = -EFAULT; + break; + } + + copied += used; + len -= used; + + /* We now will not sleep again until we are finished + * with skb. Sorry if you are doing the SMP port + * but you'll just have to fix it neatly ;) + */ + atomic_dec(&skb->users); + + if (after(tp->copied_seq,tp->urg_seq)) + tp->urg_data = 0; + if (used + offset < skb->len) + continue; + + /* Process the FIN. We may also need to handle PSH + * here and make it break out of MSG_WAITALL. + */ + if (skb->h.th->fin) + goto found_fin_ok; + if (flags & MSG_PEEK) + continue; + skb->used = 1; + if (atomic_read(&skb->users) == 1) + tcp_eat_skb(sk, skb); + continue; + + found_fin_ok: + ++*seq; + if (flags & MSG_PEEK) + break; + + /* All is done. */ + skb->used = 1; + sk->shutdown |= RCV_SHUTDOWN; + break; + } + + if(copied >= 0 && msg->msg_name) { + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + if(addr_len) + *addr_len = tp->af_specific->sockaddr_len; + } + + remove_wait_queue(sk->sleep, &wait); + current->state = TASK_RUNNING; + + /* Clean up data we have read: This will do ACK frames. */ + cleanup_rbuf(sk, copied); + release_sock(sk); + return copied; +} + +/* + * Check whether to renew the timer. + */ +static inline void tcp_check_fin_timer(struct sock *sk) +{ + if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk, int dead) +{ + int next = (int) new_state[sk->state]; + int ns = (next & TCP_STATE_MASK); + + tcp_set_state(sk, ns); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + */ + if (dead) + tcp_check_fin_timer(sk); + + return (next & TCP_ACTION_FIN); +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set sk->dead. + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->state) & + (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { + lock_sock(sk); + + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk,0)) + tcp_send_fin(sk); + + release_sock(sk); + } +} + + +/* + * Return 1 if we still have things to send in our buffers. + */ + +static inline int closing(struct sock * sk) +{ + return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. Currently it is only called by + * tcp_close, and timeout mirrors the value there. + */ + +static void tcp_close_pending (struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req = tp->syn_wait_queue; + + while(req) { + struct open_request *iter; + + if (req->sk) + tcp_close(req->sk, 0); + + iter = req; + req = req->dl_next; + + (*iter->class->destructor)(iter); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + sk->ack_backlog--; + tcp_openreq_free(iter); + } + + tcp_synq_init(tp); +} + +void tcp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + + /* + * Check whether the socket is locked ... supposedly + * it's impossible to tcp_close() a locked socket. + */ + if (atomic_read(&sk->sock_readers)) + printk("tcp_close: socket already locked!\n"); + + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + */ + lock_sock(sk); + if(sk->state == TCP_LISTEN) { + /* Special case. */ + tcp_set_state(sk, TCP_CLOSE); + tcp_close_pending(sk); + release_sock(sk); + sk->dead = 1; + return; + } + + /* It is questionable, what the role of this is now. + * In any event either it should be removed, or + * increment of SLT_KEEPALIVE be done, this is causing + * big problems. For now I comment it out. -DaveM + */ + /* sk->keepopen = 1; */ + sk->shutdown = SHUTDOWN_MASK; + + if (!sk->dead) + sk->state_change(sk); + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; + data_was_unread += len; + kfree_skb(skb); + } + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if(data_was_unread != 0) { + /* Unread data was tossed, zap the connection. */ + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk); + } else if (tcp_close_state(sk,1)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + tcp_send_fin(sk); + } + + if (timeout) { + struct task_struct *tsk = current; + struct wait_queue wait = { tsk, NULL }; + + add_wait_queue(sk->sleep, &wait); + release_sock(sk); + + while (1) { + tsk->state = TASK_INTERRUPTIBLE; + if (!closing(sk)) + break; + timeout = schedule_timeout(timeout); + if (signal_pending(tsk) || !timeout) + break; + } + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + + lock_sock(sk); + } + + /* Now that the socket is dead, if we are in the FIN_WAIT2 state + * we may need to set up a timer. + */ + tcp_check_fin_timer(sk); + + release_sock(sk); + sk->dead = 1; +} + +/* + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. + */ +static struct open_request * wait_for_connect(struct sock * sk, + struct open_request **pprev) +{ + struct wait_queue wait = { current, NULL }; + struct open_request *req; + + add_wait_queue(sk->sleep, &wait); + for (;;) { + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + schedule(); + lock_sock(sk); + req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev); + if (req) + break; + if (signal_pending(current)) + break; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return req; +} + +/* + * This will accept the next outstanding connection. + * + * Be careful about race conditions here - this is subtle. + */ + +struct sock *tcp_accept(struct sock *sk, int flags) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + struct sock *newsk = NULL; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = EINVAL; + if (sk->state != TCP_LISTEN) + goto out; + + /* Find already established connection */ + req = tcp_find_established(tp, &prev); + if (!req) { + /* If this is a non blocking socket don't sleep */ + error = EAGAIN; + if (flags & O_NONBLOCK) + goto out; + + error = ERESTARTSYS; + req = wait_for_connect(sk, &prev); + if (!req) + goto out; + } + + tcp_synq_unlink(tp, req, prev); + newsk = req->sk; + req->class->destructor(req); + tcp_openreq_free(req); + sk->ack_backlog--; + if(sk->keepopen) + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); + + release_sock(sk); + return newsk; + +out: + /* sk should be in LISTEN state, thus accept can use sk->err for + * internal purposes without stomping one anyone's feed. + */ + sk->err = error; + release_sock(sk); + return newsk; +} + +/* + * Socket option code for TCP. + */ + +int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, + int optlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val; + + if (level != SOL_TCP) + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); + + if(optlen<sizeof(int)) + return -EINVAL; + + if (get_user(val, (int *)optval)) + return -EFAULT; + + switch(optname) { + case TCP_MAXSEG: + /* values greater than interface MTU won't take effect. however at + * the point when this call is done we typically don't yet know + * which interface is going to be used + */ + if(val < 1 || val > MAX_WINDOW) + return -EINVAL; + tp->user_mss = val; + return 0; + + case TCP_NODELAY: + /* You cannot try to use this and TCP_CORK in + * tandem, so let the user know. + */ + if (sk->nonagle == 2) + return -EINVAL; + sk->nonagle = (val == 0) ? 0 : 1; + return 0; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * You cannot try to use TCP_NODELAY and this mechanism + * at the same time, so let the user know. + */ + if (sk->nonagle == 1) + return -EINVAL; + if (val != 0) { + sk->nonagle = 2; + } else { + sk->nonagle = 0; + + lock_sock(sk); + tcp_push_pending_frames(sk, tp); + release_sock(sk); + } + return 0; + + default: + return -ENOPROTOOPT; + }; +} + +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, + int *optlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val, len; + + if(level != SOL_TCP) + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min(len, sizeof(int)); + + switch(optname) { + case TCP_MAXSEG: + val = tp->user_mss; + break; + case TCP_NODELAY: + val = (sk->nonagle == 1); + break; + case TCP_CORK: + val = (sk->nonagle == 2); + break; + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if (!sk->keepopen && val) + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); + else if (sk->keepopen && !val) + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); +} + +extern void __skb_cb_too_small_for_tcp(int, int); + +void __init tcp_init(void) +{ + struct sk_buff *skb = NULL; + + if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); +} diff --git a/pfinet/linux-src/net/ipv4/tcp_input.c b/pfinet/linux-src/net/ipv4/tcp_input.c new file mode 100644 index 00000000..a753b128 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp_input.c @@ -0,0 +1,2432 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_input.c,v 1.164.2.7 1999/08/13 16:14:27 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. + * Andi Kleen : Moved open_request checking here + * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presnce of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS<min(MTU,ann. MSS) + * work without delayed acks. + * Andi Kleen: Process packets with PSH set in the + * fast path. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/tcp.h> +#include <linux/ipsec.h> + +#ifdef CONFIG_SYSCTL +#define SYNC_INIT 0 /* let the user enable it */ +#else +#define SYNC_INIT 1 +#endif + +extern int sysctl_tcp_fin_timeout; + +/* These are on by default so the code paths get tested. + * For the final 2.2 this may be undone at our discretion. -DaveM + */ +int sysctl_tcp_timestamps = 1; +int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; + +int sysctl_tcp_syncookies = SYNC_INIT; +int sysctl_tcp_stdurg; +int sysctl_tcp_rfc1337; + +static int prune_queue(struct sock *sk); + +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM + */ +static void tcp_delack_estimator(struct tcp_opt *tp) +{ + if(tp->ato == 0) { + tp->lrcvtime = tcp_time_stamp; + + /* Help sender leave slow start quickly, + * and also makes sure we do not take this + * branch ever again for this connection. + */ + tp->ato = 1; + tcp_enter_quickack_mode(tp); + } else { + int m = tcp_time_stamp - tp->lrcvtime; + + tp->lrcvtime = tcp_time_stamp; + if(m <= 0) + m = 1; + if(m > tp->rto) + tp->ato = tp->rto; + else { + /* This funny shift makes sure we + * clear the "quick ack mode" bit. + */ + tp->ato = ((tp->ato << 1) >> 2) + m; + } + } +} + +/* + * Remember to send an ACK later. + */ +static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, + struct sk_buff *skb) +{ + tp->delayed_acks++; + + /* Tiny-grams with PSH set artifically deflate our + * ato measurement, but with a lower bound. + */ + if(th->psh && (skb->len < (tp->mss_cache >> 1))) { + /* Preserve the quickack state. */ + if((tp->ato & 0x7fffffff) > HZ/50) + tp->ato = ((tp->ato & 0x80000000) | + (HZ/50)); + } +} + +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ + +static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +{ + long m = mrtt; /* RTT */ + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + */ + if(m == 0) + m = 1; + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<2; /* make sure rto = 3*rtt */ + } +} + +/* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ + +static __inline__ void tcp_set_rto(struct tcp_opt *tp) +{ + tp->rto = (tp->srtt >> 3) + tp->mdev; + tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); +} + + +/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound + * on packet lifetime in the internet. We need the HZ/5 lower + * bound to behave correctly against BSD stacks with a fixed + * delayed ack. + * FIXME: It's not entirely clear this lower bound is the best + * way to avoid the problem. Is it possible to drop the lower + * bound and still avoid trouble with BSD stacks? Perhaps + * some modification to the RTO calculation that takes delayed + * ack bias into account? This needs serious thought. -- erics + */ +static __inline__ void tcp_bound_rto(struct tcp_opt *tp) +{ + if (tp->rto > 120*HZ) + tp->rto = 120*HZ; + if (tp->rto < HZ/5) + tp->rto = HZ/5; +} + +/* WARNING: this must not be called if tp->saw_timestamp was false. */ +extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, + __u32 start_seq, __u32 end_seq) +{ + /* It is start_seq <= last_ack_seq combined + with in window check. If start_seq<=last_ack_seq<=rcv_nxt, + then segment is in window if end_seq>=rcv_nxt. + */ + if (!after(start_seq, tp->last_ack_sent) && + !before(end_seq, tp->rcv_nxt)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Plus: expired timestamps. + * + * Plus: resets failing PAWS. + */ + if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = tcp_time_stamp; + } + } +} + +#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) + +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len) +{ + return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && + (s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS && + /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ + len != (th->doff * 4)); +} + + +static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + u32 end_window = tp->rcv_wup + tp->rcv_wnd; + + if (tp->rcv_wnd && + after(end_seq, tp->rcv_nxt) && + before(seq, end_window)) + return 1; + if (seq != end_window) + return 0; + return (seq == end_seq); +} + +/* This functions checks to see if the tcp header is actually acceptable. */ +extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + if (seq == tp->rcv_nxt) + return (tp->rcv_wnd || (end_seq == seq)); + + return __tcp_sequence(tp, seq, end_seq); +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ + sk->zapped = 1; + + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->state) { + case TCP_SYN_SENT: + sk->err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->err = EPIPE; + break; + default: + sk->err = ECONNRESET; + }; + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + if (!sk->dead) + sk->state_change(sk); +} + +/* This tags the retransmission queue when SACKs arrive. */ +static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int i = nsacks; + + while(i--) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(after(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + /* We play conservative, we don't allow SACKS to partially + * tag a sequence space. + */ + fack_count++; + if(!after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { + /* If this was a retransmitted frame, account for it. */ + if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && + tp->retrans_out) + tp->retrans_out--; + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + + /* RULE: All new SACKs will either decrease retrans_out + * or advance fackets_out. + */ + if(fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } + skb = skb->next; + } + sp++; /* Move on to the next SACK block. */ + } +} + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + */ +void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) +{ + unsigned char *ptr; + int length=(th->doff*4)-sizeof(struct tcphdr); + int saw_mss = 0; + + ptr = (unsigned char *)(th + 1); + tp->saw_tstamp = 0; + + while(length>0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + switch(opcode) { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn) { + u16 in_mss = ntohs(*(__u16 *)ptr); + if (in_mss == 0) + in_mss = 536; + if (tp->mss_clamp > in_mss) + tp->mss_clamp = in_mss; + saw_mss = 1; + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn) + if (!no_fancy && sysctl_tcp_window_scaling) { + tp->wscale_ok = 1; + tp->snd_wscale = *(__u8 *)ptr; + if(tp->snd_wscale > 14) { + if(net_ratelimit()) + printk("tcp_parse_options: Illegal window " + "scaling value %d >14 received.", + tp->snd_wscale); + tp->snd_wscale = 14; + } + } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if (sysctl_tcp_timestamps && !no_fancy) { + tp->tstamp_ok = 1; + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn) { + if (sysctl_tcp_sack && !no_fancy) { + tp->sack_ok = 1; + tp->num_sacks = 0; + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + sysctl_tcp_sack && (sk != NULL) && !th->syn) { + int sack_bytes = opsize - TCPOLEN_SACK_BASE; + + if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { + int num_sacks = sack_bytes >> 3; + struct tcp_sack_block *sackp; + + sackp = (struct tcp_sack_block *)ptr; + tcp_sacktag_write_queue(sk, sackp, num_sacks); + } + } + }; + ptr+=opsize-2; + length-=opsize; + }; + } + if(th->syn && saw_mss == 0) + tp->mss_clamp = 536; +} + +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) +{ + /* If we didn't send out any options ignore them all. */ + if (tp->tcp_header_len == sizeof(struct tcphdr)) + return 0; + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->saw_tstamp = 0; + return 0; + } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*++ptr); + tp->rcv_tsecr = ntohl(*++ptr); + return 1; + } + } + tcp_parse_options(sk, th, tp, 0); + return 1; +} + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ + +static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) +{ + if (tp->dup_acks > 3) + tp->snd_cwnd = (tp->snd_ssthresh); + + tp->dup_acks = 0; +} + +/* NOTE: This code assumes that tp->dup_acks gets cleared when a + * retransmit timer fires. + */ +static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Note: If not_dup is set this implies we got a + * data carrying packet or a window update. + * This carries no new information about possible + * lost packets, so we have to ignore it for the purposes + * of counting duplicate acks. Ideally this does not imply we + * should stop our fast retransmit phase, more acks may come + * later without data to help us. Unfortunately this would make + * the code below much more complex. For now if I see such + * a packet I clear the fast retransmit phase. + */ + if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { + /* This is the standard reno style fast retransmit branch. */ + + /* 1. When the third duplicate ack is received, set ssthresh + * to one half the current congestion window, but no less + * than two segments. Retransmit the missing segment. + */ + if (tp->high_seq == 0 || after(ack, tp->high_seq)) { + tp->dup_acks++; + if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_cwnd = (tp->snd_ssthresh + 3); + tp->high_seq = tp->snd_nxt; + if(!tp->fackets_out) + tcp_retransmit_skb(sk, + skb_peek(&sk->write_queue)); + else + tcp_fack_retransmit(sk); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + } else if (++tp->dup_acks > 3) { + /* 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... + * + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode". We do not use + * duplicate ACKs to artificially inflate the congestion + * window when doing FACK. + */ + if(!tp->fackets_out) { + tp->snd_cwnd++; + } else { + /* Fill any further holes which may have + * appeared. + * + * We may want to change this to run every + * further multiple-of-3 dup ack increments, + * to be more robust against out-of-order + * packet delivery. -DaveM + */ + tcp_fack_retransmit(sk); + } + } + } else if (tp->high_seq != 0) { + /* In this branch we deal with clearing the Floyd style + * block on duplicate fast retransmits, and if requested + * we do Hoe style secondary fast retransmits. + */ + if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { + /* Once we have acked all the packets up to high_seq + * we are done this fast retransmit phase. + * Alternatively data arrived. In this case we + * Have to abort the fast retransmit attempt. + * Note that we do want to accept a window + * update since this is expected with Hoe's algorithm. + */ + clear_fast_retransmit(tp); + + /* After we have cleared up to high_seq we can + * clear the Floyd style block. + */ + if (!before(ack, tp->high_seq)) { + tp->high_seq = 0; + tp->fackets_out = 0; + } + } else if (tp->dup_acks >= 3) { + if (!tp->fackets_out) { + /* Hoe Style. We didn't ack the whole + * window. Take this as a cue that + * another packet was lost and retransmit it. + * Don't muck with the congestion window here. + * Note that we have to be careful not to + * act if this was a window update and it + * didn't ack new data, since this does + * not indicate a packet left the system. + * We can test this by just checking + * if ack changed from snd_una, since + * the only way to get here without advancing + * from snd_una is if this was a window update. + */ + if (ack != tp->snd_una && before(ack, tp->high_seq)) { + tcp_retransmit_skb(sk, + skb_peek(&sk->write_queue)); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + } else { + /* FACK style, fill any remaining holes in + * receiver's queue. + */ + tcp_fack_retransmit(sk); + } + } + } +} + +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +{ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; + } +} + +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, + __u32 *seq, __u32 *seq_rtt) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + __u32 now = tcp_time_stamp; + int acked = 0; + + /* If we are retransmitting, and this ACK clears up to + * the retransmit head, or further, then clear our state. + */ + if (tp->retrans_head != NULL && + !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq)) + tp->retrans_head = NULL; + + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u8 sacked = scb->sacked; + + /* If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived at + * the other end. + */ + if (after(scb->end_seq, ack)) + break; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out) + tp->retrans_out--; + if(!(scb->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + if(sacked & TCPCB_SACKED_RETRANS) + acked |= FLAG_RETRANS_DATA_ACKED; + if(tp->fackets_out) + tp->fackets_out--; + } else { + /* This is pure paranoia. */ + tp->retrans_head = NULL; + } + tp->packets_out--; + *seq = scb->seq; + *seq_rtt = now - scb->when; + __skb_unlink(skb, skb->list); + kfree_skb(skb); + } + return acked; +} + +static void tcp_ack_probe(struct sock *sk, __u32 ack) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Our probe was answered. */ + tp->probes_out = 0; + + /* Was it a usable window open? */ + + /* should always be non-null */ + if (tp->send_head != NULL && + !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { + tp->backoff = 0; + tp->pending = 0; + tcp_clear_xmit_timer(sk, TIME_PROBE0); + } else { + tcp_reset_xmit_timer(sk, TIME_PROBE0, + min(tp->rto << tp->backoff, 120*HZ)); + } +} + +/* Should we open up the congestion window? */ +static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) +{ + /* Data must have been acked. */ + if ((flag & FLAG_DATA_ACKED) == 0) + return 0; + + /* Some of the data acked was retransmitted somehow? */ + if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) { + /* We advance in all cases except during + * non-FACK fast retransmit/recovery. + */ + if (tp->fackets_out != 0 || + tp->retransmits != 0) + return 1; + + /* Non-FACK fast retransmit does it's own + * congestion window management, don't get + * in the way. + */ + return 0; + } + + /* New non-retransmitted data acked, always advance. */ + return 1; +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, + u32 seq, u32 ack, int flag) +{ + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> + */ + if (!(flag & FLAG_DATA_ACKED)) + return; + + seq_rtt = tcp_time_stamp - tp->rcv_tsecr; + tcp_rtt_estimator(tp, seq_rtt); + if (tp->retransmits) { + if (tp->packets_out == 0) { + tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; + tp->backoff = 0; + tcp_set_rto(tp); + } else { + /* Still retransmitting, use backoff */ + tcp_set_rto(tp); + tp->rto = tp->rto << tp->backoff; + } + } else { + tcp_set_rto(tp); + } + + tcp_bound_rto(tp); +} + +static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb = skb_peek(&sk->write_queue); + + /* Some data was ACK'd, if still retransmitting (due to a + * timeout), resend more of the retransmit queue. The + * congestion window is handled properly by that code. + */ + if (tp->retransmits) { + tcp_xmit_retransmit_queue(sk); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); + if ((__s32)when < 0) + when = 1; + tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + } +} + +/* This routine deals with incoming acks, but not outgoing ones. */ +static int tcp_ack(struct sock *sk, struct tcphdr *th, + u32 ack_seq, u32 ack, int len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int flag = 0; + u32 seq = 0; + u32 seq_rtt = 0; + + if(sk->zapped) + return(1); /* Dead, can't ack any more so why bother */ + + if (tp->pending == TIME_KEEPOPEN) + tp->probes_out = 0; + + tp->rcv_tstamp = tcp_time_stamp; + + /* If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) + goto uninteresting_ack; + + /* If there is data set flag 1 */ + if (len != th->doff*4) { + flag |= FLAG_DATA; + tcp_delack_estimator(tp); + } + + /* Update our send window. */ + + /* This is the window update code as per RFC 793 + * snd_wl{1,2} are used to prevent unordered + * segments from shrinking the window + */ + if (before(tp->snd_wl1, ack_seq) || + (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { + u32 nwin = ntohs(th->window) << tp->snd_wscale; + + if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { + flag |= FLAG_WIN_UPDATE; + tp->snd_wnd = nwin; + + tp->snd_wl1 = ack_seq; + tp->snd_wl2 = ack; + + if (nwin > tp->max_window) + tp->max_window = nwin; + } + } + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->err_soft = 0; + + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + if (tp->pending == TIME_PROBE0) + tcp_ack_probe(sk, ack); + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); + + /* We must do this here, before code below clears out important + * state contained in tp->fackets_out and tp->retransmits. -DaveM + */ + if (should_advance_cwnd(tp, flag)) + tcp_cong_avoid(tp); + + /* If we have a timestamp, we always do rtt estimates. */ + if (tp->saw_tstamp) { + tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); + } else { + /* If we were retransmiting don't count rtt estimate. */ + if (tp->retransmits) { + if (tp->packets_out == 0) { + tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; + } + } else { + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + if (flag & FLAG_DATA_ACKED) { + if(!(flag & FLAG_RETRANS_DATA_ACKED)) { + tp->backoff = 0; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tcp_bound_rto(tp); + } + } + } + } + + if (tp->packets_out) { + if (flag & FLAG_DATA_ACKED) + tcp_ack_packets_out(sk, tp); + } else { + tcp_clear_xmit_timer(sk, TIME_RETRANS); + } + + flag &= (FLAG_DATA | FLAG_WIN_UPDATE); + if ((ack == tp->snd_una && tp->packets_out && flag == 0) || + (tp->high_seq != 0)) { + tcp_fast_retrans(sk, ack, flag); + } else { + /* Clear any aborted fast retransmit starts. */ + tp->dup_acks = 0; + } + /* It is not a brain fart, I thought a bit now. 8) + * + * Forward progress is indicated, if: + * 1. the ack acknowledges new data. + * 2. or the ack is duplicate, but it is caused by new segment + * arrival. This case is filtered by: + * - it contains no data, syn or fin. + * - it does not update window. + * 3. or new SACK. It is difficult to check, so that we ignore it. + * + * Forward progress is also indicated by arrival new data, + * which was caused by window open from our side. This case is more + * difficult and it is made (alas, incorrectly) in tcp_data_queue(). + * --ANK (990513) + */ + if (ack != tp->snd_una || (flag == 0 && !th->fin)) + dst_confirm(sk->dst_cache); + + /* Remember the highest ack received. */ + tp->snd_una = ack; + return 1; + +uninteresting_ack: + SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); + return 0; +} + +/* New-style handling of TIME_WAIT sockets. */ +extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); + +void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ + struct tcp_bind_bucket *tb = tw->tb; + + /* Disassociate with bind bucket. */ + if(tw->bind_next) + tw->bind_next->bind_pprev = tw->bind_pprev; + *(tw->bind_pprev) = tw->bind_next; + if (tb->owners == NULL) { + if (tb->next) + tb->next->pprev = tb->pprev; + *(tb->pprev) = tb->next; + kmem_cache_free(tcp_bucket_cachep, tb); + } + + /* Unlink from established hashes. */ + if(tw->next) + tw->next->pprev = tw->pprev; + *tw->pprev = tw->next; + + /* We decremented the prot->inuse count when we entered TIME_WAIT + * and the sock from which this came was destroyed. + */ + tw->sklist_next->sklist_prev = tw->sklist_prev; + tw->sklist_prev->sklist_next = tw->sklist_next; + + /* Ok, now free it up. */ + kmem_cache_free(tcp_timewait_cachep, tw); +} + +/* We come here as a special case from the AF specific TCP input processing, + * and the SKB has no owner. Essentially handling this is very simple, + * we just keep silently eating rx'd packets, acking them if necessary, + * until none show up for the entire timeout period. + * + * Return 0, TCP_TW_ACK, TCP_TW_RST + */ +enum tcp_tw_status +tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + /* RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) { + struct sock *sk; + struct tcp_func *af_specific = tw->af_specific; + __u32 isn; + + isn = tw->snd_nxt + 128000; + if(isn == 0) + isn++; + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + sk = af_specific->get_sock(skb, th); + if(sk == NULL || + !ipsec_sk_policy(sk,skb) || + atomic_read(&sk->sock_readers) != 0) + return 0; + skb_set_owner_r(skb, sk); + af_specific = sk->tp_pinfo.af_tcp.af_specific; + if(af_specific->conn_request(sk, skb, isn) < 0) + return TCP_TW_RST; /* Toss a reset back. */ + return 0; /* Discard the frame. */ + } + + /* Check RST or SYN */ + if(th->rst || th->syn) { + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if(sysctl_tcp_rfc1337 == 0) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + } + if(!th->rst) + return TCP_TW_RST; /* toss a reset back */ + return 0; + } else { + /* In this case we must reset the TIMEWAIT timer. */ + if(th->ack) + tcp_tw_reschedule(tw); + } + /* Ack old packets if necessary */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) && + (th->doff * 4) > len) + return TCP_TW_ACK; + return 0; +} + +/* Enter the time wait state. This is always called from BH + * context. Essentially we whip up a timewait bucket, copy the + * relevant info into it from the SK, and mess with hash chains + * and list linkage. + */ +static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +{ + struct sock **head, *sktw; + + /* Step 1: Remove SK from established hash. */ + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + tcp_reg_zap(sk); + + /* Step 2: Put TW into bind hash where SK was. */ + tw->tb = (struct tcp_bind_bucket *)sk->prev; + if((tw->bind_next = sk->bind_next) != NULL) + sk->bind_next->bind_pprev = &tw->bind_next; + tw->bind_pprev = sk->bind_pprev; + *sk->bind_pprev = (struct sock *)tw; + sk->prev = NULL; + + /* Step 3: Same for the protocol sklist. */ + (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw; + (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw; + sk->sklist_next = NULL; + sk->prot->inuse--; + + /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ + head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)]; + sktw = (struct sock *)tw; + if((sktw->next = *head) != NULL) + (*head)->pprev = &sktw->next; + *head = sktw; + sktw->pprev = head; +} + +void tcp_time_wait(struct sock *sk) +{ + struct tcp_tw_bucket *tw; + + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + if(tw != NULL) { + /* Give us an identity. */ + tw->daddr = sk->daddr; + tw->rcv_saddr = sk->rcv_saddr; + tw->bound_dev_if= sk->bound_dev_if; + tw->num = sk->num; + tw->state = TCP_TIME_WAIT; + tw->sport = sk->sport; + tw->dport = sk->dport; + tw->family = sk->family; + tw->reuse = sk->reuse; + tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; + tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; + tw->window = tcp_select_window(sk); + tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if(tw->family == PF_INET6) { + memcpy(&tw->v6_daddr, + &sk->net_pinfo.af_inet6.daddr, + sizeof(struct in6_addr)); + memcpy(&tw->v6_rcv_saddr, + &sk->net_pinfo.af_inet6.rcv_saddr, + sizeof(struct in6_addr)); + } +#endif + /* Linkage updates. */ + tcp_tw_hashdance(sk, tw); + + /* Get the TIME_WAIT timeout firing. */ + tcp_tw_schedule(tw); + + /* CLOSE the SK. */ + if(sk->state == TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab--; + sk->state = TCP_CLOSE; + net_reset_timer(sk, TIME_DONE, + min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); + } else { + /* Sorry, we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + tcp_set_state(sk, TCP_CLOSE); + } + + /* Prevent rcvmsg/sndmsg calls, and wake people up. */ + sk->shutdown = SHUTDOWN_MASK; + if(!sk->dead) + sk->state_change(sk); +} + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + */ + +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; + + tcp_send_ack(sk); + + if (!sk->dead) { + sk->state_change(sk); + sock_wake_async(sk->socket, 1); + } + + switch(sk->state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + if (th->rst) + sk->shutdown = SHUTDOWN_MASK; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + * + * This causes a WRITE timeout, which will either + * move on to TIME_WAIT when we timeout, or resend + * the FIN properly (maybe we get rid of that annoying + * FIN lost hang). The TIME_WRITE code is already + * correct for handling this timeout. + */ + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_time_wait(sk); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); + break; + }; +} + +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) +{ + int this_sack, num_sacks = tp->num_sacks; + struct tcp_sack_block *swalk = &tp->selective_acks[0]; + + /* If more than one SACK block, see if the recent change to SP eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + if(num_sacks != 1) { + for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { + if(swalk == sp) + continue; + + /* First case, bottom of SP moves into top of the + * sequence space of SWALK. + */ + if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { + sp->start_seq = swalk->start_seq; + goto coalesce; + } + /* Second case, top of SP moves into bottom of the + * sequence space of SWALK. + */ + if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { + sp->end_seq = swalk->end_seq; + goto coalesce; + } + } + } + /* SP is the only SACK, or no coalescing cases found. */ + return; + +coalesce: + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + for(; this_sack < num_sacks-1; this_sack++, swalk++) { + struct tcp_sack_block *next = (swalk + 1); + swalk->start_seq = next->start_seq; + swalk->end_seq = next->end_seq; + } + tp->num_sacks--; +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->num_sacks; + + if (!cur_sacks) + goto new_sack; + + /* Optimize for the common case, new ofo frames arrive + * "in order". ;-) This also satisfies the requirements + * of RFC2018 about ordering of SACKs. + */ + if(sp->end_seq == TCP_SKB_CB(skb)->seq) { + sp->end_seq = TCP_SKB_CB(skb)->end_seq; + tcp_sack_maybe_coalesce(tp, sp); + } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) { + /* Re-ordered arrival, in this case, can be optimized + * as well. + */ + sp->start_seq = TCP_SKB_CB(skb)->seq; + tcp_sack_maybe_coalesce(tp, sp); + } else { + struct tcp_sack_block *swap = sp + 1; + int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4); + + /* Oh well, we have to move things around. + * Try to find a SACK we can tack this onto. + */ + + for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { + if((swap->end_seq == TCP_SKB_CB(skb)->seq) || + (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { + if(swap->end_seq == TCP_SKB_CB(skb)->seq) + swap->end_seq = TCP_SKB_CB(skb)->end_seq; + else + swap->start_seq = TCP_SKB_CB(skb)->seq; + tcp_sack_swap(sp, swap); + tcp_sack_maybe_coalesce(tp, sp); + return; + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (cur_sacks >= max_sacks) { + cur_sacks--; + tp->num_sacks--; + } + while(cur_sacks >= 1) { + struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; + struct tcp_sack_block *prev = (this - 1); + this->start_seq = prev->start_seq; + this->end_seq = prev->end_seq; + cur_sacks--; + } + + new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = TCP_SKB_CB(skb)->seq; + sp->end_seq = TCP_SKB_CB(skb)->end_seq; + tp->num_sacks++; + } +} + +static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + /* This is an in order data segment _or_ an out-of-order SKB being + * moved to the receive queue, so we know this removed SKB will eat + * from the front of a SACK. + */ + for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { + /* Check if the start of the sack is covered by skb. */ + if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) && + before(sp->start_seq, TCP_SKB_CB(skb)->end_seq)) + break; + } + + /* This should only happen if so many SACKs get built that some get + * pushed out before we get here, or we eat some in sequence packets + * which are before the first SACK block. + */ + if(this_sack >= num_sacks) + return; + + sp->start_seq = TCP_SKB_CB(skb)->end_seq; + if(!before(sp->start_seq, sp->end_seq)) { + /* Zap this SACK, by moving forward any other SACKS. */ + for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { + struct tcp_sack_block *next = (sp + 1); + sp->start_seq = next->start_seq; + sp->end_seq = next->end_seq; + } + tp->num_sacks--; + } +} + +static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { + if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) + break; + } + if(this_sack >= num_sacks) + return; + sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + while ((skb = skb_peek(&tp->out_of_order_queue))) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received \n"); + __skb_unlink(skb, skb->list); + kfree_skb(skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + if(tp->sack_ok) + tcp_sack_remove_skb(tp, skb); + __skb_unlink(skb, skb->list); + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); + } +} + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skb1; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + /* Ok. In sequence. */ + queue_and_out: + dst_confirm(sk->dst_cache); + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->h.th->fin) { + tcp_fin(skb, sk, skb->h.th); + } else { + tcp_remember_ack(tp, skb->h.th, skb); + } + /* This may have eaten into a SACK block. */ + if(tp->sack_ok && tp->num_sacks) + tcp_sack_remove_skb(tp, skb); + tcp_ofo_queue(sk); + + /* Turn on fast path. */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) + tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | + (0x10 << 16) | + tp->snd_wnd); + return; + } + + /* An old packet, either a retransmit or some packet got lost. */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* A retransmit, 2nd most common case. Force an imediate ack. */ + SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); + tcp_enter_quickack_mode(tp); + kfree_skb(skb); + return; + } + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + goto queue_and_out; + } + + /* Ok. This is an out_of_order segment, force an ack. */ + tp->delayed_acks++; + tcp_enter_quickack_mode(tp); + + /* Disable header prediction. */ + tp->pred_flags = 0; + + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + if (skb_peek(&tp->out_of_order_queue) == NULL) { + /* Initial out of order segment, build 1 SACK. */ + if(tp->sack_ok) { + tp->num_sacks = 1; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; + } + __skb_queue_head(&tp->out_of_order_queue,skb); + } else { + for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { + /* Already there. */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) { + if (skb->len >= skb1->len) { + if(tp->sack_ok) + tcp_sack_extend(tp, skb1, skb); + __skb_append(skb1, skb); + __skb_unlink(skb1, skb1->list); + kfree_skb(skb1); + } else { + /* A duplicate, smaller than what is in the + * out-of-order queue right now, toss it. + */ + kfree_skb(skb); + } + break; + } + + if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) { + __skb_append(skb1, skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); + break; + } + + /* See if we've hit the start. If so insert. */ + if (skb1 == skb_peek(&tp->out_of_order_queue)) { + __skb_queue_head(&tp->out_of_order_queue,skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); + break; + } + } + } +} + + +/* + * This routine handles the data. If there is room in the buffer, + * it will be have already been moved into it. If there is no + * room, then we will just have to discard the packet. + */ + +static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) +{ + struct tcphdr *th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + th = skb->h.th; + skb_pull(skb, th->doff*4); + skb_trim(skb, len - (th->doff*4)); + + if (skb->len == 0 && !th->fin) + return(0); + + /* + * If our receive queue has grown past its limits shrink it. + * Make sure to do this before moving snd_nxt, otherwise + * data might be acked for that we don't have enough room. + */ + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + if (prune_queue(sk) < 0) { + /* Still not enough room. That can happen when + * skb->true_size differs significantly from skb->len. + */ + return 0; + } + } + + tcp_data_queue(sk, skb); + + if (before(tp->rcv_nxt, tp->copied_seq)) { + printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); + tp->rcv_nxt = tp->copied_seq; + } + + /* Above, tcp_data_queue() increments delayed_acks appropriately. + * Now tell the user we may have some data. + */ + if (!sk->dead) { + sk->data_ready(sk,0); + } + return(1); +} + +static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && + tcp_packets_in_flight(tp) < tp->snd_cwnd) { + /* Put more data onto the wire. */ + tcp_write_xmit(sk); + } else if (tp->packets_out == 0 && !tp->pending) { + /* Start probing the receivers window. */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } +} + +static __inline__ void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; + + if (skb != NULL) + __tcp_data_snd_check(sk, skb); +} + +/* + * Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int len = skb->len, lss; + + if (len > tp->rcv_mss) + tp->rcv_mss = len; + lss = tp->last_seg_size; + tp->last_seg_size = 0; + if (len >= 536) { + if (len == lss) + tp->rcv_mss = len; + tp->last_seg_size = len; + } +} + +/* + * Check if sending an ack is needed. + */ +static __inline__ void __tcp_ack_snd_check(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* This also takes care of updating the window. + * This if statement needs to be simplified. + * + * Rules for delaying an ack: + * - delay time <= 0.5 HZ + * - we don't have a window update to send + * - must send at least every 2 full sized packets + * - must send an ACK if we have any out of order data + * + * With an extra heuristic to handle loss of packet + * situations and also helping the sender leave slow + * start in an expediant manner. + */ + + /* Two full frames received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || + /* We will update the window "significantly" or... */ + tcp_raise_window(sk) || + /* We entered "quick ACK" mode or... */ + tcp_in_quickack_mode(tp) || + /* We have out of order data */ + (skb_peek(&tp->out_of_order_queue) != NULL)) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(tp, HZ/2); + } +} + +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->delayed_acks == 0) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk); +} + + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form (or just set the sysctl tcp_stdurg). + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr && !sysctl_tcp_stdurg) + ptr--; + ptr += ntohl(th->seq); + + /* Ignore urgent data that we've already seen and read. */ + if (after(tp->copied_seq, ptr)) + return; + + /* Do we already have a newer (or duplicate) urgent pointer? */ + if (tp->urg_data && !after(ptr, tp->urg_seq)) + return; + + /* Tell the world about our new urgent pointer. */ + if (sk->proc != 0) { + if (sk->proc > 0) + kill_proc(sk->proc, SIGURG, 1); + else + kill_pg(-sk->proc, SIGURG, 1); + } + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * tp->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) + */ + if (tp->urg_seq == tp->copied_seq) + tp->copied_seq++; /* Move the copied sequence on correctly */ + tp->urg_data = URG_NOTYET; + tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; +} + +/* This is the 'fast' part of urgent handling. */ +static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk,th); + + /* Do we wait for any urgent data? - normally not... */ + if (tp->urg_data == URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); + + /* Is the urgent pointer pointing into this packet? */ + if (ptr < len) { + tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + if (!sk->dead) + sk->data_ready(sk,0); + } + } +} + +/* Clean the out_of_order queue if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int prune_queue(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff * skb; + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + net_statistics.PruneCalled++; + + /* First, purge the out_of_order queue. */ + skb = __skb_dequeue_tail(&tp->out_of_order_queue); + if(skb != NULL) { + /* Free it all. */ + do { net_statistics.OfoPruned += skb->len; + kfree_skb(skb); + skb = __skb_dequeue_tail(&tp->out_of_order_queue); + } while(skb != NULL); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if(tp->sack_ok) + tp->num_sacks = 0; + } + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + * + * We used to try to purge the in-order packets too, but that + * turns out to be deadly and fraught with races. Consider: + * + * 1) If we acked the data, we absolutely cannot drop the + * packet. This data would then never be retransmitted. + * 2) It is possible, with a proper sequence of events involving + * delayed acks and backlog queue handling, to have the user + * read the data before it gets acked. The previous code + * here got this wrong, and it lead to data corruption. + * 3) Too much state changes happen when the FIN arrives, so once + * we've seen that we can't remove any in-order data safely. + * + * The net result is that removing in-order receive data is too + * complex for anyones sanity. So we don't do it anymore. But + * if we are really having our buffer space abused we stop accepting + * new receive data. + */ + if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) + return 0; + + /* Massive buffer overcommit. */ + return -1; +} + +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int queued; + u32 flg; + + /* + * Header prediction. + * The code follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(sk, th, tp)) { + if (tp->saw_tstamp) { + if (tcp_paws_discard(tp, th, len)) { + tcp_statistics.TcpInErrs++; + if (!th->rst) { + tcp_send_ack(sk); + goto discard; + } + } + tcp_replace_ts_recent(sk, tp, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + } + } + + flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16); + + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_predition is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 else it will be !0 + * (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (len <= th->doff*4) { + /* Bulk data transfer: sender */ + if (len == th->doff*4) { + tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); + kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + tcp_statistics.TcpInErrs++; + goto discard; + } + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && + atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { + /* Bulk data transfer: receiver */ + __skb_pull(skb,th->doff*4); + + tcp_measure_rcv_mss(sk, skb); + + /* DO NOT notify forward progress here. + * It saves dozen of CPU instructions in fast path. --ANK + */ + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + + /* FIN bit check is not done since if FIN is set in + * this frame, the pred_flags won't match up. -DaveM + */ + sk->data_ready(sk, 0); + tcp_delack_estimator(tp); + + tcp_remember_ack(tp, th, skb); + + __tcp_ack_snd_check(sk); + return 0; + } + } + + /* + * Standard slow path. + */ + + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST bit + * is set, if so drop the segment and return)". + */ + if (th->rst) + goto discard; + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tp->rcv_wup, tp->rcv_wnd); + } + tcp_send_ack(sk); + goto discard; + } + + if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { + SOCK_DEBUG(sk, "syn in established state\n"); + tcp_statistics.TcpInErrs++; + tcp_reset(sk); + return 1; + } + + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + if(th->ack) + tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); + + /* Process urgent data. */ + tcp_urg(sk, th, len); + + /* step 7: process the segment text */ + queued = tcp_data(skb, sk, len); + + /* This must be after tcp_data() does the skb_pull() to + * remove the header size from skb->len. + * + * Dave!!! Phrase above (and all about rcv_mss) has + * nothing to do with reality. rcv_mss must measure TOTAL + * size, including sacks, IP options etc. Hence, measure_rcv_mss + * must occure before pulling etc, otherwise it will flap + * like hell. Even putting it before tcp_data is wrong, + * it should use skb->tail - skb->nh.raw instead. + * --ANK (980805) + * + * BTW I broke it. Now all TCP options are handled equally + * in mss_clamp calculations (i.e. ignored, rfc1122), + * and mss_cache does include all of them (i.e. tstamps) + * except for sacks, to calulate effective mss faster. + * --ANK (980805) + */ + tcp_measure_rcv_mss(sk, skb); + + /* Be careful, tcp_data() may have put this into TIME_WAIT. */ + if(sk->state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } + + if (!queued) { + discard: + kfree_skb(skb); + } + + return 0; +} + +/* + * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented + * as an open_request. + */ + +struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + struct open_request *req) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 flg; + + /* assumption: the socket is not in use. + * as we checked the user count on tcp_rcv and we're + * running from a soft interrupt. + */ + + /* Check for syn retransmission */ + flg = *(((u32 *)skb->h.th) + 3); + + flg &= __constant_htonl(0x00170000); + /* Only SYN set? */ + if (flg == __constant_htonl(0x00020000)) { + if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { + /* retransmited syn. + */ + req->class->rtx_syn_ack(sk, req); + return NULL; + } else { + return sk; /* Pass new SYN to the listen socket. */ + } + } + + /* We know it's an ACK here */ + if (req->sk) { + /* socket already created but not + * yet accepted()... + */ + sk = req->sk; + } else { + /* In theory the packet could be for a cookie, but + * TIME_WAIT should guard us against this. + * XXX: Nevertheless check for cookies? + * This sequence number check is done again later, + * but we do it here to prevent syn flood attackers + * from creating big SYN_RECV sockets. + */ + if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) || + !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, + req->rcv_isn+1+req->rcv_wnd)) { + req->class->send_reset(skb); + return NULL; + } + + sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + if (sk == NULL) + return NULL; + + req->expires = 0UL; + req->sk = sk; + } + skb_orphan(skb); + skb_set_owner_r(skb, sk); + return sk; +} + +/* + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int queued = 0; + + switch (sk->state) { + case TCP_CLOSE: + /* When state == CLOSED, hash lookup always fails. + * + * But, there is a back door, the backlog queue. + * If we have a sequence of packets in the backlog + * during __release_sock() which have a sequence such + * that: + * packet X causes entry to TCP_CLOSE state + * ... + * packet X + N has FIN bit set + * + * We report a (luckily) harmless error in this case. + * The issue is that backlog queue processing bypasses + * any hash lookups (we know which socket packets are for). + * The correct behavior here is what 2.0.x did, since + * a TCP_CLOSE socket does not exist. Drop the frame + * and send a RST back to the other end. + */ + return 1; + + case TCP_LISTEN: + /* These use the socket TOS.. + * might want to be the received TOS + */ + if(th->ack) + return 1; + + if(th->syn) { + if(tp->af_specific->conn_request(sk, skb, 0) < 0) + return 1; + + /* Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + * But, this leaves one open to an easy denial of + * service attack, and SYN cookies can't defend + * against this problem. So, we drop the data + * in the interest of security over speed. + */ + goto discard; + } + + goto discard; + break; + + case TCP_SYN_SENT: + /* SYN sent means we have to look for a suitable ack and + * either reset for bad matches or go to connected. + * The SYN_SENT case is unusual and should + * not be in line code. [AC] + */ + if(th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * I cite this place to emphasize one essential + * detail, this check is different of one + * in established state: SND.UNA <= SEG.ACK <= SND.NXT. + * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, + * because we have no previous data sent before SYN. + * --ANK(990513) + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (sk->zapped || + TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + return 1; + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + + if (!th->syn) + goto discard; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + * + * Do you see? SYN-less ACKs in SYN-SENT state are + * completely ignored. + * + * The bug causing stalled SYN-SENT sockets + * was here: tcp_ack advanced snd_una and canceled + * retransmit timer, so that bare ACK received + * in SYN-SENT state (even with invalid ack==ISS, + * because tcp_ack check is too weak for SYN-SENT) + * causes moving socket to invalid semi-SYN-SENT, + * semi-ESTABLISHED state and connection hangs. + * + * There exist buggy stacks, which really send + * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) + * Actually, if this host did not try to get something + * from ftp.inr.ac.ru I'd never find this bug 8) + * + * --ANK (990514) + */ + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + tp->fin_seq = TCP_SKB_CB(skb)->seq; + + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_parse_options(sk, th, tp, 0); + + if (tp->wscale_ok == 0) { + tp->snd_wscale = tp->rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp,65535); + } + + if (tp->tstamp_ok) { + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else + tp->tcp_header_len = sizeof(struct tcphdr); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = tcp_time_stamp; + } + + /* Can't be earlier, doff would be wrong. */ + tcp_send_ack(sk); + + sk->dport = th->source; + tp->copied_seq = tp->rcv_nxt; + + if(!sk->dead) { + sk->state_change(sk); + sock_wake_async(sk->socket, 0); + } + } else { + if(th->syn && !th->rst) { + /* The previous version of the code + * checked for "connecting to self" + * here. that check is done now in + * tcp_connect. + */ + tcp_set_state(sk, TCP_SYN_RECV); + tcp_parse_options(sk, th, tp, 0); + if (tp->saw_tstamp) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = tcp_time_stamp; + } + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + + tcp_send_synack(sk); + } else + break; + } + + /* tp->tcp_header_len and tp->mss_clamp + probably changed, synchronize mss. + */ + tcp_sync_mss(sk, tp->pmtu_cookie); + tp->rcv_mss = tp->mss_cache; + + if (sk->state == TCP_SYN_RECV) + goto discard; + + goto step6; + } + + /* Parse the tcp_options present on this header. + * By this point we really only expect timestamps. + * Note that this really has to be here and not later for PAWS + * (RFC1323) to work. + */ + if (tcp_fast_parse_options(sk, th, tp)) { + /* NOTE: assumes saw_tstamp is never set if we didn't + * negotiate the option. tcp_fast_parse_options() must + * guarantee this. + */ + if (tp->saw_tstamp) { + if (tcp_paws_discard(tp, th, len)) { + tcp_statistics.TcpInErrs++; + if (!th->rst) { + tcp_send_ack(sk); + goto discard; + } + } + tcp_replace_ts_recent(sk, tp, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + } + } + + /* The silly FIN test here is necessary to see an advancing ACK in + * retransmitted FIN frames properly. Consider the following sequence: + * + * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ + * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ + * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1 + * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test) + * + * At this point the connection will deadlock with host1 believing + * that his FIN is never ACK'd, and thus it will retransmit it's FIN + * forever. The following fix is from Taral (taral@taral.net). + */ + + /* step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) && + !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) { + if (!th->rst) { + tcp_send_ack(sk); + } + goto discard; + } + + /* step 2: check RST bit */ + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + /* step 3: check security and precedence [ignored] */ + + /* step 4: + * + * Check for a SYN, and ensure it matches the SYN we were + * first sent. We have to handle the rather unusual (but valid) + * sequence that KA9Q derived products may generate of + * + * SYN + * SYN|ACK Data + * ACK (lost) + * SYN|ACK Data + More Data + * .. we must ACK not RST... + * + * We keep syn_seq as the sequence space occupied by the + * original syn. + */ + + if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { + tcp_reset(sk); + return 1; + } + + /* step 5: check the ACK field */ + if (th->ack) { + int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); + + switch(sk->state) { + case TCP_SYN_RECV: + if (acceptable) { + tcp_set_state(sk, TCP_ESTABLISHED); + sk->dport = th->source; + tp->copied_seq = tp->rcv_nxt; + + if(!sk->dead) + sk->state_change(sk); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = htons(th->window) << tp->snd_wscale; + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + + } else { + SOCK_DEBUG(sk, "bad ack\n"); + return 1; + } + break; + + case TCP_FIN_WAIT1: + if (tp->snd_una == tp->write_seq) { + sk->shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + if (!sk->dead) + sk->state_change(sk); + else + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); + } + break; + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + sk->shutdown = SHUTDOWN_MASK; + tcp_set_state(sk,TCP_CLOSE); + if (!sk->dead) + sk->state_change(sk); + goto discard; + } + break; + } + } else + goto discard; + +step6: + /* step 6: check the URG bit */ + tcp_urg(sk, th, len); + + /* step 7: process the segment text */ + switch (sk->state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) + break; + + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { + if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_reset(sk); + return 1; + } + } + + case TCP_ESTABLISHED: + queued = tcp_data(skb, sk, len); + + /* This must be after tcp_data() does the skb_pull() to + * remove the header size from skb->len. + */ + tcp_measure_rcv_mss(sk, skb); + break; + } + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + + if (!queued) { +discard: + kfree_skb(skb); + } + return 0; +} diff --git a/pfinet/linux-src/net/ipv4/tcp_ipv4.c b/pfinet/linux-src/net/ipv4/tcp_ipv4.c new file mode 100644 index 00000000..c2c78365 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp_ipv4.c @@ -0,0 +1,2044 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_ipv4.c,v 1.175.2.10 1999/08/13 16:14:35 davem Exp $ + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * David S. Miller : New socket lookup architecture. + * This code is dedicated to John Dyson. + * David S. Miller : Change semantics of established hash, + * half is devoted to TIME_WAIT sockets + * and the rest go in the other half. + * Andi Kleen : Add support for syncookies and fixed + * some bugs: ip options weren't passed to + * the TCP layer, missed a check for an ACK bit. + * Andi Kleen : Implemented fast path mtu discovery. + * Fixed many serious bugs in the + * open_request handling and moved + * most of it into the af independent code. + * Added tail drop and some other bugfixes. + * Added new listen sematics. + * Mike McLagan : Routing by source + * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/ipsec.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/ipv6.h> + +#include <asm/segment.h> + +#include <linux/inet.h> +#include <linux/stddef.h> + +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; +extern int sysctl_tcp_syncookies; +extern int sysctl_ip_dynaddr; +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; + +/* Check TCP sequence numbers in ICMP packets. */ +#define ICMP_MIN_LENGTH 8 + +/* Socket used for sending RSTs */ +struct inode tcp_inode; +struct socket *tcp_socket=&tcp_inode.u.socket_i; + +static void tcp_v4_send_reset(struct sk_buff *skb); + +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +/* This is for sockets with full identity only. Sockets here will always + * be without wildcards and will have the following invariant: + * TCP_ESTABLISHED <= sk->state < TCP_CLOSE + * + * First half of the table is for sockets not in TIME_WAIT, second half + * is for TIME_WAIT sockets only. + */ +struct sock *tcp_established_hash[TCP_HTABLE_SIZE]; + +/* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ +struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE]; + +/* All sockets in TCP_LISTEN state will be in here. This is the only table + * where wildcard'd TCP sockets can exist. Hash function here is just local + * port number. + */ +struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; + +/* Register cache. */ +struct sock *tcp_regs[TCP_NUM_REGS]; + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = (1024 - 1); + +static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, + __u32 faddr, __u16 fport) +{ + return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1); +} + +static __inline__ int tcp_sk_hashfn(struct sock *sk) +{ + __u32 laddr = sk->rcv_saddr; + __u16 lport = sk->num; + __u32 faddr = sk->daddr; + __u16 fport = sk->dport; + + return tcp_hashfn(laddr, lport, faddr, fport); +} + +/* Allocate and initialize a new TCP local port bind bucket. + * Always runs inside the socket hashing lock. + */ +struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) +{ + struct tcp_bind_bucket *tb; + + tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); + if(tb != NULL) { + struct tcp_bind_bucket **head = + &tcp_bound_hash[tcp_bhashfn(snum)]; + tb->port = snum; + tb->fastreuse = 0; + tb->owners = NULL; + if((tb->next = *head) != NULL) + tb->next->pprev = &tb->next; + *head = tb; + tb->pprev = head; + } + return tb; +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* Ensure that the bound bucket for the port exists. + * Return 0 and bump bucket reference count on success. + * + * Must run in a BH atomic section. + */ +static __inline__ int __tcp_bucket_check(unsigned short snum) +{ + struct tcp_bind_bucket *tb; + + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for( ; (tb && (tb->port != snum)); tb = tb->next) + ; + if (tb == NULL) { + if ((tb = tcp_bucket_create(snum)) == NULL) + return 1; + } + + return 0; +} +#endif + +static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) +{ + struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (child->num != sk->num) { + unsigned short snum = ntohs(child->num); + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + tb && tb->port != snum; + tb = tb->next) + ; + if (tb == NULL) + tb = (struct tcp_bind_bucket *)sk->prev; + } +#endif + if ((child->bind_next = tb->owners) != NULL) + tb->owners->bind_pprev = &child->bind_next; + tb->owners = child; + child->bind_pprev = &tb->owners; + child->prev = (struct sock *) tb; +} + +__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child) +{ + SOCKHASH_LOCK(); + __tcp_inherit_port(sk, child); + SOCKHASH_UNLOCK(); +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ + struct tcp_bind_bucket *tb; + + SOCKHASH_LOCK(); + if (snum == 0) { + int rover = tcp_port_rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + + do { rover++; + if ((rover < low) || (rover > high)) + rover = low; + tb = tcp_bound_hash[tcp_bhashfn(rover)]; + for ( ; tb; tb = tb->next) + if (tb->port == rover) + goto next; + break; + next: + } while (--remaining > 0); + tcp_port_rover = rover; + + /* Exhausted local port range during search? */ + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. */ + snum = rover; + tb = NULL; + } else { + for (tb = tcp_bound_hash[tcp_bhashfn(snum)]; + tb != NULL; + tb = tb->next) + if (tb->port == snum) + break; + } + if (tb != NULL && tb->owners != NULL) { + if (tb->fastreuse != 0 && sk->reuse != 0) { + goto success; + } else { + struct sock *sk2 = tb->owners; + int sk_reuse = sk->reuse; + + for( ; sk2 != NULL; sk2 = sk2->bind_next) { + if (sk->bound_dev_if == sk2->bound_dev_if) { + if (!sk_reuse || + !sk2->reuse || + sk2->state == TCP_LISTEN) { + if (!sk2->rcv_saddr || + !sk->rcv_saddr || + (sk2->rcv_saddr == sk->rcv_saddr)) + break; + } + } + } + /* If we found a conflict, fail. */ + if (sk2 != NULL) + goto fail; + } + } + if (tb == NULL && + (tb = tcp_bucket_create(snum)) == NULL) + goto fail; + if (tb->owners == NULL) { + if (sk->reuse && sk->state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + ((sk->reuse == 0) || (sk->state == TCP_LISTEN))) + tb->fastreuse = 0; +success: + sk->num = snum; + if ((sk->bind_next = tb->owners) != NULL) + tb->owners->bind_pprev = &sk->bind_next; + tb->owners = sk; + sk->bind_pprev = &tb->owners; + sk->prev = (struct sock *) tb; + + SOCKHASH_UNLOCK(); + return 0; + +fail: + SOCKHASH_UNLOCK(); + return 1; +} + +/* Get rid of any references to a local port held by the + * given sock. + */ +__inline__ void __tcp_put_port(struct sock *sk) +{ + struct tcp_bind_bucket *tb; + + tb = (struct tcp_bind_bucket *) sk->prev; + if (sk->bind_next) + sk->bind_next->bind_pprev = sk->bind_pprev; + *(sk->bind_pprev) = sk->bind_next; + sk->prev = NULL; + if (tb->owners == NULL) { + if (tb->next) + tb->next->pprev = tb->pprev; + *(tb->pprev) = tb->next; + kmem_cache_free(tcp_bucket_cachep, tb); + } +} + +void tcp_put_port(struct sock *sk) +{ + SOCKHASH_LOCK(); + __tcp_put_port(sk); + SOCKHASH_UNLOCK(); +} + +static __inline__ void __tcp_v4_hash(struct sock *sk) +{ + struct sock **skp; + + if(sk->state == TCP_LISTEN) + skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + else + skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; + + if((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + *skp = sk; + sk->pprev = skp; +} + +static void tcp_v4_hash(struct sock *sk) +{ + if (sk->state != TCP_CLOSE) { + SOCKHASH_LOCK(); + __tcp_v4_hash(sk); + SOCKHASH_UNLOCK(); + } +} + +static void tcp_v4_unhash(struct sock *sk) +{ + SOCKHASH_LOCK(); + if(sk->pprev) { + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + tcp_reg_zap(sk); + __tcp_put_port(sk); + } + SOCKHASH_UNLOCK(); +} + +/* Don't inline this cruft. Here are some nice properties to + * exploit here. The BSD API does not allow a listening TCP + * to specify the remote port nor the remote address for the + * connection. So always assume those are both wildcarded + * during the search since they can never be otherwise. + */ +static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif) +{ + struct sock *sk; + struct sock *result = NULL; + int score, hiscore; + + hiscore=0; + for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) { + if(sk->num == hnum) { + __u32 rcv_saddr = sk->rcv_saddr; + + score = 1; + if(rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score++; + } + if (sk->bound_dev_if) { + if (sk->bound_dev_if != dif) + continue; + score++; + } + if (score == 3) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * It is assumed that this code only gets called from within NET_BH. + */ +static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, + u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) +{ + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u16 hnum = ntohs(dport); + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + struct sock *sk; + int hash; + + /* Check TCP register quick cache first. */ + sk = TCP_RHASH(sport); + if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = tcp_hashfn(daddr, hnum, saddr, sport); + for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) { + if (sk->state == TCP_ESTABLISHED) + TCP_RHASH(sport) = sk; + goto hit; /* You sunk my battleship! */ + } + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ + for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + sk = tcp_v4_lookup_listener(daddr, hnum, dif); +hit: + return sk; +} + +__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) +{ + return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif); +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* Cleaned up a little and adapted to new bind bucket scheme. + * Oddly, this should increase performance here for + * transparent proxy, as tests within the inner loop have + * been eliminated. -DaveM + */ +static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) +{ + struct sock *s, *result = NULL; + int badness = -1; + u32 paddr = 0; + unsigned short hnum = ntohs(num); + unsigned short hpnum = ntohs(pnum); + int firstpass = 1; + + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + + /* This code must run only from NET_BH. */ + { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)]; + for( ; (tb && tb->port != hnum); tb = tb->next) + ; + if(tb == NULL) + goto next; + s = tb->owners; + } +pass2: + for(; s; s = s->bind_next) { + int score = 0; + if(s->rcv_saddr) { + if((s->num != hpnum || s->rcv_saddr != paddr) && + (s->num != hnum || s->rcv_saddr != laddr)) + continue; + score++; + } + if(s->daddr) { + if(s->daddr != raddr) + continue; + score++; + } + if(s->dport) { + if(s->dport != rnum) + continue; + score++; + } + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { + result = s; + goto gotit; + } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { + result = s; + badness = score; + } + } +next: + if(firstpass--) { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)]; + for( ; (tb && tb->port != hpnum); tb = tb->next) + ; + if(tb) { + s = tb->owners; + goto pass2; + } + } +gotit: + return result; +} +#endif /* CONFIG_IP_TRANSPARENT_PROXY */ + +static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + return secure_tcp_sequence_number(sk->saddr, sk->daddr, + skb->h.th->dest, + skb->h.th->source); +} + +/* Check that a TCP address is unique, don't allow multiple + * connects to/from the same address. Actually we can optimize + * quite a bit, since the socket about to connect is still + * in TCP_CLOSE, a tcp_bind_bucket for the local port he will + * use will exist, with a NULL owners list. So check for that. + * The good_socknum and verify_bind scheme we use makes this + * work. + */ +static int tcp_v4_unique_address(struct sock *sk) +{ + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + int retval = 1; + + /* Freeze the hash while we snoop around. */ + SOCKHASH_LOCK(); + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for(; tb; tb = tb->next) { + if(tb->port == snum && tb->owners != NULL) { + /* Almost certainly the re-use port case, search the real hashes + * so it actually scales. + */ + sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport, + sk->rcv_saddr, htons(snum), + sk->bound_dev_if); + if((sk != NULL) && (sk->state != TCP_LISTEN)) + retval = 0; + break; + } + } + SOCKHASH_UNLOCK(); + return retval; +} + +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct sk_buff *buff; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + + if (sk->state != TCP_CLOSE) + return(-EISCONN); + + /* Don't allow a double connect. */ + if (sk->daddr) + return -EINVAL; + + if (addr_len < sizeof(struct sockaddr_in)) + return(-EINVAL); + + if (usin->sin_family != AF_INET) { + static int complained; + if (usin->sin_family) + return(-EAFNOSUPPORT); + if (!complained++) + printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm); + } + + nexthop = daddr = usin->sin_addr.s_addr; + if (sk->opt && sk->opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = sk->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, sk->saddr, + RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if); + if (tmp < 0) + return tmp; + + if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + dst_release(xchg(&sk->dst_cache, rt)); + + buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + 0, GFP_KERNEL); + + if (buff == NULL) + return -ENOBUFS; + + /* Socket has no identity, so lock_sock() is useless. Also + * since state==TCP_CLOSE (checked above) the socket cannot + * possibly be in the hashes. TCP hash locking is only + * needed while checking quickly for a unique address. + * However, the socket does need to be (and is) locked + * in tcp_connect(). + * Perhaps this addresses all of ANK's concerns. 8-) -DaveM + */ + sk->dport = usin->sin_port; + sk->daddr = rt->rt_dst; + if (sk->opt && sk->opt->srr) + sk->daddr = daddr; + if (!sk->saddr) + sk->saddr = rt->rt_src; + sk->rcv_saddr = sk->saddr; + + if (!tcp_v4_unique_address(sk)) { + kfree_skb(buff); + sk->daddr = 0; + return -EADDRNOTAVAIL; + } + + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->sport, usin->sin_port); + + tp->ext_header_len = 0; + if (sk->opt) + tp->ext_header_len = sk->opt->optlen; + + /* Reset mss clamp */ + tp->mss_clamp = ~0; + + if (!ip_dont_fragment(sk, &rt->u.dst) && + rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) { + /* Clamp mss at maximum of 536 and user_mss. + Probably, user ordered to override tiny segment size + in gatewayed case. + */ + tp->mss_clamp = max(tp->user_mss, 536); + } + + tcp_connect(sk, buff, rt->u.dst.pmtu); + return 0; +} + +static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) +{ + int retval = -EINVAL; + + /* Do sanity checking for sendmsg/sendto/send. */ + if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) + goto out; + if (msg->msg_name) { + struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; + + if (msg->msg_namelen < sizeof(*addr)) + goto out; + if (addr->sin_family && addr->sin_family != AF_INET) + goto out; + retval = -ENOTCONN; + if(sk->state == TCP_CLOSE) + goto out; + retval = -EISCONN; + if (addr->sin_port != sk->dport) + goto out; + if (addr->sin_addr.s_addr != sk->daddr) + goto out; + } + retval = tcp_do_sendmsg(sk, msg); + +out: + return retval; +} + + +/* + * Do a linear search in the socket open_request list. + * This should be replaced with a global hash table. + */ +static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, + struct iphdr *iph, + struct tcphdr *th, + struct open_request **prevp) +{ + struct open_request *req, *prev; + __u16 rport = th->source; + + /* assumption: the socket is not in use. + * as we checked the user count on tcp_rcv and we're + * running from a soft interrupt. + */ + prev = (struct open_request *) (&tp->syn_wait_queue); + for (req = prev->dl_next; req; req = req->dl_next) { + if (req->af.v4_req.rmt_addr == iph->saddr && + req->af.v4_req.loc_addr == iph->daddr && + req->rmt_port == rport +#ifdef CONFIG_IP_TRANSPARENT_PROXY + && req->lcl_port == th->dest +#endif + ) { + *prevp = prev; + return req; + } + prev = req; + } + return NULL; +} + + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (atomic_read(&sk->sock_readers)) + return; + + /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs + * send out by Linux are always <576bytes so they should go through + * unfragmented). + */ + if (sk->state == TCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if (sk->dst_cache == NULL) + return; + ip_rt_update_pmtu(sk->dst_cache, mtu); + if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && + tp->pmtu_cookie > sk->dst_cache->pmtu) { + tcp_sync_mss(sk, sk->dst_cache->pmtu); + + /* Resend the TCP packet because it's + * clear that the old packet has been + * dropped. This is the new "fast" path mtu + * discovery. + */ + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + * sk->err and sk->err_soft should be atomic_t. + */ + +void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) +{ + struct iphdr *iph = (struct iphdr*)dp; + struct tcphdr *th; + struct tcp_opt *tp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; +#if ICMP_MIN_LENGTH < 14 + int no_flags = 0; +#else +#define no_flags 0 +#endif + struct sock *sk; + __u32 seq; + int err; + + if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { + icmp_statistics.IcmpInErrors++; + return; + } +#if ICMP_MIN_LENGTH < 14 + if (len < (iph->ihl << 2) + 14) + no_flags = 1; +#endif + + th = (struct tcphdr*)(dp+(iph->ihl<<2)); + + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); + if (sk == NULL || sk->state == TCP_TIME_WAIT) { + icmp_statistics.IcmpInErrors++; + return; + } + + tp = &sk->tp_pinfo.af_tcp; + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + net_statistics.OutOfWindowIcmps++; + return; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: +#ifndef OLD_SOURCE_QUENCH /* This is deprecated */ + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; +#endif + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + return; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu)); + return; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + return; + } + + switch (sk->state) { + struct open_request *req, *prev; + case TCP_LISTEN: + /* Prevent race conditions with accept() - + * ICMP is unreliable. + */ + if (atomic_read(&sk->sock_readers)) { + net_statistics.LockDroppedIcmps++; + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + return; + } + + /* The final ACK of the handshake should be already + * handled in the new socket context, not here. + * Strictly speaking - an ICMP error for the final + * ACK should set the opening flag, but that is too + * complicated right now. + */ + if (!no_flags && !th->syn && !th->ack) + return; + + req = tcp_v4_search_req(tp, iph, th, &prev); + if (!req) + return; + if (seq != req->snt_isn) { + net_statistics.OutOfWindowIcmps++; + return; + } + if (req->sk) { + /* + * Already in ESTABLISHED and a big socket is created, + * set error code there. + * The error will _not_ be reported in the accept(), + * but only with the next operation on the socket after + * accept. + */ + sk = req->sk; + } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tp->syn_backlog--; + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); + return; + } + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen */ + if (!no_flags && !th->syn) + return; + tcp_statistics.TcpAttemptFails++; + sk->err = err; + sk->zapped = 1; + mb(); + sk->error_report(sk); + return; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + if (sk->ip_recverr) { + /* This code isn't serialized with the socket code */ + /* ANK (980927) ... which is harmless now, + sk->err's may be safely lost. + */ + sk->err = err; + mb(); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + } else { /* Only an error on timeout */ + sk->err_soft = err; + mb(); + } +} + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + th->check = 0; + th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, + csum_partial((char *)th, th->doff<<2, skb->csum)); +} + +/* + * This routine will send an RST to the other tcp. + * + * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) + * for reset. + * Answer: if a packet caused RST, it is not for a socket + * existing in our system, if it is matched to a socket, + * it is just duplicate segment or bug in other side's TCP. + * So that we build reply only basing on parameters + * arrived with segment. + * Exception: precedence violation. We do not implement it in any case. + */ + +static void tcp_v4_send_reset(struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); +#endif + return; + } + + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr)/4; + rth.rst = 1; + + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq; + } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), + IPPROTO_TCP, + 0); + arg.n_iov = 1; + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; +} + +/* + * Send an ACK for a socket less packet (needed for time wait) + * + * FIXME: Does not echo timestamps yet. + * + * Assumes that the caller did basic address and flag checks. + */ +static void tcp_v4_send_ack(struct sk_buff *skb, __u32 seq, __u32 ack, __u16 window) +{ + struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; + + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr)/4; + + rth.seq = seq; + rth.ack_seq = ack; + rth.ack = 1; + + rth.window = htons(window); + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), + IPPROTO_TCP, + 0); + arg.n_iov = 1; + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + tcp_statistics.TcpOutSegs++; +} + + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + +/* + Seems, I never wrote nothing more stupid. + I hope Gods will forgive me, but I cannot forgive myself 8) + --ANK (981001) + */ + +static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); + struct sock *sk; + int i; + + for (i=0; i<TCP_LHTABLE_SIZE; i++) { + for(sk = tcp_listening_hash[i]; sk; sk = sk->next) { + struct open_request *dummy; + if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph, + th, &dummy) && + (!sk->bound_dev_if || + sk->bound_dev_if == skb->dev->ifindex)) + return sk; + } + } + return NULL; +} + +/* + * Check whether a received TCP packet might be for one of our + * connections. + */ + +int tcp_chkaddr(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); + struct sock *sk; + + sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, + th->dest, skb->dev->ifindex); + + if (!sk) + return tcp_v4_search_proxy_openreq(skb) != NULL; + + if (sk->state == TCP_LISTEN) { + struct open_request *dummy; + if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph, + th, &dummy) && + (!sk->bound_dev_if || + sk->bound_dev_if == skb->dev->ifindex)) + return 1; + } + + /* 0 means accept all LOCAL addresses here, not all the world... */ + + if (sk->rcv_saddr == 0) + return 0; + + return 1; +} +#endif + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) +{ + struct rtable *rt; + struct ip_options *opt; + struct sk_buff * skb; + int mss; + + /* First, grab a route. */ + opt = req->af.v4_req.opt; + if(ip_route_output(&rt, ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + req->af.v4_req.loc_addr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) { + ip_statistics.IpOutNoRoutes++; + return; + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; + return; + } + + mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + + skb = tcp_make_synack(sk, &rt->u.dst, req, mss); + if (skb) { + struct tcphdr *th = skb->h.th; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + th->source = req->lcl_port; /* LVE */ +#endif + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); + } + ip_rt_put(rt); +} + +/* + * IPv4 open_request destructor. + */ +static void tcp_v4_or_free(struct open_request *req) +{ + if(!req->sk && req->af.v4_req.opt) + kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt)); +} + +static inline void syn_flood_warning(struct sk_buff *skb) +{ + static unsigned long warntime; + + if (jiffies - warntime > HZ*60) { + warntime = jiffies; + printk(KERN_INFO + "possible SYN flooding on port %d. Sending cookies.\n", + ntohs(skb->h.th->dest)); + } +} + +/* + * Save and compile IPv4 options into the open_request if needed. + */ +static inline struct ip_options * +tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = optlength(opt); + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(dopt, skb)) { + kfree_s(dopt, opt_size); + dopt = NULL; + } + } + } + return dopt; +} + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + */ +int sysctl_max_syn_backlog = 128; + +struct or_calltable or_ipv4 = { + tcp_v4_send_synack, + tcp_v4_or_free, + tcp_v4_send_reset +}; + +#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ +#define BACKLOGMAX(sk) sysctl_max_syn_backlog + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn) +{ + struct tcp_opt tp; + struct open_request *req; + struct tcphdr *th = skb->h.th; + __u32 saddr = skb->nh.iph->saddr; + __u32 daddr = skb->nh.iph->daddr; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ +#endif + + /* If the socket is dead, don't accept the connection. */ + if (sk->dead) + goto dead; + + /* Never answer to SYNs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST|RTCF_MULTICAST)) + goto drop; + + /* XXX: Check against a global syn pool counter. */ + if (BACKLOG(sk) > BACKLOGMAX(sk)) { +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + syn_flood_warning(skb); + want_cookie = 1; + } else +#endif + goto drop; + } else { + if (isn == 0) + isn = tcp_v4_init_sequence(sk, skb); + BACKLOG(sk)++; + } + + req = tcp_openreq_alloc(); + if (req == NULL) { + goto dropbacklog; + } + + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + + req->rcv_isn = TCP_SKB_CB(skb)->seq; + tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; + + tp.mss_clamp = 65535; + tcp_parse_options(NULL, th, &tp, want_cookie); + if (tp.mss_clamp == 65535) + tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr); + + if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp) + tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss; + req->mss = tp.mss_clamp; + + if (tp.saw_tstamp) + req->ts_recent = tp.rcv_tsval; + req->tstamp_ok = tp.tstamp_ok; + req->sack_ok = tp.sack_ok; + req->snd_wscale = tp.snd_wscale; + req->wscale_ok = tp.wscale_ok; + req->rmt_port = th->source; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + req->lcl_port = th->dest ; /* LVE */ +#endif + req->af.v4_req.loc_addr = daddr; + req->af.v4_req.rmt_addr = saddr; + + /* Note that we ignore the isn passed from the TIME_WAIT + * state here. That's the price we pay for cookies. + */ + if (want_cookie) + isn = cookie_v4_init_sequence(sk, skb, &req->mss); + + req->snt_isn = isn; + + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + + req->class = &or_ipv4; + req->retrans = 0; + req->sk = NULL; + + tcp_v4_send_synack(sk, req); + + if (want_cookie) { + if (req->af.v4_req.opt) + kfree(req->af.v4_req.opt); + tcp_v4_or_free(req); + tcp_openreq_free(req); + } else { + req->expires = jiffies + TCP_TIMEOUT_INIT; + tcp_inc_slow_timer(TCP_SLT_SYNACK); + tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + } + + return 0; + +dead: + SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk); + tcp_statistics.TcpAttemptFails++; + return -ENOTCONN; /* send reset */ + +dropbacklog: + if (!want_cookie) + BACKLOG(sk)--; +drop: + tcp_statistics.TcpAttemptFails++; + return 0; +} + +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * This function wants to be moved to a common for IPv[46] file. --ANK + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); + + if(newsk != NULL) { + struct tcp_opt *newtp; +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->sklist_next = NULL; + newsk->state = TCP_SYN_RECV; + + /* Clone the TCP header template */ + newsk->dport = req->rmt_port; + + atomic_set(&newsk->sock_readers, 0); + atomic_set(&newsk->rmem_alloc, 0); + skb_queue_head_init(&newsk->receive_queue); + atomic_set(&newsk->wmem_alloc, 0); + skb_queue_head_init(&newsk->write_queue); + atomic_set(&newsk->omem_alloc, 0); + + newsk->done = 0; + newsk->proc = 0; + skb_queue_head_init(&newsk->back_log); + skb_queue_head_init(&newsk->error_queue); +#ifdef CONFIG_FILTER + if ((filter = newsk->filter) != NULL) + sk_filter_charge(newsk, filter); +#endif + + /* Now setup tcp_opt */ + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->srtt = 0; + newtp->ato = 0; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = req->snt_isn; + + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + newtp->snd_wnd = ntohs(skb->h.th->window); + + newtp->max_window = newtp->snd_wnd; + newtp->pending = 0; + newtp->retransmits = 0; + newtp->last_ack_sent = req->rcv_isn + 1; + newtp->backoff = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + + newtp->rto = TCP_TIMEOUT_INIT; + newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; + newtp->high_seq = 0; + newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_cwnd_cnt = 0; + newtp->dup_acks = 0; + newtp->delayed_acks = 0; + init_timer(&newtp->retransmit_timer); + newtp->retransmit_timer.function = &tcp_retransmit_timer; + newtp->retransmit_timer.data = (unsigned long) newsk; + init_timer(&newtp->delack_timer); + newtp->delack_timer.function = &tcp_delack_timer; + newtp->delack_timer.data = (unsigned long) newsk; + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->send_head = newtp->retrans_head = NULL; + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->saw_tstamp = 0; + newtp->mss_clamp = req->mss; + + init_timer(&newtp->probe_timer); + newtp->probe_timer.function = &tcp_probe_timer; + newtp->probe_timer.data = (unsigned long) newsk; + newtp->probes_out = 0; + newtp->syn_seq = req->rcv_isn; + newtp->fin_seq = req->rcv_isn; + newtp->urg_data = 0; + tcp_synq_init(newtp); + newtp->syn_backlog = 0; + if (skb->len >= 536) + newtp->last_seg_size = skb->len; + + /* Back to base struct sock members. */ + newsk->err = 0; + newsk->ack_backlog = 0; + newsk->max_ack_backlog = SOMAXCONN; + newsk->priority = 0; + + /* IP layer stuff */ + newsk->timeout = 0; + init_timer(&newsk->timer); + newsk->timer.function = &net_timer; + newsk->timer.data = (unsigned long) newsk; + newsk->socket = NULL; + + newtp->tstamp_ok = req->tstamp_ok; + if((newtp->sack_ok = req->sack_ok) != 0) + newtp->num_sacks = 0; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; + newtp->wscale_ok = req->wscale_ok; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } + if (newtp->tstamp_ok) { + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = tcp_time_stamp; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->tcp_header_len = sizeof(struct tcphdr); + } + } + return newsk; +} + +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ +struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct ip_options *opt = req->af.v4_req.opt; + struct tcp_opt *newtp; + struct sock *newsk; + + if (sk->ack_backlog > sk->max_ack_backlog) + goto exit; /* head drop */ + if (dst == NULL) { + struct rtable *rt; + + if (ip_route_output(&rt, + opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) + return NULL; + dst = &rt->u.dst; + } +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* The new socket created for transparent proxy may fall + * into a non-existed bind bucket because sk->num != newsk->num. + * Ensure existance of the bucket now. The placement of the check + * later will require to destroy just created newsk in the case of fail. + * 1998/04/22 Andrey V. Savochkin <saw@msu.ru> + */ + if (__tcp_bucket_check(ntohs(skb->h.th->dest))) + goto exit; +#endif + + newsk = tcp_create_openreq_child(sk, req, skb); + if (!newsk) + goto exit; + + sk->tp_pinfo.af_tcp.syn_backlog--; + sk->ack_backlog++; + + newsk->dst_cache = dst; + + newtp = &(newsk->tp_pinfo.af_tcp); + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->saddr = req->af.v4_req.loc_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->num = ntohs(skb->h.th->dest); + newsk->sport = req->lcl_port; +#endif + newsk->opt = req->af.v4_req.opt; + newtp->ext_header_len = 0; + if (newsk->opt) + newtp->ext_header_len = newsk->opt->optlen; + + tcp_sync_mss(newsk, dst->pmtu); + newtp->rcv_mss = newtp->mss_clamp; + + /* It would be better to use newtp->mss_clamp here */ + if (newsk->rcvbuf < (3 * newtp->pmtu_cookie)) + newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max); + if (newsk->sndbuf < (3 * newtp->pmtu_cookie)) + newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max); + + /* We run in BH processing itself or within a BH atomic + * sequence (backlog) so no locking is needed. + */ + __tcp_v4_hash(newsk); + __tcp_inherit_port(sk, newsk); + __add_to_prot_sklist(newsk); + + sk->data_ready(sk, 0); /* Deliver SIGIO */ + + return newsk; + +exit: + dst_release(dst); + return NULL; +} + +static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + + req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev); + if (!req) + return; + /* Sequence number check required by RFC793 */ + if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) || + after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) + return; + tcp_synq_unlink(tp, req, prev); + (req->sk ? sk->ack_backlog : tp->syn_backlog)--; + req->class->destructor(req); + tcp_openreq_free(req); + + net_statistics.EmbryonicRsts++; +} + +/* Check for embryonic sockets (open_requests) We check packets with + * only the SYN bit set against the open_request queue too: This + * increases connection latency a bit, but is required to detect + * retransmitted SYNs. + */ +static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + u32 flg = ((u32 *)th)[3]; + + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v4_rst_req(sk, skb); + return NULL; + } + + /* Check for SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *dummy; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Find possible connection requests. */ + req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy); + if (req) { + sk = tcp_check_req(sk, skb, req); + } +#ifdef CONFIG_SYN_COOKIES + else if (flg == __constant_htonl(0x00120000)) { + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + } +#endif + } + return sk; +} + +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ +#ifdef CONFIG_FILTER + struct sk_filter *filter = sk->filter; + if (filter && sk_filter(skb, filter)) + goto discard; +#endif /* CONFIG_FILTER */ + + /* + * This doesn't check if the socket has enough room for the packet. + * Either process the packet _without_ queueing it and then free it, + * or do the check later. + */ + skb_set_owner_r(skb, sk); + + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + return 0; + } + + + if (sk->state == TCP_LISTEN) { + struct sock *nsk; + + nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if (atomic_read(&nsk->sock_readers)) { + skb_orphan(skb); + __skb_queue_tail(&nsk->back_log, skb); + return 0; + } + sk = nsk; + } + + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + return 0; + +reset: + tcp_v4_send_reset(skb); +discard: + kfree_skb(skb); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + return 0; +} + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) +{ + struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + th = skb->h.th; + + /* Pull up the IP header. */ + __skb_pull(skb, skb->h.raw - skb->data); + + /* Count it even if it's bad */ + tcp_statistics.TcpInSegs++; + + len = skb->len; + if (len < sizeof(struct tcphdr)) + goto bad_packet; + + /* Try to use the device checksum if provided. */ + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { + NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum " + "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " + "len=%d/%d/%d\n", + NIPQUAD(skb->nh.iph->saddr), + ntohs(th->source), + NIPQUAD(skb->nh.iph->daddr), + ntohs(th->dest), + len, skb->len, + ntohs(skb->nh.iph->tot_len))); + bad_packet: + tcp_statistics.TcpInErrs++; + goto discard_it; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + + if((th->doff * 4) < sizeof(struct tcphdr) || + len < (th->doff * 4)) + goto bad_packet; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (IPCB(skb)->redirport) + sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, skb->dev, + IPCB(skb)->redirport, skb->dev->ifindex); + else { +#endif + sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (!sk) + sk = tcp_v4_search_proxy_openreq(skb); + } +#endif + if (!sk) + goto no_tcp_socket; + if(!ipsec_sk_policy(sk,skb)) + goto discard_it; + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + + skb->used = 0; + + if (sk->state == TCP_TIME_WAIT) + goto do_time_wait; + if (!atomic_read(&sk->sock_readers)) + return tcp_v4_do_rcv(sk, skb); + + __skb_queue_tail(&sk->back_log, skb); + return 0; + +no_tcp_socket: + tcp_v4_send_reset(skb); + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +do_time_wait: + /* Sorry for the ugly switch. 2.3 will have a better solution. */ + switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_ACK: + tcp_v4_send_ack(skb, + ((struct tcp_tw_bucket *)sk)->snd_nxt, + ((struct tcp_tw_bucket *)sk)->rcv_nxt, + ((struct tcp_tw_bucket *)sk)->window); + goto discard_it; + case TCP_TW_RST: + goto no_tcp_socket; + default: + goto discard_it; + } +} + +static void __tcp_v4_rehash(struct sock *sk) +{ + struct sock **skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; + + SOCKHASH_LOCK(); + if(sk->pprev) { + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + tcp_reg_zap(sk); + } + if((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + *skp = sk; + sk->pprev = skp; + SOCKHASH_UNLOCK(); +} + +int tcp_v4_rebuild_header(struct sock *sk) +{ + struct rtable *rt = (struct rtable *)sk->dst_cache; + __u32 new_saddr; + int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; + + if(rt == NULL) + return 0; + + /* Force route checking if want_rewrite. + * The idea is good, the implementation is disguisting. + * Well, if I made bind on this socket, you cannot randomly ovewrite + * its source address. --ANK + */ + if (want_rewrite) { + int tmp; + struct rtable *new_rt; + __u32 old_saddr = rt->rt_src; + + /* Query new route using another rt buffer */ + tmp = ip_route_connect(&new_rt, rt->rt_dst, 0, + RT_TOS(sk->ip_tos)|sk->localroute, + sk->bound_dev_if); + + /* Only useful if different source addrs */ + if (tmp == 0) { + /* + * Only useful if different source addrs + */ + if (new_rt->rt_src != old_saddr ) { + dst_release(sk->dst_cache); + sk->dst_cache = &new_rt->u.dst; + rt = new_rt; + goto do_rewrite; + } + dst_release(&new_rt->u.dst); + } + } + if (rt->u.dst.obsolete) { + int err; + err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); + if (err) { + sk->err_soft=-err; + sk->error_report(sk); + return -1; + } + dst_release(xchg(&sk->dst_cache, &rt->u.dst)); + } + + return 0; + +do_rewrite: + new_saddr = rt->rt_src; + + /* Ouch!, this should not happen. */ + if (!sk->saddr || !sk->rcv_saddr) { + printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: " + "saddr=%08lX rcv_saddr=%08lX\n", + ntohl(sk->saddr), + ntohl(sk->rcv_saddr)); + return 0; + } + + if (new_saddr != sk->saddr) { + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr " + "from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(sk->saddr), + NIPQUAD(new_saddr)); + } + + sk->saddr = new_saddr; + sk->rcv_saddr = new_saddr; + + /* XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + */ + __tcp_v4_rehash(sk); + } + + return 0; +} + +static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th) +{ + return tcp_v4_lookup(skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); +} + +static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = sk->daddr; + sin->sin_port = sk->dport; +} + +struct tcp_func ipv4_specific = { + ip_queue_xmit, + tcp_v4_send_check, + tcp_v4_rebuild_header, + tcp_v4_conn_request, + tcp_v4_syn_recv_sock, + tcp_v4_get_sock, + sizeof(struct iphdr), + + ip_setsockopt, + ip_getsockopt, + v4_addr2sockaddr, + sizeof(struct sockaddr_in) +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v4_init_sock(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + + tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ + tp->mdev = TCP_TIMEOUT_INIT; + tp->mss_clamp = ~0; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_cwnd_cnt = 0; + tp->snd_ssthresh = 0x7fffffff; /* Infinity */ + + sk->state = TCP_CLOSE; + sk->max_ack_backlog = SOMAXCONN; + tp->rcv_mss = 536; + + sk->write_space = tcp_write_space; + + /* Init SYN queue. */ + tcp_synq_init(tp); + + sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; + + return 0; +} + +static int tcp_v4_destroy_sock(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + tcp_clear_xmit_timers(sk); + + if (sk->keepopen) + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); + + /* Cleanup up the write buffer. */ + while((skb = __skb_dequeue(&sk->write_queue)) != NULL) + kfree_skb(skb); + + /* Cleans up our, hopefuly empty, out_of_order_queue. */ + while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) + kfree_skb(skb); + + /* Clean up a referenced TCP bind bucket, this only happens if a + * port is allocated for a socket, but it never fully connects. + */ + if(sk->prev != NULL) + tcp_put_port(sk); + + return 0; +} + +struct proto tcp_prot = { + (struct sock *)&tcp_prot, /* sklist_next */ + (struct sock *)&tcp_prot, /* sklist_prev */ + tcp_close, /* close */ + tcp_v4_connect, /* connect */ + tcp_accept, /* accept */ + NULL, /* retransmit */ + tcp_write_wakeup, /* write_wakeup */ + tcp_read_wakeup, /* read_wakeup */ + tcp_poll, /* poll */ + tcp_ioctl, /* ioctl */ + tcp_v4_init_sock, /* init */ + tcp_v4_destroy_sock, /* destroy */ + tcp_shutdown, /* shutdown */ + tcp_setsockopt, /* setsockopt */ + tcp_getsockopt, /* getsockopt */ + tcp_v4_sendmsg, /* sendmsg */ + tcp_recvmsg, /* recvmsg */ + NULL, /* bind */ + tcp_v4_do_rcv, /* backlog_rcv */ + tcp_v4_hash, /* hash */ + tcp_v4_unhash, /* unhash */ + tcp_v4_get_port, /* get_port */ + 128, /* max_header */ + 0, /* retransmits */ + "TCP", /* name */ + 0, /* inuse */ + 0 /* highestinuse */ +}; + + + +__initfunc(void tcp_v4_init(struct net_proto_family *ops)) +{ + int err; + + tcp_inode.i_mode = S_IFSOCK; + tcp_inode.i_sock = 1; + tcp_inode.i_uid = 0; + tcp_inode.i_gid = 0; + + tcp_socket->inode = &tcp_inode; + tcp_socket->state = SS_UNCONNECTED; + tcp_socket->type=SOCK_RAW; + + if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->allocation=GFP_ATOMIC; + tcp_socket->sk->num = 256; /* Don't receive any data */ + tcp_socket->sk->ip_ttl = MAXTTL; +} diff --git a/pfinet/linux-src/net/ipv4/tcp_output.c b/pfinet/linux-src/net/ipv4/tcp_output.c new file mode 100644 index 00000000..2ac5e8a2 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp_output.c @@ -0,0 +1,1143 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_output.c,v 1.108.2.1 1999/05/14 23:07:36 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * David S. Miller : Charge memory using the right skb + * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. + * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * + */ + +#include <net/tcp.h> + +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; + +/* Get rid of any delayed acks, we sent one already.. */ +static __inline__ void clear_delayed_acks(struct sock * sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tp->delayed_acks = 0; + if(tcp_in_quickack_mode(tp)) + tcp_exit_quickack_mode(tp); + tcp_clear_xmit_timer(sk, TIME_DACK); +} + +static __inline__ void update_send_head(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->send_head = tp->send_head->next; + if (tp->send_head == (struct sk_buff *) &sk->write_queue) + tp->send_head = NULL; +} + +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if(skb != NULL) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; + int sysctl_flags; + +#define SYSCTL_FLAG_TSTAMPS 0x1 +#define SYSCTL_FLAG_WSCALE 0x2 +#define SYSCTL_FLAG_SACK 0x4 + + sysctl_flags = 0; + if(tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; + } + if(sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; + } + if(sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } + } else if(tp->sack_ok && tp->num_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + } + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = sk->sport; + th->dest = sk->dport; + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(tp->rcv_nxt); + th->doff = (tcp_header_size >> 2); + th->res1 = 0; + *(((__u8 *)th) + 13) = tcb->flags; + if(!(tcb->flags & TCPCB_FLAG_SYN)) + th->window = htons(tcp_select_window(sk)); + th->check = 0; + th->urg_ptr = ntohs(tcb->urg_ptr); + if(tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp, + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rcv_wscale, + TCP_SKB_CB(skb)->when, + tp->ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, TCP_SKB_CB(skb)->when); + } + tp->af_specific->send_check(sk, th, skb->len, skb); + + clear_delayed_acks(sk); + tp->last_ack_sent = tp->rcv_nxt; + tcp_statistics.TcpOutSegs++; + tp->af_specific->queue_xmit(skb); + } +#undef SYSCTL_FLAG_TSTAMPS +#undef SYSCTL_FLAG_WSCALE +#undef SYSCTL_FLAG_SACK +} + +/* This is the main buffer sending routine. We queue the buffer + * and decide whether to queue or transmit now. + */ +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + __skb_queue_tail(&sk->write_queue, skb); + + if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; + if (!force_queue && tp->packets_out == 0 && !tp->pending) { + tp->pending = TIME_PROBE0; + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } + } +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) +{ + struct sk_buff *buff; + int nsize = skb->len - len; + u16 flags; + + /* Get a new skb... force flag on. */ + buff = sock_wmalloc(sk, + (nsize + MAX_HEADER + sk->prot->max_header), + 1, GFP_ATOMIC); + if (buff == NULL) + return -1; /* We'll just try again later. */ + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + if(flags & TCPCB_FLAG_URG) { + u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr; + + /* Urgent data is always a pain in the ass. */ + if(old_urg_ptr > len) { + TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG); + TCP_SKB_CB(skb)->urg_ptr = 0; + TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len; + } else { + flags &= ~(TCPCB_FLAG_URG); + } + } + if(!(flags & TCPCB_FLAG_URG)) + TCP_SKB_CB(buff)->urg_ptr = 0; + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = 0; + + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), + nsize, 0); + + /* This takes care of the FIN sequence number too. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + skb_trim(skb, len); + + /* Rechecksum original buffer. */ + skb->csum = csum_partial(skb->data, skb->len, 0); + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + + /* Link BUFF into the send queue. */ + __skb_append(skb, buff); + + return 0; +} + +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->mss_clamp is mss negotiated at connection setup. + It is minumum of user_mss and mss received with SYN. + It also does not include TCP options. + + tp->pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside + this function. --ANK (980731) + */ + +int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->mss_clamp) + mss_now = tp->mss_clamp; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len; + + /* It we got too small (or even negative) value, + clamp it by 8 from below. Why 8 ? + Well, it could be 1 with the same success, + but if IP accepted segment of length 1, + it would love 8 even more 8) --ANK (980731) + */ + if (mss_now < 8) + mss_now = 8; + + /* And store cached results */ + tp->pmtu_cookie = pmtu; + tp->mss_cache = mss_now; + return mss_now; +} + + +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. + */ +void tcp_write_xmit(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int mss_now; + + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk); + + /* If we are zapped, the bytes will have to remain here. + * In time closedown will empty the write queue and all + * will be happy. + */ + if(!sk->zapped) { + struct sk_buff *skb; + int sent_pkts = 0; + + /* Anything on the transmit queue that fits the window can + * be added providing we are: + * + * a) following SWS avoidance [and Nagle algorithm] + * b) not exceeding our congestion window. + * c) not retransmitting [Nagle] + */ + while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } + + /* Advance the send_head. This one is going out. */ + update_send_head(sk); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + sent_pkts = 1; + } + + /* If we sent anything, make sure the retransmit + * timer is active. + */ + if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } +} + +/* This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * RFC 1122: + * "the suggested [SWS] avoidance algorithm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can raise + * it at least MSS bytes. + * + * Unfortunately, the recommended algorithm breaks header prediction, + * since header prediction assumes th->window stays fixed. + * + * Strictly speaking, keeping th->window fixed violates the receiver + * side SWS prevention criteria. The problem is that under this rule + * a stream of single byte packets will cause the right side of the + * window to always advance by a single byte. + * + * Of course, if the sender implements sender side SWS prevention + * then this will not be a problem. + * + * BSD seems to make the following compromise: + * + * If the free space is less than the 1/4 of the maximum + * space available and the free space is less than 1/2 mss, + * then set the window to 0. + * Otherwise, just prevent the window from shrinking + * and from being larger than the largest representable value. + * + * This prevents incremental opening of the window in the regime + * where TCP is limited by the speed of the reader side taking + * data out of the TCP receive queue. It does nothing about + * those cases where the window is constrained on the sender side + * because the pipeline is full. + * + * BSD also seems to "accidentally" limit itself to windows that are a + * multiple of MSS, at least until the free space gets quite small. + * This would appear to be a side effect of the mbuf implementation. + * Combining these two algorithms results in the observed behavior + * of having a fixed window size at almost all times. + * + * Below we obtain similar behavior by forcing the offered window to + * a multiple of the mss when it is feasible to do so. + * + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. + */ +u32 __tcp_select_window(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + unsigned int mss = tp->mss_cache; + int free_space; + u32 window; + + /* Sometimes free_space can be < 0. */ + free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2; + if (tp->window_clamp) { + if (free_space > ((int) tp->window_clamp)) + free_space = tp->window_clamp; + mss = min(tp->window_clamp, mss); + } else { + printk("tcp_select_window: tp->window_clamp == 0.\n"); + } + + if (mss < 1) { + mss = 1; + printk("tcp_select_window: sk->mss fell to 0.\n"); + } + + if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) { + window = 0; + tp->pred_flags = 0; + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + window = tp->rcv_wnd; + if ((((int) window) <= (free_space - ((int) mss))) || + (((int) window) > free_space)) + window = (((unsigned int) free_space)/mss)*mss; + } + return window; +} + +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +{ + struct sk_buff *next_skb = skb->next; + + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; + + /* Punt if the first SKB has URG set. */ + if(flags & TCPCB_FLAG_URG) + return; + + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; + + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; + + /* Ok. We will be able to collapse the packet. */ + __skb_unlink(next_skb, next_skb->list); + + if(skb->len % 4) { + /* Must copy and rechecksum all data. */ + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb->csum = csum_partial(skb->data, skb->len, 0); + } else { + /* Optimize, actually we could also combine next_skb->csum + * to skb->csum using a single add w/carry operation too. + */ + skb->csum = csum_partial_copy(next_skb->data, + skb_put(skb, next_skb_size), + next_skb_size, skb->csum); + } + + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + if(flags & TCPCB_FLAG_URG) { + u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr; + TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size; + } + TCP_SKB_CB(skb)->flags = flags; + + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + kfree_skb(next_skb); + sk->tp_pinfo.af_tcp.packets_out--; + } +} + +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb, *old_next_skb; + unsigned int mss = tcp_current_mss(sk); + + /* Don't muck with the congestion window here. */ + tp->dup_acks = 0; + tp->high_seq = tp->snd_nxt; + tp->retrans_head = NULL; + + /* Input control flow will see that this was retransmitted + * and not use it for RTT calculation in the absence of + * the timestamp option. + */ + for (old_next_skb = skb = skb_peek(&sk->write_queue); + ((skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)); + skb = skb->next) { + int resend_skb = 0; + + /* Our goal is to push out the packets which we + * sent already, but are being chopped up now to + * account for the PMTU information we have. + * + * As we resend the queue, packets are fragmented + * into two pieces, and when we try to send the + * second piece it may be collapsed together with + * a subsequent packet, and so on. -DaveM + */ + if (old_next_skb != skb || skb->len > mss) + resend_skb = 1; + old_next_skb = skb->next; + if (resend_skb != 0) + tcp_retransmit_skb(sk, skb); + } +} + +static __inline__ void update_retrans_head(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->retrans_head = tp->retrans_head->next; + if((tp->retrans_head == tp->send_head) || + (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) { + tp->retrans_head = NULL; + tp->rexmt_done = 1; + } +} + +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occurred which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int cur_mss = tcp_current_mss(sk); + + if(skb->len > cur_mss) { + if(tcp_fragment(sk, skb, cur_mss)) + return 1; /* We'll try again later. */ + + /* New SKB created, account for it. */ + tp->packets_out++; + } + + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (skb->next != tp->send_head) && + (skb->next != (struct sk_buff *)&sk->write_queue) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); + + if(tp->af_specific->rebuild_header(sk)) + return 1; /* Routing failure or similar. */ + + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if(skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb_trim(skb, 0); + skb->csum = 0; + } + + /* Ok, we're gonna send it out, update state. */ + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; + tp->retrans_out++; + + /* Make a copy, if the first transmission SKB clone we made + * is still in somebody's hands, else make a clone. + */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if(skb_cloned(skb)) + skb = skb_copy(skb, GFP_ATOMIC); + else + skb = skb_clone(skb, GFP_ATOMIC); + + tcp_transmit_skb(sk, skb); + + /* Update global TCP statistics and return success. */ + sk->prot->retransmits++; + tcp_statistics.TcpRetransSegs++; + + return 0; +} + +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + if (tp->retrans_head == NULL && + tp->rexmt_done == 0) + tp->retrans_head = skb_peek(&sk->write_queue); + if (tp->retrans_head == tp->send_head) + tp->retrans_head = NULL; + + /* Each time, advance the retrans_head if we got + * a packet out or we skipped one because it was + * SACK'd. -DaveM + */ + while ((skb = tp->retrans_head) != NULL) { + /* If it has been ack'd by a SACK block, we don't + * retransmit it. + */ + if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + /* Send it out, punt if error occurred. */ + if(tcp_retransmit_skb(sk, skb)) + break; + + update_retrans_head(sk); + + /* Stop retransmitting if we've hit the congestion + * window limit. + */ + if (tp->retrans_out >= tp->snd_cwnd) + break; + } else { + update_retrans_head(sk); + } + } +} + +/* Using FACK information, retransmit all missing frames at the receiver + * up to the forward most SACK'd packet (tp->fackets_out) if the packet + * has not been retransmitted already. + */ +void tcp_fack_retransmit(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek(&sk->write_queue); + int packet_cnt = 0; + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS)) + goto next_packet; + + /* Ok, retransmit it. */ + if(tcp_retransmit_skb(sk, skb)) + break; + + if(tcp_packets_in_flight(tp) >= tp->snd_cwnd) + break; +next_packet: + packet_cnt++; + if(packet_cnt >= tp->fackets_out) + break; + skb = skb->next; + } +} + +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. + */ +void tcp_send_fin(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek_tail(&sk->write_queue); + unsigned int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk); + + if((tp->send_head != NULL) && (skb->len < mss_now)) { + /* tcp_write_xmit() takes care of the rest. */ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + + /* Special case to avoid Nagle bogosity. If this + * segment is the last segment, and it was queued + * due to Nagle/SWS-avoidance, send it out now. + */ + if(tp->send_head == skb && + !sk->nonagle && + skb->len < (tp->mss_cache >> 1) && + tp->packets_out && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { + update_send_head(sk); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + } else { + /* Socket is locked, keep trying until memory is available. */ + do { + skb = sock_wmalloc(sk, + (MAX_HEADER + + sk->prot->max_header), + 1, GFP_KERNEL); + } while (skb == NULL); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_send_skb(sk, skb, 0); + } +} + +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + */ +void tcp_send_active_reset(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); + if (!skb) + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* Send it off. */ + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_transmit_skb(sk, skb); +} + +/* WARNING: This routine must only be called when we have already sent + * a SYN packet that crossed the incoming SYN that caused this routine + * to get called. If this assumption fails then the initial rcv_wnd + * and rcv_wscale values will not be correct. + */ +int tcp_send_synack(struct sock *sk) +{ + struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff* skb; + + skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + 1, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* SYN eats a sequence byte. */ + TCP_SKB_CB(skb)->seq = tp->snd_una; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + __skb_queue_tail(&sk->write_queue, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return 0; +} + +/* + * Prepare a SYN-ACK. + */ +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req, int mss) +{ + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + + skb->dst = dst_clone(dst); + + /* Don't offer more than they did. + * This way we don't have to memorize who said what. + * FIXME: maybe this should be changed for better performance + * with syncookies. + */ + req->mss = min(mss, req->mss); + if (req->mss < 8) { + printk(KERN_DEBUG "initial req->mss below 8\n"); + req->mss = 8; + } + + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + th->source = sk->sport; + th->dest = req->rmt_port; + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = skb->dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,req->mss, + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(req->rcv_wnd); + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + TCP_SKB_CB(skb)->when, + req->ts_recent); + + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + tcp_statistics.TcpOutSegs++; + return skb; +} + +void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu) +{ + struct dst_entry *dst = sk->dst_cache; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; + tp->rcv_nxt = 0; + + sk->err = 0; + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->user_mss) + tp->mss_clamp = tp->user_mss; + tcp_sync_mss(sk, mtu); + + /* Now unpleasant action: if initial pmtu is too low + set lower clamp. I am not sure that it is good. + To be more exact, I do not think that clamping at value, which + is apparently transient and may improve in future is good idea. + It would be better to wait until peer will returns its MSS + (probably 65535 too) and now advertise something sort of 65535 + or at least first hop device mtu. Is it clear, what I mean? + We should tell peer what maximal mss we expect to RECEIVE, + it has nothing to do with pmtu. + I am afraid someone will be confused by such huge value. + --ANK (980731) + */ + if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp ) + tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr); + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + buff->csum = 0; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; + + tp->window_clamp = dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp, + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &tp->rcv_wscale); + /* Ok, now lock the socket before we make it visible to + * the incoming packet engine. + */ + lock_sock(sk); + + /* Socket identity change complete, no longer + * in TCP_CLOSE, so enter ourselves into the + * hash tables. + */ + tcp_set_state(sk,TCP_SYN_SENT); + sk->prot->hash(sk); + + tp->rto = dst->rtt; + tcp_init_xmit_timers(sk); + tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; + + /* Send it off. */ + __skb_queue_tail(&sk->write_queue, buff); + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_statistics.TcpActiveOpens++; + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + /* Now, it is safe to release the socket. */ + release_sock(sk); +} + +/* Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. + */ +void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) +{ + unsigned long timeout; + + /* Stay within the limit we were given */ + timeout = tp->ato; + if (timeout > max_timeout) + timeout = max_timeout; + timeout += jiffies; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (!tp->delack_timer.prev) { + tp->delack_timer.expires = timeout; + add_timer(&tp->delack_timer); + } else { + if (time_before(timeout, tp->delack_timer.expires)) + mod_timer(&tp->delack_timer, timeout); + } +} + +/* This routine sends an ack and also updates the window. */ +void tcp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if(!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; + + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); + if (buff == NULL) { + /* Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + * + * This is the one possible way that we can delay an + * ACK and have tp->ato indicate that we are in + * quick ack mode, so clear it. + */ + if(tcp_in_quickack_mode(tp)) + tcp_exit_quickack_mode(tp); + tcp_send_delayed_ack(tp, HZ/2); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt; + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff); + } +} + +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + */ +void tcp_write_wakeup(struct sock *sk) +{ + /* After a valid reset we can send no more. */ + if (!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + if ((1 << sk->state) & + ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_LAST_ACK|TCPF_CLOSING)) + return; + + if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && + ((skb = tp->send_head) != NULL)) { + unsigned long win_size; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { + if (tcp_fragment(sk, skb, win_size)) + return; /* Let a retransmit get it. */ + } + update_send_head(sk); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, + GFP_ATOMIC); + if (skb == NULL) + return; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_transmit_skb(sk, skb); + } + } +} + +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. + */ +void tcp_send_probe0(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_write_wakeup(sk); + tp->pending = TIME_PROBE0; + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TIME_PROBE0, + min(tp->rto << tp->backoff, 120*HZ)); +} diff --git a/pfinet/linux-src/net/ipv4/tcp_timer.c b/pfinet/linux-src/net/ipv4/tcp_timer.c new file mode 100644 index 00000000..21029f8e --- /dev/null +++ b/pfinet/linux-src/net/ipv4/tcp_timer.c @@ -0,0 +1,595 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_timer.c,v 1.62.2.3 1999/06/20 20:14:30 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +#include <net/tcp.h> + +int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; +int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; +int sysctl_tcp_retries1 = TCP_RETR1; +int sysctl_tcp_retries2 = TCP_RETR2; + +static void tcp_sltimer_handler(unsigned long); +static void tcp_syn_recv_timer(unsigned long); +static void tcp_keepalive(unsigned long data); +static void tcp_twkill(unsigned long); + +struct timer_list tcp_slow_timer = { + NULL, NULL, + 0, 0, + tcp_sltimer_handler, +}; + + +struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { + {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ + {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */ +}; + +const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ + +void tcp_init_xmit_timers(struct sock *sk) +{ + init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer); + sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer; + sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk; + + init_timer(&sk->tp_pinfo.af_tcp.delack_timer); + sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer; + sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk; + + init_timer(&sk->tp_pinfo.af_tcp.probe_timer); + sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer; + sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk; +} + +/* + * Reset the retransmission timer + */ + +void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + switch (what) { + case TIME_RETRANS: + /* When seting the transmit timer the probe timer + * should not be set. + * The delayed ack timer can be set if we are changing the + * retransmit timer when removing acked frames. + */ + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); + mod_timer(&tp->retransmit_timer, jiffies+when); + break; + + case TIME_DACK: + mod_timer(&tp->delack_timer, jiffies+when); + break; + + case TIME_PROBE0: + mod_timer(&tp->probe_timer, jiffies+when); + break; + + case TIME_WRITE: + printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n"); + break; + + default: + printk(KERN_DEBUG "bug: unknown timer value\n"); + }; +} + +void tcp_clear_xmit_timers(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if(tp->retransmit_timer.prev) + del_timer(&tp->retransmit_timer); + if(tp->delack_timer.prev) + del_timer(&tp->delack_timer); + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); +} + +static int tcp_write_err(struct sock *sk, int force) +{ + sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT; + sk->error_report(sk); + + tcp_clear_xmit_timers(sk); + + /* Time wait the socket. */ + if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) { + tcp_time_wait(sk); + } else { + /* Clean up time. */ + tcp_set_state(sk, TCP_CLOSE); + return 0; + } + return 1; +} + +/* A write timeout has occurred. Process the after effects. */ +static int tcp_write_timeout(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Look for a 'soft' timeout. */ + if ((sk->state == TCP_ESTABLISHED && + tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || + (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { + dst_negative_advice(&sk->dst_cache); + } + + /* Have we tried to SYN too many times (repent repent 8)) */ + if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) { + tcp_write_err(sk, 1); + /* Don't FIN, we got nothing back */ + return 0; + } + + /* Has it gone just too far? */ + if (tp->retransmits > sysctl_tcp_retries2) + return tcp_write_err(sk, 0); + + return 1; +} + +void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + + if(!sk->zapped && + sk->tp_pinfo.af_tcp.delayed_acks && + sk->state != TCP_CLOSE) { + /* If socket is currently locked, defer the ACK. */ + if (!atomic_read(&sk->sock_readers)) + tcp_send_ack(sk); + else + tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10); + } +} + +void tcp_probe_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if(sk->zapped) + return; + + if (atomic_read(&sk->sock_readers)) { + /* Try again later. */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); + return; + } + + /* *WARNING* RFC 1122 forbids this + * It doesn't AFAIK, because we kill the retransmit timer -AK + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + */ + if (tp->probes_out > sysctl_tcp_retries2) { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + sk->error_report(sk); + + if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { + /* Time wait the socket. */ + tcp_time_wait(sk); + } else { + /* Clean up time. */ + tcp_set_state(sk, TCP_CLOSE); + } + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); + } +} + +static __inline__ int tcp_keepopen_proc(struct sock *sk) +{ + int res = 0; + + if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + + if (elapsed >= sysctl_tcp_keepalive_time) { + if (tp->probes_out > sysctl_tcp_keepalive_probes) { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + if (!sk->dead) + sk->state_change(sk); + } else { + tp->probes_out++; + tp->pending = TIME_KEEPOPEN; + tcp_write_wakeup(sk); + res = 1; + } + } + } + return res; +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +int tcp_tw_death_row_slot = 0; +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = + { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + +extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); + +static void tcp_twkill(unsigned long data) +{ + struct tcp_tw_bucket *tw; + int killed = 0; + + tw = tcp_tw_death_row[tcp_tw_death_row_slot]; + tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; + while(tw != NULL) { + struct tcp_tw_bucket *next = tw->next_death; + + tcp_timewait_kill(tw); + killed++; + tw = next; + } + if(killed != 0) { + struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; + atomic_sub(killed, &slt->count); + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); +} + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ +void tcp_tw_schedule(struct tcp_tw_bucket *tw) +{ + int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot]; + + if((tw->next_death = *tpp) != NULL) + (*tpp)->pprev_death = &tw->next_death; + *tpp = tw; + tw->pprev_death = tpp; + + tw->death_slot = slot; + + tcp_inc_slow_timer(TCP_SLT_TWKILL); +} + +/* Happens rarely if at all, no care about scalability here. */ +void tcp_tw_reschedule(struct tcp_tw_bucket *tw) +{ + struct tcp_tw_bucket **tpp; + int slot; + + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + + slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + tpp = &tcp_tw_death_row[slot]; + if((tw->next_death = *tpp) != NULL) + (*tpp)->pprev_death = &tw->next_death; + *tpp = tw; + tw->pprev_death = tpp; + + tw->death_slot = slot; + /* Timer was incremented when we first entered the table. */ +} + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + tcp_dec_slow_timer(TCP_SLT_TWKILL); +} + +/* + * Check all sockets for keepalive timer + * Called every 75 seconds + * This timer is started by af_inet init routine and is constantly + * running. + * + * It might be better to maintain a count of sockets that need it using + * setsockopt/tcp_destroy_sk and only set the timer when needed. + */ + +/* + * don't send over 5 keepopens at a time to avoid burstiness + * on big servers [AC] + */ +#define MAX_KA_PROBES 5 + +int sysctl_tcp_max_ka_probes = MAX_KA_PROBES; + +/* Keepopen's are only valid for "established" TCP's, nicely our listener + * hash gets rid of most of the useless testing, so we run through a couple + * of the established hash chains each clock tick. -DaveM + * + * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes + * going off for them, so we only need check the first half of the established + * hash table, even less testing under heavy load. + * + * I _really_ would rather do this by adding a new timer_struct to struct sock, + * and this way only those who set the keepalive option will get the overhead. + * The idea is you set it for 2 hours when the sock is first connected, when it + * does fire off (if at all, most sockets die earlier) you check for the keepalive + * option and also if the sock has been idle long enough to start probing. + */ +static void tcp_keepalive(unsigned long data) +{ + static int chain_start = 0; + int count = 0; + int i; + + for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) { + struct sock *sk = tcp_established_hash[i]; + while(sk) { + if(!atomic_read(&sk->sock_readers) && sk->keepopen) { + count += tcp_keepopen_proc(sk); + if(count == sysctl_tcp_max_ka_probes) + goto out; + } + sk = sk->next; + } + } +out: + chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) & + ((TCP_HTABLE_SIZE/2) - 1)); +} + +/* + * The TCP retransmit timer. This lacks a few small details. + * + * 1. An initial rtt timeout on the probe0 should cause what we can + * of the first write queue buffer to be split and sent. + * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report + * ETIMEDOUT if we know an additional 'soft' error caused this. + * tcp_err should save a 'soft error' for us. + * [Unless someone has broken it then it does, except for one 2.0 + * broken case of a send when the route/device is directly unreachable, + * and we error but should retry! - FIXME] [AC] + */ + +void tcp_retransmit_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + /* We are reset. We will send no more retransmits. */ + if(sk->zapped) { + tcp_clear_xmit_timer(sk, TIME_RETRANS); + return; + } + + if (atomic_read(&sk->sock_readers)) { + /* Try again later */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); + return; + } + + /* Clear delay ack timer. */ + tcp_clear_xmit_timer(sk, TIME_DACK); + + /* RFC 2018, clear all 'sacked' flags in retransmission queue, + * the sender may have dropped out of order frames and we must + * send them out should this timer fire on us. + */ + if(tp->sack_ok) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + TCP_SKB_CB(skb)->sacked &= + ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS); + skb = skb->next; + } + } + + /* Retransmission. */ + tp->retrans_head = NULL; + tp->rexmt_done = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; + if (tp->retransmits == 0) { + /* Remember window where we lost: + * "one half of the current window but at least 2 segments" + * + * Here "current window" means the effective one, which + * means it must be an accurate representation of our current + * sending rate _and_ the snd_wnd. + */ + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd = 1; + } + + tp->retransmits++; + + tp->dup_acks = 0; + tp->high_seq = tp->snd_nxt; + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + tp->backoff++; + tp->rto = min(tp->rto << 1, 120*HZ); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + tcp_write_timeout(sk); +} + +/* + * Slow timer for SYN-RECV sockets + */ + +/* This now scales very nicely. -DaveM */ +static void tcp_syn_recv_timer(unsigned long data) +{ + struct sock *sk; + unsigned long now = jiffies; + int i; + + for(i = 0; i < TCP_LHTABLE_SIZE; i++) { + sk = tcp_listening_hash[i]; + + while(sk) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + /* TCP_LISTEN is implied. */ + if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) { + struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue); + struct open_request *req = tp->syn_wait_queue; + do { + struct open_request *conn; + + conn = req; + req = req->dl_next; + + if (conn->sk || + ((long)(now - conn->expires)) <= 0) { + prev = conn; + continue; + } + + tcp_synq_unlink(tp, conn, prev); + if (conn->retrans >= sysctl_tcp_retries1) { +#ifdef TCP_DEBUG + printk(KERN_DEBUG "syn_recv: " + "too many retransmits\n"); +#endif + (*conn->class->destructor)(conn); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + tp->syn_backlog--; + tcp_openreq_free(conn); + + if (!tp->syn_wait_queue) + break; + } else { + unsigned long timeo; + struct open_request *op; + + (*conn->class->rtx_syn_ack)(sk, conn); + + conn->retrans++; +#ifdef TCP_DEBUG + printk(KERN_DEBUG "syn_ack rtx %d\n", + conn->retrans); +#endif + timeo = min((TCP_TIMEOUT_INIT + << conn->retrans), + 120*HZ); + conn->expires = now + timeo; + op = prev->dl_next; + tcp_synq_queue(tp, conn); + if (op != prev->dl_next) + prev = prev->dl_next; + } + /* old prev still valid here */ + } while (req); + } + sk = sk->next; + } + } +} + +void tcp_sltimer_handler(unsigned long data) +{ + struct tcp_sl_timer *slt = tcp_slt_array; + unsigned long next = ~0UL; + unsigned long now = jiffies; + int i; + + for (i=0; i < TCP_SLT_MAX; i++, slt++) { + if (atomic_read(&slt->count)) { + long trigger; + + trigger = slt->period - ((long)(now - slt->last)); + + if (trigger <= 0) { + (*slt->handler)((unsigned long) slt); + slt->last = now; + trigger = slt->period; + } + + /* Only reschedule if some events remain. */ + if (atomic_read(&slt->count)) + next = min(next, trigger); + } + } + if (next != ~0UL) + mod_timer(&tcp_slow_timer, (now + next)); +} + +void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) +{ + unsigned long now = jiffies; + unsigned long when; + + slt->last = now; + + when = now + slt->period; + + if (tcp_slow_timer.prev) { + if ((long)(tcp_slow_timer.expires - when) >= 0) + mod_timer(&tcp_slow_timer, when); + } else { + tcp_slow_timer.expires = when; + add_timer(&tcp_slow_timer); + } +} diff --git a/pfinet/linux-src/net/ipv4/timer.c b/pfinet/linux-src/net/ipv4/timer.c new file mode 100644 index 00000000..3821a7c4 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/timer.c @@ -0,0 +1,127 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * TIMER - implementation of software timers for IP. + * + * Version: $Id: timer.c,v 1.15 1999/02/22 13:54:29 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> + * Florian La Roche, <flla@stud.uni-sb.de> + * + * Fixes: + * Alan Cox : To avoid destroying a wait queue as we use it + * we defer destruction until the destroy timer goes + * off. + * Alan Cox : Destroy socket doesn't write a status value to the + * socket buffer _AFTER_ freeing it! Also sock ensures + * the socket will get removed BEFORE this is called + * otherwise if the timer TIME_DESTROY occurs inside + * of inet_bh() with this socket being handled it goes + * BOOM! Have to stop timer going off if net_bh is + * active or the destroy causes crashes. + * Alan Cox : Cleaned up unused code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <linux/interrupt.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> + +void net_delete_timer (struct sock *t) +{ + if(t->timer.prev) + del_timer (&t->timer); + t->timeout = 0; +} + +void net_reset_timer (struct sock *t, int timeout, unsigned long len) +{ + t->timeout = timeout; + mod_timer(&t->timer, jiffies+len); +} + +/* Now we will only be called whenever we need to do + * something, but we must be sure to process all of the + * sockets that need it. + */ +void net_timer (unsigned long data) +{ + struct sock *sk = (struct sock*)data; + int why = sk->timeout; + + /* Only process if socket is not in use. */ + if (atomic_read(&sk->sock_readers)) { + /* Try again later. */ + mod_timer(&sk->timer, jiffies+HZ/20); + return; + } + + /* Always see if we need to send an ack. */ + if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) { + sk->prot->read_wakeup (sk); + if (!sk->dead) + sk->data_ready(sk,0); + } + + /* Now we need to figure out why the socket was on the timer. */ + switch (why) { + case TIME_DONE: + /* If the socket hasn't been closed off, re-try a bit later. */ + if (!sk->dead) { + net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME); + break; + } + + if (sk->state != TCP_CLOSE) { + printk (KERN_DEBUG "non CLOSE socket in time_done\n"); + break; + } + destroy_sock (sk); + break; + + case TIME_DESTROY: + /* We've waited for a while for all the memory associated with + * the socket to be freed. + */ + destroy_sock(sk); + break; + + case TIME_CLOSE: + /* We've waited long enough, close the socket. */ + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + if (!sk->dead) + sk->state_change(sk); + net_reset_timer (sk, TIME_DONE, TCP_DONE_TIME); + break; + + default: + /* I want to see these... */ + printk ("net_timer: timer expired - reason %d is unknown\n", why); + break; + } +} + diff --git a/pfinet/linux-src/net/ipv4/udp.c b/pfinet/linux-src/net/ipv4/udp.c new file mode 100644 index 00000000..909e858f --- /dev/null +++ b/pfinet/linux-src/net/ipv4/udp.c @@ -0,0 +1,1191 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The User Datagram Protocol (UDP). + * + * Version: $Id: udp.c,v 1.66.2.3 1999/08/07 10:56:36 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Alan Cox, <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : verify_area() calls + * Alan Cox : stopped close while in use off icmp + * messages. Not a fix but a botch that + * for udp at least is 'valid'. + * Alan Cox : Fixed icmp handling properly + * Alan Cox : Correct error for oversized datagrams + * Alan Cox : Tidied select() semantics. + * Alan Cox : udp_err() fixed properly, also now + * select and read wake correctly on errors + * Alan Cox : udp_send verify_area moved to avoid mem leak + * Alan Cox : UDP can count its memory + * Alan Cox : send to an unknown connection causes + * an ECONNREFUSED off the icmp, but + * does NOT close. + * Alan Cox : Switched to new sk_buff handlers. No more backlog! + * Alan Cox : Using generic datagram code. Even smaller and the PEEK + * bug no longer crashes it. + * Fred Van Kempen : Net2e support for sk->broadcast. + * Alan Cox : Uses skb_free_datagram + * Alan Cox : Added get/set sockopt support. + * Alan Cox : Broadcasting without option set returns EACCES. + * Alan Cox : No wakeup calls. Instead we now use the callbacks. + * Alan Cox : Use ip_tos and ip_ttl + * Alan Cox : SNMP Mibs + * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. + * Matt Dillon : UDP length checks. + * Alan Cox : Smarter af_inet used properly. + * Alan Cox : Use new kernel side addressing. + * Alan Cox : Incorrect return on truncated datagram receive. + * Arnt Gulbrandsen : New udp_send and stuff + * Alan Cox : Cache last socket + * Alan Cox : Route cache + * Jon Peatfield : Minor efficiency fix to sendto(). + * Mike Shaver : RFC1122 checks. + * Alan Cox : Nonblocking error fix. + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * David S. Miller : New socket lookup architecture. + * Last socket cache retained as it + * does have a high hit rate. + * Olaf Kirch : Don't linearise iovec on sendmsg. + * Andi Kleen : Some cleanups, cache destination entry + * for connect. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Melvin Smith : Check msg_name not msg_namelen in sendto(), + * return ENOTCONN for unconnected sockets (POSIX) + * Janos Farkas : don't deliver multi/broadcasts to a different + * bound-to-device socket + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* RFC1122 Status: + 4.1.3.1 (Ports): + SHOULD send ICMP_PORT_UNREACHABLE in response to datagrams to + an un-listened port. (OK) + 4.1.3.2 (IP Options) + MUST pass IP options from IP -> application (OK) + MUST allow application to specify IP options (OK) + 4.1.3.3 (ICMP Messages) + MUST pass ICMP error messages to application (OK -- except when SO_BSDCOMPAT is set) + 4.1.3.4 (UDP Checksums) + MUST provide facility for checksumming (OK) + MAY allow application to control checksumming (OK) + MUST default to checksumming on (OK) + MUST discard silently datagrams with bad csums (OK, except during debugging) + 4.1.3.5 (UDP Multihoming) + MUST allow application to specify source address (OK) + SHOULD be able to communicate the chosen src addr up to application + when application doesn't choose (DOES - use recvmsg cmsgs) + 4.1.3.6 (Invalid Addresses) + MUST discard invalid source addresses (OK -- done in the new routing code) + MUST only send datagrams with one of our addresses (OK) +*/ + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/config.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/route.h> +#include <net/checksum.h> + +/* + * Snmp MIB for the UDP layer + */ + +struct udp_mib udp_statistics; + +struct sock *udp_hash[UDP_HTABLE_SIZE]; + +/* Shared by v4/v6 udp. */ +int udp_port_rover = 0; + +static int udp_v4_get_port(struct sock *sk, unsigned short snum) +{ + SOCKHASH_LOCK(); + if (snum == 0) { + int best_size_so_far, best, result, i; + + if (udp_port_rover > sysctl_local_port_range[1] || + udp_port_rover < sysctl_local_port_range[0]) + udp_port_rover = sysctl_local_port_range[0]; + best_size_so_far = 32767; + best = result = udp_port_rover; + for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + struct sock *sk; + int size; + + sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + if (!sk) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + goto gotit; + } + size = 0; + do { + if (++size >= best_size_so_far) + goto next; + } while ((sk = sk->next) != NULL); + best_size_so_far = size; + best = result; + next: + } + result = best; + for(;; result += UDP_HTABLE_SIZE) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } +gotit: + udp_port_rover = snum = result; + } else { + struct sock *sk2; + + for (sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; + sk2 != NULL; + sk2 = sk2->next) { + if (sk2->num == snum && + sk2 != sk && + sk2->bound_dev_if == sk->bound_dev_if && + (!sk2->rcv_saddr || + !sk->rcv_saddr || + sk2->rcv_saddr == sk->rcv_saddr) && + (!sk2->reuse || !sk->reuse)) + goto fail; + } + } + sk->num = snum; + SOCKHASH_UNLOCK(); + return 0; + +fail: + SOCKHASH_UNLOCK(); + return 1; +} + +/* Last hit UDP socket cache, this is ipv4 specific so make it static. */ +static u32 uh_cache_saddr, uh_cache_daddr; +static u16 uh_cache_dport, uh_cache_sport; +static struct sock *uh_cache_sk = NULL; + +static void udp_v4_hash(struct sock *sk) +{ + struct sock **skp = &udp_hash[sk->num & (UDP_HTABLE_SIZE - 1)]; + + SOCKHASH_LOCK(); + if ((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + *skp = sk; + sk->pprev = skp; + SOCKHASH_UNLOCK(); +} + +static void udp_v4_unhash(struct sock *sk) +{ + SOCKHASH_LOCK(); + if (sk->pprev) { + if (sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + if(uh_cache_sk == sk) + uh_cache_sk = NULL; + } + SOCKHASH_UNLOCK(); +} + +/* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this here plus the last hit cache. -DaveM + */ +struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) +{ + struct sock *sk, *result = NULL; + unsigned short hnum = ntohs(dport); + int badness = -1; + + for(sk = udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]; sk != NULL; sk = sk->next) { + if((sk->num == hnum) && !(sk->dead && (sk->state == TCP_CLOSE))) { + int score = 0; + if(sk->rcv_saddr) { + if(sk->rcv_saddr != daddr) + continue; + score++; + } + if(sk->daddr) { + if(sk->daddr != saddr) + continue; + score++; + } + if(sk->dport) { + if(sk->dport != sport) + continue; + score++; + } + if(sk->bound_dev_if) { + if(sk->bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { + result = sk; + break; + } else if(score > badness) { + result = sk; + badness = score; + } + } + } + return result; +} + +__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) +{ + struct sock *sk; + + if(!dif && uh_cache_sk && + uh_cache_saddr == saddr && + uh_cache_sport == sport && + uh_cache_dport == dport && + uh_cache_daddr == daddr) + return uh_cache_sk; + + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + if(!dif) { + uh_cache_sk = sk; + uh_cache_saddr = saddr; + uh_cache_daddr = daddr; + uh_cache_sport = sport; + uh_cache_dport = dport; + } + return sk; +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +#define secondlist(hpnum, sk, fpass) \ +({ struct sock *s1; if(!(sk) && (fpass)--) \ + s1 = udp_hash[(hpnum) & (UDP_HTABLE_SIZE - 1)]; \ + else \ + s1 = (sk); \ + s1; \ +}) + +#define udp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \ + secondlist((hpnum), udp_hash[(hnum)&(UDP_HTABLE_SIZE-1)],(fpass)) + +#define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ + secondlist((hpnum),(sk)->next,(fpass)) + +static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) +{ + struct sock *s, *result = NULL; + int badness = -1; + u32 paddr = 0; + unsigned short hnum = ntohs(num); + unsigned short hpnum = ntohs(pnum); + int firstpass = 1; + + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + + SOCKHASH_LOCK(); + for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); + s != NULL; + s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) { + if(s->num == hnum || s->num == hpnum) { + int score = 0; + if(s->dead && (s->state == TCP_CLOSE)) + continue; + if(s->rcv_saddr) { + if((s->num != hpnum || s->rcv_saddr != paddr) && + (s->num != hnum || s->rcv_saddr != laddr)) + continue; + score++; + } + if(s->daddr) { + if(s->daddr != raddr) + continue; + score++; + } + if(s->dport) { + if(s->dport != rnum) + continue; + score++; + } + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { + result = s; + break; + } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { + result = s; + badness = score; + } + } + } + SOCKHASH_UNLOCK(); + return result; +} + +#undef secondlist +#undef udp_v4_proxy_loop_init +#undef udp_v4_proxy_loop_next + +#endif + +static inline struct sock *udp_v4_mcast_next(struct sock *sk, + unsigned short num, + unsigned long raddr, + unsigned short rnum, + unsigned long laddr, + int dif) +{ + struct sock *s = sk; + unsigned short hnum = ntohs(num); + for(; s; s = s->next) { + if ((s->num != hnum) || + (s->dead && (s->state == TCP_CLOSE)) || + (s->daddr && s->daddr!=raddr) || + (s->dport != rnum && s->dport != 0) || + (s->rcv_saddr && s->rcv_saddr != laddr) || + (s->bound_dev_if && s->bound_dev_if != dif)) + continue; + break; + } + return s; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. + * Header points to the ip header of the error packet. We move + * on past this. Then (as it used to claim before adjustment) + * header points to the first 8 bytes of the udp header. We need + * to find the appropriate port. + */ + +void udp_err(struct sk_buff *skb, unsigned char *dp, int len) +{ + struct iphdr *iph = (struct iphdr*)dp; + struct udphdr *uh = (struct udphdr*)(dp+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + int harderr; + u32 info; + int err; + + if (len < (iph->ihl<<2)+sizeof(struct udphdr)) { + icmp_statistics.IcmpInErrors++; + return; + } + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (sk == NULL) { + icmp_statistics.IcmpInErrors++; + return; /* No socket for error */ + } + + err = 0; + info = 0; + harderr = 0; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + info = ntohl(skb->h.icmph->un.gateway)>>24; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + info = ntohs(skb->h.icmph->un.frag.mtu); + harderr = 1; + break; + } + return; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + } + + /* + * Various people wanted BSD UDP semantics. Well they've come + * back out because they slow down response to stuff like dead + * or unreachable name servers and they screw term users something + * chronic. Oh and it violates RFC1122. So basically fix your + * client code people. + */ + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. After the comment above, that should be no surprise. + */ + + if (!harderr && !sk->ip_recverr) + return; + + /* + * 4.x BSD compatibility item. Break RFC1122 to + * get BSD socket semantics. + */ + if(sk->bsdism && sk->state!=TCP_ESTABLISHED) + return; + + if (sk->ip_recverr) + ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + sk->err = err; + sk->error_report(sk); +} + + +static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) +{ + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); +} + +struct udpfakehdr +{ + struct udphdr uh; + u32 saddr; + u32 daddr; + struct iovec *iov; + u32 wcheck; +}; + +/* + * Copy and checksum a UDP packet from user space into a buffer. We still have + * to do the planning to get ip_build_xmit to spot direct transfer to network + * card and provide an additional callback mode for direct user->board I/O + * transfers. That one will be fun. + */ + +static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen) +{ + struct udpfakehdr *ufh = (struct udpfakehdr *)p; + if (offset==0) { + if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr), &ufh->wcheck)) + return -EFAULT; + ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr), + ufh->wcheck); + ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr, + ntohs(ufh->uh.len), + IPPROTO_UDP, ufh->wcheck); + if (ufh->uh.check == 0) + ufh->uh.check = -1; + memcpy(to, ufh, sizeof(struct udphdr)); + return 0; + } + if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen, &ufh->wcheck)) + return -EFAULT; + return 0; +} + +/* + * Unchecksummed UDP is sufficiently critical to stuff like ATM video conferencing + * that we use two routines for this for speed. Probably we ought to have a + * CONFIG_FAST_NET set for >10Mb/second boards to activate this sort of coding. + * Timing needed to verify if this is a valid decision. + */ + +static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen) +{ + struct udpfakehdr *ufh = (struct udpfakehdr *)p; + + if (offset==0) { + memcpy(to, ufh, sizeof(struct udphdr)); + return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr)); + } + return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen); +} + +int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) +{ + int ulen = len + sizeof(struct udphdr); + struct ipcm_cookie ipc; + struct udpfakehdr ufh; + struct rtable *rt = NULL; + int free = 0; + int connected = 0; + u32 daddr; + u8 tos; + int err; + + /* This check is ONLY to check for arithmetic overflow + on integer(!) len. Not more! Real check will be made + in ip_build_xmit --ANK + + BTW socket.c -> af_*.c -> ... make multiple + invalid conversions size_t -> int. We MUST repair it f.e. + by replacing all of them with size_t and revise all + the places sort of len += sizeof(struct iphdr) + If len was ULONG_MAX-10 it would be cathastrophe --ANK + */ + + if (len < 0 || len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + return -EOPNOTSUPP; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY|MSG_NOSIGNAL)) + return -EINVAL; + if ((msg->msg_flags&MSG_PROXY) && !capable(CAP_NET_ADMIN)) + return -EPERM; +#else + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) + return -EINVAL; +#endif + + /* + * Get and verify the address. + */ + + if (msg->msg_name) { + struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return(-EINVAL); + if (usin->sin_family != AF_INET) { + static int complained; + if (!complained++) + printk(KERN_WARNING "%s forgot to set AF_INET in udp sendmsg. Fix it!\n", current->comm); + if (usin->sin_family) + return -EINVAL; + } + ufh.daddr = usin->sin_addr.s_addr; + ufh.uh.dest = usin->sin_port; + if (ufh.uh.dest == 0) + return -EINVAL; + } else { + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + ufh.daddr = sk->daddr; + ufh.uh.dest = sk->dport; + /* Open fast path for connected socket. + Route will not be used, if at least one option is set. + */ + connected = 1; + } +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (msg->msg_flags&MSG_PROXY) { + /* + * We map the first 8 bytes of a second sockaddr_in + * into the last 8 (unused) bytes of a sockaddr_in. + */ + struct sockaddr_in *from = (struct sockaddr_in *)msg->msg_name; + from = (struct sockaddr_in *)&from->sin_zero; + if (from->sin_family != AF_INET) + return -EINVAL; + ipc.addr = from->sin_addr.s_addr; + ufh.uh.source = from->sin_port; + if (ipc.addr == 0) + ipc.addr = sk->saddr; + connected = 0; + } else +#endif + { + ipc.addr = sk->saddr; + ufh.uh.source = sk->sport; + } + + ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; + if (msg->msg_controllen) { + err = ip_cmsg_send(msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + connected = 0; + } + if (!ipc.opt) + ipc.opt = sk->opt; + + ufh.saddr = ipc.addr; + ipc.addr = daddr = ufh.daddr; + + if (ipc.opt && ipc.opt->srr) { + if (!daddr) + return -EINVAL; + daddr = ipc.opt->faddr; + connected = 0; + } + tos = RT_TOS(sk->ip_tos); + if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || + (ipc.opt && ipc.opt->is_strictroute)) { + tos |= RTO_ONLINK; + connected = 0; + } + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!ufh.saddr) + ufh.saddr = sk->ip_mc_addr; + connected = 0; + } + + if (connected && sk->dst_cache) { + rt = (struct rtable*)sk->dst_cache; + if (rt->u.dst.obsolete) { + sk->dst_cache = NULL; + dst_release(&rt->u.dst); + rt = NULL; + } else + dst_clone(&rt->u.dst); + } + + if (rt == NULL) { + err = ip_route_output(&rt, daddr, ufh.saddr, +#ifdef CONFIG_IP_TRANSPARENT_PROXY + (msg->msg_flags&MSG_PROXY ? RTO_TPROXY : 0) | +#endif + tos, ipc.oif); + if (err) + goto out; + + err = -EACCES; + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) + goto out; + if (connected && sk->dst_cache == NULL) + sk->dst_cache = dst_clone(&rt->u.dst); + } + + ufh.saddr = rt->rt_src; + if (!ipc.addr) + ufh.daddr = ipc.addr = rt->rt_dst; + ufh.uh.len = htons(ulen); + ufh.uh.check = 0; + ufh.iov = msg->msg_iov; + ufh.wcheck = 0; + + /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ + /* 4.1.3.4. It's configurable by the application via setsockopt() */ + /* (MAY) and it defaults to on (MUST). */ + + err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag, + &ufh, ulen, &ipc, rt, msg->msg_flags); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + udp_statistics.UdpOutDatagrams++; + return len; + } + return err; +} + +/* + * IOCTL requests applicable to the UDP protocol + */ + +int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) + { + case TIOCOUTQ: + { + unsigned long amount; + + amount = sock_wspace(sk); + return put_user(amount, (int *)arg); + } + + case TIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + /* N.B. Is this interrupt safe?? + -> Yes. Interrupts do not remove skbs. --ANK (980725) + */ + skb = skb_peek(&sk->receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount = skb->len - sizeof(struct udphdr); + } + return put_user(amount, (int *)arg); + } + + default: + return(-ENOIOCTLCMD); + } + return(0); +} + +#ifndef HAVE_CSUM_COPY_USER +#undef CONFIG_UDP_DELAY_CSUM +#endif + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags, int *addr_len) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + + /* + * From here the generic datagram does a lot of the work. Come + * the finished NET3, it will do _ALL_ the work! + */ + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + +#ifndef CONFIG_UDP_DELAY_CSUM + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); +#else + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { + if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + unsigned int csum; + + err = 0; + csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); + csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, + copied, csum, &err); + if (err) + goto out_free; + if ((unsigned short)csum_fold(csum)) + goto csum_copy_err; + } +#endif + if (err) + goto out_free; + sk->stamp=skb->stamp; + + /* Copy the address. */ + if (sin) + { + /* + * Check any passed addresses + */ + if (addr_len) + *addr_len=sizeof(*sin); + + sin->sin_family = AF_INET; + sin->sin_port = skb->h.uh->source; + sin->sin_addr.s_addr = skb->nh.iph->saddr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (flags&MSG_PROXY) + { + /* + * We map the first 8 bytes of a second sockaddr_in + * into the last 8 (unused) bytes of a sockaddr_in. + * This _is_ ugly, but it's the only way to do it + * easily, without adding system calls. + */ + struct sockaddr_in *sinto = + (struct sockaddr_in *) sin->sin_zero; + + sinto->sin_family = AF_INET; + sinto->sin_port = skb->h.uh->dest; + sinto->sin_addr.s_addr = skb->nh.iph->daddr; + } +#endif + } + if (sk->ip_cmsg_flags) + ip_cmsg_recv(msg, skb); + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +#ifdef CONFIG_UDP_DELAY_CSUM +csum_copy_err: + udp_statistics.UdpInErrors++; + skb_free_datagram(sk, skb); + + /* + * Error for blocking case is chosen to masquerade + * as some normal condition. + */ + return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; +#endif +} + +int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct rtable *rt; + int err; + + + if (addr_len < sizeof(*usin)) + return(-EINVAL); + + /* + * 1003.1g - break association. + */ + + if (usin->sin_family==AF_UNSPEC) + { + sk->saddr=INADDR_ANY; + sk->rcv_saddr=INADDR_ANY; + sk->daddr=INADDR_ANY; + sk->state = TCP_CLOSE; + if(uh_cache_sk == sk) + uh_cache_sk = NULL; + return 0; + } + + if (usin->sin_family && usin->sin_family != AF_INET) + return(-EAFNOSUPPORT); + + dst_release(xchg(&sk->dst_cache, NULL)); + + err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, + sk->ip_tos|sk->localroute, sk->bound_dev_if); + if (err) + return err; + if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) { + ip_rt_put(rt); + return -EACCES; + } + if(!sk->saddr) + sk->saddr = rt->rt_src; /* Update source address */ + if(!sk->rcv_saddr) + sk->rcv_saddr = rt->rt_src; + sk->daddr = rt->rt_dst; + sk->dport = usin->sin_port; + sk->state = TCP_ESTABLISHED; + + if(uh_cache_sk == sk) + uh_cache_sk = NULL; + + sk->dst_cache = &rt->u.dst; + return(0); +} + + +static void udp_close(struct sock *sk, long timeout) +{ + /* See for explanation: raw_close in ipv4/raw.c */ + sk->state = TCP_CLOSE; + udp_v4_unhash(sk); + sk->dead = 1; + destroy_sock(sk); +} + +static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + /* + * Charge it to the socket, dropping if the queue is full. + */ + +#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM) + if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) { + udp_statistics.UdpInErrors++; + ip_statistics.IpInDiscards++; + ip_statistics.IpInDelivers--; + kfree_skb(skb); + return -1; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +#endif + + if (sock_queue_rcv_skb(sk,skb)<0) { + udp_statistics.UdpInErrors++; + ip_statistics.IpInDiscards++; + ip_statistics.IpInDelivers--; + kfree_skb(skb); + return -1; + } + udp_statistics.UdpInDatagrams++; + return 0; +} + + +static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) +{ + udp_queue_rcv_skb(sk, skb); +} + +/* + * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, + u32 saddr, u32 daddr) +{ + struct sock *sk; + int dif; + + sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif); + if (sk) { + struct sock *sknext = NULL; + + do { + struct sk_buff *skb1 = skb; + + sknext = udp_v4_mcast_next(sk->next, uh->dest, saddr, + uh->source, daddr, dif); + if(sknext) + skb1 = skb_clone(skb, GFP_ATOMIC); + + if(skb1) + udp_deliver(sk, skb1); + sk = sknext; + } while(sknext); + } else + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check whether a received UDP packet might be for one of our + * sockets. + */ + +int udp_chkaddr(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct udphdr *uh = (struct udphdr *)(skb->nh.raw + iph->ihl*4); + struct sock *sk; + + sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest, skb->dev->ifindex); + if (!sk) + return 0; + + /* 0 means accept all LOCAL addresses here, not all the world... */ + if (sk->rcv_saddr == 0) + return 0; + + return 1; +} +#endif + +/* + * All we need to do is get the socket, and then do a checksum. + */ + +int udp_rcv(struct sk_buff *skb, unsigned short len) +{ + struct sock *sk; + struct udphdr *uh; + unsigned short ulen; + struct rtable *rt = (struct rtable*)skb->dst; + u32 saddr = skb->nh.iph->saddr; + u32 daddr = skb->nh.iph->daddr; + + /* + * First time through the loop.. Do all the setup stuff + * (including finding out the socket we go to etc) + */ + + /* + * Get the header. + */ + + uh = skb->h.uh; + __skb_pull(skb, skb->h.raw - skb->data); + + ip_statistics.IpInDelivers++; + + /* + * Validate the packet and the UDP length. + */ + + ulen = ntohs(uh->len); + + if (ulen > len || ulen < sizeof(*uh)) { + NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len)); + udp_statistics.UdpInErrors++; + kfree_skb(skb); + return(0); + } + skb_trim(skb, ulen); + +#ifndef CONFIG_UDP_DELAY_CSUM + if (uh->check && + (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) || + ((skb->ip_summed==CHECKSUM_NONE) && + (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) + goto csum_error; +#else + if (uh->check==0) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else if (skb->ip_summed==CHECKSUM_HW) { + if (udp_check(uh,ulen,saddr,daddr,skb->csum)) + goto csum_error; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); +#endif + + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return udp_v4_mcast_deliver(skb, uh, saddr, daddr); + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (IPCB(skb)->redirport) + sk = udp_v4_proxy_lookup(uh->dest, saddr, uh->source, + daddr, skb->dev, IPCB(skb)->redirport, + skb->dev->ifindex); + else +#endif + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); + + if (sk == NULL) { +#ifdef CONFIG_UDP_DELAY_CSUM + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + (unsigned short)csum_fold(csum_partial((char*)uh, ulen, skb->csum))) + goto csum_error; +#endif + udp_statistics.UdpNoPorts++; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP broadcast to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return(0); + } + udp_deliver(sk, skb); + return 0; + +csum_error: + /* + * RFC1122: OK. Discards the bad packet silently (as far as + * the network is concerned, anyway) as per 4.1.3.4 (MUST). + */ + NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen)); + udp_statistics.UdpInErrors++; + kfree_skb(skb); + return(0); +} + +struct proto udp_prot = { + (struct sock *)&udp_prot, /* sklist_next */ + (struct sock *)&udp_prot, /* sklist_prev */ + udp_close, /* close */ + udp_connect, /* connect */ + NULL, /* accept */ + NULL, /* retransmit */ + NULL, /* write_wakeup */ + NULL, /* read_wakeup */ + datagram_poll, /* poll */ + udp_ioctl, /* ioctl */ + NULL, /* init */ + NULL, /* destroy */ + NULL, /* shutdown */ + ip_setsockopt, /* setsockopt */ + ip_getsockopt, /* getsockopt */ + udp_sendmsg, /* sendmsg */ + udp_recvmsg, /* recvmsg */ + NULL, /* bind */ + udp_queue_rcv_skb, /* backlog_rcv */ + udp_v4_hash, /* hash */ + udp_v4_unhash, /* unhash */ + udp_v4_get_port, /* good_socknum */ + 128, /* max_header */ + 0, /* retransmits */ + "UDP", /* name */ + 0, /* inuse */ + 0 /* highestinuse */ +}; diff --git a/pfinet/linux-src/net/ipv4/utils.c b/pfinet/linux-src/net/ipv4/utils.c new file mode 100644 index 00000000..ce74ade2 --- /dev/null +++ b/pfinet/linux-src/net/ipv4/utils.c @@ -0,0 +1,91 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Various kernel-resident INET utility functions; mainly + * for format conversion and debugging output. + * + * Version: $Id: utils.c,v 1.6 1997/12/13 21:53:03 kuznet Exp $ + * + * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * + * Fixes: + * Alan Cox : verify_area check. + * Alan Cox : removed old debugging. + * Andi Kleen : add net_ratelimit() + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/stat.h> +#include <stdarg.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/skbuff.h> + + +/* + * Display an IP address in readable format. + */ + +char *in_ntoa(__u32 in) +{ + static char buff[18]; + char *p; + + p = (char *) ∈ + sprintf(buff, "%d.%d.%d.%d", + (p[0] & 255), (p[1] & 255), (p[2] & 255), (p[3] & 255)); + return(buff); +} + + +/* + * Convert an ASCII string to binary IP. + */ + +__u32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return(htonl(l)); +} + |