summaryrefslogtreecommitdiff
path: root/pfinet/linux-src/net
diff options
context:
space:
mode:
Diffstat (limited to 'pfinet/linux-src/net')
-rw-r--r--pfinet/linux-src/net/core/Makefile41
-rw-r--r--pfinet/linux-src/net/core/datagram.c249
-rw-r--r--pfinet/linux-src/net/core/dev.c2026
-rw-r--r--pfinet/linux-src/net/core/dev_mcast.c252
-rw-r--r--pfinet/linux-src/net/core/dst.c145
-rw-r--r--pfinet/linux-src/net/core/filter.c454
-rw-r--r--pfinet/linux-src/net/core/firewall.c160
-rw-r--r--pfinet/linux-src/net/core/iovec.c278
-rw-r--r--pfinet/linux-src/net/core/neighbour.c1394
-rw-r--r--pfinet/linux-src/net/core/profile.c305
-rw-r--r--pfinet/linux-src/net/core/rtnetlink.c512
-rw-r--r--pfinet/linux-src/net/core/scm.c280
-rw-r--r--pfinet/linux-src/net/core/skbuff.c385
-rw-r--r--pfinet/linux-src/net/core/sock.c1051
-rw-r--r--pfinet/linux-src/net/core/sysctl_net_core.c61
-rw-r--r--pfinet/linux-src/net/core/utils.c66
-rw-r--r--pfinet/linux-src/net/ethernet/Makefile33
-rw-r--r--pfinet/linux-src/net/ethernet/eth.c298
-rw-r--r--pfinet/linux-src/net/ethernet/pe2.c38
-rw-r--r--pfinet/linux-src/net/ethernet/sysctl_net_ether.c13
-rw-r--r--pfinet/linux-src/net/ipv4/Config.in88
-rw-r--r--pfinet/linux-src/net/ipv4/Makefile116
-rw-r--r--pfinet/linux-src/net/ipv4/af_inet.c1161
-rw-r--r--pfinet/linux-src/net/ipv4/arp.c1154
-rw-r--r--pfinet/linux-src/net/ipv4/devinet.c1034
-rw-r--r--pfinet/linux-src/net/ipv4/fib_frontend.c628
-rw-r--r--pfinet/linux-src/net/ipv4/fib_hash.c885
-rw-r--r--pfinet/linux-src/net/ipv4/fib_rules.c419
-rw-r--r--pfinet/linux-src/net/ipv4/fib_semantics.c991
-rw-r--r--pfinet/linux-src/net/ipv4/icmp.c1155
-rw-r--r--pfinet/linux-src/net/ipv4/igmp.c698
-rw-r--r--pfinet/linux-src/net/ipv4/ip_forward.c297
-rw-r--r--pfinet/linux-src/net/ipv4/ip_fragment.c593
-rw-r--r--pfinet/linux-src/net/ipv4/ip_fw.c1759
-rw-r--r--pfinet/linux-src/net/ipv4/ip_gre.c1223
-rw-r--r--pfinet/linux-src/net/ipv4/ip_input.c549
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq.c2545
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_app.c603
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_autofw.c448
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c264
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_ftp.c393
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_irc.c345
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_mfw.c769
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_mod.c322
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_portfw.c508
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_quake.c322
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_raudio.c578
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_user.c473
-rw-r--r--pfinet/linux-src/net/ipv4/ip_masq_vdolive.c294
-rw-r--r--pfinet/linux-src/net/ipv4/ip_nat_dumb.c158
-rw-r--r--pfinet/linux-src/net/ipv4/ip_options.c617
-rw-r--r--pfinet/linux-src/net/ipv4/ip_output.c992
-rw-r--r--pfinet/linux-src/net/ipv4/ip_sockglue.c739
-rw-r--r--pfinet/linux-src/net/ipv4/ipconfig.c970
-rw-r--r--pfinet/linux-src/net/ipv4/ipip.c870
-rw-r--r--pfinet/linux-src/net/ipv4/ipmr.c1609
-rw-r--r--pfinet/linux-src/net/ipv4/proc.c387
-rw-r--r--pfinet/linux-src/net/ipv4/protocol.c211
-rw-r--r--pfinet/linux-src/net/ipv4/rarp.c606
-rw-r--r--pfinet/linux-src/net/ipv4/raw.c573
-rw-r--r--pfinet/linux-src/net/ipv4/route.c2048
-rw-r--r--pfinet/linux-src/net/ipv4/syncookies.c201
-rw-r--r--pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c205
-rw-r--r--pfinet/linux-src/net/ipv4/tcp.c1826
-rw-r--r--pfinet/linux-src/net/ipv4/tcp_input.c2432
-rw-r--r--pfinet/linux-src/net/ipv4/tcp_ipv4.c2044
-rw-r--r--pfinet/linux-src/net/ipv4/tcp_output.c1143
-rw-r--r--pfinet/linux-src/net/ipv4/tcp_timer.c595
-rw-r--r--pfinet/linux-src/net/ipv4/timer.c127
-rw-r--r--pfinet/linux-src/net/ipv4/udp.c1191
-rw-r--r--pfinet/linux-src/net/ipv4/utils.c91
71 files changed, 48290 insertions, 0 deletions
diff --git a/pfinet/linux-src/net/core/Makefile b/pfinet/linux-src/net/core/Makefile
new file mode 100644
index 00000000..5df65cd2
--- /dev/null
+++ b/pfinet/linux-src/net/core/Makefile
@@ -0,0 +1,41 @@
+#
+# Makefile for the Linux networking core.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := core.o
+
+O_OBJS := sock.o skbuff.o iovec.o datagram.o scm.o
+
+ifeq ($(CONFIG_SYSCTL),y)
+ifeq ($(CONFIG_NET),y)
+O_OBJS += sysctl_net_core.o
+endif
+endif
+
+ifdef CONFIG_FILTER
+O_OBJS += filter.o
+endif
+
+ifdef CONFIG_NET
+
+O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o
+
+ifdef CONFIG_FIREWALL
+OX_OBJS += firewall.o
+endif
+
+endif
+
+ifdef CONFIG_NET_PROFILE
+OX_OBJS += profile.o
+endif
+
+include $(TOPDIR)/Rules.make
+
+tar:
+ tar -cvf /dev/f1 .
diff --git a/pfinet/linux-src/net/core/datagram.c b/pfinet/linux-src/net/core/datagram.c
new file mode 100644
index 00000000..9bb68fa4
--- /dev/null
+++ b/pfinet/linux-src/net/core/datagram.c
@@ -0,0 +1,249 @@
+/*
+ * SUCS NET3:
+ *
+ * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top
+ * of these would make sense. Not tonight however 8-).
+ * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly
+ * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it.
+ *
+ * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
+ *
+ * Fixes:
+ * Alan Cox : NULL return from skb_peek_copy() understood
+ * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
+ * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but
+ * AX.25 now works right, and SPX is feasible.
+ * Alan Cox : Fixed write poll of non IP protocol crash.
+ * Florian La Roche: Changed for my new skbuff handling.
+ * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
+ * Linus Torvalds : BSD semantic fixes.
+ * Alan Cox : Datagram iovec handling
+ * Darryl Miles : Fixed non-blocking SOCK_STREAM.
+ * Alan Cox : POSIXisms
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+
+/*
+ * Wait for a packet..
+ *
+ * Interrupts off so that no packet arrives before we begin sleeping.
+ * Otherwise we might miss our wake up
+ */
+
+static inline void wait_for_packet(struct sock * sk)
+{
+ struct wait_queue wait = { current, NULL };
+
+ add_wait_queue(sk->sleep, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+
+ if (skb_peek(&sk->receive_queue) == NULL)
+ schedule();
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+}
+
+/*
+ * Is a socket 'connection oriented' ?
+ */
+
+static inline int connection_based(struct sock *sk)
+{
+ return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM);
+}
+
+/*
+ * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible
+ * races. This replaces identical code in packet,raw and udp, as well as the IPX
+ * AX.25 and Appletalk. It also finally fixes the long standing peek and read
+ * race for datagram sockets. If you alter this routine remember it must be
+ * re-entrant.
+ *
+ * This function will lock the socket if a skb is returned, so the caller
+ * needs to unlock the socket in that case (usually by calling skb_free_datagram)
+ *
+ * * It does not lock socket since today. This function is
+ * * free of race conditions. This measure should/can improve
+ * * significantly datagram socket latencies at high loads,
+ * * when data copying to user space takes lots of time.
+ * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ * * 8) Great win.)
+ * * --ANK (980729)
+ *
+ * The order of the tests when we find no data waiting are specified
+ * quite explicitly by POSIX 1003.1g, don't change them without having
+ * the standard around please.
+ */
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
+{
+ int error;
+ struct sk_buff *skb;
+
+ /* Caller is allowed not to check sk->err before skb_recv_datagram() */
+ error = sock_error(sk);
+ if (error)
+ goto no_packet;
+
+restart:
+ while(skb_queue_empty(&sk->receive_queue)) /* No data */
+ {
+ /* Socket errors? */
+ error = sock_error(sk);
+ if (error)
+ goto no_packet;
+
+ /* Socket shut down? */
+ if (sk->shutdown & RCV_SHUTDOWN)
+ goto no_packet;
+
+ /* Sequenced packets can come disconnected. If so we report the problem */
+ error = -ENOTCONN;
+ if(connection_based(sk) && sk->state!=TCP_ESTABLISHED)
+ goto no_packet;
+
+ /* handle signals */
+ error = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto no_packet;
+
+ /* User doesn't want to wait */
+ error = -EAGAIN;
+ if (noblock)
+ goto no_packet;
+
+ wait_for_packet(sk);
+ }
+
+ /* Again only user level code calls this function, so nothing interrupt level
+ will suddenly eat the receive_queue */
+ if (flags & MSG_PEEK)
+ {
+ unsigned long cpu_flags;
+
+ /* It is the only POTENTIAL race condition
+ in this function. skb may be stolen by
+ another receiver after peek, but before
+ incrementing use count, provided kernel
+ is reentearble (it is not) or this function
+ is called by interrupts.
+
+ Protect it with global skb spinlock,
+ though for now even this is overkill.
+ --ANK (980728)
+ */
+ spin_lock_irqsave(&skb_queue_lock, cpu_flags);
+ skb = skb_peek(&sk->receive_queue);
+ if(skb!=NULL)
+ atomic_inc(&skb->users);
+ spin_unlock_irqrestore(&skb_queue_lock, cpu_flags);
+ } else
+ skb = skb_dequeue(&sk->receive_queue);
+
+ if (!skb) /* Avoid race if someone beats us to the data */
+ goto restart;
+ return skb;
+
+no_packet:
+ *err = error;
+ return NULL;
+}
+
+void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+}
+
+/*
+ * Copy a datagram to a linear buffer.
+ */
+
+int skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size)
+{
+ int err = -EFAULT;
+
+ if (!copy_to_user(to, skb->h.raw + offset, size))
+ err = 0;
+ return err;
+}
+
+
+/*
+ * Copy a datagram to an iovec.
+ * Note: the iovec is modified during the copy.
+ */
+
+int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to,
+ int size)
+{
+ return memcpy_toiovec(to, skb->h.raw + offset, size);
+}
+
+/*
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you _don't_ use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ */
+
+unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask;
+
+ poll_wait(file, sk->sleep, wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->err || !skb_queue_empty(&sk->error_queue))
+ mask |= POLLERR;
+ if (sk->shutdown & RCV_SHUTDOWN)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (connection_based(sk)) {
+ if (sk->state==TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ sk->socket->flags |= SO_NOSPACE;
+
+ return mask;
+}
diff --git a/pfinet/linux-src/net/core/dev.c b/pfinet/linux-src/net/core/dev.c
new file mode 100644
index 00000000..cc9584a1
--- /dev/null
+++ b/pfinet/linux-src/net/core/dev.c
@@ -0,0 +1,2026 @@
+/*
+ * NET3 Protocol independent device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the non IP parts of dev.c 1.0.19
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Florian la Roche <rzsfl@rz.uni-sb.de>
+ * Alan Cox <gw4pts@gw4pts.ampr.org>
+ * David Hinds <dhinds@allegro.stanford.edu>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Adam Sulmicki <adam@cfar.umd.edu>
+ *
+ * Changes:
+ * Marcelo Tosatti <marcelo@conectiva.com.br> : dont accept mtu 0 or <
+ * Alan Cox : device private ioctl copies fields back.
+ * Alan Cox : Transmit queue code does relevant stunts to
+ * keep the queue safe.
+ * Alan Cox : Fixed double lock.
+ * Alan Cox : Fixed promisc NULL pointer trap
+ * ???????? : Support the full private ioctl range
+ * Alan Cox : Moved ioctl permission check into drivers
+ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
+ * Alan Cox : 100 backlog just doesn't cut it when
+ * you start doing multicast video 8)
+ * Alan Cox : Rewrote net_bh and list manager.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Took out transmit every packet pass
+ * Saved a few bytes in the ioctl handler
+ * Alan Cox : Network driver sets packet type before calling netif_rx. Saves
+ * a function call a packet.
+ * Alan Cox : Hashed net_bh()
+ * Richard Kooijman: Timestamp fixes.
+ * Alan Cox : Wrong field in SIOCGIFDSTADDR
+ * Alan Cox : Device lock protection.
+ * Alan Cox : Fixed nasty side effect of device close changes.
+ * Rudi Cilibrasi : Pass the right thing to set_mac_address()
+ * Dave Miller : 32bit quantity for the device lock to make it work out
+ * on a Sparc.
+ * Bjorn Ekwall : Added KERNELD hack.
+ * Alan Cox : Cleaned up the backlog initialise.
+ * Craig Metz : SIOCGIFCONF fix if space for under
+ * 1 device.
+ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
+ * is no device open function.
+ * Andi Kleen : Fix error reporting for SIOCGIFCONF
+ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
+ * Cyrus Durgin : Cleaned for KMOD
+ * Adam Sulmicki : Bug Fix : Network Device Unload
+ * A network device unload needs to purge
+ * the backlog queue.
+ * Paul Rusty Russel : SIOCSIFNAME
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+#include <net/slhc.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <net/br.h>
+#include <net/dst.h>
+#include <net/pkt_sched.h>
+#include <net/profile.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#ifdef CONFIG_NET_RADIO
+#include <linux/wireless.h>
+#endif /* CONFIG_NET_RADIO */
+#ifdef CONFIG_PLIP
+extern int plip_init(void);
+#endif
+
+NET_PROFILE_DEFINE(dev_queue_xmit)
+NET_PROFILE_DEFINE(net_bh)
+NET_PROFILE_DEFINE(net_bh_skb)
+
+
+const char *if_port_text[] = {
+ "unknown",
+ "BNC",
+ "10baseT",
+ "AUI",
+ "100baseT",
+ "100baseTX",
+ "100baseFX"
+};
+
+/*
+ * The list of packet types we will receive (as opposed to discard)
+ * and the routines to invoke.
+ *
+ * Why 16. Because with 16 the only overlap we get on a hash of the
+ * low nibble of the protocol value is RARP/SNAP/X.25.
+ *
+ * 0800 IP
+ * 0001 802.3
+ * 0002 AX.25
+ * 0004 802.2
+ * 8035 RARP
+ * 0005 SNAP
+ * 0805 X.25
+ * 0806 ARP
+ * 8137 IPX
+ * 0009 Localtalk
+ * 86DD IPv6
+ */
+
+struct packet_type *ptype_base[16]; /* 16 way hashed list */
+struct packet_type *ptype_all = NULL; /* Taps */
+
+/*
+ * Device list lock. Setting it provides that interface
+ * will not disappear unexpectedly while kernel sleeps.
+ */
+
+atomic_t dev_lockct = ATOMIC_INIT(0);
+
+/*
+ * Our notifier list
+ */
+
+static struct notifier_block *netdev_chain=NULL;
+
+/*
+ * Device drivers call our routines to queue packets here. We empty the
+ * queue in the bottom half handler.
+ */
+
+static struct sk_buff_head backlog;
+
+#ifdef CONFIG_NET_FASTROUTE
+int netdev_fastroute;
+int netdev_fastroute_obstacles;
+struct net_fastroute_stats dev_fastroute_stat;
+#endif
+
+static void dev_clear_backlog(struct device *dev);
+
+
+/******************************************************************************************
+
+ Protocol management and registration routines
+
+*******************************************************************************************/
+
+/*
+ * For efficiency
+ */
+
+int netdev_nit=0;
+
+/*
+ * Add a protocol ID to the list. Now that the input handler is
+ * smarter we can dispense with all the messy stuff that used to be
+ * here.
+ *
+ * BEWARE!!! Protocol handlers, mangling input packets,
+ * MUST BE last in hash buckets and checking protocol handlers
+ * MUST start from promiscous ptype_all chain in net_bh.
+ * It is true now, do not change it.
+ * Explantion follows: if protocol handler, mangling packet, will
+ * be the first on list, it is not able to sense, that packet
+ * is cloned and should be copied-on-write, so that it will
+ * change it and subsequent readers will get broken packet.
+ * --ANK (980803)
+ */
+
+void dev_add_pack(struct packet_type *pt)
+{
+ int hash;
+#ifdef CONFIG_NET_FASTROUTE
+ /* Hack to detect packet socket */
+ if (pt->data) {
+ netdev_fastroute_obstacles++;
+ dev_clear_fastroute(pt->dev);
+ }
+#endif
+ if(pt->type==htons(ETH_P_ALL))
+ {
+ netdev_nit++;
+ pt->next=ptype_all;
+ ptype_all=pt;
+ }
+ else
+ {
+ hash=ntohs(pt->type)&15;
+ pt->next = ptype_base[hash];
+ ptype_base[hash] = pt;
+ }
+}
+
+
+/*
+ * Remove a protocol ID from the list.
+ */
+
+void dev_remove_pack(struct packet_type *pt)
+{
+ struct packet_type **pt1;
+ if(pt->type==htons(ETH_P_ALL))
+ {
+ netdev_nit--;
+ pt1=&ptype_all;
+ }
+ else
+ pt1=&ptype_base[ntohs(pt->type)&15];
+ for(; (*pt1)!=NULL; pt1=&((*pt1)->next))
+ {
+ if(pt==(*pt1))
+ {
+ *pt1=pt->next;
+ synchronize_bh();
+#ifdef CONFIG_NET_FASTROUTE
+ if (pt->data)
+ netdev_fastroute_obstacles--;
+#endif
+ return;
+ }
+ }
+ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+}
+
+/*****************************************************************************************
+
+ Device Interface Subroutines
+
+******************************************************************************************/
+
+/*
+ * Find an interface by name.
+ */
+
+struct device *dev_get(const char *name)
+{
+ struct device *dev;
+
+ for (dev = dev_base; dev != NULL; dev = dev->next)
+ {
+ if (strcmp(dev->name, name) == 0)
+ return(dev);
+ }
+ return NULL;
+}
+
+struct device * dev_get_by_index(int ifindex)
+{
+ struct device *dev;
+
+ for (dev = dev_base; dev != NULL; dev = dev->next)
+ {
+ if (dev->ifindex == ifindex)
+ return(dev);
+ }
+ return NULL;
+}
+
+struct device *dev_getbyhwaddr(unsigned short type, char *ha)
+{
+ struct device *dev;
+
+ for (dev = dev_base; dev != NULL; dev = dev->next)
+ {
+ if (dev->type == type &&
+ memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
+ return(dev);
+ }
+ return(NULL);
+}
+
+/*
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. Not efficient for many devices, not called a lot..
+ */
+
+int dev_alloc_name(struct device *dev, const char *name)
+{
+ int i;
+ /*
+ * If you need over 100 please also fix the algorithm...
+ */
+ for(i=0;i<100;i++)
+ {
+ sprintf(dev->name,name,i);
+ if(dev_get(dev->name)==NULL)
+ return i;
+ }
+ return -ENFILE; /* Over 100 of the things .. bail out! */
+}
+
+struct device *dev_alloc(const char *name, int *err)
+{
+ struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL);
+ if(dev==NULL)
+ {
+ *err=-ENOBUFS;
+ return NULL;
+ }
+ dev->name=(char *)(dev+1); /* Name string space */
+ *err=dev_alloc_name(dev,name);
+ if(*err<0)
+ {
+ kfree(dev);
+ return NULL;
+ }
+ return dev;
+}
+
+void netdev_state_change(struct device *dev)
+{
+ if (dev->flags&IFF_UP)
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+}
+
+
+/*
+ * Find and possibly load an interface.
+ */
+
+#ifdef CONFIG_KMOD
+
+void dev_load(const char *name)
+{
+ if(!dev_get(name) && capable(CAP_SYS_MODULE))
+ request_module(name);
+}
+
+#else
+
+extern inline void dev_load(const char *unused){;}
+
+#endif
+
+static int default_rebuild_header(struct sk_buff *skb)
+{
+ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!");
+ kfree_skb(skb);
+ return 1;
+}
+
+/*
+ * Prepare an interface for use.
+ */
+
+int dev_open(struct device *dev)
+{
+ int ret = 0;
+
+ /*
+ * Is it already up?
+ */
+
+ if (dev->flags&IFF_UP)
+ return 0;
+
+ /*
+ * Call device private open method
+ */
+
+ if (dev->open)
+ ret = dev->open(dev);
+
+ /*
+ * If it went open OK then:
+ */
+
+ if (ret == 0)
+ {
+ /*
+ * nil rebuild_header routine,
+ * that should be never called and used as just bug trap.
+ */
+
+ if (dev->rebuild_header == NULL)
+ dev->rebuild_header = default_rebuild_header;
+
+ /*
+ * Set the flags.
+ */
+ dev->flags |= (IFF_UP | IFF_RUNNING);
+
+ /*
+ * Initialize multicasting status
+ */
+ dev_mc_upload(dev);
+
+ /*
+ * Wakeup transmit queue engine
+ */
+ dev_activate(dev);
+
+ /*
+ * ... and announce new interface.
+ */
+ notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+
+ }
+ return(ret);
+}
+
+#ifdef CONFIG_NET_FASTROUTE
+
+static __inline__ void dev_do_clear_fastroute(struct device *dev)
+{
+ if (dev->accept_fastpath) {
+ int i;
+
+ for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++)
+ dst_release_irqwait(xchg(dev->fastpath+i, NULL));
+ }
+}
+
+void dev_clear_fastroute(struct device *dev)
+{
+ if (dev) {
+ dev_do_clear_fastroute(dev);
+ } else {
+ for (dev = dev_base; dev; dev = dev->next)
+ dev_do_clear_fastroute(dev);
+ }
+}
+#endif
+
+/*
+ * Completely shutdown an interface.
+ */
+
+int dev_close(struct device *dev)
+{
+ if (!(dev->flags&IFF_UP))
+ return 0;
+
+ dev_deactivate(dev);
+
+ dev_lock_wait();
+
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ */
+
+ if (dev->stop)
+ dev->stop(dev);
+
+ if (dev->start)
+ printk("dev_close: bug %s still running\n", dev->name);
+
+ /*
+ * Device is now down.
+ */
+ dev_clear_backlog(dev);
+
+ dev->flags&=~(IFF_UP|IFF_RUNNING);
+#ifdef CONFIG_NET_FASTROUTE
+ dev_clear_fastroute(dev);
+#endif
+
+ /*
+ * Tell people we are going down
+ */
+ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+
+ return(0);
+}
+
+
+/*
+ * Device change register/unregister. These are not inline or static
+ * as we export them to the world.
+ */
+
+int register_netdevice_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&netdev_chain, nb);
+}
+
+int unregister_netdevice_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&netdev_chain,nb);
+}
+
+/*
+ * Support routine. Sends outgoing frames to any network
+ * taps currently in use.
+ */
+
+void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev)
+{
+ struct packet_type *ptype;
+ get_fast_time(&skb->stamp);
+
+ for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next)
+ {
+ /* Never send packets back to the socket
+ * they originated from - MvS (miquels@drinkel.ow.org)
+ */
+ if ((ptype->dev == dev || !ptype->dev) &&
+ ((struct sock *)ptype->data != skb->sk))
+ {
+ struct sk_buff *skb2;
+ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL)
+ break;
+
+ /* Code, following below is wrong.
+
+ The only reason, why it does work is that
+ ONLY packet sockets receive outgoing
+ packets. If such a packet will be (occasionally)
+ received by normal packet handler, which expects
+ that mac header is pulled...
+ */
+
+ /* More sensible variant. skb->nh should be correctly
+ set by sender, so that the second statement is
+ just protection against buggy protocols.
+ */
+ skb2->mac.raw = skb2->data;
+
+ if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name);
+ skb2->nh.raw = skb2->data;
+ if (dev->hard_header)
+ skb2->nh.raw += dev->hard_header_len;
+ }
+
+ skb2->h.raw = skb2->nh.raw;
+ skb2->pkt_type = PACKET_OUTGOING;
+ ptype->func(skb2, skb->dev, ptype);
+ }
+ }
+}
+
+/*
+ * Fast path for loopback frames.
+ */
+
+void dev_loopback_xmit(struct sk_buff *skb)
+{
+ struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC);
+ if (newskb==NULL)
+ return;
+
+ newskb->mac.raw = newskb->data;
+ skb_pull(newskb, newskb->nh.raw - newskb->data);
+ newskb->pkt_type = PACKET_LOOPBACK;
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (newskb->dst==NULL)
+ printk(KERN_DEBUG "BUG: packet without dst looped back 1\n");
+ netif_rx(newskb);
+}
+
+int dev_queue_xmit(struct sk_buff *skb)
+{
+ struct device *dev = skb->dev;
+ struct Qdisc *q;
+
+#ifdef CONFIG_NET_PROFILE
+ start_bh_atomic();
+ NET_PROFILE_ENTER(dev_queue_xmit);
+#endif
+
+ start_bh_atomic();
+ q = dev->qdisc;
+ if (q->enqueue) {
+ q->enqueue(skb, q);
+ qdisc_wakeup(dev);
+ end_bh_atomic();
+
+#ifdef CONFIG_NET_PROFILE
+ NET_PROFILE_LEAVE(dev_queue_xmit);
+ end_bh_atomic();
+#endif
+
+ return 0;
+ }
+
+ /* The device has no queue. Common case for software devices:
+ loopback, all the sorts of tunnels...
+
+ Really, it is unlikely that bh protection is necessary here:
+ virtual devices do not generate EOI events.
+ However, it is possible, that they rely on bh protection
+ made by us here.
+ */
+ if (dev->flags&IFF_UP) {
+ if (netdev_nit)
+ dev_queue_xmit_nit(skb,dev);
+ if (dev->hard_start_xmit(skb, dev) == 0) {
+ end_bh_atomic();
+
+#ifdef CONFIG_NET_PROFILE
+ NET_PROFILE_LEAVE(dev_queue_xmit);
+ end_bh_atomic();
+#endif
+
+ return 0;
+ }
+ if (net_ratelimit())
+ printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name);
+ }
+ end_bh_atomic();
+
+ kfree_skb(skb);
+
+#ifdef CONFIG_NET_PROFILE
+ NET_PROFILE_LEAVE(dev_queue_xmit);
+ end_bh_atomic();
+#endif
+
+ return 0;
+}
+
+
+/*=======================================================================
+ Receiver rotutines
+ =======================================================================*/
+
+int netdev_dropping = 0;
+int netdev_max_backlog = 300;
+atomic_t netdev_rx_dropped;
+#ifdef CONFIG_CPU_IS_SLOW
+int net_cpu_congestion;
+#endif
+
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+int netdev_throttle_events;
+static unsigned long netdev_fc_mask = 1;
+unsigned long netdev_fc_xoff = 0;
+
+static struct
+{
+ void (*stimul)(struct device *);
+ struct device *dev;
+} netdev_fc_slots[32];
+
+int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev))
+{
+ int bit = 0;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+ if (netdev_fc_mask != ~0UL) {
+ bit = ffz(netdev_fc_mask);
+ netdev_fc_slots[bit].stimul = stimul;
+ netdev_fc_slots[bit].dev = dev;
+ set_bit(bit, &netdev_fc_mask);
+ clear_bit(bit, &netdev_fc_xoff);
+ }
+ restore_flags(flags);
+ return bit;
+}
+
+void netdev_unregister_fc(int bit)
+{
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+ if (bit > 0) {
+ netdev_fc_slots[bit].stimul = NULL;
+ netdev_fc_slots[bit].dev = NULL;
+ clear_bit(bit, &netdev_fc_mask);
+ clear_bit(bit, &netdev_fc_xoff);
+ }
+ restore_flags(flags);
+}
+
+static void netdev_wakeup(void)
+{
+ unsigned long xoff;
+
+ cli();
+ xoff = netdev_fc_xoff;
+ netdev_fc_xoff = 0;
+ netdev_dropping = 0;
+ netdev_throttle_events++;
+ while (xoff) {
+ int i = ffz(~xoff);
+ xoff &= ~(1<<i);
+ netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev);
+ }
+ sti();
+}
+#endif
+
+static void dev_clear_backlog(struct device *dev)
+{
+ struct sk_buff *prev, *curr;
+
+ /*
+ *
+ * Let now clear backlog queue. -AS
+ *
+ * We are competing here both with netif_rx() and net_bh().
+ * We don't want either of those to mess with skb ptrs
+ * while we work on them, thus cli()/sti().
+ *
+ * It looks better to use net_bh trick, at least
+ * to be sure, that we keep interrupt latency really low. --ANK (980727)
+ */
+
+ if (backlog.qlen) {
+ start_bh_atomic();
+ curr = backlog.next;
+ while ( curr != (struct sk_buff *)(&backlog) ) {
+ unsigned long flags;
+ curr=curr->next;
+ if ( curr->prev->dev == dev ) {
+ prev = curr->prev;
+ spin_lock_irqsave(&skb_queue_lock, flags);
+ __skb_unlink(prev, &backlog);
+ spin_unlock_irqrestore(&skb_queue_lock, flags);
+ kfree_skb(prev);
+ }
+ }
+ end_bh_atomic();
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+ if (netdev_dropping)
+ netdev_wakeup();
+#else
+ netdev_dropping = 0;
+#endif
+ }
+}
+
+/*
+ * Receive a packet from a device driver and queue it for the upper
+ * (protocol) levels. It always succeeds.
+ */
+
+void netif_rx(struct sk_buff *skb)
+{
+#ifndef CONFIG_CPU_IS_SLOW
+ if(skb->stamp.tv_sec==0)
+ get_fast_time(&skb->stamp);
+#else
+ skb->stamp = xtime;
+#endif
+
+ /* The code is rearranged so that the path is the most
+ short when CPU is congested, but is still operating.
+ */
+
+ if (backlog.qlen <= netdev_max_backlog) {
+ if (backlog.qlen) {
+ if (netdev_dropping == 0) {
+ skb_queue_tail(&backlog,skb);
+ mark_bh(NET_BH);
+ return;
+ }
+ atomic_inc(&netdev_rx_dropped);
+ kfree_skb(skb);
+ return;
+ }
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+ if (netdev_dropping)
+ netdev_wakeup();
+#else
+ netdev_dropping = 0;
+#endif
+ skb_queue_tail(&backlog,skb);
+ mark_bh(NET_BH);
+ return;
+ }
+ netdev_dropping = 1;
+ atomic_inc(&netdev_rx_dropped);
+ kfree_skb(skb);
+}
+
+#ifdef CONFIG_BRIDGE
+static inline void handle_bridge(struct sk_buff *skb, unsigned short type)
+{
+ if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type)))
+ {
+ /*
+ * We pass the bridge a complete frame. This means
+ * recovering the MAC header first.
+ */
+
+ int offset;
+
+ skb=skb_clone(skb, GFP_ATOMIC);
+ if(skb==NULL)
+ return;
+
+ offset=skb->data-skb->mac.raw;
+ skb_push(skb,offset); /* Put header back on for bridge */
+
+ if(br_receive_frame(skb))
+ return;
+ kfree_skb(skb);
+ }
+ return;
+}
+#endif
+
+
+/*
+ * When we are called the queue is ready to grab, the interrupts are
+ * on and hardware can interrupt and queue to the receive queue as we
+ * run with no problems.
+ * This is run as a bottom half after an interrupt handler that does
+ * mark_bh(NET_BH);
+ */
+
+void net_bh(void)
+{
+ struct packet_type *ptype;
+ struct packet_type *pt_prev;
+ unsigned short type;
+ unsigned long start_time = jiffies;
+#ifdef CONFIG_CPU_IS_SLOW
+ static unsigned long start_busy = 0;
+ static unsigned long ave_busy = 0;
+
+ if (start_busy == 0)
+ start_busy = start_time;
+ net_cpu_congestion = ave_busy>>8;
+#endif
+
+ NET_PROFILE_ENTER(net_bh);
+ /*
+ * Can we send anything now? We want to clear the
+ * decks for any more sends that get done as we
+ * process the input. This also minimises the
+ * latency on a transmit interrupt bh.
+ */
+
+ if (qdisc_head.forw != &qdisc_head)
+ qdisc_run_queues();
+
+ /*
+ * Any data left to process. This may occur because a
+ * mark_bh() is done after we empty the queue including
+ * that from the device which does a mark_bh() just after
+ */
+
+ /*
+ * While the queue is not empty..
+ *
+ * Note that the queue never shrinks due to
+ * an interrupt, so we can do this test without
+ * disabling interrupts.
+ */
+
+ while (!skb_queue_empty(&backlog))
+ {
+ struct sk_buff * skb;
+
+ /* Give chance to other bottom halves to run */
+ if (jiffies - start_time > 1)
+ goto net_bh_break;
+
+ /*
+ * We have a packet. Therefore the queue has shrunk
+ */
+ skb = skb_dequeue(&backlog);
+
+#ifdef CONFIG_CPU_IS_SLOW
+ if (ave_busy > 128*16) {
+ kfree_skb(skb);
+ while ((skb = skb_dequeue(&backlog)) != NULL)
+ kfree_skb(skb);
+ break;
+ }
+#endif
+
+
+#if 0
+ NET_PROFILE_SKB_PASSED(skb, net_bh_skb);
+#endif
+#ifdef CONFIG_NET_FASTROUTE
+ if (skb->pkt_type == PACKET_FASTROUTE) {
+ dev_queue_xmit(skb);
+ continue;
+ }
+#endif
+
+ /*
+ * Bump the pointer to the next structure.
+ *
+ * On entry to the protocol layer. skb->data and
+ * skb->nh.raw point to the MAC and encapsulated data
+ */
+
+ /* XXX until we figure out every place to modify.. */
+ skb->h.raw = skb->nh.raw = skb->data;
+
+ if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) {
+ printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol);
+ kfree_skb(skb);
+ continue;
+ }
+
+ /*
+ * Fetch the packet protocol ID.
+ */
+
+ type = skb->protocol;
+
+#ifdef CONFIG_BRIDGE
+ /*
+ * If we are bridging then pass the frame up to the
+ * bridging code (if this protocol is to be bridged).
+ * If it is bridged then move on
+ */
+ handle_bridge(skb, type);
+#endif
+
+ /*
+ * We got a packet ID. Now loop over the "known protocols"
+ * list. There are two lists. The ptype_all list of taps (normally empty)
+ * and the main protocol list which is hashed perfectly for normal protocols.
+ */
+
+ pt_prev = NULL;
+ for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next)
+ {
+ if (!ptype->dev || ptype->dev == skb->dev) {
+ if(pt_prev)
+ {
+ struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC);
+ if(skb2)
+ pt_prev->func(skb2,skb->dev, pt_prev);
+ }
+ pt_prev=ptype;
+ }
+ }
+
+ for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next)
+ {
+ if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev))
+ {
+ /*
+ * We already have a match queued. Deliver
+ * to it and then remember the new match
+ */
+ if(pt_prev)
+ {
+ struct sk_buff *skb2;
+
+ skb2=skb_clone(skb, GFP_ATOMIC);
+
+ /*
+ * Kick the protocol handler. This should be fast
+ * and efficient code.
+ */
+
+ if(skb2)
+ pt_prev->func(skb2, skb->dev, pt_prev);
+ }
+ /* Remember the current last to do */
+ pt_prev=ptype;
+ }
+ } /* End of protocol list loop */
+
+ /*
+ * Is there a last item to send to ?
+ */
+
+ if(pt_prev)
+ pt_prev->func(skb, skb->dev, pt_prev);
+ /*
+ * Has an unknown packet has been received ?
+ */
+
+ else {
+ kfree_skb(skb);
+ }
+ } /* End of queue loop */
+
+ /*
+ * We have emptied the queue
+ */
+
+ /*
+ * One last output flush.
+ */
+
+ if (qdisc_head.forw != &qdisc_head)
+ qdisc_run_queues();
+
+#ifdef CONFIG_CPU_IS_SLOW
+ if (1) {
+ unsigned long start_idle = jiffies;
+ ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4);
+ start_busy = 0;
+ }
+#endif
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+ if (netdev_dropping)
+ netdev_wakeup();
+#else
+ netdev_dropping = 0;
+#endif
+ NET_PROFILE_LEAVE(net_bh);
+ return;
+
+net_bh_break:
+ mark_bh(NET_BH);
+ NET_PROFILE_LEAVE(net_bh);
+ return;
+}
+
+/* Protocol dependent address dumping routines */
+
+static gifconf_func_t * gifconf_list [NPROTO];
+
+int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
+{
+ if (family>=NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * This call is useful, but I'd remove it too.
+ *
+ * The reason is purely aestetical, it is the only call
+ * from SIOC* family using struct ifreq in reversed manner.
+ * Besides that, it is pretty silly to put "drawing" facility
+ * to kernel, it is useful only to print ifindices
+ * in readable form, is not it? --ANK
+ *
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct ifreq *arg)
+{
+ struct device *dev;
+ struct ifreq ifr;
+ int err;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ err = copy_from_user(&ifr, arg, sizeof(struct ifreq));
+ if (err)
+ return -EFAULT;
+
+ dev = dev_get_by_index(ifr.ifr_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ strcpy(ifr.ifr_name, dev->name);
+
+ err = copy_to_user(arg, &ifr, sizeof(struct ifreq));
+ return (err)?-EFAULT:0;
+}
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(char *arg)
+{
+ struct ifconf ifc;
+ struct device *dev;
+ char *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for (dev = dev_base; dev != NULL; dev = dev->next) {
+ for (i=0; i<NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (pos==NULL) {
+ done = gifconf_list[i](dev, NULL, 0);
+ } else {
+ done = gifconf_list[i](dev, pos+total, len-total);
+ }
+ if (done<0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc.ifc_len = total;
+
+ if (copy_to_user(arg, &ifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return 0;
+}
+
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+
+#ifdef CONFIG_PROC_FS
+static int sprintf_stats(char *buffer, struct device *dev)
+{
+ struct net_device_stats *stats = (dev->get_stats ? dev->get_stats(dev): NULL);
+ int size;
+
+ if (stats)
+ size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+ dev->name,
+ stats->rx_bytes,
+ stats->rx_packets, stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors
+ + stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes,
+ stats->tx_packets, stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors + stats->tx_aborted_errors
+ + stats->tx_window_errors + stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+ else
+ size = sprintf(buffer, "%6s: No statistics available.\n", dev->name);
+
+ return size;
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface
+ * to create /proc/net/dev
+ */
+
+int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ int len=0;
+ off_t begin=0;
+ off_t pos=0;
+ int size;
+
+ struct device *dev;
+
+
+ size = sprintf(buffer,
+ "Inter-| Receive | Transmit\n"
+ " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n");
+
+ pos+=size;
+ len+=size;
+
+
+ for (dev = dev_base; dev != NULL; dev = dev->next)
+ {
+ size = sprintf_stats(buffer+len, dev);
+ len+=size;
+ pos=begin+len;
+
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ break;
+ }
+
+ *start=buffer+(offset-begin); /* Start of wanted data */
+ len-=(offset-begin); /* Start slop */
+ if(len>length)
+ len=length; /* Ending slop */
+ return len;
+}
+
+static int dev_proc_stats(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(buffer, "%08x %08x %08x %08x %08x\n",
+ atomic_read(&netdev_rx_dropped),
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+ netdev_throttle_events,
+#else
+ 0,
+#endif
+#ifdef CONFIG_NET_FASTROUTE
+ dev_fastroute_stat.hits,
+ dev_fastroute_stat.succeed,
+ dev_fastroute_stat.deferred
+#else
+ 0, 0, 0
+#endif
+ );
+
+ len -= offset;
+
+ if (len > length)
+ len = length;
+ if(len < 0)
+ len = 0;
+
+ *start = buffer + offset;
+ *eof = 1;
+
+ return len;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+
+#ifdef CONFIG_NET_RADIO
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Print one entry of /proc/net/wireless
+ * This is a clone of /proc/net/dev (just above)
+ */
+static int sprintf_wireless_stats(char *buffer, struct device *dev)
+{
+ /* Get stats from the driver */
+ struct iw_statistics *stats = (dev->get_wireless_stats ?
+ dev->get_wireless_stats(dev) :
+ (struct iw_statistics *) NULL);
+ int size;
+
+ if(stats != (struct iw_statistics *) NULL)
+ size = sprintf(buffer,
+ "%6s: %02x %3d%c %3d%c %3d%c %5d %5d %5d\n",
+ dev->name,
+ stats->status,
+ stats->qual.qual,
+ stats->qual.updated & 1 ? '.' : ' ',
+ stats->qual.level,
+ stats->qual.updated & 2 ? '.' : ' ',
+ stats->qual.noise,
+ stats->qual.updated & 3 ? '.' : ' ',
+ stats->discard.nwid,
+ stats->discard.code,
+ stats->discard.misc);
+ else
+ size = 0;
+
+ return size;
+}
+
+/*
+ * Print info for /proc/net/wireless (print all entries)
+ * This is a clone of /proc/net/dev (just above)
+ */
+int dev_get_wireless_info(char * buffer, char **start, off_t offset,
+ int length, int dummy)
+{
+ int len = 0;
+ off_t begin = 0;
+ off_t pos = 0;
+ int size;
+
+ struct device * dev;
+
+ size = sprintf(buffer,
+ "Inter-|sta| Quality | Discarded packets\n"
+ " face |tus|link level noise| nwid crypt misc\n");
+
+ pos+=size;
+ len+=size;
+
+ for(dev = dev_base; dev != NULL; dev = dev->next)
+ {
+ size = sprintf_wireless_stats(buffer+len, dev);
+ len+=size;
+ pos=begin+len;
+
+ if(pos < offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos > offset + length)
+ break;
+ }
+
+ *start = buffer + (offset - begin); /* Start of wanted data */
+ len -= (offset - begin); /* Start slop */
+ if(len > length)
+ len = length; /* Ending slop */
+
+ return len;
+}
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_RADIO */
+
+void dev_set_promiscuity(struct device *dev, int inc)
+{
+ unsigned short old_flags = dev->flags;
+
+ dev->flags |= IFF_PROMISC;
+ if ((dev->promiscuity += inc) == 0)
+ dev->flags &= ~IFF_PROMISC;
+ if (dev->flags^old_flags) {
+#ifdef CONFIG_NET_FASTROUTE
+ if (dev->flags&IFF_PROMISC) {
+ netdev_fastroute_obstacles++;
+ dev_clear_fastroute(dev);
+ } else
+ netdev_fastroute_obstacles--;
+#endif
+ dev_mc_upload(dev);
+ printk(KERN_INFO "device %s %s promiscuous mode\n",
+ dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
+ }
+}
+
+void dev_set_allmulti(struct device *dev, int inc)
+{
+ unsigned short old_flags = dev->flags;
+
+ dev->flags |= IFF_ALLMULTI;
+ if ((dev->allmulti += inc) == 0)
+ dev->flags &= ~IFF_ALLMULTI;
+ if (dev->flags^old_flags)
+ dev_mc_upload(dev);
+}
+
+int dev_change_flags(struct device *dev, unsigned flags)
+{
+ int ret;
+ int old_flags = dev->flags;
+
+ /*
+ * Set the flags on our device.
+ */
+
+ dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP|
+ IFF_SLAVE|IFF_MASTER|IFF_DYNAMIC|
+ IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
+ (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
+
+ /*
+ * Load in the correct multicast list now the flags have changed.
+ */
+
+ dev_mc_upload(dev);
+
+ /*
+ * Have we downed the interface. We handle IFF_UP ourselves
+ * according to user attempts to set it, rather than blindly
+ * setting it.
+ */
+
+ ret = 0;
+ if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
+ {
+ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
+
+ if (ret == 0)
+ dev_mc_upload(dev);
+ }
+
+ if (dev->flags&IFF_UP &&
+ ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+
+ if ((flags^dev->gflags)&IFF_PROMISC) {
+ int inc = (flags&IFF_PROMISC) ? +1 : -1;
+ dev->gflags ^= IFF_PROMISC;
+ dev_set_promiscuity(dev, inc);
+ }
+
+ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
+ is important. Some (broken) drivers set IFF_PROMISC, when
+ IFF_ALLMULTI is requested not asking us and not reporting.
+ */
+ if ((flags^dev->gflags)&IFF_ALLMULTI) {
+ int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
+ dev->gflags ^= IFF_ALLMULTI;
+ dev_set_allmulti(dev, inc);
+ }
+
+ return ret;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls.
+ */
+
+static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
+{
+ struct device *dev;
+ int err;
+
+ if ((dev = dev_get(ifr->ifr_name)) == NULL)
+ return -ENODEV;
+
+ switch(cmd)
+ {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI))
+ |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
+ return 0;
+
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ if (ifr->ifr_mtu == dev->mtu)
+ return 0;
+
+ /*
+ * MTU must be positive.
+ */
+
+ if (ifr->ifr_mtu<=0)
+ return -EINVAL;
+
+ if (dev->change_mtu)
+ err = dev->change_mtu(dev, ifr->ifr_mtu);
+ else {
+ dev->mtu = ifr->ifr_mtu;
+ err = 0;
+ }
+ if (!err && dev->flags&IFF_UP)
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
+ return err;
+
+ case SIOCGIFHWADDR:
+ memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
+ ifr->ifr_hwaddr.sa_family=dev->type;
+ return 0;
+
+ case SIOCSIFHWADDR:
+ if(dev->set_mac_address==NULL)
+ return -EOPNOTSUPP;
+ if(ifr->ifr_hwaddr.sa_family!=dev->type)
+ return -EINVAL;
+ err=dev->set_mac_address(dev,&ifr->ifr_hwaddr);
+ if (!err)
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
+ return err;
+
+ case SIOCSIFHWBROADCAST:
+ if(ifr->ifr_hwaddr.sa_family!=dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start=dev->mem_start;
+ ifr->ifr_map.mem_end=dev->mem_end;
+ ifr->ifr_map.base_addr=dev->base_addr;
+ ifr->ifr_map.irq=dev->irq;
+ ifr->ifr_map.dma=dev->dma;
+ ifr->ifr_map.port=dev->if_port;
+ return 0;
+
+ case SIOCSIFMAP:
+ if (dev->set_config)
+ return dev->set_config(dev,&ifr->ifr_map);
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if(dev->set_multicast_list==NULL ||
+ ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
+ return -EINVAL;
+ dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
+ return 0;
+
+ case SIOCDELMULTI:
+ if(dev->set_multicast_list==NULL ||
+ ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
+ return -EINVAL;
+ dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ case SIOCSIFTXQLEN:
+ if(ifr->ifr_qlen<0)
+ return -EINVAL;
+ dev->tx_queue_len = ifr->ifr_qlen;
+ return 0;
+
+ case SIOCSIFNAME:
+ if (dev->flags&IFF_UP)
+ return -EBUSY;
+ if (dev_get(ifr->ifr_newname))
+ return -EEXIST;
+ memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
+ dev->name[IFNAMSIZ-1] = 0;
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+ return 0;
+
+ /*
+ * Unknown or private ioctl
+ */
+
+ default:
+ if(cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) {
+ if (dev->do_ioctl)
+ return dev->do_ioctl(dev, ifr, cmd);
+ return -EOPNOTSUPP;
+ }
+
+#ifdef CONFIG_NET_RADIO
+ if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ if (dev->do_ioctl)
+ return dev->do_ioctl(dev, ifr, cmd);
+ return -EOPNOTSUPP;
+ }
+#endif /* CONFIG_NET_RADIO */
+
+ }
+ return -EINVAL;
+}
+
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+int dev_ioctl(unsigned int cmd, void *arg)
+{
+ struct ifreq ifr;
+ int ret;
+ char *colon;
+
+ /* One special case: SIOCGIFCONF takes ifconf argument
+ and requires shared lock, because it sleeps writing
+ to user space.
+ */
+
+ if (cmd == SIOCGIFCONF) {
+ rtnl_shlock();
+ ret = dev_ifconf((char *) arg);
+ rtnl_shunlock();
+ return ret;
+ }
+ if (cmd == SIOCGIFNAME) {
+ return dev_ifname((struct ifreq *)arg);
+ }
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch(cmd)
+ {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(ifr.ifr_name);
+ ret = dev_ifsioc(&ifr, cmd);
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFMAP:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSIFTXQLEN:
+ case SIOCSIFNAME:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ dev_load(ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(&ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but currently
+ do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space. Not applicable in our case */
+ case SIOCSIFLINK:
+ return -EINVAL;
+
+ /*
+ * Unknown or private ioctl.
+ */
+
+ default:
+ if (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) {
+ dev_load(ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(&ifr, cmd);
+ rtnl_unlock();
+ if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return ret;
+ }
+#ifdef CONFIG_NET_RADIO
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ dev_load(ifr.ifr_name);
+ if (IW_IS_SET(cmd)) {
+ if (!suser())
+ return -EPERM;
+ rtnl_lock();
+ }
+ ret = dev_ifsioc(&ifr, cmd);
+ if (IW_IS_SET(cmd))
+ rtnl_unlock();
+ if (!ret && IW_IS_GET(cmd) &&
+ copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return ret;
+ }
+#endif /* CONFIG_NET_RADIO */
+ return -EINVAL;
+ }
+}
+
+int dev_new_index(void)
+{
+ static int ifindex;
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex=1;
+ if (dev_get_by_index(ifindex) == NULL)
+ return ifindex;
+ }
+}
+
+static int dev_boot_phase = 1;
+
+
+int register_netdevice(struct device *dev)
+{
+ struct device *d, **dp;
+
+ if (dev_boot_phase) {
+ /* This is NOT bug, but I am not sure, that all the
+ devices, initialized before netdev module is started
+ are sane.
+
+ Now they are chained to device boot list
+ and probed later. If a module is initialized
+ before netdev, but assumes that dev->init
+ is really called by register_netdev(), it will fail.
+
+ So that this message should be printed for a while.
+ */
+ printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name);
+
+ /* Check for existence, and append to tail of chain */
+ for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
+ if (d == dev || strcmp(d->name, dev->name) == 0)
+ return -EEXIST;
+ }
+ dev->next = NULL;
+ *dp = dev;
+ return 0;
+ }
+
+ dev->iflink = -1;
+
+ /* Init, if this function is available */
+ if (dev->init && dev->init(dev) != 0)
+ return -EIO;
+
+ /* Check for existence, and append to tail of chain */
+ for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
+ if (d == dev || strcmp(d->name, dev->name) == 0)
+ return -EEXIST;
+ }
+ dev->next = NULL;
+ dev_init_scheduler(dev);
+ dev->ifindex = dev_new_index();
+ if (dev->iflink == -1)
+ dev->iflink = dev->ifindex;
+ *dp = dev;
+
+ /* Notify protocols, that a new device appeared. */
+ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+
+ return 0;
+}
+
+int unregister_netdevice(struct device *dev)
+{
+ struct device *d, **dp;
+
+ if (dev_boot_phase == 0) {
+ /* If device is running, close it.
+ It is very bad idea, really we should
+ complain loudly here, but random hackery
+ in linux/drivers/net likes it.
+ */
+ if (dev->flags & IFF_UP)
+ dev_close(dev);
+
+#ifdef CONFIG_NET_FASTROUTE
+ dev_clear_fastroute(dev);
+#endif
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+ */
+ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
+
+ /*
+ * Flush the multicast chain
+ */
+ dev_mc_discard(dev);
+
+ /* To avoid pointers looking to nowhere,
+ we wait for end of critical section */
+ dev_lock_wait();
+ }
+
+ /* And unlink it from device chain. */
+ for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
+ if (d == dev) {
+ *dp = d->next;
+ synchronize_bh();
+ d->next = NULL;
+
+ if (dev->destructor)
+ dev->destructor(dev);
+ return 0;
+ }
+ }
+ return -ENODEV;
+}
+
+
+/*
+ * Initialize the DEV module. At boot time this walks the device list and
+ * unhooks any devices that fail to initialise (normally hardware not
+ * present) and leaves us with a valid list of present and active devices.
+ *
+ */
+extern int lance_init(void);
+extern int bpq_init(void);
+extern int scc_init(void);
+extern void sdla_setup(void);
+extern void dlci_setup(void);
+extern int dmascc_init(void);
+extern int sm_init(void);
+
+extern int baycom_ser_fdx_init(void);
+extern int baycom_ser_hdx_init(void);
+extern int baycom_par_init(void);
+
+extern int lapbeth_init(void);
+extern void arcnet_init(void);
+extern void ip_auto_config(void);
+#ifdef CONFIG_8xx
+extern int cpm_enet_init(void);
+#endif /* CONFIG_8xx */
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_dev = {
+ PROC_NET_DEV, 3, "dev",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ dev_get_info
+};
+#endif
+
+#ifdef CONFIG_NET_RADIO
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_wireless = {
+ PROC_NET_WIRELESS, 8, "wireless",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ dev_get_wireless_info
+};
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_RADIO */
+
+__initfunc(int net_dev_init(void))
+{
+ struct device *dev, **dp;
+
+#ifdef CONFIG_NET_SCHED
+ pktsched_init();
+#endif
+
+ /*
+ * Initialise the packet receive queue.
+ */
+
+ skb_queue_head_init(&backlog);
+
+ /*
+ * The bridge has to be up before the devices
+ */
+
+#ifdef CONFIG_BRIDGE
+ br_init();
+#endif
+
+ /*
+ * This is Very Ugly(tm).
+ *
+ * Some devices want to be initialized early..
+ */
+
+#if defined(CONFIG_SCC)
+ scc_init();
+#endif
+#if defined(CONFIG_DMASCC)
+ dmascc_init();
+#endif
+#if defined(CONFIG_BPQETHER)
+ bpq_init();
+#endif
+#if defined(CONFIG_DLCI)
+ dlci_setup();
+#endif
+#if defined(CONFIG_SDLA)
+ sdla_setup();
+#endif
+#if defined(CONFIG_BAYCOM_PAR)
+ baycom_par_init();
+#endif
+#if defined(CONFIG_BAYCOM_SER_FDX)
+ baycom_ser_fdx_init();
+#endif
+#if defined(CONFIG_BAYCOM_SER_HDX)
+ baycom_ser_hdx_init();
+#endif
+#if defined(CONFIG_SOUNDMODEM)
+ sm_init();
+#endif
+#if defined(CONFIG_LAPBETHER)
+ lapbeth_init();
+#endif
+#if defined(CONFIG_PLIP)
+ plip_init();
+#endif
+#if defined(CONFIG_ARCNET)
+ arcnet_init();
+#endif
+#if defined(CONFIG_8xx)
+ cpm_enet_init();
+#endif
+ /*
+ * SLHC if present needs attaching so other people see it
+ * even if not opened.
+ */
+
+#ifdef CONFIG_INET
+#if (defined(CONFIG_SLIP) && defined(CONFIG_SLIP_COMPRESSED)) \
+ || defined(CONFIG_PPP) \
+ || (defined(CONFIG_ISDN) && defined(CONFIG_ISDN_PPP))
+ slhc_install();
+#endif
+#endif
+
+#ifdef CONFIG_NET_PROFILE
+ net_profile_init();
+ NET_PROFILE_REGISTER(dev_queue_xmit);
+ NET_PROFILE_REGISTER(net_bh);
+#if 0
+ NET_PROFILE_REGISTER(net_bh_skb);
+#endif
+#endif
+ /*
+ * Add the devices.
+ * If the call to dev->init fails, the dev is removed
+ * from the chain disconnecting the device until the
+ * next reboot.
+ */
+
+ dp = &dev_base;
+ while ((dev = *dp) != NULL)
+ {
+ dev->iflink = -1;
+ if (dev->init && dev->init(dev))
+ {
+ /*
+ * It failed to come up. Unhook it.
+ */
+ *dp = dev->next;
+ synchronize_bh();
+ }
+ else
+ {
+ dp = &dev->next;
+ dev->ifindex = dev_new_index();
+ if (dev->iflink == -1)
+ dev->iflink = dev->ifindex;
+ dev_init_scheduler(dev);
+ }
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_dev);
+ {
+ struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0);
+ ent->read_proc = dev_proc_stats;
+ }
+#endif
+
+#ifdef CONFIG_NET_RADIO
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_wireless);
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_RADIO */
+
+ init_bh(NET_BH, net_bh);
+
+ dev_boot_phase = 0;
+
+ dev_mcast_init();
+
+#ifdef CONFIG_IP_PNP
+ ip_auto_config();
+#endif
+
+ return 0;
+}
diff --git a/pfinet/linux-src/net/core/dev_mcast.c b/pfinet/linux-src/net/core/dev_mcast.c
new file mode 100644
index 00000000..bce3f4a4
--- /dev/null
+++ b/pfinet/linux-src/net/core/dev_mcast.c
@@ -0,0 +1,252 @@
+/*
+ * Linux NET3: Multicast List maintenance.
+ *
+ * Authors:
+ * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
+ * Richard Underwood <richard@wuzz.demon.co.uk>
+ *
+ * Stir fried together from the IP multicast and CAP patches above
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ * Alan Cox : Update the device on a real delete
+ * rather than any time but...
+ * Alan Cox : IFF_ALLMULTI support.
+ * Alan Cox : New format set_multicast_list() calls.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+
+
+/*
+ * Device multicast list maintenance.
+ *
+ * This is used both by IP and by the user level maintenance functions.
+ * Unlike BSD we maintain a usage count on a given multicast address so
+ * that a casual user application can add/delete multicasts used by
+ * protocols without doing damage to the protocols when it deletes the
+ * entries. It also helps IP as it tracks overlapping maps.
+ *
+ * Device mc lists are changed by bh at least if IPv6 is enabled,
+ * so that it must be bh protected.
+ */
+
+/*
+ * Update the multicast list into the physical NIC controller.
+ */
+
+void dev_mc_upload(struct device *dev)
+{
+ /* Don't do anything till we up the interface
+ [dev_open will call this function so the list will
+ stay sane] */
+
+ if(!(dev->flags&IFF_UP))
+ return;
+
+ /*
+ * Devices with no set multicast don't get set
+ */
+
+ if(dev->set_multicast_list==NULL)
+ return;
+
+ start_bh_atomic();
+ dev->set_multicast_list(dev);
+ end_bh_atomic();
+}
+
+/*
+ * Delete a device level multicast
+ */
+
+int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl)
+{
+ int err = 0;
+ struct dev_mc_list *dmi, **dmip;
+
+ start_bh_atomic();
+ for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) {
+ /*
+ * Find the entry we want to delete. The device could
+ * have variable length entries so check these too.
+ */
+ if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && alen==dmi->dmi_addrlen) {
+ if (glbl) {
+ int old_glbl = dmi->dmi_gusers;
+ dmi->dmi_gusers = 0;
+ if (old_glbl == 0)
+ break;
+ }
+ if(--dmi->dmi_users)
+ goto done;
+
+ /*
+ * Last user. So delete the entry.
+ */
+ *dmip = dmi->next;
+ dev->mc_count--;
+ kfree_s(dmi,sizeof(*dmi));
+ /*
+ * We have altered the list, so the card
+ * loaded filter is now wrong. Fix it
+ */
+ end_bh_atomic();
+ dev_mc_upload(dev);
+ return 0;
+ }
+ }
+ err = -ENOENT;
+done:
+ end_bh_atomic();
+ return err;
+}
+
+/*
+ * Add a device level multicast
+ */
+
+int dev_mc_add(struct device *dev, void *addr, int alen, int glbl)
+{
+ int err = 0;
+ struct dev_mc_list *dmi, *dmi1;
+
+ dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any());
+
+ start_bh_atomic();
+ for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) {
+ if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) {
+ if (glbl) {
+ int old_glbl = dmi->dmi_gusers;
+ dmi->dmi_gusers = 1;
+ if (old_glbl)
+ goto done;
+ }
+ dmi->dmi_users++;
+ goto done;
+ }
+ }
+
+ if ((dmi=dmi1)==NULL)
+ return -ENOMEM;
+ memcpy(dmi->dmi_addr, addr, alen);
+ dmi->dmi_addrlen=alen;
+ dmi->next=dev->mc_list;
+ dmi->dmi_users=1;
+ dmi->dmi_gusers=glbl ? 1 : 0;
+ dev->mc_list=dmi;
+ dev->mc_count++;
+ end_bh_atomic();
+ dev_mc_upload(dev);
+ return 0;
+
+done:
+ end_bh_atomic();
+ if (dmi1)
+ kfree(dmi1);
+ return err;
+}
+
+/*
+ * Discard multicast list when a device is downed
+ */
+
+void dev_mc_discard(struct device *dev)
+{
+ start_bh_atomic();
+ while (dev->mc_list!=NULL) {
+ struct dev_mc_list *tmp=dev->mc_list;
+ dev->mc_list=tmp->next;
+ if (tmp->dmi_users > tmp->dmi_gusers)
+ printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users);
+ kfree_s(tmp,sizeof(*tmp));
+ }
+ dev->mc_count=0;
+ end_bh_atomic();
+}
+
+#ifdef CONFIG_PROC_FS
+static int dev_mc_read_proc(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ off_t pos=0, begin=0;
+ struct dev_mc_list *m;
+ int len=0;
+ struct device *dev;
+
+ start_bh_atomic();
+
+ for (dev = dev_base; dev; dev = dev->next) {
+ for (m = dev->mc_list; m; m = m->next) {
+ int i;
+
+ len += sprintf(buffer+len,"%-4d %-15s %-5d %-5d ", dev->ifindex, dev->name,
+ m->dmi_users, m->dmi_gusers);
+
+ for (i=0; i<m->dmi_addrlen; i++)
+ len += sprintf(buffer+len, "%02x", m->dmi_addr[i]);
+
+ len+=sprintf(buffer+len, "\n");
+
+ pos=begin+len;
+ if (pos < offset) {
+ len=0;
+ begin=pos;
+ }
+ if (pos > offset+length)
+ goto done;
+ }
+ }
+ *eof = 1;
+
+done:
+ end_bh_atomic();
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if(len<0)
+ len=0;
+ return len;
+}
+#endif
+
+__initfunc(void dev_mcast_init(void))
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *ent;
+
+ ent = create_proc_entry("net/dev_mcast", 0, 0);
+ ent->read_proc = dev_mc_read_proc;
+#endif
+}
+
diff --git a/pfinet/linux-src/net/core/dst.c b/pfinet/linux-src/net/core/dst.c
new file mode 100644
index 00000000..9007dde6
--- /dev/null
+++ b/pfinet/linux-src/net/core/dst.c
@@ -0,0 +1,145 @@
+/*
+ * net/dst.c Protocol independent destination cache.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <asm/segment.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/dst.h>
+
+struct dst_entry * dst_garbage_list;
+atomic_t dst_total = ATOMIC_INIT(0);
+
+static unsigned long dst_gc_timer_expires;
+static unsigned long dst_gc_timer_inc = DST_GC_MAX;
+static void dst_run_gc(unsigned long);
+
+static struct timer_list dst_gc_timer =
+ { NULL, NULL, DST_GC_MIN, 0L, dst_run_gc };
+
+#if RT_CACHE_DEBUG >= 2
+atomic_t hh_count;
+#endif
+
+static void dst_run_gc(unsigned long dummy)
+{
+ int delayed = 0;
+ struct dst_entry * dst, **dstp;
+
+ del_timer(&dst_gc_timer);
+ dstp = &dst_garbage_list;
+ while ((dst = *dstp) != NULL) {
+ if (atomic_read(&dst->use)) {
+ dstp = &dst->next;
+ delayed++;
+ continue;
+ }
+ *dstp = dst->next;
+ dst_destroy(dst);
+ }
+ if (!dst_garbage_list) {
+ dst_gc_timer_inc = DST_GC_MAX;
+ return;
+ }
+ if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
+ dst_gc_timer_expires = DST_GC_MAX;
+ dst_gc_timer_inc += DST_GC_INC;
+ dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
+#if RT_CACHE_DEBUG >= 2
+ printk("dst_total: %d/%d %ld\n",
+ atomic_read(&dst_total), delayed, dst_gc_timer_expires);
+#endif
+ add_timer(&dst_gc_timer);
+}
+
+static int dst_discard(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dst_blackhole(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+
+void * dst_alloc(int size, struct dst_ops * ops)
+{
+ struct dst_entry * dst;
+
+ if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
+ if (ops->gc())
+ return NULL;
+ }
+ dst = kmalloc(size, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+ memset(dst, 0, size);
+ dst->ops = ops;
+ atomic_set(&dst->refcnt, 0);
+ dst->lastuse = jiffies;
+ dst->input = dst_discard;
+ dst->output = dst_blackhole;
+ atomic_inc(&dst_total);
+ atomic_inc(&ops->entries);
+ return dst;
+}
+
+void __dst_free(struct dst_entry * dst)
+{
+ start_bh_atomic();
+ /* The first case (dev==NULL) is required, when
+ protocol module is unloaded.
+ */
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst_discard;
+ dst->output = dst_blackhole;
+ dst->dev = &loopback_dev;
+ }
+ dst->obsolete = 2;
+ dst->next = dst_garbage_list;
+ dst_garbage_list = dst;
+ if (dst_gc_timer_inc > DST_GC_INC) {
+ del_timer(&dst_gc_timer);
+ dst_gc_timer_inc = DST_GC_INC;
+ dst_gc_timer_expires = DST_GC_MIN;
+ dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
+ add_timer(&dst_gc_timer);
+ }
+ end_bh_atomic();
+}
+
+void dst_destroy(struct dst_entry * dst)
+{
+ struct neighbour *neigh = dst->neighbour;
+ struct hh_cache *hh = dst->hh;
+
+ dst->hh = NULL;
+ if (hh && atomic_dec_and_test(&hh->hh_refcnt))
+ kfree(hh);
+
+ if (neigh) {
+ dst->neighbour = NULL;
+ neigh_release(neigh);
+ }
+
+ atomic_dec(&dst->ops->entries);
+
+ if (dst->ops->destroy)
+ dst->ops->destroy(dst);
+ atomic_dec(&dst_total);
+ kfree(dst);
+}
diff --git a/pfinet/linux-src/net/core/filter.c b/pfinet/linux-src/net/core/filter.c
new file mode 100644
index 00000000..8e1ffb62
--- /dev/null
+++ b/pfinet/linux-src/net/core/filter.c
@@ -0,0 +1,454 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Author:
+ * Jay Schulist <Jay.Schulist@spacs.k12.wi.us>
+ *
+ * Based on the design of:
+ * - The Berkeley Packet Filter
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ */
+
+#include <linux/config.h>
+#if defined(CONFIG_FILTER)
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/filter.h>
+
+/* No hurry in this branch */
+
+static u8 *load_pointer(struct sk_buff *skb, int k)
+{
+ u8 *ptr = NULL;
+
+ if (k>=SKF_NET_OFF)
+ ptr = skb->nh.raw + k - SKF_NET_OFF;
+ else if (k>=SKF_LL_OFF)
+ ptr = skb->mac.raw + k - SKF_LL_OFF;
+
+ if (ptr<skb->head && ptr < skb->tail)
+ return ptr;
+ return NULL;
+}
+
+/*
+ * Decode and apply filter instructions to the skb->data.
+ * Return length to keep, 0 for none. skb is the data we are
+ * filtering, filter is the array of filter instructions, and
+ * len is the number of filter blocks in the array.
+ */
+
+int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+{
+ unsigned char *data = skb->data;
+ /* len is UNSIGNED. Byte wide insns relies only on implicit
+ type casts to prevent reading arbitrary memory locations.
+ */
+ unsigned int len = skb->len;
+ struct sock_filter *fentry; /* We walk down these */
+ u32 A = 0; /* Accumulator */
+ u32 X = 0; /* Index Register */
+ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
+ int k;
+ int pc;
+
+ /*
+ * Process array of filter instructions.
+ */
+
+ for(pc = 0; pc < flen; pc++)
+ {
+ fentry = &filter[pc];
+
+ switch(fentry->code)
+ {
+ case BPF_ALU|BPF_ADD|BPF_X:
+ A += X;
+ continue;
+
+ case BPF_ALU|BPF_ADD|BPF_K:
+ A += fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_SUB|BPF_X:
+ A -= X;
+ continue;
+
+ case BPF_ALU|BPF_SUB|BPF_K:
+ A -= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_MUL|BPF_X:
+ A *= X;
+ continue;
+
+ case BPF_ALU|BPF_MUL|BPF_K:
+ A *= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_DIV|BPF_X:
+ if(X == 0)
+ return (0);
+ A /= X;
+ continue;
+
+ case BPF_ALU|BPF_DIV|BPF_K:
+ if(fentry->k == 0)
+ return (0);
+ A /= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_AND|BPF_X:
+ A &= X;
+ continue;
+
+ case BPF_ALU|BPF_AND|BPF_K:
+ A &= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_OR|BPF_X:
+ A |= X;
+ continue;
+
+ case BPF_ALU|BPF_OR|BPF_K:
+ A |= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_LSH|BPF_X:
+ A <<= X;
+ continue;
+
+ case BPF_ALU|BPF_LSH|BPF_K:
+ A <<= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_RSH|BPF_X:
+ A >>= X;
+ continue;
+
+ case BPF_ALU|BPF_RSH|BPF_K:
+ A >>= fentry->k;
+ continue;
+
+ case BPF_ALU|BPF_NEG:
+ A = -A;
+ continue;
+
+ case BPF_JMP|BPF_JA:
+ pc += fentry->k;
+ continue;
+
+ case BPF_JMP|BPF_JGT|BPF_K:
+ pc += (A > fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGE|BPF_K:
+ pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ pc += (A == fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JSET|BPF_K:
+ pc += (A & fentry->k) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGT|BPF_X:
+ pc += (A > X) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGE|BPF_X:
+ pc += (A >= X) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ pc += (A == X) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_JMP|BPF_JSET|BPF_X:
+ pc += (A & X) ? fentry->jt : fentry->jf;
+ continue;
+
+ case BPF_LD|BPF_W|BPF_ABS:
+ k = fentry->k;
+load_w:
+ if(k+sizeof(u32) <= len) {
+ A = ntohl(*(u32*)&data[k]);
+ continue;
+ }
+ if (k<0) {
+ u8 *ptr;
+
+ if (k>=SKF_AD_OFF)
+ break;
+ if ((ptr = load_pointer(skb, k)) != NULL) {
+ A = ntohl(*(u32*)ptr);
+ continue;
+ }
+ }
+ return 0;
+
+ case BPF_LD|BPF_H|BPF_ABS:
+ k = fentry->k;
+load_h:
+ if(k + sizeof(u16) <= len) {
+ A = ntohs(*(u16*)&data[k]);
+ continue;
+ }
+ if (k<0) {
+ u8 *ptr;
+
+ if (k>=SKF_AD_OFF)
+ break;
+ if ((ptr = load_pointer(skb, k)) != NULL) {
+ A = ntohs(*(u16*)ptr);
+ continue;
+ }
+ }
+ return 0;
+
+ case BPF_LD|BPF_B|BPF_ABS:
+ k = fentry->k;
+load_b:
+ if(k < len) {
+ A = data[k];
+ continue;
+ }
+ if (k<0) {
+ u8 *ptr;
+
+ if (k>=SKF_AD_OFF)
+ break;
+ if ((ptr = load_pointer(skb, k)) != NULL) {
+ A = *ptr;
+ continue;
+ }
+ }
+
+ case BPF_LD|BPF_W|BPF_LEN:
+ A = len;
+ continue;
+
+ case BPF_LDX|BPF_W|BPF_LEN:
+ X = len;
+ continue;
+
+ case BPF_LD|BPF_W|BPF_IND:
+ k = X + fentry->k;
+ goto load_w;
+
+ case BPF_LD|BPF_H|BPF_IND:
+ k = X + fentry->k;
+ goto load_h;
+
+ case BPF_LD|BPF_B|BPF_IND:
+ k = X + fentry->k;
+ goto load_b;
+
+ case BPF_LDX|BPF_B|BPF_MSH:
+ k = fentry->k;
+ if(k >= len)
+ return (0);
+ X = (data[k] & 0xf) << 2;
+ continue;
+
+ case BPF_LD|BPF_IMM:
+ A = fentry->k;
+ continue;
+
+ case BPF_LDX|BPF_IMM:
+ X = fentry->k;
+ continue;
+
+ case BPF_LD|BPF_MEM:
+ A = mem[fentry->k];
+ continue;
+
+ case BPF_LDX|BPF_MEM:
+ X = mem[fentry->k];
+ continue;
+
+ case BPF_MISC|BPF_TAX:
+ X = A;
+ continue;
+
+ case BPF_MISC|BPF_TXA:
+ A = X;
+ continue;
+
+ case BPF_RET|BPF_K:
+ return ((unsigned int)fentry->k);
+
+ case BPF_RET|BPF_A:
+ return ((unsigned int)A);
+
+ case BPF_ST:
+ mem[fentry->k] = A;
+ continue;
+
+ case BPF_STX:
+ mem[fentry->k] = X;
+ continue;
+
+ default:
+ /* Invalid instruction counts as RET */
+ return (0);
+ }
+
+ /* Handle ancillary data, which are impossible
+ (or very difficult) to get parsing packet contents.
+ */
+ switch (k-SKF_AD_OFF) {
+ case SKF_AD_PROTOCOL:
+ A = htons(skb->protocol);
+ continue;
+ case SKF_AD_PKTTYPE:
+ A = skb->pkt_type;
+ continue;
+ case SKF_AD_IFINDEX:
+ A = skb->dev->ifindex;
+ continue;
+ default:
+ return 0;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Check the user's filter code. If we let some ugly
+ * filter code slip through kaboom!
+ */
+
+int sk_chk_filter(struct sock_filter *filter, int flen)
+{
+ struct sock_filter *ftest;
+ int pc;
+
+ /*
+ * Check the filter code now.
+ */
+ for(pc = 0; pc < flen; pc++)
+ {
+ /*
+ * All jumps are forward as they are not signed
+ */
+
+ ftest = &filter[pc];
+ if(BPF_CLASS(ftest->code) == BPF_JMP)
+ {
+ /*
+ * But they mustn't jump off the end.
+ */
+ if(BPF_OP(ftest->code) == BPF_JA)
+ {
+ /* Note, the large ftest->k might cause
+ loops. Compare this with conditional
+ jumps below, where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned)(flen-pc-1))
+ return (-EINVAL);
+ }
+ else
+ {
+ /*
+ * For conditionals both must be safe
+ */
+ if(pc + ftest->jt +1 >= flen || pc + ftest->jf +1 >= flen)
+ return (-EINVAL);
+ }
+ }
+
+ /*
+ * Check that memory operations use valid addresses.
+ */
+
+ if (ftest->k >= BPF_MEMWORDS)
+ {
+ /*
+ * But it might not be a memory operation...
+ */
+ switch (ftest->code) {
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_LD|BPF_MEM:
+ case BPF_LDX|BPF_MEM:
+ return -EINVAL;
+ }
+ }
+ }
+
+ /*
+ * The program must end with a return. We don't care where they
+ * jumped within the script (its always forwards) but in the
+ * end they _will_ hit this.
+ */
+
+ return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL;
+}
+
+/*
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later.
+ */
+
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct sk_filter *fp;
+ unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+ int err;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS)
+ return (-EINVAL);
+
+ fp = (struct sk_filter *)sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+ if(fp == NULL)
+ return (-ENOMEM);
+
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+ sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+ return -EFAULT;
+ }
+
+ atomic_set(&fp->refcnt, 1);
+ fp->len = fprog->len;
+
+ if ((err = sk_chk_filter(fp->insns, fp->len))==0) {
+ struct sk_filter *old_fp = sk->filter;
+ sk->filter = fp;
+ synchronize_bh();
+ fp = old_fp;
+ }
+
+ if (fp)
+ sk_filter_release(sk, fp);
+
+ return (err);
+}
+#endif /* CONFIG_FILTER */
diff --git a/pfinet/linux-src/net/core/firewall.c b/pfinet/linux-src/net/core/firewall.c
new file mode 100644
index 00000000..fc7b1a51
--- /dev/null
+++ b/pfinet/linux-src/net/core/firewall.c
@@ -0,0 +1,160 @@
+/*
+ * Generic loadable firewalls. At the moment only IP will actually
+ * use these, but people can add the others as they are needed.
+ *
+ * Authors: Dave Bonn (for IP)
+ * much hacked by: Alan Cox
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/firewall.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/semaphore.h>
+
+struct semaphore firewall_sem = MUTEX;
+static int firewall_policy[NPROTO];
+static struct firewall_ops *firewall_chain[NPROTO];
+
+/*
+ * Register a firewall
+ */
+
+int register_firewall(int pf, struct firewall_ops *fw)
+{
+ struct firewall_ops **p;
+
+ if(pf<0||pf>=NPROTO)
+ return -EINVAL;
+
+ /*
+ * Don't allow two people to adjust at once.
+ */
+
+ down(&firewall_sem);
+
+ p=&firewall_chain[pf];
+
+ while(*p)
+ {
+ if(fw->fw_priority > (*p)->fw_priority)
+ break;
+ p=&((*p)->next);
+ }
+
+ /*
+ * We need to use a memory barrier to make sure that this
+ * works correctly even in SMP with weakly ordered writes.
+ *
+ * This is atomic wrt interrupts (and generally walking the
+ * chain), but not wrt itself (so you can't call this from
+ * an interrupt. Not that you'd want to).
+ */
+
+ fw->next=*p;
+ mb();
+ *p = fw;
+
+ /*
+ * And release the sleep lock
+ */
+
+ up(&firewall_sem);
+ return 0;
+}
+
+/*
+ * Unregister a firewall
+ */
+
+int unregister_firewall(int pf, struct firewall_ops *fw)
+{
+ struct firewall_ops **nl;
+
+ if(pf<0||pf>=NPROTO)
+ return -EINVAL;
+
+ /*
+ * Don't allow two people to adjust at once.
+ */
+
+ down(&firewall_sem);
+
+ nl=&firewall_chain[pf];
+
+ while(*nl!=NULL)
+ {
+ if(*nl==fw)
+ {
+ struct firewall_ops *f=fw->next;
+ *nl = f;
+ up(&firewall_sem);
+ synchronize_bh();
+ return 0;
+ }
+ nl=&((*nl)->next);
+ }
+ up(&firewall_sem);
+ return -ENOENT;
+}
+
+int call_fw_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb)
+{
+ struct firewall_ops *fw=firewall_chain[pf];
+
+ while(fw!=NULL)
+ {
+ int rc=fw->fw_forward(fw,pf,dev,phdr,arg,skb);
+ if(rc!=FW_SKIP)
+ return rc;
+ fw=fw->next;
+ }
+ return firewall_policy[pf];
+}
+
+/*
+ * Actual invocation of the chains
+ */
+
+int call_in_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb)
+{
+ struct firewall_ops *fw=firewall_chain[pf];
+
+ while(fw!=NULL)
+ {
+ int rc=fw->fw_input(fw,pf,dev,phdr,arg,skb);
+ if(rc!=FW_SKIP)
+ return rc;
+ fw=fw->next;
+ }
+ return firewall_policy[pf];
+}
+
+int call_out_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb)
+{
+ struct firewall_ops *fw=firewall_chain[pf];
+
+ while(fw!=NULL)
+ {
+ int rc=fw->fw_output(fw,pf,dev,phdr,arg,skb);
+ if(rc!=FW_SKIP)
+ return rc;
+ fw=fw->next;
+ }
+ /* alan, is this right? */
+ return firewall_policy[pf];
+}
+
+EXPORT_SYMBOL(register_firewall);
+EXPORT_SYMBOL(unregister_firewall);
+EXPORT_SYMBOL(call_in_firewall);
+EXPORT_SYMBOL(call_out_firewall);
+EXPORT_SYMBOL(call_fw_firewall);
+
+__initfunc(void fwchain_init(void))
+{
+ int i;
+ for(i=0;i<NPROTO;i++)
+ firewall_policy[i]=FW_ACCEPT;
+}
diff --git a/pfinet/linux-src/net/core/iovec.c b/pfinet/linux-src/net/core/iovec.c
new file mode 100644
index 00000000..c20f8530
--- /dev/null
+++ b/pfinet/linux-src/net/core/iovec.c
@@ -0,0 +1,278 @@
+/*
+ * iovec manipulation routines.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Andrew Lunn : Errors in iovec copying.
+ * Pedro Roque : Added memcpy_fromiovecend and
+ * csum_..._fromiovecend.
+ * Andi Kleen : fixed error handling for 2.1
+ * Alexey Kuznetsov: 2.1 optimisations
+ * Andi Kleen : Fix csum*fromiovecend for IPv6.
+ */
+
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/malloc.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <net/checksum.h>
+
+/*
+ * Verify iovec. The caller must ensure that the iovec is big enough
+ * to hold the message iovec.
+ *
+ * Save time not doing verify_area. copy_*_user will make this work
+ * in any case.
+ */
+
+int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode)
+{
+ int size, err, ct;
+
+ if(m->msg_namelen)
+ {
+ if(mode==VERIFY_READ)
+ {
+ err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address);
+ if(err<0)
+ goto out;
+ }
+
+ m->msg_name = address;
+ } else
+ m->msg_name = NULL;
+
+ err = -EFAULT;
+ size = m->msg_iovlen * sizeof(struct iovec);
+ if (copy_from_user(iov, m->msg_iov, size))
+ goto out;
+ m->msg_iov=iov;
+
+ for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) {
+ err += iov[ct].iov_len;
+ /* Goal is not to verify user data, but to prevent returning
+ negative value, which is interpreted as errno.
+ Overflow is still possible, but it is harmless.
+ */
+ if (err < 0)
+ return -EMSGSIZE;
+ }
+out:
+ return err;
+}
+
+/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+ int err = -EFAULT;
+
+ while(len>0)
+ {
+ if(iov->iov_len)
+ {
+ int copy = min(iov->iov_len, len);
+ if (copy_to_user(iov->iov_base, kdata, copy))
+ goto out;
+ kdata+=copy;
+ len-=copy;
+ iov->iov_len-=copy;
+ iov->iov_base+=copy;
+ }
+ iov++;
+ }
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * In kernel copy to iovec. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+ while(len>0)
+ {
+ if(iov->iov_len)
+ {
+ int copy = min(iov->iov_len, len);
+ memcpy(iov->iov_base, kdata, copy);
+ kdata+=copy;
+ len-=copy;
+ iov->iov_len-=copy;
+ iov->iov_base+=copy;
+ }
+ iov++;
+ }
+}
+
+
+/*
+ * Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+ int err = -EFAULT;
+
+ while(len>0)
+ {
+ if(iov->iov_len)
+ {
+ int copy = min(len, iov->iov_len);
+ if (copy_from_user(kdata, iov->iov_base, copy))
+ goto out;
+ len-=copy;
+ kdata+=copy;
+ iov->iov_base+=copy;
+ iov->iov_len-=copy;
+ }
+ iov++;
+ }
+ err = 0;
+out:
+ return err;
+}
+
+
+/*
+ * For use with ip_build_xmit
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
+ int len)
+{
+ int err = -EFAULT;
+
+ /* Skip over the finished iovecs */
+ while(offset >= iov->iov_len)
+ {
+ offset -= iov->iov_len;
+ iov++;
+ }
+
+ while (len > 0)
+ {
+ u8 *base = iov->iov_base + offset;
+ int copy = min(len, iov->iov_len - offset);
+
+ offset = 0;
+ if (copy_from_user(kdata, base, copy))
+ goto out;
+ len -= copy;
+ kdata += copy;
+ iov++;
+ }
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * And now for the all-in-one: copy and checksum from a user iovec
+ * directly to a datagram
+ * Calls to csum_partial but the last must be in 32 bit chunks
+ *
+ * ip_build_xmit must ensure that when fragmenting only the last
+ * call to this function will be unaligned also.
+ */
+
+int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
+ int offset, unsigned int len, int *csump)
+{
+ int csum = *csump;
+ int partial_cnt = 0, err = 0;
+
+ /* Skip over the finished iovecs */
+ while (offset >= iov->iov_len)
+ {
+ offset -= iov->iov_len;
+ iov++;
+ }
+
+ while (len > 0)
+ {
+ u8 *base = iov->iov_base + offset;
+ unsigned int copy = min(len, iov->iov_len - offset);
+
+ offset = 0;
+ /* There is a remnant from previous iov. */
+ if (partial_cnt)
+ {
+ int par_len = 4 - partial_cnt;
+
+ /* iov component is too short ... */
+ if (par_len > copy) {
+ if (copy_from_user(kdata, base, copy))
+ goto out_fault;
+ kdata += copy;
+ base += copy;
+ partial_cnt += copy;
+ len -= copy;
+ iov++;
+ if (len)
+ continue;
+ *csump = csum_partial(kdata - partial_cnt,
+ partial_cnt, csum);
+ goto out;
+ }
+ if (copy_from_user(kdata, base, par_len))
+ goto out_fault;
+ csum = csum_partial(kdata - partial_cnt, 4, csum);
+ kdata += par_len;
+ base += par_len;
+ copy -= par_len;
+ len -= par_len;
+ partial_cnt = 0;
+ }
+
+ if (len > copy)
+ {
+ partial_cnt = copy % 4;
+ if (partial_cnt)
+ {
+ copy -= partial_cnt;
+ if (copy_from_user(kdata + copy, base + copy,
+ partial_cnt))
+ goto out_fault;
+ }
+ }
+
+ if (copy) {
+ csum = csum_and_copy_from_user(base, kdata, copy,
+ csum, &err);
+ if (err)
+ goto out;
+ }
+ len -= copy + partial_cnt;
+ kdata += copy + partial_cnt;
+ iov++;
+ }
+ *csump = csum;
+out:
+ return err;
+
+out_fault:
+ err = -EFAULT;
+ goto out;
+}
diff --git a/pfinet/linux-src/net/core/neighbour.c b/pfinet/linux-src/net/core/neighbour.c
new file mode 100644
index 00000000..6afbfdcc
--- /dev/null
+++ b/pfinet/linux-src/net/core/neighbour.c
@@ -0,0 +1,1394 @@
+/*
+ * Generic address resolution entity
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/sched.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+
+/*
+ NOTE. The most unpleasent question is serialization of
+ accesses to resolved addresses. The problem is that addresses
+ are modified by bh, but they are referenced from normal
+ kernel thread. Before today no locking was made.
+ My reasoning was that corrupted address token will be copied
+ to packet with cosmologically small probability
+ (it is even difficult to estimate such small number)
+ and it is very silly to waste cycles in fast path to lock them.
+
+ But now I changed my mind, but not because previous statement
+ is wrong. Actually, neigh->ha MAY BE not opaque byte array,
+ but reference to some private data. In this case even neglibible
+ corruption probability becomes bug.
+
+ - hh cache is protected by rwlock. It assumes that
+ hh cache update procedure is short and fast, and that
+ read_lock is cheaper than start_bh_atomic().
+ - ha tokens, saved in neighbour entries, are protected
+ by bh_atomic().
+ - no protection is made in /proc reading. It is OK, because
+ /proc is broken by design in any case, and
+ corrupted output is normal behaviour there.
+
+ --ANK (981025)
+ */
+
+#define NEIGH_DEBUG 1
+
+#define NEIGH_PRINTK(x...) printk(x)
+#define NEIGH_NOPRINTK(x...) do { ; } while(0)
+#define NEIGH_PRINTK0 NEIGH_PRINTK
+#define NEIGH_PRINTK1 NEIGH_NOPRINTK
+#define NEIGH_PRINTK2 NEIGH_NOPRINTK
+
+#if NEIGH_DEBUG >= 1
+#undef NEIGH_PRINTK1
+#define NEIGH_PRINTK1 NEIGH_PRINTK
+#endif
+#if NEIGH_DEBUG >= 2
+#undef NEIGH_PRINTK2
+#define NEIGH_PRINTK2 NEIGH_PRINTK
+#endif
+
+static void neigh_timer_handler(unsigned long arg);
+#ifdef CONFIG_ARPD
+static void neigh_app_notify(struct neighbour *n);
+#endif
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev);
+
+static int neigh_glbl_allocs;
+static struct neigh_table *neigh_tables;
+
+static int neigh_blackhole(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+/*
+ * It is random distribution in the interval (1/2)*base...(3/2)*base.
+ * It corresponds to default IPv6 settings and is not overridable,
+ * because it is really reasonbale choice.
+ */
+
+unsigned long neigh_rand_reach_time(unsigned long base)
+{
+ return (net_random() % base) + (base>>1);
+}
+
+
+static int neigh_forced_gc(struct neigh_table *tbl)
+{
+ int shrunk = 0;
+ int i;
+
+ if (atomic_read(&tbl->lock))
+ return 0;
+
+ for (i=0; i<=NEIGH_HASHMASK; i++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[i];
+ while ((n = *np) != NULL) {
+ /* Neighbour record may be discarded if:
+ - nobody refers to it.
+ - it is not premanent
+ - (NEW and probably wrong)
+ INCOMPLETE entries are kept at least for
+ n->parms->retrans_time, otherwise we could
+ flood network with resolution requests.
+ It is not clear, what is better table overflow
+ or flooding.
+ */
+ if (atomic_read(&n->refcnt) == 0 &&
+ !(n->nud_state&NUD_PERMANENT) &&
+ (n->nud_state != NUD_INCOMPLETE ||
+ jiffies - n->used > n->parms->retrans_time)) {
+ *np = n->next;
+ n->tbl = NULL;
+ tbl->entries--;
+ shrunk = 1;
+ neigh_destroy(n);
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+
+ tbl->last_flush = jiffies;
+ return shrunk;
+}
+
+int neigh_ifdown(struct neigh_table *tbl, struct device *dev)
+{
+ int i;
+
+ if (atomic_read(&tbl->lock)) {
+ NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n");
+ return -EBUSY;
+ }
+
+ start_bh_atomic();
+ for (i=0; i<=NEIGH_HASHMASK; i++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[i];
+ while ((n = *np) != NULL) {
+ if (dev && n->dev != dev) {
+ np = &n->next;
+ continue;
+ }
+ *np = n->next;
+ n->tbl = NULL;
+ tbl->entries--;
+ if (atomic_read(&n->refcnt)) {
+ /* The most unpleasant situation.
+ We must destroy neighbour entry,
+ but someone still uses it.
+
+ The destroy will be delayed until
+ the last user releases us, but
+ we must kill timers etc. and move
+ it to safe state.
+ */
+ if (n->nud_state & NUD_IN_TIMER)
+ del_timer(&n->timer);
+ n->parms = &tbl->parms;
+ skb_queue_purge(&n->arp_queue);
+ n->output = neigh_blackhole;
+ if (n->nud_state&NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ NEIGH_PRINTK2("neigh %p is stray.\n", n);
+ } else
+ neigh_destroy(n);
+ }
+ }
+
+ del_timer(&tbl->proxy_timer);
+ skb_queue_purge(&tbl->proxy_queue);
+ pneigh_ifdown(tbl, dev);
+ end_bh_atomic();
+ return 0;
+}
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat)
+{
+ struct neighbour *n;
+ unsigned long now = jiffies;
+
+ if (tbl->entries > tbl->gc_thresh1) {
+ if (creat < 0)
+ return NULL;
+ if (tbl->entries > tbl->gc_thresh3 ||
+ (tbl->entries > tbl->gc_thresh2 &&
+ now - tbl->last_flush > 5*HZ)) {
+ if (neigh_forced_gc(tbl) == 0 &&
+ tbl->entries > tbl->gc_thresh3)
+ return NULL;
+ }
+ }
+
+ n = kmalloc(tbl->entry_size, GFP_ATOMIC);
+ if (n == NULL)
+ return NULL;
+
+ memset(n, 0, tbl->entry_size);
+
+ skb_queue_head_init(&n->arp_queue);
+ n->updated = n->used = now;
+ n->nud_state = NUD_NONE;
+ n->output = neigh_blackhole;
+ n->parms = &tbl->parms;
+ init_timer(&n->timer);
+ n->timer.function = neigh_timer_handler;
+ n->timer.data = (unsigned long)n;
+ tbl->stats.allocs++;
+ neigh_glbl_allocs++;
+ return n;
+}
+
+
+struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey,
+ struct device *dev, int creat)
+{
+ struct neighbour *n;
+ u32 hash_val;
+ int key_len = tbl->key_len;
+
+ hash_val = *(u32*)(pkey + key_len - 4);
+ hash_val ^= (hash_val>>16);
+ hash_val ^= hash_val>>8;
+ hash_val ^= hash_val>>3;
+ hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+
+ for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+ if (dev == n->dev &&
+ memcmp(n->primary_key, pkey, key_len) == 0) {
+ atomic_inc(&n->refcnt);
+ return n;
+ }
+ }
+ if (!creat)
+ return NULL;
+
+ n = neigh_alloc(tbl, creat);
+ if (n == NULL)
+ return NULL;
+
+ memcpy(n->primary_key, pkey, key_len);
+ n->dev = dev;
+
+ /* Protocol specific setup. */
+ if (tbl->constructor && tbl->constructor(n) < 0) {
+ neigh_destroy(n);
+ return NULL;
+ }
+
+ /* Device specific setup. */
+ if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) {
+ neigh_destroy(n);
+ return NULL;
+ }
+
+ n->confirmed = jiffies - (n->parms->base_reachable_time<<1);
+ atomic_set(&n->refcnt, 1);
+ tbl->entries++;
+ n->next = tbl->hash_buckets[hash_val];
+ tbl->hash_buckets[hash_val] = n;
+ n->tbl = tbl;
+ NEIGH_PRINTK2("neigh %p is created.\n", n);
+ return n;
+}
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
+ struct device *dev, int creat)
+{
+ struct pneigh_entry *n;
+ u32 hash_val;
+ int key_len = tbl->key_len;
+
+ hash_val = *(u32*)(pkey + key_len - 4);
+ hash_val ^= (hash_val>>16);
+ hash_val ^= hash_val>>8;
+ hash_val ^= hash_val>>4;
+ hash_val &= PNEIGH_HASHMASK;
+
+ for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
+ if (memcmp(n->key, pkey, key_len) == 0 &&
+ (n->dev == dev || !n->dev))
+ return n;
+ }
+ if (!creat)
+ return NULL;
+
+ n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (n == NULL)
+ return NULL;
+
+ memcpy(n->key, pkey, key_len);
+ n->dev = dev;
+
+ if (tbl->pconstructor && tbl->pconstructor(n)) {
+ kfree(n);
+ return NULL;
+ }
+
+ n->next = tbl->phash_buckets[hash_val];
+ tbl->phash_buckets[hash_val] = n;
+ return n;
+}
+
+
+int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev)
+{
+ struct pneigh_entry *n, **np;
+ u32 hash_val;
+ int key_len = tbl->key_len;
+
+ hash_val = *(u32*)(pkey + key_len - 4);
+ hash_val ^= (hash_val>>16);
+ hash_val ^= hash_val>>8;
+ hash_val ^= hash_val>>4;
+ hash_val &= PNEIGH_HASHMASK;
+
+ for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) {
+ if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) {
+ *np = n->next;
+ synchronize_bh();
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ kfree(n);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev)
+{
+ struct pneigh_entry *n, **np;
+ u32 h;
+
+ for (h=0; h<=PNEIGH_HASHMASK; h++) {
+ np = &tbl->phash_buckets[h];
+ while ((n=*np) != NULL) {
+ if (n->dev == dev || dev == NULL) {
+ *np = n->next;
+ synchronize_bh();
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ kfree(n);
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+ return -ENOENT;
+}
+
+
+/*
+ * neighbour must already be out of the table;
+ *
+ */
+void neigh_destroy(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ if (neigh->tbl || atomic_read(&neigh->refcnt)) {
+ NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: "
+ "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0));
+ return;
+ }
+
+ if (neigh->nud_state&NUD_IN_TIMER)
+ del_timer(&neigh->timer);
+
+ while ((hh = neigh->hh) != NULL) {
+ neigh->hh = hh->hh_next;
+ hh->hh_next = NULL;
+ hh->hh_output = neigh_blackhole;
+ if (atomic_dec_and_test(&hh->hh_refcnt))
+ kfree(hh);
+ }
+
+ if (neigh->ops && neigh->ops->destructor)
+ (neigh->ops->destructor)(neigh);
+
+ skb_queue_purge(&neigh->arp_queue);
+
+ NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+
+ neigh_glbl_allocs--;
+ kfree(neigh);
+}
+
+/* Neighbour state is suspicious;
+ disable fast path.
+ */
+static void neigh_suspect(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh);
+
+ neigh->output = neigh->ops->output;
+
+ for (hh = neigh->hh; hh; hh = hh->hh_next)
+ hh->hh_output = neigh->ops->output;
+}
+
+/* Neighbour state is OK;
+ enable fast path.
+ */
+static void neigh_connect(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+
+ NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+
+ neigh->output = neigh->ops->connected_output;
+
+ for (hh = neigh->hh; hh; hh = hh->hh_next)
+ hh->hh_output = neigh->ops->hh_output;
+}
+
+/*
+ Transitions NUD_STALE <-> NUD_REACHABLE do not occur
+ when fast path is built: we have no timers assotiated with
+ these states, we do not have time to check state when sending.
+ neigh_periodic_timer check periodically neigh->confirmed
+ time and moves NUD_REACHABLE -> NUD_STALE.
+
+ If a routine wants to know TRUE entry state, it calls
+ neigh_sync before checking state.
+ */
+
+static void neigh_sync(struct neighbour *n)
+{
+ unsigned long now = jiffies;
+ u8 state = n->nud_state;
+
+ if (state&(NUD_NOARP|NUD_PERMANENT))
+ return;
+ if (state&NUD_REACHABLE) {
+ if (now - n->confirmed > n->parms->reachable_time) {
+ n->nud_state = NUD_STALE;
+ neigh_suspect(n);
+ }
+ } else if (state&NUD_VALID) {
+ if (now - n->confirmed < n->parms->reachable_time) {
+ if (state&NUD_IN_TIMER)
+ del_timer(&n->timer);
+ n->nud_state = NUD_REACHABLE;
+ neigh_connect(n);
+ }
+ }
+}
+
+static void neigh_periodic_timer(unsigned long arg)
+{
+ struct neigh_table *tbl = (struct neigh_table*)arg;
+ unsigned long now = jiffies;
+ int i;
+
+ if (atomic_read(&tbl->lock)) {
+ tbl->gc_timer.expires = now + 1*HZ;
+ add_timer(&tbl->gc_timer);
+ return;
+ }
+
+ /*
+ * periodicly recompute ReachableTime from random function
+ */
+
+ if (now - tbl->last_rand > 300*HZ) {
+ struct neigh_parms *p;
+ tbl->last_rand = now;
+ for (p=&tbl->parms; p; p = p->next)
+ p->reachable_time = neigh_rand_reach_time(p->base_reachable_time);
+ }
+
+ for (i=0; i <= NEIGH_HASHMASK; i++) {
+ struct neighbour *n, **np;
+
+ np = &tbl->hash_buckets[i];
+ while ((n = *np) != NULL) {
+ unsigned state = n->nud_state;
+
+ if (state&(NUD_PERMANENT|NUD_IN_TIMER))
+ goto next_elt;
+
+ if ((long)(n->used - n->confirmed) < 0)
+ n->used = n->confirmed;
+
+ if (atomic_read(&n->refcnt) == 0 &&
+ (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) {
+ *np = n->next;
+ n->tbl = NULL;
+ n->next = NULL;
+ tbl->entries--;
+ neigh_destroy(n);
+ continue;
+ }
+
+ if (n->nud_state&NUD_REACHABLE &&
+ now - n->confirmed > n->parms->reachable_time) {
+ n->nud_state = NUD_STALE;
+ neigh_suspect(n);
+ }
+
+next_elt:
+ np = &n->next;
+ }
+ }
+
+ tbl->gc_timer.expires = now + tbl->gc_interval;
+ add_timer(&tbl->gc_timer);
+}
+
+static __inline__ int neigh_max_probes(struct neighbour *n)
+{
+ struct neigh_parms *p = n->parms;
+ return p->ucast_probes + p->app_probes + p->mcast_probes;
+}
+
+
+/* Called when a timer expires for a neighbour entry. */
+
+static void neigh_timer_handler(unsigned long arg)
+{
+ unsigned long now = jiffies;
+ struct neighbour *neigh = (struct neighbour*)arg;
+ unsigned state = neigh->nud_state;
+
+ if (!(state&NUD_IN_TIMER)) {
+ NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n");
+ return;
+ }
+
+ if ((state&NUD_VALID) &&
+ now - neigh->confirmed < neigh->parms->reachable_time) {
+ neigh->nud_state = NUD_REACHABLE;
+ NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+ neigh_connect(neigh);
+ return;
+ }
+ if (state == NUD_DELAY) {
+ NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
+ neigh->nud_state = NUD_PROBE;
+ neigh->probes = 0;
+ }
+
+ if (neigh->probes >= neigh_max_probes(neigh)) {
+ struct sk_buff *skb;
+
+ neigh->nud_state = NUD_FAILED;
+ neigh->tbl->stats.res_failed++;
+ NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL)
+ neigh->ops->error_report(neigh, skb);
+ skb_queue_purge(&neigh->arp_queue);
+ return;
+ }
+
+ neigh->timer.expires = now + neigh->parms->retrans_time;
+ add_timer(&neigh->timer);
+
+ neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue));
+ neigh->probes++;
+}
+
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+ start_bh_atomic();
+ if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) {
+ if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) {
+ if (neigh->tbl == NULL) {
+ NEIGH_PRINTK2("neigh %p used after death.\n", neigh);
+ if (skb)
+ kfree_skb(skb);
+ end_bh_atomic();
+ return 1;
+ }
+ if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
+ neigh->probes = neigh->parms->ucast_probes;
+ neigh->nud_state = NUD_INCOMPLETE;
+ neigh->timer.expires = jiffies + neigh->parms->retrans_time;
+ add_timer(&neigh->timer);
+
+ neigh->ops->solicit(neigh, skb);
+ neigh->probes++;
+ } else {
+ neigh->nud_state = NUD_FAILED;
+ if (skb)
+ kfree_skb(skb);
+ end_bh_atomic();
+ return 1;
+ }
+ }
+ if (neigh->nud_state == NUD_INCOMPLETE) {
+ if (skb) {
+ if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) {
+ struct sk_buff *buff;
+ buff = neigh->arp_queue.prev;
+ __skb_unlink(buff, &neigh->arp_queue);
+ kfree_skb(buff);
+ }
+ __skb_queue_head(&neigh->arp_queue, skb);
+ }
+ end_bh_atomic();
+ return 1;
+ }
+ if (neigh->nud_state == NUD_STALE) {
+ NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
+ add_timer(&neigh->timer);
+ }
+ }
+ end_bh_atomic();
+ return 0;
+}
+
+static __inline__ void neigh_update_hhs(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+ void (*update)(struct hh_cache*, struct device*, unsigned char*) =
+ neigh->dev->header_cache_update;
+
+ if (update) {
+ for (hh=neigh->hh; hh; hh=hh->hh_next) {
+ write_lock_irq(&hh->hh_lock);
+ update(hh, neigh->dev, neigh->ha);
+ write_unlock_irq(&hh->hh_lock);
+ }
+ }
+}
+
+
+
+/* Generic update routine.
+ -- lladdr is new lladdr or NULL, if it is not supplied.
+ -- new is new state.
+ -- override==1 allows to override existing lladdr, if it is different.
+ -- arp==0 means that the change is administrative.
+ */
+
+int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp)
+{
+ u8 old = neigh->nud_state;
+ struct device *dev = neigh->dev;
+
+ if (arp && (old&(NUD_NOARP|NUD_PERMANENT)))
+ return -EPERM;
+
+ if (!(new&NUD_VALID)) {
+ if (old&NUD_IN_TIMER)
+ del_timer(&neigh->timer);
+ if (old&NUD_CONNECTED)
+ neigh_suspect(neigh);
+ neigh->nud_state = new;
+ return 0;
+ }
+
+ /* Compare new lladdr with cached one */
+ if (dev->addr_len == 0) {
+ /* First case: device needs no address. */
+ lladdr = neigh->ha;
+ } else if (lladdr) {
+ /* The second case: if something is already cached
+ and a new address is proposed:
+ - compare new & old
+ - if they are different, check override flag
+ */
+ if (old&NUD_VALID) {
+ if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0)
+ lladdr = neigh->ha;
+ else if (!override)
+ return -EPERM;
+ }
+ } else {
+ /* No address is supplied; if we know something,
+ use it, otherwise discard the request.
+ */
+ if (!(old&NUD_VALID))
+ return -EINVAL;
+ lladdr = neigh->ha;
+ }
+
+ neigh_sync(neigh);
+ old = neigh->nud_state;
+ if (new&NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+
+ /* If entry was valid and address is not changed,
+ do not change entry state, if new one is STALE.
+ */
+ if (old&NUD_VALID) {
+ if (lladdr == neigh->ha)
+ if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED)))
+ return 0;
+ }
+ if (old&NUD_IN_TIMER)
+ del_timer(&neigh->timer);
+ neigh->nud_state = new;
+ if (lladdr != neigh->ha) {
+ memcpy(&neigh->ha, lladdr, dev->addr_len);
+ neigh_update_hhs(neigh);
+ neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1);
+#ifdef CONFIG_ARPD
+ if (neigh->parms->app_probes)
+ neigh_app_notify(neigh);
+#endif
+ }
+ if (new == old)
+ return 0;
+ if (new&NUD_CONNECTED)
+ neigh_connect(neigh);
+ else
+ neigh_suspect(neigh);
+ if (!(old&NUD_VALID)) {
+ struct sk_buff *skb;
+
+ /* Again: avoid dead loop if something went wrong */
+
+ while (neigh->nud_state&NUD_VALID &&
+ (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct neighbour *n1 = neigh;
+ /* On shaper/eql skb->dst->neighbour != neigh :( */
+ if (skb->dst && skb->dst->neighbour)
+ n1 = skb->dst->neighbour;
+ n1->output(skb);
+ }
+ skb_queue_purge(&neigh->arp_queue);
+ }
+ return 0;
+}
+
+struct neighbour * neigh_event_ns(struct neigh_table *tbl,
+ u8 *lladdr, void *saddr,
+ struct device *dev)
+{
+ struct neighbour *neigh;
+
+ neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len);
+ if (neigh)
+ neigh_update(neigh, lladdr, NUD_STALE, 1, 1);
+ return neigh;
+}
+
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol)
+{
+ struct hh_cache *hh = NULL;
+ struct device *dev = dst->dev;
+
+ for (hh=n->hh; hh; hh = hh->hh_next)
+ if (hh->hh_type == protocol)
+ break;
+
+ if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
+ memset(hh, 0, sizeof(struct hh_cache));
+ hh->hh_type = protocol;
+ atomic_set(&hh->hh_refcnt, 0);
+ hh->hh_next = NULL;
+ if (dev->hard_header_cache(n, hh)) {
+ kfree(hh);
+ hh = NULL;
+ } else {
+ atomic_inc(&hh->hh_refcnt);
+ hh->hh_next = n->hh;
+ n->hh = hh;
+ if (n->nud_state&NUD_CONNECTED)
+ hh->hh_output = n->ops->hh_output;
+ else
+ hh->hh_output = n->ops->output;
+ }
+ }
+ if (hh) {
+ atomic_inc(&hh->hh_refcnt);
+ dst->hh = hh;
+ }
+}
+
+/* This function can be used in contexts, where only old dev_queue_xmit
+ worked, f.e. if you want to override normal output path (eql, shaper),
+ but resoltution is not made yet.
+ */
+
+int neigh_compat_output(struct sk_buff *skb)
+{
+ struct device *dev = skb->dev;
+
+ __skb_pull(skb, skb->nh.raw - skb->data);
+
+ if (dev->hard_header &&
+ dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 &&
+ dev->rebuild_header(skb))
+ return 0;
+
+ return dev_queue_xmit(skb);
+}
+
+/* Slow and careful. */
+
+int neigh_resolve_output(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+ struct neighbour *neigh;
+
+ if (!dst || !(neigh = dst->neighbour))
+ goto discard;
+
+ __skb_pull(skb, skb->nh.raw - skb->data);
+
+ if (neigh_event_send(neigh, skb) == 0) {
+ int err;
+ struct device *dev = neigh->dev;
+ if (dev->hard_header_cache && dst->hh == NULL) {
+ start_bh_atomic();
+ if (dst->hh == NULL)
+ neigh_hh_init(neigh, dst, dst->ops->protocol);
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+ end_bh_atomic();
+ } else {
+ start_bh_atomic();
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+ end_bh_atomic();
+ }
+ if (err >= 0)
+ return neigh->ops->queue_xmit(skb);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ return 0;
+
+discard:
+ NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+/* As fast as possible without hh cache */
+
+int neigh_connected_output(struct sk_buff *skb)
+{
+ int err;
+ struct dst_entry *dst = skb->dst;
+ struct neighbour *neigh = dst->neighbour;
+ struct device *dev = neigh->dev;
+
+ __skb_pull(skb, skb->nh.raw - skb->data);
+
+ start_bh_atomic();
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len);
+ end_bh_atomic();
+ if (err >= 0)
+ return neigh->ops->queue_xmit(skb);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static void neigh_proxy_process(unsigned long arg)
+{
+ struct neigh_table *tbl = (struct neigh_table *)arg;
+ long sched_next = 0;
+ unsigned long now = jiffies;
+ struct sk_buff *skb = tbl->proxy_queue.next;
+
+ while (skb != (struct sk_buff*)&tbl->proxy_queue) {
+ struct sk_buff *back = skb;
+ long tdif = back->stamp.tv_usec - now;
+
+ skb = skb->next;
+ if (tdif <= 0) {
+ __skb_unlink(back, &tbl->proxy_queue);
+ if (tbl->proxy_redo)
+ tbl->proxy_redo(back);
+ else
+ kfree_skb(back);
+ } else if (!sched_next || tdif < sched_next)
+ sched_next = tdif;
+ }
+ del_timer(&tbl->proxy_timer);
+ if (sched_next) {
+ tbl->proxy_timer.expires = jiffies + sched_next;
+ add_timer(&tbl->proxy_timer);
+ }
+}
+
+void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+ long sched_next = net_random()%p->proxy_delay;
+
+ if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+ kfree_skb(skb);
+ return;
+ }
+ skb->stamp.tv_sec = 0;
+ skb->stamp.tv_usec = now + sched_next;
+ if (del_timer(&tbl->proxy_timer)) {
+ long tval = tbl->proxy_timer.expires - now;
+ if (tval < sched_next)
+ sched_next = tval;
+ }
+ tbl->proxy_timer.expires = now + sched_next;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ __skb_queue_tail(&tbl->proxy_queue, skb);
+ add_timer(&tbl->proxy_timer);
+}
+
+
+struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl)
+{
+ struct neigh_parms *p;
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (p) {
+ memcpy(p, &tbl->parms, sizeof(*p));
+ p->tbl = tbl;
+ p->reachable_time = neigh_rand_reach_time(p->base_reachable_time);
+ if (dev && dev->neigh_setup) {
+ if (dev->neigh_setup(dev, p)) {
+ kfree(p);
+ return NULL;
+ }
+ }
+ p->next = tbl->parms.next;
+ tbl->parms.next = p;
+ }
+ return p;
+}
+
+void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
+{
+ struct neigh_parms **p;
+
+ if (parms == NULL || parms == &tbl->parms)
+ return;
+ for (p = &tbl->parms.next; *p; p = &(*p)->next) {
+ if (*p == parms) {
+ *p = parms->next;
+ synchronize_bh();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_unregister(parms);
+#endif
+ kfree(parms);
+ return;
+ }
+ }
+ NEIGH_PRINTK1("neigh_release_parms: not found\n");
+}
+
+
+void neigh_table_init(struct neigh_table *tbl)
+{
+ unsigned long now = jiffies;
+
+ tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time);
+
+ init_timer(&tbl->gc_timer);
+ tbl->gc_timer.data = (unsigned long)tbl;
+ tbl->gc_timer.function = neigh_periodic_timer;
+ tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time;
+ add_timer(&tbl->gc_timer);
+
+ init_timer(&tbl->proxy_timer);
+ tbl->proxy_timer.data = (unsigned long)tbl;
+ tbl->proxy_timer.function = neigh_proxy_process;
+ skb_queue_head_init(&tbl->proxy_queue);
+
+ tbl->last_flush = now;
+ tbl->last_rand = now + tbl->parms.reachable_time*20;
+ tbl->next = neigh_tables;
+ neigh_tables = tbl;
+}
+
+int neigh_table_clear(struct neigh_table *tbl)
+{
+ struct neigh_table **tp;
+
+ start_bh_atomic();
+ del_timer(&tbl->gc_timer);
+ del_timer(&tbl->proxy_timer);
+ skb_queue_purge(&tbl->proxy_queue);
+ neigh_ifdown(tbl, NULL);
+ end_bh_atomic();
+ if (tbl->entries)
+ printk(KERN_CRIT "neighbour leakage\n");
+ for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
+ if (*tp == tbl) {
+ *tp = tbl->next;
+ synchronize_bh();
+ break;
+ }
+ }
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_unregister(&tbl->parms);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_RTNETLINK
+
+
+int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct ndmsg *ndm = NLMSG_DATA(nlh);
+ struct rtattr **nda = arg;
+ struct neigh_table *tbl;
+ struct device *dev = NULL;
+
+ if (ndm->ndm_ifindex) {
+ if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
+ return -ENODEV;
+ }
+
+ for (tbl=neigh_tables; tbl; tbl = tbl->next) {
+ int err = 0;
+ struct neighbour *n;
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
+
+ if (nda[NDA_DST-1] == NULL ||
+ nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len))
+ return -EINVAL;
+
+ if (ndm->ndm_flags&NTF_PROXY)
+ return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev);
+
+ if (dev == NULL)
+ return -EINVAL;
+
+ start_bh_atomic();
+ n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
+ if (n) {
+ err = neigh_update(n, NULL, NUD_FAILED, 1, 0);
+ neigh_release(n);
+ }
+ end_bh_atomic();
+ return err;
+ }
+
+ return -EADDRNOTAVAIL;
+}
+
+int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct ndmsg *ndm = NLMSG_DATA(nlh);
+ struct rtattr **nda = arg;
+ struct neigh_table *tbl;
+ struct device *dev = NULL;
+
+ if (ndm->ndm_ifindex) {
+ if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
+ return -ENODEV;
+ }
+
+ for (tbl=neigh_tables; tbl; tbl = tbl->next) {
+ int err = 0;
+ struct neighbour *n;
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
+ if (nda[NDA_DST-1] == NULL ||
+ nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len))
+ return -EINVAL;
+ if (ndm->ndm_flags&NTF_PROXY) {
+ if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1))
+ return 0;
+ return -ENOBUFS;
+ }
+ if (dev == NULL)
+ return -EINVAL;
+ if (nda[NDA_LLADDR-1] != NULL &&
+ nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len))
+ return -EINVAL;
+ start_bh_atomic();
+ n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0);
+ if (n) {
+ if (nlh->nlmsg_flags&NLM_F_EXCL)
+ err = -EEXIST;
+ } else if (!(nlh->nlmsg_flags&NLM_F_CREATE))
+ err = -ENOENT;
+ else {
+ n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1);
+ if (n == NULL)
+ err = -ENOBUFS;
+ }
+ if (err == 0) {
+ err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL,
+ ndm->ndm_state,
+ nlh->nlmsg_flags&NLM_F_REPLACE, 0);
+ }
+ if (n)
+ neigh_release(n);
+ end_bh_atomic();
+ return err;
+ }
+
+ return -EADDRNOTAVAIL;
+}
+
+
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
+ u32 pid, u32 seq, int event)
+{
+ unsigned long now = jiffies;
+ struct ndmsg *ndm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+ struct nda_cacheinfo ci;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm));
+ ndm = NLMSG_DATA(nlh);
+ ndm->ndm_family = n->ops->family;
+ ndm->ndm_flags = n->flags;
+ ndm->ndm_type = n->type;
+ ndm->ndm_state = n->nud_state;
+ ndm->ndm_ifindex = n->dev->ifindex;
+ RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key);
+ if (n->nud_state&NUD_VALID)
+ RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha);
+ ci.ndm_used = now - n->used;
+ ci.ndm_confirmed = now - n->confirmed;
+ ci.ndm_updated = now - n->updated;
+ ci.ndm_refcnt = atomic_read(&n->refcnt);
+ RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+
+static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct neighbour *n;
+ int h, s_h;
+ int idx, s_idx;
+
+ s_h = cb->args[1];
+ s_idx = idx = cb->args[2];
+ for (h=0; h <= NEIGH_HASHMASK; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ s_idx = 0;
+ start_bh_atomic();
+ for (n = tbl->hash_buckets[h], idx = 0; n;
+ n = n->next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) {
+ end_bh_atomic();
+ cb->args[1] = h;
+ cb->args[2] = idx;
+ return -1;
+ }
+ }
+ end_bh_atomic();
+ }
+
+ cb->args[1] = h;
+ cb->args[2] = idx;
+ return skb->len;
+}
+
+int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int t;
+ int s_t;
+ struct neigh_table *tbl;
+ int family = ((struct rtgenmsg*)NLMSG_DATA(cb->nlh))->rtgen_family;
+
+ s_t = cb->args[0];
+
+ for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) {
+ if (t < s_t) continue;
+ if (family && tbl->family != family)
+ continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ if (neigh_dump_table(tbl, skb, cb) < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_ARPD
+void neigh_app_ns(struct neighbour *n)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int size = NLMSG_SPACE(sizeof(struct ndmsg)+256);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ nlh = (struct nlmsghdr*)skb->data;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+ NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+}
+
+static void neigh_app_notify(struct neighbour *n)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int size = NLMSG_SPACE(sizeof(struct ndmsg)+256);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ nlh = (struct nlmsghdr*)skb->data;
+ NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+}
+
+
+
+#endif
+
+
+#endif
+
+#ifdef CONFIG_SYSCTL
+
+struct neigh_sysctl_table
+{
+ struct ctl_table_header *sysctl_header;
+ ctl_table neigh_vars[17];
+ ctl_table neigh_dev[2];
+ ctl_table neigh_neigh_dir[2];
+ ctl_table neigh_proto_dir[2];
+ ctl_table neigh_root_dir[2];
+} neigh_sysctl_template = {
+ NULL,
+ {{NET_NEIGH_MCAST_SOLICIT, "mcast_solicit",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_UCAST_SOLICIT, "ucast_solicit",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_APP_SOLICIT, "app_solicit",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_RETRANS_TIME, "retrans_time",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_REACHABLE_TIME, "base_reachable_time",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_NEIGH_GC_STALE_TIME, "gc_stale_time",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_NEIGH_UNRES_QLEN, "unres_qlen",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_PROXY_QLEN, "proxy_qlen",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_ANYCAST_DELAY, "anycast_delay",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_PROXY_DELAY, "proxy_delay",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_LOCKTIME, "locktime",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_GC_INTERVAL, "gc_interval",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_NEIGH_GC_THRESH1, "gc_thresh1",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_GC_THRESH2, "gc_thresh2",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_NEIGH_GC_THRESH3, "gc_thresh3",
+ NULL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {0}},
+
+ {{NET_PROTO_CONF_DEFAULT, "default", NULL, 0, 0555, NULL},{0}},
+ {{0, "neigh", NULL, 0, 0555, NULL},{0}},
+ {{0, NULL, NULL, 0, 0555, NULL},{0}},
+ {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}}
+};
+
+int neigh_sysctl_register(struct device *dev, struct neigh_parms *p,
+ int p_id, int pdev_id, char *p_name)
+{
+ struct neigh_sysctl_table *t;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (t == NULL)
+ return -ENOBUFS;
+ memcpy(t, &neigh_sysctl_template, sizeof(*t));
+ t->neigh_vars[0].data = &p->mcast_probes;
+ t->neigh_vars[1].data = &p->ucast_probes;
+ t->neigh_vars[2].data = &p->app_probes;
+ t->neigh_vars[3].data = &p->retrans_time;
+ t->neigh_vars[4].data = &p->base_reachable_time;
+ t->neigh_vars[5].data = &p->delay_probe_time;
+ t->neigh_vars[6].data = &p->gc_staletime;
+ t->neigh_vars[7].data = &p->queue_len;
+ t->neigh_vars[8].data = &p->proxy_qlen;
+ t->neigh_vars[9].data = &p->anycast_delay;
+ t->neigh_vars[10].data = &p->proxy_delay;
+ t->neigh_vars[11].data = &p->locktime;
+ if (dev) {
+ t->neigh_dev[0].procname = dev->name;
+ t->neigh_dev[0].ctl_name = dev->ifindex;
+ memset(&t->neigh_vars[12], 0, sizeof(ctl_table));
+ } else {
+ t->neigh_vars[12].data = (int*)(p+1);
+ t->neigh_vars[13].data = (int*)(p+1) + 1;
+ t->neigh_vars[14].data = (int*)(p+1) + 2;
+ t->neigh_vars[15].data = (int*)(p+1) + 3;
+ }
+ t->neigh_neigh_dir[0].ctl_name = pdev_id;
+
+ t->neigh_proto_dir[0].procname = p_name;
+ t->neigh_proto_dir[0].ctl_name = p_id;
+
+ t->neigh_dev[0].child = t->neigh_vars;
+ t->neigh_neigh_dir[0].child = t->neigh_dev;
+ t->neigh_proto_dir[0].child = t->neigh_neigh_dir;
+ t->neigh_root_dir[0].child = t->neigh_proto_dir;
+
+ t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0);
+ if (t->sysctl_header == NULL) {
+ kfree(t);
+ return -ENOBUFS;
+ }
+ p->sysctl_table = t;
+ return 0;
+}
+
+void neigh_sysctl_unregister(struct neigh_parms *p)
+{
+ if (p->sysctl_table) {
+ struct neigh_sysctl_table *t = p->sysctl_table;
+ p->sysctl_table = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
+}
+
+#endif /* CONFIG_SYSCTL */
diff --git a/pfinet/linux-src/net/core/profile.c b/pfinet/linux-src/net/core/profile.c
new file mode 100644
index 00000000..fc7464b7
--- /dev/null
+++ b/pfinet/linux-src/net/core/profile.c
@@ -0,0 +1,305 @@
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/inet.h>
+#include <net/checksum.h>
+
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <net/profile.h>
+
+#ifdef CONFIG_NET_PROFILE
+
+atomic_t net_profile_active;
+struct timeval net_profile_adjust;
+
+NET_PROFILE_DEFINE(total);
+
+struct net_profile_slot *net_profile_chain = &net_prof_total;
+
+#ifdef __alpha__
+__u32 alpha_lo;
+long alpha_hi;
+
+static void alpha_tick(unsigned long);
+
+static struct timer_list alpha_timer =
+ { NULL, NULL, 0, 0L, alpha_tick };
+
+void alpha_tick(unsigned long dummy)
+{
+ struct timeval dummy_stamp;
+ net_profile_stamp(&dummy_stamp);
+ alpha_timer.expires = jiffies + 4*HZ;
+ add_timer(&alpha_timer);
+}
+
+#endif
+
+void net_profile_irq_adjust(struct timeval *entered, struct timeval* leaved)
+{
+ struct net_profile_slot *s;
+
+ net_profile_sub(entered, leaved);
+ for (s = net_profile_chain; s; s = s->next) {
+ if (s->active)
+ net_profile_add(leaved, &s->irq);
+ }
+}
+
+
+#ifdef CONFIG_PROC_FS
+static int profile_read_proc(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ off_t pos=0;
+ off_t begin=0;
+ int len=0;
+ struct net_profile_slot *s;
+
+ len+= sprintf(buffer, "Slot Hits Hi Lo OnIrqHi OnIrqLo Ufl\n");
+
+ if (offset == 0) {
+ cli();
+ net_prof_total.active = 1;
+ atomic_inc(&net_profile_active);
+ NET_PROFILE_LEAVE(total);
+ sti();
+ }
+ for (s = net_profile_chain; s; s = s->next) {
+ struct net_profile_slot tmp;
+
+ cli();
+ tmp = *s;
+
+ /* Wrong, but pretty close to truth */
+
+ s->accumulator.tv_sec = 0;
+ s->accumulator.tv_usec = 0;
+ s->irq.tv_sec = 0;
+ s->irq.tv_usec = 0;
+ s->hits = 0;
+ s->underflow = 0;
+ /* Repair active count, it is possible, only if code has a bug */
+ if (s->active) {
+ s->active = 0;
+ atomic_dec(&net_profile_active);
+ }
+ sti();
+
+ net_profile_sub(&tmp.irq, &tmp.accumulator);
+
+ len += sprintf(buffer+len,"%-15s %-10d %-10ld %-10lu %-10lu %-10lu %d/%d",
+ tmp.id,
+ tmp.hits,
+ tmp.accumulator.tv_sec,
+ tmp.accumulator.tv_usec,
+ tmp.irq.tv_sec,
+ tmp.irq.tv_usec,
+ tmp.underflow, tmp.active);
+
+ buffer[len++]='\n';
+
+ pos=begin+len;
+ if(pos<offset) {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ goto done;
+ }
+ *eof = 1;
+
+done:
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if (len < 0) {
+ len = 0;
+ printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n");
+ }
+ if (offset == 0) {
+ cli();
+ net_prof_total.active = 0;
+ net_prof_total.hits = 0;
+ net_profile_stamp(&net_prof_total.entered);
+ sti();
+ }
+ return len;
+}
+#endif
+
+struct iphdr whitehole_iph;
+int whitehole_count;
+
+static int whitehole_xmit(struct sk_buff *skb, struct device *dev)
+{
+ struct net_device_stats *stats;
+ dev_kfree_skb(skb);
+ stats = (struct net_device_stats *)dev->priv;
+ stats->tx_packets++;
+ stats->tx_bytes+=skb->len;
+
+ return 0;
+}
+
+static void whitehole_inject(unsigned long);
+int whitehole_init(struct device *dev);
+
+static struct timer_list whitehole_timer =
+ { NULL, NULL, 0, 0L, whitehole_inject };
+
+static struct device whitehole_dev = {
+ "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, };
+
+static int whitehole_open(struct device *dev)
+{
+ whitehole_count = 100000;
+ whitehole_timer.expires = jiffies + 5*HZ;
+ add_timer(&whitehole_timer);
+ return 0;
+}
+
+static int whitehole_close(struct device *dev)
+{
+ del_timer(&whitehole_timer);
+ return 0;
+}
+
+static void whitehole_inject(unsigned long dummy)
+{
+ struct net_device_stats *stats = (struct net_device_stats *)whitehole_dev.priv;
+ extern int netdev_dropping;
+
+ do {
+ struct iphdr *iph;
+ struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC);
+ if (!skb)
+ break;
+ skb_reserve(skb, 32);
+ iph = (struct iphdr*)skb_put(skb, sizeof(*iph));
+ skb->mac.raw = ((u8*)iph) - 14;
+ memcpy(iph, &whitehole_iph, sizeof(*iph));
+ skb->protocol = __constant_htons(ETH_P_IP);
+ skb->dev = &whitehole_dev;
+ skb->pkt_type = PACKET_HOST;
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ netif_rx(skb);
+ whitehole_count--;
+ } while (netdev_dropping == 0 && whitehole_count>0);
+ if (whitehole_count > 0) {
+ whitehole_timer.expires = jiffies + 1;
+ add_timer(&whitehole_timer);
+ }
+}
+
+static struct net_device_stats *whitehole_get_stats(struct device *dev)
+{
+ struct net_device_stats *stats = (struct net_device_stats *) dev->priv;
+ return stats;
+}
+
+__initfunc(int whitehole_init(struct device *dev))
+{
+ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
+ if (dev->priv == NULL)
+ return -ENOBUFS;
+ memset(dev->priv, 0, sizeof(struct net_device_stats));
+ dev->get_stats = whitehole_get_stats;
+ dev->hard_start_xmit = whitehole_xmit;
+ dev->open = whitehole_open;
+ dev->stop = whitehole_close;
+ ether_setup(dev);
+ dev->tx_queue_len = 0;
+ dev->flags |= IFF_NOARP;
+ dev->flags &= ~(IFF_BROADCAST|IFF_MULTICAST);
+ dev->iflink = 0;
+ whitehole_iph.ihl = 5;
+ whitehole_iph.version = 4;
+ whitehole_iph.ttl = 2;
+ whitehole_iph.saddr = in_aton("193.233.7.21");
+ whitehole_iph.daddr = in_aton("193.233.7.10");
+ whitehole_iph.tot_len = htons(20);
+ whitehole_iph.check = ip_compute_csum((void *)&whitehole_iph, 20);
+ return 0;
+}
+
+int net_profile_register(struct net_profile_slot *slot)
+{
+ cli();
+ slot->next = net_profile_chain;
+ net_profile_chain = slot;
+ sti();
+ return 0;
+}
+
+int net_profile_unregister(struct net_profile_slot *slot)
+{
+ struct net_profile_slot **sp, *s;
+
+ for (sp = &net_profile_chain; (s = *sp) != NULL; sp = &s->next) {
+ if (s == slot) {
+ cli();
+ *sp = s->next;
+ sti();
+ return 0;
+ }
+ }
+ return -ESRCH;
+}
+
+
+__initfunc(int net_profile_init(void))
+{
+ int i;
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *ent;
+
+ ent = create_proc_entry("net/profile", 0, 0);
+ ent->read_proc = profile_read_proc;
+#endif
+
+ register_netdevice(&whitehole_dev);
+
+ printk("Evaluating net profiler cost ...");
+#if CPU == 586 || CPU == 686
+ if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC)) {
+ printk(KERN_ERR "Sorry, your CPU does not support TSC. Net profiler disabled.\n");
+ return -1;
+ }
+#endif
+ start_bh_atomic();
+#ifdef __alpha__
+ alpha_tick(0);
+#endif
+ for (i=0; i<1024; i++) {
+ NET_PROFILE_ENTER(total);
+ NET_PROFILE_LEAVE(total);
+ }
+ if (net_prof_total.accumulator.tv_sec) {
+ printk(" too high!\n");
+ } else {
+ net_profile_adjust.tv_usec = net_prof_total.accumulator.tv_usec>>10;
+ printk("%ld units\n", net_profile_adjust.tv_usec);
+ }
+ net_prof_total.hits = 0;
+ net_profile_stamp(&net_prof_total.entered);
+ end_bh_atomic();
+ return 0;
+}
+
+#endif
diff --git a/pfinet/linux-src/net/core/rtnetlink.c b/pfinet/linux-src/net/core/rtnetlink.c
new file mode 100644
index 00000000..7f89e54a
--- /dev/null
+++ b/pfinet/linux-src/net/core/rtnetlink.c
@@ -0,0 +1,512 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Routing netlink socket interface: protocol independent part.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ * Alexey Zhuravlev ifi_change does something useful
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/string.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+atomic_t rtnl_rlockct;
+struct wait_queue *rtnl_wait;
+
+
+void rtnl_lock()
+{
+ rtnl_shlock();
+ rtnl_exlock();
+}
+
+void rtnl_unlock()
+{
+ rtnl_exunlock();
+ rtnl_shunlock();
+}
+
+int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
+{
+ memset(tb, 0, sizeof(struct rtattr*)*maxattr);
+
+ while (RTA_OK(rta, len)) {
+ unsigned flavor = rta->rta_type;
+ if (flavor && flavor <= maxattr)
+ tb[flavor-1] = rta;
+ rta = RTA_NEXT(rta, len);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_RTNETLINK
+struct sock *rtnl;
+
+unsigned long rtnl_wlockct;
+
+struct rtnetlink_link * rtnetlink_links[NPROTO];
+
+#define _S 1 /* superuser privileges required */
+#define _X 2 /* exclusive access to tables required */
+#define _G 4 /* GET request */
+
+static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] =
+{
+ NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+ NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
+ NLMSG_LENGTH(sizeof(struct rtmsg)),
+ NLMSG_LENGTH(sizeof(struct ndmsg)),
+ NLMSG_LENGTH(sizeof(struct rtmsg)),
+ NLMSG_LENGTH(sizeof(struct tcmsg)),
+ NLMSG_LENGTH(sizeof(struct tcmsg)),
+ NLMSG_LENGTH(sizeof(struct tcmsg))
+};
+
+static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] =
+{
+ IFLA_MAX,
+ IFA_MAX,
+ RTA_MAX,
+ NDA_MAX,
+ RTA_MAX,
+ TCA_MAX,
+ TCA_MAX,
+ TCA_MAX
+};
+
+void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+{
+ struct rtattr *rta;
+ int size = RTA_LENGTH(attrlen);
+
+ rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
+ rta->rta_type = attrtype;
+ rta->rta_len = size;
+ memcpy(RTA_DATA(rta), data, attrlen);
+}
+
+int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+{
+ int err = 0;
+
+ NETLINK_CB(skb).dst_groups = group;
+ if (echo)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
+ if (echo)
+ err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+ return err;
+}
+
+static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev,
+ int type, u32 pid, u32 seq, u32 change)
+{
+ struct ifinfomsg *r;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
+ if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ r = NLMSG_DATA(nlh);
+ r->ifi_family = AF_UNSPEC;
+ r->ifi_type = dev->type;
+ r->ifi_index = dev->ifindex;
+ r->ifi_flags = dev->flags;
+ r->ifi_change = change;
+
+ RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name);
+ if (dev->addr_len) {
+ RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+ RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+ }
+ if (1) {
+ unsigned mtu = dev->mtu;
+ RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu);
+ }
+ if (dev->ifindex != dev->iflink)
+ RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink);
+ if (dev->qdisc_sleeping)
+ RTA_PUT(skb, IFLA_QDISC,
+ strlen(dev->qdisc_sleeping->ops->id) + 1,
+ dev->qdisc_sleeping->ops->id);
+ if (dev->get_stats) {
+ struct net_device_stats *stats = dev->get_stats(dev);
+ if (stats)
+ RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats);
+ }
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->args[0];
+ struct device *dev;
+
+ for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0)
+ break;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->family;
+
+ if (s_idx == 0)
+ s_idx = 1;
+ for (idx=1; idx<NPROTO; idx++) {
+ int type = cb->nlh->nlmsg_type-RTM_BASE;
+ if (idx < s_idx || idx == PF_PACKET)
+ continue;
+ if (rtnetlink_links[idx] == NULL ||
+ rtnetlink_links[idx][type].dumpit == NULL)
+ continue;
+ if (idx > s_idx)
+ memset(&cb->args[0], 0, sizeof(cb->args));
+ if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0)
+ continue;
+ if (skb_tailroom(skb) < 256)
+ break;
+ }
+ cb->family = idx;
+
+ return skb->len;
+}
+
+void rtmsg_ifinfo(int type, struct device *dev)
+{
+ struct sk_buff *skb;
+ int size = NLMSG_GOODSIZE;
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, ~0U) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
+}
+
+static int rtnetlink_done(struct netlink_callback *cb)
+{
+ if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC)
+ rtnl_shunlock();
+ return 0;
+}
+
+/* Process one rtnetlink message. */
+
+extern __inline__ int
+rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
+{
+ struct rtnetlink_link *link;
+ struct rtnetlink_link *link_tab;
+ struct rtattr *rta[RTATTR_MAX];
+
+ int exclusive = 0;
+ int sz_idx, kind;
+ int min_len;
+ int family;
+ int type;
+ int err;
+
+ /* Only requests are handled by kernel now */
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ type = nlh->nlmsg_type;
+
+ /* A control message: ignore them */
+ if (type < RTM_BASE)
+ return 0;
+
+ /* Unknown message: reply with EINVAL */
+ if (type > RTM_MAX)
+ goto err_inval;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+ return 0;
+
+ family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+ if (family > NPROTO) {
+ *errp = -EAFNOSUPPORT;
+ return -1;
+ }
+
+ link_tab = rtnetlink_links[family];
+ if (link_tab == NULL)
+ link_tab = rtnetlink_links[PF_UNSPEC];
+ link = &link_tab[type];
+
+ sz_idx = type>>2;
+ kind = type&3;
+
+ if (kind != 2 && !cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
+ *errp = -EPERM;
+ return -1;
+ }
+
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ u32 rlen;
+
+ if (link->dumpit == NULL)
+ link = &(rtnetlink_links[PF_UNSPEC][type]);
+
+ if (link->dumpit == NULL)
+ goto err_inval;
+
+ /* Super-user locks all the tables to get atomic snapshot */
+ if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)
+ && nlh->nlmsg_flags&NLM_F_ATOMIC)
+ atomic_inc(&rtnl_rlockct);
+ if ((*errp = netlink_dump_start(rtnl, skb, nlh,
+ link->dumpit,
+ rtnetlink_done)) != 0) {
+ if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC)
+ atomic_dec(&rtnl_rlockct);
+ return -1;
+ }
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return -1;
+ }
+
+ if (kind != 2) {
+ if (rtnl_exlock_nowait()) {
+ *errp = 0;
+ return -1;
+ }
+ exclusive = 1;
+ }
+
+ memset(&rta, 0, sizeof(rta));
+
+ min_len = rtm_min[sz_idx];
+ if (nlh->nlmsg_len < min_len)
+ goto err_inval;
+
+ if (nlh->nlmsg_len > min_len) {
+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+ struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
+
+ while (RTA_OK(attr, attrlen)) {
+ unsigned flavor = attr->rta_type;
+ if (flavor) {
+ if (flavor > rta_max[sz_idx])
+ goto err_inval;
+ rta[flavor-1] = attr;
+ }
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ }
+
+ if (link->doit == NULL)
+ link = &(rtnetlink_links[PF_UNSPEC][type]);
+ if (link->doit == NULL)
+ goto err_inval;
+ err = link->doit(skb, nlh, (void *)&rta);
+
+ if (exclusive)
+ rtnl_exunlock();
+ *errp = err;
+ return err;
+
+err_inval:
+ if (exclusive)
+ rtnl_exunlock();
+ *errp = -EINVAL;
+ return -1;
+}
+
+/*
+ * Process one packet of messages.
+ * Malformed skbs with wrong lengths of messages are discarded silently.
+ */
+
+extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ u32 rlen;
+
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return 0;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if (rtnetlink_rcv_msg(skb, nlh, &err)) {
+ /* Not error, but we must interrupt processing here:
+ * Note, that in this case we do not pull message
+ * from skb, it will be processed later.
+ */
+ if (err == 0)
+ return -1;
+ netlink_ack(skb, nlh, err);
+ } else if (nlh->nlmsg_flags&NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+
+ return 0;
+}
+
+/*
+ * rtnetlink input queue processing routine:
+ * - try to acquire shared lock. If it is failed, defer processing.
+ * - feed skbs to rtnetlink_rcv_skb, until it refuse a message,
+ * that will occur, when a dump started and/or acquisition of
+ * exclusive lock failed.
+ */
+
+static void rtnetlink_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+
+ if (rtnl_shlock_nowait())
+ return;
+
+ while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
+ if (rtnetlink_rcv_skb(skb)) {
+ if (skb->len)
+ skb_queue_head(&sk->receive_queue, skb);
+ else
+ kfree_skb(skb);
+ break;
+ }
+ kfree_skb(skb);
+ }
+
+ rtnl_shunlock();
+}
+
+static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] =
+{
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, rtnetlink_dump_ifinfo, },
+ { NULL, NULL, },
+
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, rtnetlink_dump_all, },
+ { NULL, NULL, },
+
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, rtnetlink_dump_all, },
+ { NULL, NULL, },
+
+ { neigh_add, NULL, },
+ { neigh_delete, NULL, },
+ { NULL, neigh_dump_info, },
+ { NULL, NULL, },
+
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+};
+
+
+static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct device *dev = ptr;
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ rtmsg_ifinfo(RTM_DELLINK, dev);
+ break;
+ default:
+ rtmsg_ifinfo(RTM_NEWLINK, dev);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block rtnetlink_dev_notifier = {
+ rtnetlink_event,
+ NULL,
+ 0
+};
+
+
+__initfunc(void rtnetlink_init(void))
+{
+#ifdef RTNL_DEBUG
+ printk("Initializing RT netlink socket\n");
+#endif
+ rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
+ if (rtnl == NULL)
+ panic("rtnetlink_init: cannot initialize rtnetlink\n");
+ register_netdevice_notifier(&rtnetlink_dev_notifier);
+ rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table;
+ rtnetlink_links[PF_PACKET] = link_rtnetlink_table;
+}
+
+
+
+#endif
diff --git a/pfinet/linux-src/net/core/scm.c b/pfinet/linux-src/net/core/scm.c
new file mode 100644
index 00000000..cdb5f3d0
--- /dev/null
+++ b/pfinet/linux-src/net/core/scm.c
@@ -0,0 +1,280 @@
+/* scm.c - Socket level control messages processing.
+ *
+ * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Alignment and value checking mods by Craig Metz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/inet.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/rarp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/scm.h>
+
+
+/*
+ * Only allow a user to send credentials, that they could set with
+ * setu(g)id.
+ */
+
+static __inline__ int scm_check_creds(struct ucred *creds)
+{
+ if ((creds->pid == current->pid || capable(CAP_SYS_ADMIN)) &&
+ ((creds->uid == current->uid || creds->uid == current->euid ||
+ creds->uid == current->suid) || capable(CAP_SETUID)) &&
+ ((creds->gid == current->gid || creds->gid == current->egid ||
+ creds->gid == current->sgid) || capable(CAP_SETGID))) {
+ return 0;
+ }
+ return -EPERM;
+}
+
+static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+{
+ int *fdp = (int*)CMSG_DATA(cmsg);
+ struct scm_fp_list *fpl = *fplp;
+ struct file **fpp;
+ int i, num;
+
+ num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+
+ if (num <= 0)
+ return 0;
+
+ if (num > SCM_MAX_FD)
+ return -EINVAL;
+
+ if (!fpl)
+ {
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+ *fplp = fpl;
+ fpl->count = 0;
+ }
+ fpp = &fpl->fp[fpl->count];
+
+ if (fpl->count + num > SCM_MAX_FD)
+ return -EINVAL;
+
+ /*
+ * Verify the descriptors and increment the usage count.
+ */
+
+ for (i=0; i< num; i++)
+ {
+ int fd = fdp[i];
+ struct file *file;
+
+ if (fd < 0 || !(file = fget(fd)))
+ return -EBADF;
+ *fpp++ = file;
+ fpl->count++;
+ }
+ return num;
+}
+
+void __scm_destroy(struct scm_cookie *scm)
+{
+ struct scm_fp_list *fpl = scm->fp;
+ int i;
+
+ if (fpl) {
+ scm->fp = NULL;
+ for (i=fpl->count-1; i>=0; i--)
+ fput(fpl->fp[i]);
+ kfree(fpl);
+ }
+}
+
+int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
+{
+ struct cmsghdr *cmsg;
+ int err;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
+ {
+ err = -EINVAL;
+
+ /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
+ /* The first check was omitted in <= 2.2.5. The reasoning was
+ that parser checks cmsg_len in any case, so that
+ additional check would be work duplication.
+ But if cmsg_level is not SOL_SOCKET, we do not check
+ for too short ancillary data object at all! Oops.
+ OK, let's add it...
+ */
+ if (cmsg->cmsg_len < sizeof(struct cmsghdr) ||
+ (unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+ + cmsg->cmsg_len) > msg->msg_controllen)
+ goto error;
+
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ switch (cmsg->cmsg_type)
+ {
+ case SCM_RIGHTS:
+ err=scm_fp_copy(cmsg, &p->fp);
+ if (err<0)
+ goto error;
+ break;
+ case SCM_CREDENTIALS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
+ goto error;
+ memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&p->creds);
+ if (err)
+ goto error;
+ break;
+ default:
+ goto error;
+ }
+ }
+
+ if (p->fp && !p->fp->count)
+ {
+ kfree(p->fp);
+ p->fp = NULL;
+ }
+
+ err = -EINVAL;
+ if (msg->msg_flags & MSG_CTLFLAGS)
+ goto error;
+
+ return 0;
+
+error:
+ scm_destroy(p);
+ return err;
+}
+
+int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
+{
+ struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control;
+ struct cmsghdr cmhdr;
+ int cmlen = CMSG_LEN(len);
+ int err;
+
+ if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return 0; /* XXX: return error? check spec. */
+ }
+ if (msg->msg_controllen < cmlen) {
+ msg->msg_flags |= MSG_CTRUNC;
+ cmlen = msg->msg_controllen;
+ }
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+
+ err = -EFAULT;
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ goto out;
+ if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+ goto out;
+ cmlen = CMSG_SPACE(len);
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ err = 0;
+out:
+ return err;
+}
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control;
+
+ int fdmax = (msg->msg_controllen - sizeof(struct cmsghdr))/sizeof(int);
+ int fdnum = scm->fp->count;
+ struct file **fp = scm->fp->fp;
+ int *cmfptr;
+ int err = 0, i;
+
+ if (fdnum < fdmax)
+ fdmax = fdnum;
+
+ for (i=0, cmfptr=(int*)CMSG_DATA(cm); i<fdmax; i++, cmfptr++)
+ {
+ int new_fd;
+ err = get_unused_fd();
+ if (err < 0)
+ break;
+ new_fd = err;
+ err = put_user(new_fd, cmfptr);
+ if (err) {
+ put_unused_fd(new_fd);
+ break;
+ }
+ /* Bump the usage count and install the file. */
+ fp[i]->f_count++;
+ current->files->fd[new_fd] = fp[i];
+ }
+
+ if (i > 0)
+ {
+ int cmlen = CMSG_LEN(i*sizeof(int));
+ if (!err)
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ if (!err)
+ err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+ if (!err)
+ err = put_user(cmlen, &cm->cmsg_len);
+ if (!err) {
+ cmlen = CMSG_SPACE(i*sizeof(int));
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ }
+ }
+ if (i < fdnum)
+ msg->msg_flags |= MSG_CTRUNC;
+
+ /*
+ * All of the files that fit in the message have had their
+ * usage counts incremented, so we just free the list.
+ */
+ __scm_destroy(scm);
+}
+
+struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+{
+ struct scm_fp_list *new_fpl;
+ int i;
+
+ if (!fpl)
+ return NULL;
+
+ new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
+ if (new_fpl) {
+ memcpy(new_fpl, fpl, sizeof(*fpl));
+
+ for (i=fpl->count-1; i>=0; i--)
+ fpl->fp[i]->f_count++;
+ }
+ return new_fpl;
+}
diff --git a/pfinet/linux-src/net/core/skbuff.c b/pfinet/linux-src/net/core/skbuff.c
new file mode 100644
index 00000000..b7636437
--- /dev/null
+++ b/pfinet/linux-src/net/core/skbuff.c
@@ -0,0 +1,385 @@
+/*
+ * Routines having to do with the 'struct sk_buff' memory handlers.
+ *
+ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
+ * Florian La Roche <rzsfl@rz.uni-sb.de>
+ *
+ * Version: $Id: skbuff.c,v 1.55 1999/02/23 08:12:27 davem Exp $
+ *
+ * Fixes:
+ * Alan Cox : Fixed the worst of the load balancer bugs.
+ * Dave Platt : Interrupt stacking fix.
+ * Richard Kooijman : Timestamp fixes.
+ * Alan Cox : Changed buffer format.
+ * Alan Cox : destructor hook for AF_UNIX etc.
+ * Linus Torvalds : Better skb_clone.
+ * Alan Cox : Added skb_copy.
+ * Alan Cox : Added all the changed routines Linus
+ * only put in the headers
+ * Ray VanTassle : Fixed --skb->lock in free
+ * Alan Cox : skb_copy copy arp field
+ * Andi Kleen : slabified it.
+ *
+ * NOTE:
+ * The __skb_ routines should be called with interrupts
+ * disabled, or you better be *real* sure that the operation is atomic
+ * with respect to whatever list is being frobbed (e.g. via lock_sock()
+ * or via disabling bottom half handlers, etc).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * The functions in this file will not compile correctly with gcc 2.4.x
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/malloc.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+/*
+ * Skb list spinlock
+ */
+spinlock_t skb_queue_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Resource tracking variables
+ */
+
+static atomic_t net_skbcount = ATOMIC_INIT(0);
+static atomic_t net_allocs = ATOMIC_INIT(0);
+static atomic_t net_fails = ATOMIC_INIT(0);
+
+extern atomic_t ip_frag_mem;
+
+static kmem_cache_t *skbuff_head_cache;
+
+/*
+ * Keep out-of-line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always
+ * reliable.
+ */
+
+void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+{
+ panic("skput:over: %p:%d put:%d dev:%s",
+ here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
+}
+
+void skb_under_panic(struct sk_buff *skb, int sz, void *here)
+{
+ panic("skput:under: %p:%d put:%d dev:%s",
+ here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
+}
+
+void show_net_buffers(void)
+{
+ printk("Networking buffers in use : %u\n",
+ atomic_read(&net_skbcount));
+ printk("Total network buffer allocations : %u\n",
+ atomic_read(&net_allocs));
+ printk("Total failed network buffer allocs : %u\n",
+ atomic_read(&net_fails));
+#ifdef CONFIG_INET
+ printk("IP fragment buffer size : %u\n",
+ atomic_read(&ip_frag_mem));
+#endif
+}
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+struct sk_buff *alloc_skb(unsigned int size,int gfp_mask)
+{
+ struct sk_buff *skb;
+ u8 *data;
+
+ if (in_interrupt() && (gfp_mask & __GFP_WAIT)) {
+ static int count = 0;
+ if (++count < 5) {
+ printk(KERN_ERR "alloc_skb called nonatomically "
+ "from interrupt %p\n", __builtin_return_address(0));
+ }
+ gfp_mask &= ~__GFP_WAIT;
+ }
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (skb == NULL)
+ goto nohead;
+
+ /* Get the DATA. Size must match skb_add_mtu(). */
+ size = ((size + 15) & ~15);
+ data = kmalloc(size + sizeof(atomic_t), gfp_mask);
+ if (data == NULL)
+ goto nodata;
+
+ /* Note that this counter is useless now - you can just look in the
+ * skbuff_head entry in /proc/slabinfo. We keep it only for emergency
+ * cases.
+ */
+ atomic_inc(&net_allocs);
+
+ skb->truesize = size;
+
+ atomic_inc(&net_skbcount);
+
+ /* Load the data pointers. */
+ skb->head = data;
+ skb->data = data;
+ skb->tail = data;
+ skb->end = data + size;
+
+ /* Set up other state */
+ skb->len = 0;
+ skb->is_clone = 0;
+ skb->cloned = 0;
+
+ atomic_set(&skb->users, 1);
+ atomic_set(skb_datarefp(skb), 1);
+ return skb;
+
+nodata:
+ kmem_cache_free(skbuff_head_cache, skb);
+nohead:
+ atomic_inc(&net_fails);
+ return NULL;
+}
+
+
+/*
+ * Slab constructor for a skb head.
+ */
+static inline void skb_headerinit(void *p, kmem_cache_t *cache,
+ unsigned long flags)
+{
+ struct sk_buff *skb = p;
+
+ skb->destructor = NULL;
+ skb->pkt_type = PACKET_HOST; /* Default type */
+ skb->pkt_bridged = 0; /* Not bridged */
+ skb->prev = skb->next = NULL;
+ skb->list = NULL;
+ skb->sk = NULL;
+ skb->stamp.tv_sec=0; /* No idea about time */
+ skb->ip_summed = 0;
+ skb->security = 0; /* By default packets are insecure */
+ skb->dst = NULL;
+#ifdef CONFIG_IP_FIREWALL
+ skb->fwmark = 0;
+#endif
+ memset(skb->cb, 0, sizeof(skb->cb));
+ skb->priority = 0;
+}
+
+/*
+ * Free an skbuff by memory without cleaning the state.
+ */
+void kfree_skbmem(struct sk_buff *skb)
+{
+ if (!skb->cloned || atomic_dec_and_test(skb_datarefp(skb)))
+ kfree(skb->head);
+
+ kmem_cache_free(skbuff_head_cache, skb);
+ atomic_dec(&net_skbcount);
+}
+
+/*
+ * Free an sk_buff. Release anything attached to the buffer. Clean the state.
+ */
+
+void __kfree_skb(struct sk_buff *skb)
+{
+ if (skb->list)
+ printk(KERN_WARNING "Warning: kfree_skb passed an skb still "
+ "on a list (from %p).\n", __builtin_return_address(0));
+
+ dst_release(skb->dst);
+ if(skb->destructor)
+ skb->destructor(skb);
+ skb_headerinit(skb, NULL, 0); /* clean state */
+ kfree_skbmem(skb);
+}
+
+/*
+ * Duplicate an sk_buff. The new one is not owned by a socket.
+ */
+
+struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+{
+ struct sk_buff *n;
+
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+
+ memcpy(n, skb, sizeof(*n));
+ atomic_inc(skb_datarefp(skb));
+ skb->cloned = 1;
+
+ atomic_inc(&net_allocs);
+ atomic_inc(&net_skbcount);
+ dst_clone(n->dst);
+ n->cloned = 1;
+ n->next = n->prev = NULL;
+ n->list = NULL;
+ n->sk = NULL;
+ n->is_clone = 1;
+ atomic_set(&n->users, 1);
+ n->destructor = NULL;
+ return n;
+}
+
+/*
+ * This is slower, and copies the whole data area
+ */
+
+struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask)
+{
+ struct sk_buff *n;
+ unsigned long offset;
+
+ /*
+ * Allocate the copy buffer
+ */
+
+ n=alloc_skb(skb->end - skb->head, gfp_mask);
+ if(n==NULL)
+ return NULL;
+
+ /*
+ * Shift between the two data areas in bytes
+ */
+
+ offset=n->head-skb->head;
+
+ /* Set the data pointer */
+ skb_reserve(n,skb->data-skb->head);
+ /* Set the tail pointer and length */
+ skb_put(n,skb->len);
+ /* Copy the bytes */
+ memcpy(n->head,skb->head,skb->end-skb->head);
+ n->csum = skb->csum;
+ n->list=NULL;
+ n->sk=NULL;
+ n->dev=skb->dev;
+ n->priority=skb->priority;
+ n->protocol=skb->protocol;
+ n->dst=dst_clone(skb->dst);
+ n->h.raw=skb->h.raw+offset;
+ n->nh.raw=skb->nh.raw+offset;
+ n->mac.raw=skb->mac.raw+offset;
+ memcpy(n->cb, skb->cb, sizeof(skb->cb));
+ n->used=skb->used;
+ n->is_clone=0;
+ atomic_set(&n->users, 1);
+ n->pkt_type=skb->pkt_type;
+ n->stamp=skb->stamp;
+ n->destructor = NULL;
+ n->security=skb->security;
+#ifdef CONFIG_IP_FIREWALL
+ n->fwmark = skb->fwmark;
+#endif
+ return n;
+}
+
+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom)
+{
+ struct sk_buff *n;
+ unsigned long offset;
+ int headroom = skb_headroom(skb);
+
+ /*
+ * Allocate the copy buffer
+ */
+
+ n=alloc_skb(skb->truesize+newheadroom-headroom, GFP_ATOMIC);
+ if(n==NULL)
+ return NULL;
+
+ skb_reserve(n,newheadroom);
+
+ /*
+ * Shift between the two data areas in bytes
+ */
+
+ offset=n->data-skb->data;
+
+ /* Set the tail pointer and length */
+ skb_put(n,skb->len);
+ /* Copy the bytes */
+ memcpy(n->data,skb->data,skb->len);
+ n->list=NULL;
+ n->sk=NULL;
+ n->priority=skb->priority;
+ n->protocol=skb->protocol;
+ n->dev=skb->dev;
+ n->dst=dst_clone(skb->dst);
+ n->h.raw=skb->h.raw+offset;
+ n->nh.raw=skb->nh.raw+offset;
+ n->mac.raw=skb->mac.raw+offset;
+ memcpy(n->cb, skb->cb, sizeof(skb->cb));
+ n->used=skb->used;
+ n->is_clone=0;
+ atomic_set(&n->users, 1);
+ n->pkt_type=skb->pkt_type;
+ n->stamp=skb->stamp;
+ n->destructor = NULL;
+ n->security=skb->security;
+#ifdef CONFIG_IP_FIREWALL
+ n->fwmark = skb->fwmark;
+#endif
+
+ return n;
+}
+
+#if 0
+/*
+ * Tune the memory allocator for a new MTU size.
+ */
+void skb_add_mtu(int mtu)
+{
+ /* Must match allocation in alloc_skb */
+ mtu = ((mtu + 15) & ~15) + sizeof(atomic_t);
+
+ kmem_add_cache_size(mtu);
+}
+#endif
+
+void __init skb_init(void)
+{
+ skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+ sizeof(struct sk_buff),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ skb_headerinit, NULL);
+ if (!skbuff_head_cache)
+ panic("cannot create skbuff cache");
+}
diff --git a/pfinet/linux-src/net/core/sock.c b/pfinet/linux-src/net/core/sock.c
new file mode 100644
index 00000000..e0eb41a0
--- /dev/null
+++ b/pfinet/linux-src/net/core/sock.c
@@ -0,0 +1,1051 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic socket support routines. Memory allocators, socket lock/release
+ * handler for protocols to use and generic option handler.
+ *
+ *
+ * Version: $Id: sock.c,v 1.80 1999/05/08 03:04:34 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() problems
+ * Alan Cox : Connecting on a connecting socket
+ * now returns an error for tcp.
+ * Alan Cox : sock->protocol is set correctly.
+ * and is not sometimes left as 0.
+ * Alan Cox : connect handles icmp errors on a
+ * connect properly. Unfortunately there
+ * is a restart syscall nasty there. I
+ * can't match BSD without hacking the C
+ * library. Ideas urgently sought!
+ * Alan Cox : Disallow bind() to addresses that are
+ * not ours - especially broadcast ones!!
+ * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
+ * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
+ * instead they leave that for the DESTROY timer.
+ * Alan Cox : Clean up error flag in accept
+ * Alan Cox : TCP ack handling is buggy, the DESTROY timer
+ * was buggy. Put a remove_sock() in the handler
+ * for memory when we hit 0. Also altered the timer
+ * code. The ACK stuff can wait and needs major
+ * TCP layer surgery.
+ * Alan Cox : Fixed TCP ack bug, removed remove sock
+ * and fixed timer/inet_bh race.
+ * Alan Cox : Added zapped flag for TCP
+ * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
+ * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
+ * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
+ * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
+ * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
+ * Rick Sladkey : Relaxed UDP rules for matching packets.
+ * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
+ * Pauline Middelink : identd support
+ * Alan Cox : Fixed connect() taking signals I think.
+ * Alan Cox : SO_LINGER supported
+ * Alan Cox : Error reporting fixes
+ * Anonymous : inet_create tidied up (sk->reuse setting)
+ * Alan Cox : inet sockets don't set sk->type!
+ * Alan Cox : Split socket option code
+ * Alan Cox : Callbacks
+ * Alan Cox : Nagle flag for Charles & Johannes stuff
+ * Alex : Removed restriction on inet fioctl
+ * Alan Cox : Splitting INET from NET core
+ * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
+ * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
+ * Alan Cox : Split IP from generic code
+ * Alan Cox : New kfree_skbmem()
+ * Alan Cox : Make SO_DEBUG superuser only.
+ * Alan Cox : Allow anyone to clear SO_DEBUG
+ * (compatibility fix)
+ * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
+ * Alan Cox : Allocator for a socket is settable.
+ * Alan Cox : SO_ERROR includes soft errors.
+ * Alan Cox : Allow NULL arguments on some SO_ opts
+ * Alan Cox : Generic socket allocation to make hooks
+ * easier (suggested by Craig Metz).
+ * Michael Pall : SO_ERROR returns positive errno again
+ * Steve Whitehouse: Added default destructor to free
+ * protocol private data.
+ * Steve Whitehouse: Added various other default routines
+ * common to several socket families.
+ * Chris Evans : Call suser() check last on F_SETOWN
+ * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
+ * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
+ * Andi Kleen : Fix write_space callback
+ *
+ * To Fix:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/rarp.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <linux/ipsec.h>
+
+#ifdef CONFIG_FILTER
+#include <linux/filter.h>
+#endif
+
+#define min(a,b) ((a)<(b)?(a):(b))
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max = SK_WMEM_MAX;
+__u32 sysctl_rmem_max = SK_RMEM_MAX;
+__u32 sysctl_wmem_default = SK_WMEM_MAX;
+__u32 sysctl_rmem_default = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancilliary data plus some space */
+int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
+
+/*
+ * This is meant for all protocols to use and covers goings on
+ * at the socket level. Everything here is generic.
+ */
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ char *optval, int optlen)
+{
+ struct sock *sk=sock->sk;
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter;
+#endif
+ int val;
+ int valbool;
+ int err;
+ struct linger ling;
+ int ret = 0;
+
+ /*
+ * Options without arguments
+ */
+
+#ifdef SO_DONTLINGER /* Compatibility item... */
+ switch(optname)
+ {
+ case SO_DONTLINGER:
+ sk->linger=0;
+ return 0;
+ }
+#endif
+
+ if(optlen<sizeof(int))
+ return(-EINVAL);
+
+ err = get_user(val, (int *)optval);
+ if (err)
+ return err;
+
+ valbool = val?1:0;
+
+ switch(optname)
+ {
+ case SO_DEBUG:
+ if(val && !capable(CAP_NET_ADMIN))
+ {
+ ret = -EACCES;
+ }
+ else
+ sk->debug=valbool;
+ break;
+ case SO_REUSEADDR:
+ sk->reuse = valbool;
+ break;
+ case SO_TYPE:
+ case SO_ERROR:
+ ret = -ENOPROTOOPT;
+ break;
+ case SO_DONTROUTE:
+ sk->localroute=valbool;
+ break;
+ case SO_BROADCAST:
+ sk->broadcast=valbool;
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ about it this is right. Otherwise apps have to
+ play 'guess the biggest size' games. RCVBUF/SNDBUF
+ are treated in BSD as hints */
+
+ if (val > sysctl_wmem_max)
+ val = sysctl_wmem_max;
+
+ sk->sndbuf = max(val*2,2048);
+
+ /*
+ * Wake up sending tasks if we
+ * upped the value.
+ */
+ sk->write_space(sk);
+ break;
+
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ about it this is right. Otherwise apps have to
+ play 'guess the biggest size' games. RCVBUF/SNDBUF
+ are treated in BSD as hints */
+
+ if (val > sysctl_rmem_max)
+ val = sysctl_rmem_max;
+
+ /* FIXME: is this lower bound the right one? */
+ sk->rcvbuf = max(val*2,256);
+ break;
+
+ case SO_KEEPALIVE:
+#ifdef CONFIG_INET
+ if (sk->protocol == IPPROTO_TCP)
+ {
+ tcp_set_keepalive(sk, valbool);
+ }
+#endif
+ sk->keepopen = valbool;
+ break;
+
+ case SO_OOBINLINE:
+ sk->urginline = valbool;
+ break;
+
+ case SO_NO_CHECK:
+ sk->no_check = valbool;
+ break;
+
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+ sk->priority = val;
+ else
+ return(-EPERM);
+ break;
+
+ case SO_LINGER:
+ if(optlen<sizeof(ling))
+ return -EINVAL; /* 1003.1g */
+ err = copy_from_user(&ling,optval,sizeof(ling));
+ if (err)
+ {
+ ret = -EFAULT;
+ break;
+ }
+ if(ling.l_onoff==0)
+ sk->linger=0;
+ else
+ {
+ sk->lingertime=ling.l_linger;
+ sk->linger=1;
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ sk->bsdism = valbool;
+ break;
+
+ case SO_PASSCRED:
+ sock->passcred = valbool;
+ break;
+
+
+#ifdef CONFIG_NETDEVICES
+ case SO_BINDTODEVICE:
+ {
+ char devname[IFNAMSIZ];
+
+ /* Sorry... */
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+
+ if (!valbool) {
+ sk->bound_dev_if = 0;
+ } else {
+ if (optlen > IFNAMSIZ)
+ optlen = IFNAMSIZ;
+ if (copy_from_user(devname, optval, optlen))
+ return -EFAULT;
+
+ /* Remove any cached route for this socket. */
+ lock_sock(sk);
+ dst_release(xchg(&sk->dst_cache, NULL));
+ release_sock(sk);
+
+ if (devname[0] == '\0') {
+ sk->bound_dev_if = 0;
+ } else {
+ struct device *dev = dev_get(devname);
+ if (!dev)
+ return -EINVAL;
+ sk->bound_dev_if = dev->ifindex;
+ }
+ return 0;
+ }
+ }
+#endif
+
+
+#ifdef CONFIG_FILTER
+ case SO_ATTACH_FILTER:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_DETACH_FILTER:
+ filter = sk->filter;
+ if(filter) {
+ sk->filter = NULL;
+ synchronize_bh();
+ sk_filter_release(sk, filter);
+ return 0;
+ }
+ return -ENOENT;
+#endif
+ /* We implement the SO_SNDLOWAT etc to
+ not be settable (1003.1g 5.3) */
+ default:
+ return(-ENOPROTOOPT);
+ }
+ return ret;
+}
+
+
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char *optval, int *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ union
+ {
+ int val;
+ struct linger ling;
+ struct timeval tm;
+ } v;
+
+ int lv=sizeof(int),len;
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+
+ switch(optname)
+ {
+ case SO_DEBUG:
+ v.val = sk->debug;
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sk->localroute;
+ break;
+
+ case SO_BROADCAST:
+ v.val= sk->broadcast;
+ break;
+
+ case SO_SNDBUF:
+ v.val=sk->sndbuf;
+ break;
+
+ case SO_RCVBUF:
+ v.val =sk->rcvbuf;
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->reuse;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = sk->keepopen;
+ break;
+
+ case SO_TYPE:
+ v.val = sk->type;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if(v.val==0)
+ v.val=xchg(&sk->err_soft,0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = sk->urginline;
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->no_check;
+ break;
+
+ case SO_PRIORITY:
+ v.val = sk->priority;
+ break;
+
+ case SO_LINGER:
+ lv=sizeof(v.ling);
+ v.ling.l_onoff=sk->linger;
+ v.ling.l_linger=sk->lingertime;
+ break;
+
+ case SO_BSDCOMPAT:
+ v.val = sk->bsdism;
+ break;
+
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ lv=sizeof(struct timeval);
+ v.tm.tv_sec=0;
+ v.tm.tv_usec=0;
+ break;
+
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ v.val=1;
+ break;
+
+ case SO_PASSCRED:
+ v.val = sock->passcred;
+ break;
+
+ case SO_PEERCRED:
+ lv=sizeof(sk->peercred);
+ len=min(len, lv);
+ if(copy_to_user((void*)optval, &sk->peercred, len))
+ return -EFAULT;
+ goto lenout;
+
+ default:
+ return(-ENOPROTOOPT);
+ }
+ len=min(len,lv);
+ if(copy_to_user(optval,&v,len))
+ return -EFAULT;
+lenout:
+ if(put_user(len, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+static kmem_cache_t *sk_cachep;
+
+/*
+ * All socket objects are allocated here. This is for future
+ * usage.
+ */
+
+struct sock *sk_alloc(int family, int priority, int zero_it)
+{
+ struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
+
+ if(sk) {
+ if (zero_it)
+ memset(sk, 0, sizeof(struct sock));
+ sk->family = family;
+ }
+
+ return sk;
+}
+
+void sk_free(struct sock *sk)
+{
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter;
+#endif
+ if (sk->destruct)
+ sk->destruct(sk);
+
+#ifdef CONFIG_FILTER
+ filter = sk->filter;
+ if (filter) {
+ sk_filter_release(sk, filter);
+ sk->filter = NULL;
+ }
+#endif
+
+ if (atomic_read(&sk->omem_alloc))
+ printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
+
+ kmem_cache_free(sk_cachep, sk);
+}
+
+void __init sk_init(void)
+{
+ sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
+ SLAB_HWCACHE_ALIGN, 0, 0);
+
+}
+
+/*
+ * Simple resource managers for sockets.
+ */
+
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ /* In case it might be waiting for more memory. */
+ atomic_sub(skb->truesize, &sk->wmem_alloc);
+ sk->write_space(sk);
+}
+
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->rmem_alloc);
+}
+
+
+/*
+ * Allocate a skb from the socket's send buffer.
+ */
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+{
+ if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
+ struct sk_buff * skb = alloc_skb(size, priority);
+ if (skb) {
+ atomic_add(skb->truesize, &sk->wmem_alloc);
+ skb->destructor = sock_wfree;
+ skb->sk = sk;
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Allocate a skb from the socket's receive buffer.
+ */
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+{
+ if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb) {
+ atomic_add(skb->truesize, &sk->rmem_alloc);
+ skb->destructor = sock_rfree;
+ skb->sk = sk;
+ return skb;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Allocate a memory block from the socket's option memory buffer.
+ */
+void *sock_kmalloc(struct sock *sk, int size, int priority)
+{
+ if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
+ void *mem;
+ /* First do the add, to avoid the race if kmalloc
+ * might sleep.
+ */
+ atomic_add(size, &sk->omem_alloc);
+ mem = kmalloc(size, priority);
+ if (mem)
+ return mem;
+ atomic_sub(size, &sk->omem_alloc);
+ }
+ return NULL;
+}
+
+/*
+ * Free an option memory block.
+ */
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ kfree_s(mem, size);
+ atomic_sub(size, &sk->omem_alloc);
+}
+
+/* FIXME: this is insane. We are trying suppose to be controlling how
+ * how much space we have for data bytes, not packet headers.
+ * This really points out that we need a better system for doing the
+ * receive buffer. -- erics
+ * WARNING: This is currently ONLY used in tcp. If you need it else where
+ * this will probably not be what you want. Possibly these two routines
+ * should move over to the ipv4 directory.
+ */
+unsigned long sock_rspace(struct sock *sk)
+{
+ int amt = 0;
+
+ if (sk != NULL) {
+ /* This used to have some bizarre complications that
+ * to attempt to reserve some amount of space. This doesn't
+ * make sense, since the number returned here does not
+ * actually reflect allocated space, but rather the amount
+ * of space we committed to. We gamble that we won't
+ * run out of memory, and returning a smaller number does
+ * not change the gamble. If we lose the gamble tcp still
+ * works, it may just slow down for retransmissions.
+ */
+ amt = sk->rcvbuf - atomic_read(&sk->rmem_alloc);
+ if (amt < 0)
+ amt = 0;
+ }
+ return amt;
+}
+
+
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+ I think, these locks should be removed for datagram sockets.
+ */
+static void sock_wait_for_wmem(struct sock * sk)
+{
+ struct wait_queue wait = { current, NULL };
+
+ sk->socket->flags &= ~SO_NOSPACE;
+ add_wait_queue(sk->sleep, &wait);
+ for (;;) {
+ if (signal_pending(current))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
+ break;
+ if (sk->shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->err)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+}
+
+
+/*
+ * Generic send/receive buffer handlers
+ */
+
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ unsigned long fallback, int noblock, int *errcode)
+{
+ int err;
+ struct sk_buff *skb;
+
+ while (1) {
+ unsigned long try_size = size;
+
+ err = sock_error(sk);
+ if (err != 0)
+ goto failure;
+
+ /*
+ * We should send SIGPIPE in these cases according to
+ * 1003.1g draft 6.4. If we (the user) did a shutdown()
+ * call however we should not.
+ *
+ * Note: This routine isnt just used for datagrams and
+ * anyway some datagram protocols have a notion of
+ * close down.
+ */
+
+ err = -EPIPE;
+ if (sk->shutdown&SEND_SHUTDOWN)
+ goto failure;
+
+ if (fallback) {
+ /* The buffer get won't block, or use the atomic queue.
+ * It does produce annoying no free page messages still.
+ */
+ skb = sock_wmalloc(sk, size, 0, GFP_BUFFER);
+ if (skb)
+ break;
+ try_size = fallback;
+ }
+ skb = sock_wmalloc(sk, try_size, 0, sk->allocation);
+ if (skb)
+ break;
+
+ /*
+ * This means we have too many buffers for this socket already.
+ */
+
+ sk->socket->flags |= SO_NOSPACE;
+ err = -EAGAIN;
+ if (noblock)
+ goto failure;
+ err = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto failure;
+ sock_wait_for_wmem(sk);
+ }
+
+ return skb;
+
+failure:
+ *errcode = err;
+ return NULL;
+}
+
+
+void __release_sock(struct sock *sk)
+{
+#ifdef CONFIG_INET
+ if (!sk->prot || !sk->backlog_rcv)
+ return;
+
+ /* See if we have any packets built up. */
+ start_bh_atomic();
+ while (!skb_queue_empty(&sk->back_log)) {
+ struct sk_buff * skb = sk->back_log.next;
+ __skb_unlink(skb, &sk->back_log);
+ sk->backlog_rcv(sk, skb);
+ }
+ end_bh_atomic();
+#endif
+}
+
+
+/*
+ * Generic socket manager library. Most simpler socket families
+ * use this to manage their socket lists. At some point we should
+ * hash these. By making this generic we get the lot hashed for free.
+ */
+
+void sklist_remove_socket(struct sock **list, struct sock *sk)
+{
+ struct sock *s;
+
+ start_bh_atomic();
+
+ s= *list;
+ if(s==sk)
+ {
+ *list = s->next;
+ end_bh_atomic();
+ return;
+ }
+ while(s && s->next)
+ {
+ if(s->next==sk)
+ {
+ s->next=sk->next;
+ break;
+ }
+ s=s->next;
+ }
+ end_bh_atomic();
+}
+
+void sklist_insert_socket(struct sock **list, struct sock *sk)
+{
+ start_bh_atomic();
+ sk->next= *list;
+ *list=sk;
+ end_bh_atomic();
+}
+
+/*
+ * This is only called from user mode. Thus it protects itself against
+ * interrupt users but doesn't worry about being called during work.
+ * Once it is removed from the queue no interrupt or bottom half will
+ * touch it and we are (fairly 8-) ) safe.
+ */
+
+void sklist_destroy_socket(struct sock **list, struct sock *sk);
+
+/*
+ * Handler for deferred kills.
+ */
+
+static void sklist_destroy_timer(unsigned long data)
+{
+ struct sock *sk=(struct sock *)data;
+ sklist_destroy_socket(NULL,sk);
+}
+
+/*
+ * Destroy a socket. We pass NULL for a list if we know the
+ * socket is not on a list.
+ */
+
+void sklist_destroy_socket(struct sock **list,struct sock *sk)
+{
+ struct sk_buff *skb;
+ if(list)
+ sklist_remove_socket(list, sk);
+
+ while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
+ {
+ kfree_skb(skb);
+ }
+
+ if(atomic_read(&sk->wmem_alloc) == 0 &&
+ atomic_read(&sk->rmem_alloc) == 0 &&
+ sk->dead)
+ {
+ sk_free(sk);
+ }
+ else
+ {
+ /*
+ * Someone is using our buffers still.. defer
+ */
+ init_timer(&sk->timer);
+ sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
+ sk->timer.function=sklist_destroy_timer;
+ sk->timer.data = (unsigned long)sk;
+ add_timer(&sk->timer);
+ }
+}
+
+/*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+ * cases where it makes no sense for a protocol to have a "do nothing"
+ * function, some default processing is provided.
+ */
+
+int sock_no_dup(struct socket *newsock, struct socket *oldsock)
+{
+ struct sock *sk = oldsock->sk;
+
+ return net_families[sk->family]->create(newsock, sk->protocol);
+}
+
+int sock_no_release(struct socket *sock, struct socket *peersock)
+{
+ return 0;
+}
+
+int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+ int len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+ int *len, int peer)
+{
+ return -EOPNOTSUPP;
+}
+
+unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
+{
+ return 0;
+}
+
+int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_listen(struct socket *sock, int backlog)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_shutdown(struct socket *sock, int how)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_setsockopt(struct socket *sock, int level, int optname,
+ char *optval, int optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_getsockopt(struct socket *sock, int level, int optname,
+ char *optval, int *optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+/*
+ * Note: if you add something that sleeps here then change sock_fcntl()
+ * to do proper fd locking.
+ */
+int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+
+ switch(cmd)
+ {
+ case F_SETOWN:
+ /*
+ * This is a little restrictive, but it's the only
+ * way to make sure that you can't send a sigurg to
+ * another process.
+ */
+ if (current->pgrp != -arg &&
+ current->pid != arg &&
+ !capable(CAP_KILL)) return(-EPERM);
+ sk->proc = arg;
+ return(0);
+ case F_GETOWN:
+ return(sk->proc);
+ default:
+ return(-EINVAL);
+ }
+}
+
+int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
+ struct scm_cookie *scm)
+{
+ return -EOPNOTSUPP;
+}
+
+int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags,
+ struct scm_cookie *scm)
+{
+ return -EOPNOTSUPP;
+}
+
+
+
+/*
+ * Default Socket Callbacks
+ */
+
+void sock_def_wakeup(struct sock *sk)
+{
+ if(!sk->dead)
+ wake_up_interruptible(sk->sleep);
+}
+
+void sock_def_error_report(struct sock *sk)
+{
+ if (!sk->dead) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,0);
+ }
+}
+
+void sock_def_readable(struct sock *sk, int len)
+{
+ if(!sk->dead) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1);
+ }
+}
+
+void sock_def_write_space(struct sock *sk)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if(!sk->dead &&
+ ((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf)) {
+ wake_up_interruptible(sk->sleep);
+
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sock_wake_async(sk->socket, 2);
+ }
+}
+
+void sock_def_destruct(struct sock *sk)
+{
+ if (sk->protinfo.destruct_hook)
+ kfree(sk->protinfo.destruct_hook);
+}
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ skb_queue_head_init(&sk->receive_queue);
+ skb_queue_head_init(&sk->write_queue);
+ skb_queue_head_init(&sk->back_log);
+ skb_queue_head_init(&sk->error_queue);
+
+ init_timer(&sk->timer);
+
+ sk->allocation = GFP_KERNEL;
+ sk->rcvbuf = sysctl_rmem_default;
+ sk->sndbuf = sysctl_wmem_default;
+ sk->state = TCP_CLOSE;
+ sk->zapped = 1;
+ sk->socket = sock;
+
+ if(sock)
+ {
+ sk->type = sock->type;
+ sk->sleep = &sock->wait;
+ sock->sk = sk;
+ }
+
+ sk->state_change = sock_def_wakeup;
+ sk->data_ready = sock_def_readable;
+ sk->write_space = sock_def_write_space;
+ sk->error_report = sock_def_error_report;
+ sk->destruct = sock_def_destruct;
+
+ sk->peercred.pid = 0;
+ sk->peercred.uid = -1;
+ sk->peercred.gid = -1;
+
+}
diff --git a/pfinet/linux-src/net/core/sysctl_net_core.c b/pfinet/linux-src/net/core/sysctl_net_core.c
new file mode 100644
index 00000000..446ca145
--- /dev/null
+++ b/pfinet/linux-src/net/core/sysctl_net_core.c
@@ -0,0 +1,61 @@
+/* -*- linux-c -*-
+ * sysctl_net_core.c: sysctl interface to net core subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/core directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/config.h>
+
+#ifdef CONFIG_SYSCTL
+
+extern int netdev_max_backlog;
+extern int netdev_fastroute;
+extern int net_msg_cost;
+extern int net_msg_burst;
+
+extern __u32 sysctl_wmem_max;
+extern __u32 sysctl_rmem_max;
+extern __u32 sysctl_wmem_default;
+extern __u32 sysctl_rmem_default;
+
+extern int sysctl_core_destroy_delay;
+extern int sysctl_optmem_max;
+
+ctl_table core_table[] = {
+#ifdef CONFIG_NET
+ {NET_CORE_WMEM_MAX, "wmem_max",
+ &sysctl_wmem_max, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_RMEM_MAX, "rmem_max",
+ &sysctl_rmem_max, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_WMEM_DEFAULT, "wmem_default",
+ &sysctl_wmem_default, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_RMEM_DEFAULT, "rmem_default",
+ &sysctl_rmem_default, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_MAX_BACKLOG, "netdev_max_backlog",
+ &netdev_max_backlog, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#ifdef CONFIG_NET_FASTROUTE
+ {NET_CORE_FASTROUTE, "netdev_fastroute",
+ &netdev_fastroute, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#endif
+ {NET_CORE_MSG_COST, "message_cost",
+ &net_msg_cost, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_CORE_MSG_BURST, "message_burst",
+ &net_msg_burst, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+ {NET_CORE_OPTMEM_MAX, "optmem_max",
+ &sysctl_optmem_max, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#endif /* CONFIG_NET */
+ { 0 }
+};
+#endif
diff --git a/pfinet/linux-src/net/core/utils.c b/pfinet/linux-src/net/core/utils.c
new file mode 100644
index 00000000..415926b8
--- /dev/null
+++ b/pfinet/linux-src/net/core/utils.c
@@ -0,0 +1,66 @@
+/*
+ * Generic address resultion entity
+ *
+ * Authors:
+ * net_random Alan Cox
+ * net_ratelimit Andy Kleen
+ *
+ * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+
+static unsigned long net_rand_seed = 152L;
+
+unsigned long net_random(void)
+{
+ net_rand_seed=net_rand_seed*69069L+1;
+ return net_rand_seed^jiffies;
+}
+
+void net_srandom(unsigned long entropy)
+{
+ net_rand_seed ^= entropy;
+ net_random();
+}
+
+int net_msg_cost = 5*HZ;
+int net_msg_burst = 10*5*HZ;
+
+/*
+ * This enforces a rate limit: not more than one kernel message
+ * every 5secs to make a denial-of-service attack impossible.
+ *
+ * All warning printk()s should be guarded by this function.
+ */
+int net_ratelimit(void)
+{
+ static unsigned long toks = 10*5*HZ;
+ static unsigned long last_msg;
+ static int missed;
+ unsigned long now = jiffies;
+
+ toks += now - xchg(&last_msg, now);
+ if (toks > net_msg_burst)
+ toks = net_msg_burst;
+ if (toks >= net_msg_cost) {
+ toks -= net_msg_cost;
+ if (missed)
+ printk(KERN_WARNING "NET: %d messages suppressed.\n", missed);
+ missed = 0;
+ return 1;
+ }
+ missed++;
+ return 0;
+}
diff --git a/pfinet/linux-src/net/ethernet/Makefile b/pfinet/linux-src/net/ethernet/Makefile
new file mode 100644
index 00000000..193d6af8
--- /dev/null
+++ b/pfinet/linux-src/net/ethernet/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the Linux Ethernet layer.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := ethernet.o
+
+OBJS := eth.o
+
+ifeq ($(CONFIG_SYSCTL),y)
+OBJS += sysctl_net_ether.o
+endif
+
+ifdef CONFIG_IPX
+OBJ2 := pe2.o
+endif
+
+ifdef CONFIG_ATALK
+OBJ2 := pe2.o
+endif
+
+ifdef CONFIG_NET
+O_OBJS := $(OBJS) $(OBJ2)
+endif
+
+include $(TOPDIR)/Rules.make
+
+tar:
+ tar -cvf /dev/f1 .
diff --git a/pfinet/linux-src/net/ethernet/eth.c b/pfinet/linux-src/net/ethernet/eth.c
new file mode 100644
index 00000000..bce35d48
--- /dev/null
+++ b/pfinet/linux-src/net/ethernet/eth.c
@@ -0,0 +1,298 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Ethernet-type device handling.
+ *
+ * Version: @(#)eth.c 1.0.7 05/25/93
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Florian La Roche, <rzsfl@rz.uni-sb.de>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * Fixes:
+ * Mr Linux : Arp problems
+ * Alan Cox : Generic queue tidyup (very tiny here)
+ * Alan Cox : eth_header ntohs should be htons
+ * Alan Cox : eth_rebuild_header missing an htons and
+ * minor other things.
+ * Tegge : Arp bug fixes.
+ * Florian : Removed many unnecessary functions, code cleanup
+ * and changes for new arp and skbuff.
+ * Alan Cox : Redid header building to reflect new format.
+ * Alan Cox : ARP only when compiled with CONFIG_INET
+ * Greg Page : 802.2 and SNAP stuff.
+ * Alan Cox : MAC layer pointers/new format.
+ * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding.
+ * Alan Cox : Protect against forwarding explosions with
+ * older network drivers and IFF_ALLMULTI.
+ * Christer Weinigel : Better rebuild header message.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <net/dst.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/checksum.h>
+
+
+__initfunc(void eth_setup(char *str, int *ints))
+{
+ struct device *d = dev_base;
+
+ if (!str || !*str)
+ return;
+ while (d)
+ {
+ if (!strcmp(str,d->name))
+ {
+ if (ints[0] > 0)
+ d->irq=ints[1];
+ if (ints[0] > 1)
+ d->base_addr=ints[2];
+ if (ints[0] > 2)
+ d->mem_start=ints[3];
+ if (ints[0] > 3)
+ d->mem_end=ints[4];
+ break;
+ }
+ d=d->next;
+ }
+}
+
+
+/*
+ * Create the Ethernet MAC header for an arbitrary protocol layer
+ *
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp)
+ */
+
+int eth_header(struct sk_buff *skb, struct device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len)
+{
+ struct ethhdr *eth = (struct ethhdr *)skb_push(skb,ETH_HLEN);
+
+ /*
+ * Set the protocol type. For a packet of type ETH_P_802_3 we put the length
+ * in here instead. It is up to the 802.2 layer to carry protocol information.
+ */
+
+ if(type!=ETH_P_802_3)
+ eth->h_proto = htons(type);
+ else
+ eth->h_proto = htons(len);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if(saddr)
+ memcpy(eth->h_source,saddr,dev->addr_len);
+ else
+ memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
+
+ /*
+ * Anyway, the loopback-device should never use this function...
+ */
+
+ if (dev->flags & (IFF_LOOPBACK|IFF_NOARP))
+ {
+ memset(eth->h_dest, 0, dev->addr_len);
+ return(dev->hard_header_len);
+ }
+
+ if(daddr)
+ {
+ memcpy(eth->h_dest,daddr,dev->addr_len);
+ return dev->hard_header_len;
+ }
+
+ return -dev->hard_header_len;
+}
+
+
+/*
+ * Rebuild the Ethernet MAC header. This is called after an ARP
+ * (or in future other address resolution) has completed on this
+ * sk_buff. We now let ARP fill in the other fields.
+ *
+ * This routine CANNOT use cached dst->neigh!
+ * Really, it is used only when dst->neigh is wrong.
+ */
+
+int eth_rebuild_header(struct sk_buff *skb)
+{
+ struct ethhdr *eth = (struct ethhdr *)skb->data;
+ struct device *dev = skb->dev;
+
+ switch (eth->h_proto)
+ {
+#ifdef CONFIG_INET
+ case __constant_htons(ETH_P_IP):
+ return arp_find(eth->h_dest, skb);
+#endif
+ default:
+ printk(KERN_DEBUG
+ "%s: unable to resolve type %X addresses.\n",
+ dev->name, (int)eth->h_proto);
+
+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
+ break;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Determine the packet's protocol ID. The rule here is that we
+ * assume 802.3 if the type field is short enough to be a length.
+ * This is normal practice and works for any 'now in use' protocol.
+ */
+
+unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev)
+{
+ struct ethhdr *eth;
+ unsigned char *rawp;
+
+ skb->mac.raw=skb->data;
+ skb_pull(skb,dev->hard_header_len);
+ eth= skb->mac.ethernet;
+
+ if(*eth->h_dest&1)
+ {
+ if(memcmp(eth->h_dest,dev->broadcast, ETH_ALEN)==0)
+ skb->pkt_type=PACKET_BROADCAST;
+ else
+ skb->pkt_type=PACKET_MULTICAST;
+ }
+
+ /*
+ * This ALLMULTI check should be redundant by 1.4
+ * so don't forget to remove it.
+ *
+ * Seems, you forgot to remove it. All silly devices
+ * seems to set IFF_PROMISC.
+ */
+
+ else if(dev->flags&(IFF_PROMISC/*|IFF_ALLMULTI*/))
+ {
+ if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN))
+ skb->pkt_type=PACKET_OTHERHOST;
+ }
+
+ if (ntohs(eth->h_proto) >= 1536)
+ return eth->h_proto;
+
+ rawp = skb->data;
+
+ /*
+ * This is a magic hack to spot IPX packets. Older Novell breaks
+ * the protocol design and runs IPX over 802.3 without an 802.2 LLC
+ * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
+ * won't work for fault tolerant netware but does for the rest.
+ */
+ if (*(unsigned short *)rawp == 0xFFFF)
+ return htons(ETH_P_802_3);
+
+ /*
+ * Real 802.2 LLC
+ */
+ return htons(ETH_P_802_2);
+}
+
+int eth_header_parse(struct sk_buff *skb, unsigned char *haddr)
+{
+ struct ethhdr *eth = skb->mac.ethernet;
+ memcpy(haddr, eth->h_source, ETH_ALEN);
+ return ETH_ALEN;
+}
+
+int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh)
+{
+ unsigned short type = hh->hh_type;
+ struct ethhdr *eth = (struct ethhdr*)(((u8*)hh->hh_data) + 2);
+ struct device *dev = neigh->dev;
+
+ if (type == __constant_htons(ETH_P_802_3))
+ return -1;
+
+ eth->h_proto = type;
+ memcpy(eth->h_source, dev->dev_addr, dev->addr_len);
+ memcpy(eth->h_dest, neigh->ha, dev->addr_len);
+ return 0;
+}
+
+/*
+ * Called by Address Resolution module to notify changes in address.
+ */
+
+void eth_header_cache_update(struct hh_cache *hh, struct device *dev, unsigned char * haddr)
+{
+ memcpy(((u8*)hh->hh_data) + 2, haddr, dev->addr_len);
+}
+
+#ifndef CONFIG_IP_ROUTER
+
+/*
+ * Copy from an ethernet device memory space to an sk_buff while checksumming if IP
+ */
+
+void eth_copy_and_sum(struct sk_buff *dest, unsigned char *src, int length, int base)
+{
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ int ip_length;
+
+ eth=(struct ethhdr *)src;
+ if(eth->h_proto!=htons(ETH_P_IP))
+ {
+ memcpy(dest->data,src,length);
+ return;
+ }
+ /*
+ * We have to watch for padded packets. The csum doesn't include the
+ * padding, and there is no point in copying the padding anyway.
+ * We have to use the smaller of length and ip_length because it
+ * can happen that ip_length > length.
+ */
+ memcpy(dest->data,src,sizeof(struct iphdr)+ETH_HLEN); /* ethernet is always >= 34 */
+ length -= sizeof(struct iphdr) + ETH_HLEN;
+ iph=(struct iphdr*)(src+ETH_HLEN);
+ ip_length = ntohs(iph->tot_len) - sizeof(struct iphdr);
+
+ /* Also watch out for bogons - min IP size is 8 (rfc-1042) */
+ if ((ip_length <= length) && (ip_length > 7))
+ length=ip_length;
+
+ dest->csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base);
+ dest->ip_summed=1;
+}
+
+#endif /* !(CONFIG_IP_ROUTER) */
diff --git a/pfinet/linux-src/net/ethernet/pe2.c b/pfinet/linux-src/net/ethernet/pe2.c
new file mode 100644
index 00000000..4915f070
--- /dev/null
+++ b/pfinet/linux-src/net/ethernet/pe2.c
@@ -0,0 +1,38 @@
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/datalink.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+
+static void
+pEII_datalink_header(struct datalink_proto *dl,
+ struct sk_buff *skb, unsigned char *dest_node)
+{
+ struct device *dev = skb->dev;
+
+ skb->protocol = htons (ETH_P_IPX);
+ if(dev->hard_header)
+ dev->hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len);
+}
+
+struct datalink_proto *
+make_EII_client(void)
+{
+ struct datalink_proto *proto;
+
+ proto = (struct datalink_proto *) kmalloc(sizeof(*proto), GFP_ATOMIC);
+ if (proto != NULL) {
+ proto->type_len = 0;
+ proto->header_length = 0;
+ proto->datalink_header = pEII_datalink_header;
+ proto->string_name = "EtherII";
+ }
+
+ return proto;
+}
+
+void destroy_EII_client(struct datalink_proto *dl)
+{
+ if (dl)
+ kfree_s(dl, sizeof(struct datalink_proto));
+}
diff --git a/pfinet/linux-src/net/ethernet/sysctl_net_ether.c b/pfinet/linux-src/net/ethernet/sysctl_net_ether.c
new file mode 100644
index 00000000..b81a6d53
--- /dev/null
+++ b/pfinet/linux-src/net/ethernet/sysctl_net_ether.c
@@ -0,0 +1,13 @@
+/* -*- linux-c -*-
+ * sysctl_net_ether.c: sysctl interface to net Ethernet subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ether directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+
+ctl_table ether_table[] = {
+ {0}
+};
diff --git a/pfinet/linux-src/net/ipv4/Config.in b/pfinet/linux-src/net/ipv4/Config.in
new file mode 100644
index 00000000..29786da5
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/Config.in
@@ -0,0 +1,88 @@
+#
+# IP configuration
+#
+bool 'IP: multicasting' CONFIG_IP_MULTICAST
+bool 'IP: advanced router' CONFIG_IP_ADVANCED_ROUTER
+if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then
+ define_bool CONFIG_RTNETLINK y
+ define_bool CONFIG_NETLINK y
+ bool 'IP: policy routing' CONFIG_IP_MULTIPLE_TABLES
+ bool 'IP: equal cost multipath' CONFIG_IP_ROUTE_MULTIPATH
+ bool 'IP: use TOS value as routing key' CONFIG_IP_ROUTE_TOS
+ bool 'IP: verbose route monitoring' CONFIG_IP_ROUTE_VERBOSE
+ bool 'IP: large routing tables' CONFIG_IP_ROUTE_LARGE_TABLES
+ if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
+ bool 'IP: fast network address translation' CONFIG_IP_ROUTE_NAT
+ fi
+fi
+bool 'IP: kernel level autoconfiguration' CONFIG_IP_PNP
+if [ "$CONFIG_IP_PNP" = "y" ]; then
+ bool ' BOOTP support' CONFIG_IP_PNP_BOOTP
+ bool ' RARP support' CONFIG_IP_PNP_RARP
+# not yet ready..
+# bool ' ARP support' CONFIG_IP_PNP_ARP
+fi
+if [ "$CONFIG_FIREWALL" = "y" ]; then
+ bool 'IP: firewalling' CONFIG_IP_FIREWALL
+ if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
+ if [ "$CONFIG_NETLINK" = "y" ]; then
+ bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK
+ if [ "$CONFIG_IP_FIREWALL_NETLINK" = "y" ]; then
+ define_bool CONFIG_NETLINK_DEV y
+ fi
+ fi
+ bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG
+ if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
+ bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
+ fi
+ fi
+fi
+if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
+ if [ "$CONFIG_IP_ALWAYS_DEFRAG" != "n" ]; then
+ bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY
+ bool 'IP: masquerading' CONFIG_IP_MASQUERADE
+ if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then
+ comment 'Protocol-specific masquerading support will be built as modules.'
+ bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP
+ comment 'Protocol-specific masquerading support will be built as modules.'
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ bool 'IP: masquerading special modules support' CONFIG_IP_MASQUERADE_MOD
+ if [ "$CONFIG_IP_MASQUERADE_MOD" = "y" ]; then
+ tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW
+ tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW
+ tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW
+ fi
+ fi
+ fi
+ fi
+fi
+bool 'IP: optimize as router not host' CONFIG_IP_ROUTER
+tristate 'IP: tunneling' CONFIG_NET_IPIP
+tristate 'IP: GRE tunnels over IP' CONFIG_NET_IPGRE
+if [ "$CONFIG_IP_MULTICAST" = "y" ]; then
+ if [ "$CONFIG_NET_IPGRE" != "n" ]; then
+ bool 'IP: broadcast GRE over IP' CONFIG_NET_IPGRE_BROADCAST
+ fi
+ bool 'IP: multicast routing' CONFIG_IP_MROUTE
+ if [ "$CONFIG_IP_MROUTE" = "y" ]; then
+ bool 'IP: PIM-SM version 1 support' CONFIG_IP_PIMSM_V1
+ bool 'IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2
+ fi
+fi
+bool 'IP: aliasing support' CONFIG_IP_ALIAS
+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ if [ "$CONFIG_RTNETLINK" = "y" ]; then
+ bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD
+ fi
+fi
+bool 'IP: TCP syncookie support (not enabled per default)' CONFIG_SYN_COOKIES
+comment '(it is safe to leave these untouched)'
+#bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP
+tristate 'IP: Reverse ARP' CONFIG_INET_RARP
+#bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY
+#bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF
+bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE
+#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+#bool 'IP: support experimental checksum copy to user for UDP' CONFIG_UDP_DELAY_CSUM
+#fi
+
diff --git a/pfinet/linux-src/net/ipv4/Makefile b/pfinet/linux-src/net/ipv4/Makefile
new file mode 100644
index 00000000..8ab280de
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/Makefile
@@ -0,0 +1,116 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := ipv4.o
+IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\
+ raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+IPV4X_OBJS :=
+
+MOD_LIST_NAME := IPV4_MODULES
+M_OBJS :=
+
+ifeq ($(CONFIG_IP_FIREWALL),y)
+IPV4_OBJS += ip_fw.o
+endif
+
+ifeq ($(CONFIG_IP_MULTIPLE_TABLES),y)
+IPV4_OBJS += fib_rules.o
+endif
+
+ifeq ($(CONFIG_IP_ROUTE_NAT),y)
+IPV4_OBJS += ip_nat_dumb.o
+endif
+
+ifeq ($(CONFIG_IP_MROUTE),y)
+IPV4_OBJS += ipmr.o
+endif
+
+ifeq ($(CONFIG_INET_RARP),y)
+IPV4_OBJS += rarp.o
+else
+ ifeq ($(CONFIG_INET_RARP),m)
+ M_OBJS += rarp.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_IPIP),y)
+IPV4X_OBJS += ipip.o
+else
+ ifeq ($(CONFIG_NET_IPIP),m)
+ MX_OBJS += ipip.o
+ endif
+endif
+
+ifeq ($(CONFIG_NET_IPGRE),y)
+IPV4X_OBJS += ip_gre.o
+else
+ ifeq ($(CONFIG_NET_IPGRE),m)
+ MX_OBJS += ip_gre.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_MASQUERADE),y)
+IPV4X_OBJS += ip_masq.o ip_masq_app.o
+
+ifeq ($(CONFIG_IP_MASQUERADE_MOD),y)
+ IPV4X_OBJS += ip_masq_mod.o
+
+ ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),y)
+ IPV4_OBJS += ip_masq_autofw.o
+ else
+ ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),m)
+ M_OBJS += ip_masq_autofw.o
+ endif
+ endif
+
+ ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),y)
+ IPV4_OBJS += ip_masq_portfw.o
+ else
+ ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),m)
+ M_OBJS += ip_masq_portfw.o
+ endif
+ endif
+
+ ifeq ($(CONFIG_IP_MASQUERADE_MFW),y)
+ IPV4_OBJS += ip_masq_mfw.o
+ else
+ ifeq ($(CONFIG_IP_MASQUERADE_MFW),m)
+ M_OBJS += ip_masq_mfw.o
+ endif
+ endif
+
+endif
+
+M_OBJS += ip_masq_user.o
+M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o
+M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o
+endif
+
+ifeq ($(CONFIG_SYN_COOKIES),y)
+IPV4_OBJS += syncookies.o
+# module not supported, because it would be too messy.
+endif
+
+ifeq ($(CONFIG_IP_PNP),y)
+IPV4_OBJS += ipconfig.o
+endif
+
+ifdef CONFIG_INET
+O_OBJS := $(IPV4_OBJS)
+OX_OBJS := $(IPV4X_OBJS)
+endif
+
+include $(TOPDIR)/Rules.make
+
+tar:
+ tar -cvf /dev/f1 .
diff --git a/pfinet/linux-src/net/ipv4/af_inet.c b/pfinet/linux-src/net/ipv4/af_inet.c
new file mode 100644
index 00000000..e37eb6bd
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/af_inet.c
@@ -0,0 +1,1161 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Version: $Id: af_inet.c,v 1.87.2.5 1999/08/08 08:43:10 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state moved
+ * to close when you look carefully. With
+ * this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets. Note
+ * that FreeBSD at least was broken in this respect
+ * so be careful with compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense interpretation of
+ * listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/rarp.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <linux/ip_fw.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_IP_MASQUERADE
+#include <net/ip_masq.h>
+#endif
+#ifdef CONFIG_BRIDGE
+#include <net/br.h>
+#endif
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+#ifdef CONFIG_NET_RADIO
+#include <linux/wireless.h>
+#endif /* CONFIG_NET_RADIO */
+
+#define min(a,b) ((a)<(b)?(a):(b))
+
+struct linux_mib net_statistics;
+
+extern int raw_get_info(char *, char **, off_t, int, int);
+extern int snmp_get_info(char *, char **, off_t, int, int);
+extern int netstat_get_info(char *, char **, off_t, int, int);
+extern int afinet_get_info(char *, char **, off_t, int, int);
+extern int tcp_get_info(char *, char **, off_t, int, int);
+extern int udp_get_info(char *, char **, off_t, int, int);
+extern void ip_mc_drop_socket(struct sock *sk);
+
+#ifdef CONFIG_DLCI
+extern int dlci_ioctl(unsigned int, void*);
+#endif
+
+#ifdef CONFIG_DLCI_MODULE
+int (*dlci_ioctl_hook)(unsigned int, void *) = NULL;
+#endif
+
+int (*rarp_ioctl_hook)(unsigned int,void*) = NULL;
+
+/*
+ * Destroy an AF_INET socket
+ */
+
+static __inline__ void kill_sk_queues(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ /* First the read buffer. */
+ while((skb = skb_dequeue(&sk->receive_queue)) != NULL)
+ kfree_skb(skb);
+
+ /* Next, the error queue. */
+ while((skb = skb_dequeue(&sk->error_queue)) != NULL)
+ kfree_skb(skb);
+
+ /* Now the backlog. */
+ while((skb=skb_dequeue(&sk->back_log)) != NULL)
+ kfree_skb(skb);
+}
+
+static __inline__ void kill_sk_now(struct sock *sk)
+{
+ /* No longer exists. */
+ del_from_prot_sklist(sk);
+
+ /* Remove from protocol hash chains. */
+ sk->prot->unhash(sk);
+
+ if(sk->opt)
+ kfree(sk->opt);
+ dst_release(sk->dst_cache);
+ sk_free(sk);
+}
+
+static __inline__ void kill_sk_later(struct sock *sk)
+{
+ /* this should never happen. */
+ /* actually it can if an ack has just been sent. */
+ /*
+ * It's more normal than that...
+ * It can happen because a skb is still in the device queues
+ * [PR]
+ */
+
+ NETDEBUG(printk(KERN_DEBUG "Socket destroy delayed (r=%d w=%d)\n",
+ atomic_read(&sk->rmem_alloc),
+ atomic_read(&sk->wmem_alloc)));
+
+ sk->ack_backlog = 0;
+ release_sock(sk);
+ net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
+}
+
+void destroy_sock(struct sock *sk)
+{
+ lock_sock(sk); /* just to be safe. */
+
+ /* Now we can no longer get new packets or once the
+ * timers are killed, send them.
+ */
+ net_delete_timer(sk);
+
+ if (sk->prot->destroy && !sk->destroy)
+ sk->prot->destroy(sk);
+
+ sk->destroy = 1;
+
+ kill_sk_queues(sk);
+
+ /* Now if everything is gone we can free the socket
+ * structure, otherwise we need to keep it around until
+ * everything is gone.
+ */
+ if (atomic_read(&sk->rmem_alloc) == 0 && atomic_read(&sk->wmem_alloc) == 0)
+ kill_sk_now(sk);
+ else
+ kill_sk_later(sk);
+}
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+
+/*
+ * Set socket options on an inet socket.
+ */
+
+int inet_setsockopt(struct socket *sock, int level, int optname,
+ char *optval, int optlen)
+{
+ struct sock *sk=sock->sk;
+ if (sk->prot->setsockopt==NULL)
+ return(-EOPNOTSUPP);
+ return sk->prot->setsockopt(sk,level,optname,optval,optlen);
+}
+
+/*
+ * Get a socket option on an AF_INET socket.
+ *
+ * FIX: POSIX 1003.1g is very ambiguous here. It states that
+ * asynchronous errors should be reported by getsockopt. We assume
+ * this means if you specify SO_ERROR (otherwise whats the point of it).
+ */
+
+int inet_getsockopt(struct socket *sock, int level, int optname,
+ char *optval, int *optlen)
+{
+ struct sock *sk=sock->sk;
+ if (sk->prot->getsockopt==NULL)
+ return(-EOPNOTSUPP);
+ return sk->prot->getsockopt(sk,level,optname,optval,optlen);
+}
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ /* We may need to bind the socket. */
+ if (sk->num == 0) {
+ if (sk->prot->get_port(sk, 0) != 0)
+ return(-EAGAIN);
+ sk->sport = htons(sk->num);
+ sk->prot->hash(sk);
+ add_to_prot_sklist(sk);
+ }
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ return(-EINVAL);
+
+ if ((unsigned) backlog == 0) /* BSDism */
+ backlog = 1;
+ if ((unsigned) backlog > SOMAXCONN)
+ backlog = SOMAXCONN;
+ sk->max_ack_backlog = backlog;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ old_state = sk->state;
+ if (old_state != TCP_LISTEN) {
+ sk->state = TCP_LISTEN;
+ sk->ack_backlog = 0;
+ if (sk->num == 0) {
+ if (sk->prot->get_port(sk, 0) != 0) {
+ sk->state = old_state;
+ return -EAGAIN;
+ }
+ sk->sport = htons(sk->num);
+ add_to_prot_sklist(sk);
+ } else {
+ if (sk->prev)
+ ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
+ }
+
+ dst_release(xchg(&sk->dst_cache, NULL));
+ sk->prot->hash(sk);
+ sk->socket->flags |= SO_ACCEPTCON;
+ }
+ return 0;
+}
+
+/*
+ * Create an inet socket.
+ *
+ * FIXME: Gcc would generate much better code if we set the parameters
+ * up in in-memory structure order. Gcc68K even more so
+ */
+
+static int inet_create(struct socket *sock, int protocol)
+{
+ struct sock *sk;
+ struct proto *prot;
+
+ /* Compatibility */
+ if (sock->type == SOCK_PACKET) {
+ static int warned;
+ if (net_families[PF_PACKET]==NULL)
+ {
+#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE)
+ char module_name[30];
+ sprintf(module_name,"net-pf-%d", PF_PACKET);
+ request_module(module_name);
+ if (net_families[PF_PACKET] == NULL)
+#endif
+ return -ESOCKTNOSUPPORT;
+ }
+ if (!warned++)
+ printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm);
+ return net_families[PF_PACKET]->create(sock, protocol);
+ }
+
+ sock->state = SS_UNCONNECTED;
+ sk = sk_alloc(PF_INET, GFP_KERNEL, 1);
+ if (sk == NULL)
+ goto do_oom;
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ if (protocol && protocol != IPPROTO_TCP)
+ goto free_and_noproto;
+ protocol = IPPROTO_TCP;
+ if (ipv4_config.no_pmtu_disc)
+ sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ else
+ sk->ip_pmtudisc = IP_PMTUDISC_WANT;
+ prot = &tcp_prot;
+ sock->ops = &inet_stream_ops;
+ break;
+ case SOCK_SEQPACKET:
+ goto free_and_badtype;
+ case SOCK_DGRAM:
+ if (protocol && protocol != IPPROTO_UDP)
+ goto free_and_noproto;
+ protocol = IPPROTO_UDP;
+ sk->no_check = UDP_NO_CHECK;
+ sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ prot=&udp_prot;
+ sock->ops = &inet_dgram_ops;
+ break;
+ case SOCK_RAW:
+ if (!capable(CAP_NET_RAW))
+ goto free_and_badperm;
+ if (!protocol)
+ goto free_and_noproto;
+ prot = &raw_prot;
+ sk->reuse = 1;
+ sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ sk->num = protocol;
+ sock->ops = &inet_dgram_ops;
+ if (protocol == IPPROTO_RAW)
+ sk->ip_hdrincl = 1;
+ break;
+ default:
+ goto free_and_badtype;
+ }
+
+ sock_init_data(sock,sk);
+
+ sk->destruct = NULL;
+
+ sk->zapped=0;
+#ifdef CONFIG_TCP_NAGLE_OFF
+ sk->nonagle = 1;
+#endif
+ sk->family = PF_INET;
+ sk->protocol = protocol;
+
+ sk->prot = prot;
+ sk->backlog_rcv = prot->backlog_rcv;
+
+ sk->timer.data = (unsigned long)sk;
+ sk->timer.function = &net_timer;
+
+ sk->ip_ttl=ip_statistics.IpDefaultTTL;
+
+ sk->ip_mc_loop=1;
+ sk->ip_mc_ttl=1;
+ sk->ip_mc_index=0;
+ sk->ip_mc_list=NULL;
+
+ if (sk->num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ sk->sport = htons(sk->num);
+
+ /* Add to protocol hash chains. */
+ sk->prot->hash(sk);
+ add_to_prot_sklist(sk);
+ }
+
+ if (sk->prot->init) {
+ int err = sk->prot->init(sk);
+ if (err != 0) {
+ destroy_sock(sk);
+ return(err);
+ }
+ }
+ return(0);
+
+free_and_badtype:
+ sk_free(sk);
+ return -ESOCKTNOSUPPORT;
+
+free_and_badperm:
+ sk_free(sk);
+ return -EPERM;
+
+free_and_noproto:
+ sk_free(sk);
+ return -EPROTONOSUPPORT;
+
+do_oom:
+ return -ENOBUFS;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+
+int inet_release(struct socket *sock, struct socket *peersock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ /* Begin closedown and wake up sleepers. */
+ if (sock->state != SS_UNCONNECTED)
+ sock->state = SS_DISCONNECTING;
+ sk->state_change(sk);
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sk->linger && !(current->flags & PF_EXITING)) {
+ timeout = HZ * sk->lingertime;
+ if (!timeout)
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ }
+ sock->sk = NULL;
+ sk->socket = NULL;
+ sk->prot->close(sk, timeout);
+ }
+ return(0);
+}
+
+static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
+ struct sock *sk=sock->sk;
+ unsigned short snum;
+ int chk_addr_ret;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if(sk->prot->bind)
+ return sk->prot->bind(sk, uaddr, addr_len);
+
+ /* Check these errors (active socket, bad address length, double bind). */
+ if ((sk->state != TCP_CLOSE) ||
+ (addr_len < sizeof(struct sockaddr_in)) ||
+ (sk->num != 0))
+ return -EINVAL;
+
+ chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ /* Superuser may bind to any address to allow transparent proxying. */
+ if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN))
+#endif
+ return -EADDRNOTAVAIL; /* Source address MUST be ours! */
+ }
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
+ if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ sk->saddr = 0; /* Use device */
+
+ snum = ntohs(addr->sin_port);
+#ifdef CONFIG_IP_MASQUERADE
+ /* The kernel masquerader needs some ports. */
+ if((snum >= PORT_MASQ_BEGIN) && (snum <= PORT_MASQ_END))
+ return -EADDRINUSE;
+#endif
+ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return(-EACCES);
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->prot->get_port(sk, snum) != 0)
+ return -EADDRINUSE;
+
+ sk->sport = htons(sk->num);
+ sk->daddr = 0;
+ sk->dport = 0;
+ sk->prot->hash(sk);
+ add_to_prot_sklist(sk);
+ dst_release(sk->dst_cache);
+ sk->dst_cache=NULL;
+ return(0);
+}
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk=sock->sk;
+ int err;
+
+ if (inet_autobind(sk) != 0)
+ return(-EAGAIN);
+ if (sk->prot->connect == NULL)
+ return(-EOPNOTSUPP);
+ err = sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ if (err < 0)
+ return(err);
+ return(0);
+}
+
+static void inet_wait_for_connect(struct sock *sk)
+{
+ struct wait_queue wait = { current, NULL };
+
+ add_wait_queue(sk->sleep, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+ while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+ if (signal_pending(current))
+ break;
+ if (sk->err)
+ break;
+ schedule();
+ current->state = TASK_INTERRUPTIBLE;
+ }
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+
+int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk=sock->sk;
+ int err;
+
+ if(sock->state != SS_UNCONNECTED && sock->state != SS_CONNECTING) {
+ if(sock->state == SS_CONNECTED)
+ return -EISCONN;
+ return -EINVAL;
+ }
+
+ if(sock->state == SS_CONNECTING) {
+ /* Note: tcp_connected contains SYN_RECV, which may cause
+ bogus results here. -AK */
+ if(tcp_connected(sk->state)) {
+ sock->state = SS_CONNECTED;
+ return 0;
+ }
+ if (sk->zapped || sk->err)
+ goto sock_error;
+ if (flags & O_NONBLOCK)
+ return -EALREADY;
+ } else {
+ if (sk->prot->connect == NULL)
+ return(-EOPNOTSUPP);
+
+ /* We may need to bind the socket. */
+ if (inet_autobind(sk) != 0)
+ return(-EAGAIN);
+
+ err = sk->prot->connect(sk, uaddr, addr_len);
+ /* Note: there is a theoretical race here when an wake up
+ occurred before inet_wait_for_connect is entered. In 2.3
+ the wait queue setup should be moved before the low level
+ connect call. -AK*/
+ if (err < 0)
+ return(err);
+ sock->state = SS_CONNECTING;
+ }
+
+ if (sk->state > TCP_FIN_WAIT2 && sock->state == SS_CONNECTING)
+ goto sock_error;
+
+ if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
+ return (-EINPROGRESS);
+
+ if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+ inet_wait_for_connect(sk);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ }
+
+ sock->state = SS_CONNECTED;
+ if ((sk->state != TCP_ESTABLISHED) && sk->err)
+ goto sock_error;
+ return(0);
+
+sock_error:
+ /* This is ugly but needed to fix a race in the ICMP error handler */
+ if (sk->zapped && sk->state != TCP_CLOSE) {
+ lock_sock(sk);
+ tcp_set_state(sk, TCP_CLOSE);
+ release_sock(sk);
+ sk->zapped = 0;
+ }
+ sock->state = SS_UNCONNECTED;
+ return sock_error(sk);
+}
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk, *sk2;
+ struct sock *newsk = newsock->sk;
+ int err = -EINVAL;
+
+ if (sock->state != SS_UNCONNECTED || !(sock->flags & SO_ACCEPTCON))
+ goto do_err;
+
+ err = -EOPNOTSUPP;
+ if (sk1->prot->accept == NULL)
+ goto do_err;
+
+ if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
+ goto do_sk1_err;
+
+ /*
+ * We've been passed an extra socket.
+ * We need to free it up because the tcp module creates
+ * its own when it accepts one.
+ */
+ sk2->sleep = newsk->sleep;
+
+ newsock->sk = sk2;
+ sk2->socket = newsock;
+ newsk->socket = NULL;
+
+ if (flags & O_NONBLOCK)
+ goto do_half_success;
+
+ if(sk2->state == TCP_ESTABLISHED)
+ goto do_full_success;
+ if(sk2->err > 0)
+ goto do_connect_err;
+ err = -ECONNABORTED;
+ if (sk2->state == TCP_CLOSE)
+ goto do_bad_connection;
+do_full_success:
+ destroy_sock(newsk);
+ newsock->state = SS_CONNECTED;
+ return 0;
+
+do_half_success:
+ destroy_sock(newsk);
+ return(0);
+
+do_connect_err:
+ err = sock_error(sk2);
+do_bad_connection:
+ sk2->sleep = NULL;
+ sk2->socket = NULL;
+ destroy_sock(sk2);
+ newsock->sk = newsk;
+ newsk->socket = newsock;
+ return err;
+
+do_sk1_err:
+ err = sock_error(sk1);
+do_err:
+ return err;
+}
+
+
+/*
+ * This does both peername and sockname.
+ */
+
+static int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!tcp_connected(sk->state))
+ return(-ENOTCONN);
+ sin->sin_port = sk->dport;
+ sin->sin_addr.s_addr = sk->daddr;
+ } else {
+ __u32 addr = sk->rcv_saddr;
+ if (!addr)
+ addr = sk->saddr;
+ sin->sin_port = sk->sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ *uaddr_len = sizeof(*sin);
+ return(0);
+}
+
+
+
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size,
+ int flags, struct scm_cookie *scm)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ if (sock->flags & SO_ACCEPTCON)
+ return(-EINVAL);
+ if (sk->prot->recvmsg == NULL)
+ return(-EOPNOTSUPP);
+ /* We may need to bind the socket. */
+ if (inet_autobind(sk) != 0)
+ return(-EAGAIN);
+ err = sk->prot->recvmsg(sk, msg, size, flags&MSG_DONTWAIT,
+ flags&~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size,
+ struct scm_cookie *scm)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->shutdown & SEND_SHUTDOWN) {
+ if (!(msg->msg_flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 1);
+ return(-EPIPE);
+ }
+ if (sk->prot->sendmsg == NULL)
+ return(-EOPNOTSUPP);
+ if(sk->err)
+ return sock_error(sk);
+
+ /* We may need to bind the socket. */
+ if (inet_autobind(sk) != 0)
+ return -EAGAIN;
+
+ return sk->prot->sendmsg(sk, msg, size);
+}
+
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || how==0) /* MAXINT->0 */
+ return(-EINVAL);
+ if (!sk)
+ return(-ENOTCONN);
+ if (sock->state == SS_CONNECTING && sk->state == TCP_ESTABLISHED)
+ sock->state = SS_CONNECTED;
+ if (!tcp_connected(sk->state))
+ return(-ENOTCONN);
+ sk->shutdown |= how;
+ if (sk->prot->shutdown)
+ sk->prot->shutdown(sk, how);
+ /* Wake up anyone sleeping in poll. */
+ sk->state_change(sk);
+ return(0);
+}
+
+
+unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->prot->poll == NULL)
+ return(0);
+ return sk->prot->poll(file, sock, wait);
+}
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ int pid;
+
+ switch(cmd)
+ {
+ case FIOSETOWN:
+ case SIOCSPGRP:
+ err = get_user(pid, (int *) arg);
+ if (err)
+ return err;
+ if (current->pid != pid && current->pgrp != -pid &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ sk->proc = pid;
+ return(0);
+ case FIOGETOWN:
+ case SIOCGPGRP:
+ return put_user(sk->proc, (int *)arg);
+ case SIOCGSTAMP:
+ if(sk->stamp.tv_sec==0)
+ return -ENOENT;
+ err = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval));
+ if (err)
+ err = -EFAULT;
+ return err;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ return(ip_rt_ioctl(cmd,(void *) arg));
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ return(arp_ioctl(cmd,(void *) arg));
+ case SIOCDRARP:
+ case SIOCGRARP:
+ case SIOCSRARP:
+#ifdef CONFIG_KMOD
+ if (rarp_ioctl_hook == NULL)
+ request_module("rarp");
+#endif
+ if (rarp_ioctl_hook != NULL)
+ return(rarp_ioctl_hook(cmd,(void *) arg));
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ return(devinet_ioctl(cmd,(void *) arg));
+ case SIOCGIFBR:
+ case SIOCSIFBR:
+#ifdef CONFIG_BRIDGE
+ return(br_ioctl(cmd,(void *) arg));
+#else
+ return -ENOPKG;
+#endif
+
+ case SIOCADDDLCI:
+ case SIOCDELDLCI:
+#ifdef CONFIG_DLCI
+ return(dlci_ioctl(cmd, (void *) arg));
+#endif
+
+#ifdef CONFIG_DLCI_MODULE
+
+#ifdef CONFIG_KMOD
+ if (dlci_ioctl_hook == NULL)
+ request_module("dlci");
+#endif
+
+ if (dlci_ioctl_hook)
+ return((*dlci_ioctl_hook)(cmd, (void *) arg));
+#endif
+ return -ENOPKG;
+
+ default:
+ if ((cmd >= SIOCDEVPRIVATE) &&
+ (cmd <= (SIOCDEVPRIVATE + 15)))
+ return(dev_ioctl(cmd,(void *) arg));
+
+#ifdef CONFIG_NET_RADIO
+ if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST))
+ return(dev_ioctl(cmd,(void *) arg));
+#endif
+
+ if (sk->prot->ioctl==NULL || (err=sk->prot->ioctl(sk, cmd, arg))==-ENOIOCTLCMD)
+ return(dev_ioctl(cmd,(void *) arg));
+ return err;
+ }
+ /*NOTREACHED*/
+ return(0);
+}
+
+struct proto_ops inet_stream_ops = {
+ PF_INET,
+
+ sock_no_dup,
+ inet_release,
+ inet_bind,
+ inet_stream_connect,
+ sock_no_socketpair,
+ inet_accept,
+ inet_getname,
+ inet_poll,
+ inet_ioctl,
+ inet_listen,
+ inet_shutdown,
+ inet_setsockopt,
+ inet_getsockopt,
+ sock_no_fcntl,
+ inet_sendmsg,
+ inet_recvmsg
+};
+
+struct proto_ops inet_dgram_ops = {
+ PF_INET,
+
+ sock_no_dup,
+ inet_release,
+ inet_bind,
+ inet_dgram_connect,
+ sock_no_socketpair,
+ sock_no_accept,
+ inet_getname,
+ datagram_poll,
+ inet_ioctl,
+ sock_no_listen,
+ inet_shutdown,
+ inet_setsockopt,
+ inet_getsockopt,
+ sock_no_fcntl,
+ inet_sendmsg,
+ inet_recvmsg
+};
+
+struct net_proto_family inet_family_ops = {
+ PF_INET,
+ inet_create
+};
+
+
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_INET_RARP
+static struct proc_dir_entry proc_net_rarp = {
+ PROC_NET_RARP, 4, "rarp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ rarp_get_info
+};
+#endif /* RARP */
+static struct proc_dir_entry proc_net_raw = {
+ PROC_NET_RAW, 3, "raw",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ raw_get_info
+};
+static struct proc_dir_entry proc_net_netstat = {
+ PROC_NET_NETSTAT, 7, "netstat",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ netstat_get_info
+};
+static struct proc_dir_entry proc_net_snmp = {
+ PROC_NET_SNMP, 4, "snmp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ snmp_get_info
+};
+static struct proc_dir_entry proc_net_sockstat = {
+ PROC_NET_SOCKSTAT, 8, "sockstat",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ afinet_get_info
+};
+static struct proc_dir_entry proc_net_tcp = {
+ PROC_NET_TCP, 3, "tcp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ tcp_get_info
+};
+static struct proc_dir_entry proc_net_udp = {
+ PROC_NET_UDP, 3, "udp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ udp_get_info
+};
+#endif /* CONFIG_PROC_FS */
+
+extern void tcp_init(void);
+extern void tcp_v4_init(struct net_proto_family *);
+
+
+/*
+ * Called by socket.c on kernel startup.
+ */
+
+__initfunc(void inet_proto_init(struct net_proto *pro))
+{
+ struct sk_buff *dummy_skb;
+ struct inet_protocol *p;
+
+ printk(KERN_INFO "NET4: Linux TCP/IP 1.0 for NET4.0\n");
+
+ if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb))
+ {
+ printk(KERN_CRIT "inet_proto_init: panic\n");
+ return;
+ }
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void) sock_register(&inet_family_ops);
+
+ /*
+ * Add all the protocols.
+ */
+
+ printk(KERN_INFO "IP Protocols: ");
+ for(p = inet_protocol_base; p != NULL;)
+ {
+ struct inet_protocol *tmp = (struct inet_protocol *) p->next;
+ inet_add_protocol(p);
+ printk("%s%s",p->name,tmp?", ":"\n");
+ p = tmp;
+ }
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init(&inet_family_ops);
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ icmp_init(&inet_family_ops);
+
+ /* I wish inet_add_protocol had no constructor hook...
+ I had to move IPIP from net/ipv4/protocol.c :-( --ANK
+ */
+#ifdef CONFIG_NET_IPIP
+ ipip_init();
+#endif
+#ifdef CONFIG_NET_IPGRE
+ ipgre_init();
+#endif
+
+ /*
+ * Set the firewalling up
+ */
+#if defined(CONFIG_IP_FIREWALL)
+ ip_fw_init();
+#endif
+
+#ifdef CONFIG_IP_MASQUERADE
+ ip_masq_init();
+#endif
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ ip_mr_init();
+#endif
+
+#ifdef CONFIG_INET_RARP
+ rarp_ioctl_hook = rarp_ioctl;
+#endif
+ /*
+ * Create all the /proc entries.
+ */
+
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_INET_RARP
+ proc_net_register(&proc_net_rarp);
+#endif /* RARP */
+ proc_net_register(&proc_net_raw);
+ proc_net_register(&proc_net_snmp);
+ proc_net_register(&proc_net_netstat);
+ proc_net_register(&proc_net_sockstat);
+ proc_net_register(&proc_net_tcp);
+ proc_net_register(&proc_net_udp);
+#endif /* CONFIG_PROC_FS */
+}
diff --git a/pfinet/linux-src/net/ipv4/arp.c b/pfinet/linux-src/net/ipv4/arp.c
new file mode 100644
index 00000000..27d2f802
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/arp.c
@@ -0,0 +1,1154 @@
+/* linux/net/inet/arp.c
+ *
+ * Version: $Id: arp.c,v 1.77.2.1 1999/06/28 10:39:23 davem Exp $
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ */
+
+/* RFC1122 Status:
+ 2.3.2.1 (ARP Cache Validation):
+ MUST provide mechanism to flush stale cache entries (OK)
+ SHOULD be able to configure cache timeout (OK)
+ MUST throttle ARP retransmits (OK)
+ 2.3.2.2 (ARP Packet Queue):
+ SHOULD save at least one packet from each "conversation" with an
+ unresolved IP address. (OK)
+ 950727 -- MS
+*/
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#include <net/netrom.h>
+#endif
+#endif
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static struct neigh_ops arp_generic_ops =
+{
+ AF_INET,
+ NULL,
+ arp_solicit,
+ arp_error_report,
+ neigh_resolve_output,
+ neigh_connected_output,
+ dev_queue_xmit,
+ dev_queue_xmit
+};
+
+static struct neigh_ops arp_hh_ops =
+{
+ AF_INET,
+ NULL,
+ arp_solicit,
+ arp_error_report,
+ neigh_resolve_output,
+ neigh_resolve_output,
+ dev_queue_xmit,
+ dev_queue_xmit
+};
+
+static struct neigh_ops arp_direct_ops =
+{
+ AF_INET,
+ NULL,
+ NULL,
+ NULL,
+ dev_queue_xmit,
+ dev_queue_xmit,
+ dev_queue_xmit,
+ dev_queue_xmit
+};
+
+struct neigh_ops arp_broken_ops =
+{
+ AF_INET,
+ NULL,
+ arp_solicit,
+ arp_error_report,
+ neigh_compat_output,
+ neigh_compat_output,
+ dev_queue_xmit,
+ dev_queue_xmit,
+};
+
+struct neigh_table arp_tbl =
+{
+ NULL,
+ AF_INET,
+ sizeof(struct neighbour) + 4,
+ 4,
+ arp_constructor,
+ NULL,
+ NULL,
+ parp_redo,
+ { NULL, NULL, &arp_tbl, 0, NULL, NULL,
+ 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },
+ 30*HZ, 128, 512, 1024,
+};
+
+int arp_mc_map(u32 addr, u8 *haddr, struct device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_IEEE802:
+ case ARPHRD_FDDI:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ u32 addr = *(u32*)neigh->primary_key;
+ struct device *dev = neigh->dev;
+ struct in_device *in_dev = dev->ip_ptr;
+
+ if (in_dev == NULL)
+ return -EINVAL;
+
+ neigh->type = inet_addr_type(addr);
+ if (in_dev->arp_parms)
+ neigh->parms = in_dev->arp_parms;
+
+ if (dev->hard_header == NULL) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh->ops->queue_xmit;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+#if 1
+ /* So... these "amateur" devices are hopeless.
+ The only thing, that I can say now:
+ It is very sad that we need to keep ugly obsolete
+ code to make them happy.
+
+ They should be moved to more reasonable state, now
+ they use rebuild_header INSTEAD OF hard_start_xmit!!!
+ Besides that, they are sort of out of date
+ (a lot of redundant clones/copies, useless in 2.1),
+ I wonder why people believe that they work.
+ */
+ switch (dev->type) {
+ default:
+ break;
+ case ARPHRD_ROSE:
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+#endif
+ neigh->ops = &arp_broken_ops;
+ neigh->output = neigh->ops->output;
+ return 0;
+#endif
+ }
+#endif
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+ if (dev->hard_header_cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+ if (neigh->nud_state&NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ u32 saddr;
+ u8 *dst_ha = NULL;
+ struct device *dev = neigh->dev;
+ u32 target = *(u32*)neigh->primary_key;
+ int probes = neigh->probes;
+
+ if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
+ saddr = skb->nh.iph->saddr;
+ else
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ if ((probes -= neigh->parms->ucast_probes) < 0) {
+ if (!(neigh->nud_state&NUD_VALID))
+ printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+ dst_ha = neigh->ha;
+ } else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+ neigh_app_ns(neigh);
+#endif
+ return;
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_ha, dev->dev_addr, NULL);
+}
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ * Find an arp mapping in the cache. If not found, post a request.
+ *
+ * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ * even if it exists. It is supposed that skb->dev was mangled
+ * by a virtual device (eql, shaper). Nobody but broken devices
+ * is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev)
+{
+ switch (addr_hint) {
+ case RTN_LOCAL:
+ printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ memcpy(haddr, dev->dev_addr, dev->addr_len);
+ return 1;
+ case RTN_MULTICAST:
+ arp_mc_map(paddr, haddr, dev, 1);
+ return 1;
+ case RTN_BROADCAST:
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+ struct device *dev = skb->dev;
+ u32 paddr;
+ struct neighbour *n;
+
+ if (!skb->dst) {
+ printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ kfree_skb(skb);
+ return 1;
+ }
+
+ paddr = ((struct rtable*)skb->dst)->rt_gateway;
+
+ if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+ return 0;
+
+ start_bh_atomic();
+ n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+ if (n) {
+ n->used = jiffies;
+ if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+ memcpy(haddr, n->ha, dev->addr_len);
+ neigh_release(n);
+ end_bh_atomic();
+ return 0;
+ }
+ neigh_release(n);
+ } else
+ kfree_skb(skb);
+ end_bh_atomic();
+ return 1;
+}
+
+/* END OF OBSOLETE FUNCTIONS */
+
+/*
+ * Note: requires bh_atomic locking.
+ */
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+ struct device *dev = dst->dev;
+
+ if (dev == NULL)
+ return 0;
+ if (dst->neighbour == NULL) {
+ u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+ if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ nexthop = 0;
+ dst->neighbour = __neigh_lookup(&arp_tbl, &nexthop, dev, 1);
+ }
+ return (dst->neighbour != NULL);
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create and send an arp packet. If (dest_hw == NULL), we create a broadcast
+ * message.
+ */
+
+void arp_send(int type, int ptype, u32 dest_ip,
+ struct device *dev, u32 src_ip,
+ unsigned char *dest_hw, unsigned char *src_hw,
+ unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+ + dev->hard_header_len + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, (dev->hard_header_len+15)&~15);
+ skb->nh.raw = skb->data;
+ arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ skb->dev = dev;
+ skb->protocol = __constant_htons (ETH_P_ARP);
+ if (src_hw == NULL)
+ src_hw = dev->dev_addr;
+ if (dest_hw == NULL)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len);
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = __constant_htons(ETH_P_IP);
+ break;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ arp->ar_hrd = __constant_htons(ARPHRD_AX25);
+ arp->ar_pro = __constant_htons(AX25_P_IP);
+ break;
+
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = __constant_htons(ARPHRD_NETROM);
+ arp->ar_pro = __constant_htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ arp->ar_hrd = __constant_htons(ARPHRD_ETHER);
+ arp->ar_pro = __constant_htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr=(unsigned char *)(arp+1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &src_ip,4);
+ arp_ptr+=4;
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr+=dev->addr_len;
+ memcpy(arp_ptr, &dest_ip, 4);
+ skb->dev = dev;
+
+ dev_queue_xmit(skb);
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_rcv(skb, skb->dev, NULL);
+}
+
+/*
+ * Receive an arp request by the device layer.
+ */
+
+int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+{
+ struct arphdr *arp = skb->nh.arph;
+ unsigned char *arp_ptr= (unsigned char *)(arp+1);
+ struct rtable *rt;
+ unsigned char *sha, *tha;
+ u32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct in_device *in_dev = dev->ip_ptr;
+ struct neighbour *n;
+
+/*
+ * The hardware length of the packet should match the hardware length
+ * of the device. Similarly, the hardware types should match. The
+ * device should be ARP-able. Also, if pln is not 4, then the lookup
+ * is not from an IP number. We can't currently handle this, so toss
+ * it.
+ */
+ if (in_dev == NULL ||
+ arp->ar_hln != dev->addr_len ||
+ dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK ||
+ arp->ar_pln != 4)
+ goto out;
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != __constant_htons(ETH_P_IP))
+ goto out;
+ if (htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+#ifdef CONFIG_NET_ETHERNET
+ case ARPHRD_ETHER:
+ /*
+ * ETHERNET devices will accept ARP hardware types of either
+ * 1 (Ethernet) or 6 (IEEE 802.2).
+ */
+ if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
+ goto out;
+ if (arp->ar_pro != __constant_htons(ETH_P_IP))
+ goto out;
+ break;
+#endif
+#ifdef CONFIG_FDDI
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP hardware types
+ * of 1 (Ethernet). However, to be more robust, we'll accept hardware
+ * types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ */
+ if (arp->ar_hrd != __constant_htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != __constant_htons(ARPHRD_IEEE802))
+ goto out;
+ if (arp->ar_pro != __constant_htons(ETH_P_IP))
+ goto out;
+ break;
+#endif
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ if (arp->ar_pro != __constant_htons(AX25_P_IP))
+ goto out;
+ if (arp->ar_hrd != __constant_htons(ARPHRD_AX25))
+ goto out;
+ break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != __constant_htons(AX25_P_IP))
+ goto out;
+ if (arp->ar_hrd != __constant_htons(ARPHRD_NETROM))
+ goto out;
+ break;
+#endif
+#endif
+ }
+
+ /* Undertsand only these message types */
+
+ if (arp->ar_op != __constant_htons(ARPOP_REPLY) &&
+ arp->ar_op != __constant_htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ sha=arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ tha=arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (LOOPBACK(tip) || MULTICAST(tip))
+ goto out;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
+ inet_addr_type(tip) == RTN_LOCAL)
+ arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+ goto out;
+ }
+
+ if (arp->ar_op == __constant_htons(ARPOP_REQUEST) &&
+ ip_route_input(skb, tip, sip, 0, dev) == 0) {
+
+ rt = (struct rtable*)skb->dst;
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ neigh_release(n);
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if ((rt->rt_flags&RTCF_DNAT) ||
+ (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
+ (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ neigh_release(n);
+
+ if (skb->stamp.tv_sec == 0 ||
+ skb->pkt_type == PACKET_HOST ||
+ in_dev->arp_parms->proxy_delay == 0) {
+ arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ } else {
+ pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ if (n == NULL &&
+ arp->ar_op == __constant_htons(ARPOP_REPLY) &&
+ inet_addr_type(sip) == RTN_UNICAST)
+ n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+#endif
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override = 0;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ if (jiffies - n->updated >= n->parms->locktime)
+ override = 1;
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != __constant_htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state, override, 1);
+ neigh_release(n);
+ }
+
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+
+
+/*
+ * User level interface (ioctl, /proc)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+int arp_req_set(struct arpreq *r, struct device * dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags&ATF_PUBL) {
+ u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+ if (mask && mask != 0xFFFFFFFF)
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 1;
+ return 0;
+ }
+ if (dev->ip_ptr) {
+ ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 1;
+ return 0;
+ }
+ return -ENXIO;
+ }
+
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (dev == NULL) {
+ struct rtable * rt;
+ if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+
+ err = -ENOBUFS;
+ start_bh_atomic();
+ neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1);
+ if (neigh) {
+ unsigned state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state, 1, 0);
+ neigh_release(neigh);
+ }
+ end_bh_atomic();
+ return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+ unsigned flags = 0;
+ if (neigh->nud_state&NUD_PERMANENT)
+ flags = ATF_PERM|ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ flags = ATF_COM;
+ return flags;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct device *dev)
+{
+ u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ start_bh_atomic();
+ neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+ if (neigh) {
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_ha.sa_family = dev->type;
+ strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ r->arp_flags = arp_state_to_flags(neigh);
+ neigh_release(neigh);
+ err = 0;
+ }
+ end_bh_atomic();
+ return err;
+}
+
+int arp_req_delete(struct arpreq *r, struct device * dev)
+{
+ int err;
+ u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+
+ if (r->arp_flags & ATF_PUBL) {
+ u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+ if (mask == 0xFFFFFFFF)
+ return pneigh_delete(&arp_tbl, &ip, dev);
+ if (mask == 0) {
+ if (dev == NULL) {
+ ipv4_devconf.proxy_arp = 0;
+ return 0;
+ }
+ if (dev->ip_ptr) {
+ ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 0;
+ return 0;
+ }
+ return -ENXIO;
+ }
+ return -EINVAL;
+ }
+
+ if (dev == NULL) {
+ struct rtable * rt;
+ if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ err = -ENXIO;
+ start_bh_atomic();
+ neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+ if (neigh) {
+ if (neigh->nud_state&~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
+ neigh_release(neigh);
+ }
+ end_bh_atomic();
+ return err;
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(unsigned int cmd, void *arg)
+{
+ int err;
+ struct arpreq r;
+ struct device * dev = NULL;
+
+ switch(cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr=__constant_htonl(0xFFFFFFFFUL);
+
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ if ((dev = dev_get(r.arp_dev)) == NULL)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch(cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(&r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(&r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ if (!err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ break;
+ }
+out:
+ rtnl_unlock();
+ return err;
+}
+
+/*
+ * Write the contents of the ARP cache to a PROCfs file.
+ */
+#ifdef CONFIG_PROC_FS
+
+#define HBUFFERLEN 30
+
+int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ int len=0;
+ off_t pos=0;
+ int size;
+ char hbuffer[HBUFFERLEN];
+ int i,j,k;
+ const char hexbuf[] = "0123456789ABCDEF";
+
+ size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n");
+
+ pos+=size;
+ len+=size;
+
+ neigh_table_lock(&arp_tbl);
+
+ for(i=0; i<=NEIGH_HASHMASK; i++) {
+ struct neighbour *n;
+ for (n=arp_tbl.hash_buckets[i]; n; n=n->next) {
+ struct device *dev = n->dev;
+ int hatype = dev->type;
+
+ /* Do not confuse users "arp -a" with magic entries */
+ if (!(n->nud_state&~NUD_NOARP))
+ continue;
+
+ /* I'd get great pleasure deleting
+ this ugly code. Let's output it in hexadecimal format.
+ "arp" utility will eventually repaired --ANK
+ */
+#if 1 /* UGLY CODE */
+/*
+ * Convert hardware address to XX:XX:XX:XX ... form.
+ */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ strcpy(hbuffer,ax2asc((ax25_address *)n->ha));
+ else {
+#endif
+ for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) {
+ hbuffer[k++]=hexbuf[(n->ha[j]>>4)&15 ];
+ hbuffer[k++]=hexbuf[n->ha[j]&15 ];
+ hbuffer[k++]=':';
+ }
+ hbuffer[--k]=0;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ }
+#endif
+#else
+ if ((neigh->nud_state&NUD_VALID) && dev->addr_len) {
+ int j;
+ for (j=0; j < dev->addr_len; j++)
+ sprintf(hbuffer+2*j, "%02x", neigh->ha[j]);
+ } else
+ sprintf(hbuffer, "0");
+#endif
+
+ size = sprintf(buffer+len,
+ "%-17s0x%-10x0x%-10x%s",
+ in_ntoa(*(u32*)n->primary_key),
+ hatype,
+ arp_state_to_flags(n),
+ hbuffer);
+ size += sprintf(buffer+len+size,
+ " %-17s %s\n",
+ "*", dev->name);
+
+ len += size;
+ pos += size;
+
+ if (pos <= offset)
+ len=0;
+ if (pos >= offset+length)
+ goto done;
+ }
+ }
+
+ for (i=0; i<=PNEIGH_HASHMASK; i++) {
+ struct pneigh_entry *n;
+ for (n=arp_tbl.phash_buckets[i]; n; n=n->next) {
+ struct device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+
+ size = sprintf(buffer+len,
+ "%-17s0x%-10x0x%-10x%s",
+ in_ntoa(*(u32*)n->key),
+ hatype,
+ ATF_PUBL|ATF_PERM,
+ "00:00:00:00:00:00");
+ size += sprintf(buffer+len+size,
+ " %-17s %s\n",
+ "*", dev ? dev->name : "*");
+
+ len += size;
+ pos += size;
+
+ if (pos <= offset)
+ len=0;
+ if (pos >= offset+length)
+ goto done;
+ }
+ }
+
+done:
+ neigh_table_unlock(&arp_tbl);
+
+ *start = buffer+len-(pos-offset); /* Start of wanted data */
+ len = pos-offset; /* Start slop */
+ if (len>length)
+ len = length; /* Ending slop */
+ if (len<0)
+ len = 0;
+ return len;
+}
+#endif
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type =
+{
+ __constant_htons(ETH_P_ARP),
+ NULL, /* All devices */
+ arp_rcv,
+ NULL,
+ NULL
+};
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_arp = {
+ PROC_NET_ARP, 3, "arp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ arp_get_info
+};
+#endif
+
+__initfunc(void arp_init (void))
+{
+ neigh_table_init(&arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_arp);
+#endif
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
+#endif
+}
+
+
+#ifdef CONFIG_AX25_MODULE
+
+/*
+ * ax25 -> ASCII conversion
+ */
+char *ax2asc(ax25_address *a)
+{
+ static char buf[11];
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ') *s++ = c;
+ }
+
+ *s++ = '-';
+
+ if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+
+}
+
+#endif
diff --git a/pfinet/linux-src/net/ipv4/devinet.c b/pfinet/linux-src/net/ipv4/devinet.c
new file mode 100644
index 00000000..a50ee3bd
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/devinet.c
@@ -0,0 +1,1034 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * Version: $Id: devinet.c,v 1.28.2.2 1999/08/07 10:56:18 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists.
+ * Cyrus Durgin: updated for kmod
+ */
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+struct ipv4_devconf ipv4_devconf = { 1, 1, 1, 1, 0, };
+static struct ipv4_devconf ipv4_devconf_dflt = { 1, 1, 1, 1, 1, };
+
+#ifdef CONFIG_RTNETLINK
+static void rtmsg_ifa(int event, struct in_ifaddr *);
+#else
+#define rtmsg_ifa(a,b) do { } while(0)
+#endif
+
+static struct notifier_block *inetaddr_chain;
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p);
+static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+#endif
+
+int inet_ifa_count;
+int inet_dev_count;
+
+static struct in_ifaddr * inet_alloc_ifa(void)
+{
+ struct in_ifaddr *ifa;
+
+ ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+ if (ifa) {
+ memset(ifa, 0, sizeof(*ifa));
+ inet_ifa_count++;
+ }
+
+ return ifa;
+}
+
+static __inline__ void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ kfree_s(ifa, sizeof(*ifa));
+ inet_ifa_count--;
+}
+
+struct in_device *inetdev_init(struct device *dev)
+{
+ struct in_device *in_dev;
+
+ if (dev->mtu < 68)
+ return NULL;
+
+ in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ return NULL;
+ inet_dev_count++;
+ memset(in_dev, 0, sizeof(*in_dev));
+ memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) {
+ kfree(in_dev);
+ return NULL;
+ }
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
+#endif
+ dev->ip_ptr = in_dev;
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+ if (dev->flags&IFF_UP)
+ ip_mc_up(in_dev);
+ return in_dev;
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl_unregister(&in_dev->cnf);
+#endif
+ in_dev->dev->ip_ptr = NULL;
+ synchronize_bh();
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ kfree(in_dev);
+}
+
+struct in_ifaddr * inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+{
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa))
+ return ifa;
+ }
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static void
+inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
+{
+ struct in_ifaddr *ifa1 = *ifap;
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries */
+
+ if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) {
+ struct in_ifaddr *ifa;
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa=*ifap1) != NULL) {
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ continue;
+ }
+ *ifap1 = ifa->ifa_next;
+ synchronize_bh();
+
+ rtmsg_ifa(RTM_DELADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ }
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+ synchronize_bh();
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1);
+ notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+ if (destroy) {
+ inet_free_ifa(ifa1);
+ if (in_dev->ifa_list == NULL)
+ inetdev_destroy(in_dev);
+ }
+}
+
+static int
+inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
+{
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ if (ifa->ifa_local == 0) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap=&in_dev->ifa_list; (ifa1=*ifap)!=NULL; ifap=&ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags&IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY)) {
+ net_srandom(ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ wmb();
+ *ifap = ifa;
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa);
+ notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int
+inet_set_ifa(struct device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = dev->ip_ptr;
+
+ if (in_dev == NULL) {
+ in_dev = inetdev_init(dev);
+ if (in_dev == NULL) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ }
+ ifa->ifa_dev = in_dev;
+ if (LOOPBACK(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(in_dev, ifa);
+}
+
+struct in_device *inetdev_by_index(int ifindex)
+{
+ struct device *dev;
+ dev = dev_get_by_index(ifindex);
+ if (dev)
+ return dev->ip_ptr;
+ return NULL;
+}
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask)
+{
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+#ifdef CONFIG_RTNETLINK
+
+/* rtm_{add|del} functions are not reenterable, so that
+ this structure can be made static
+ */
+
+int
+inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa, **ifap;
+
+ if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
+ return -EADDRNOTAVAIL;
+
+ for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) {
+ if ((rta[IFA_LOCAL-1] && memcmp(RTA_DATA(rta[IFA_LOCAL-1]), &ifa->ifa_local, 4)) ||
+ (rta[IFA_LABEL-1] && strcmp(RTA_DATA(rta[IFA_LABEL-1]), ifa->ifa_label)) ||
+ (rta[IFA_ADDRESS-1] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS-1]), ifa))))
+ continue;
+ inet_del_ifa(in_dev, ifap, 1);
+ return 0;
+ }
+
+ return -EADDRNOTAVAIL;
+}
+
+int
+inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct device *dev;
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+ struct in_ifaddr *ifa;
+
+ if (ifm->ifa_prefixlen > 32 || rta[IFA_LOCAL-1] == NULL)
+ return -EINVAL;
+
+ if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL)
+ return -ENODEV;
+
+ if ((in_dev = dev->ip_ptr) == NULL) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ return -ENOBUFS;
+ }
+
+ if ((ifa = inet_alloc_ifa()) == NULL)
+ return -ENOBUFS;
+
+ if (rta[IFA_ADDRESS-1] == NULL)
+ rta[IFA_ADDRESS-1] = rta[IFA_LOCAL-1];
+ memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL-1]), 4);
+ memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS-1]), 4);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ if (rta[IFA_BROADCAST-1])
+ memcpy(&ifa->ifa_broadcast, RTA_DATA(rta[IFA_BROADCAST-1]), 4);
+ if (rta[IFA_ANYCAST-1])
+ memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST-1]), 4);
+ ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ ifa->ifa_dev = in_dev;
+ if (rta[IFA_LABEL-1])
+ memcpy(ifa->ifa_label, RTA_DATA(rta[IFA_LABEL-1]), IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ return inet_insert_ifa(in_dev, ifa);
+}
+
+#endif
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static __inline__ int inet_abc_len(u32 addr)
+{
+ if (ZERONET(addr))
+ return 0;
+
+ addr = ntohl(addr);
+ if (IN_CLASSA(addr))
+ return 8;
+ if (IN_CLASSB(addr))
+ return 16;
+ if (IN_CLASSC(addr))
+ return 24;
+
+ /*
+ * Something else, probably a multicast.
+ */
+
+ return -1;
+}
+
+
+int devinet_ioctl(unsigned int cmd, void *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct device *dev;
+#ifdef CONFIG_IP_ALIAS
+ char *colon;
+#endif
+ int exclusive = 0;
+ int ret = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+#ifdef CONFIG_IP_ALIAS
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+#endif
+
+#ifdef CONFIG_KMOD
+ dev_load(ifr.ifr_name);
+#endif
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that this ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ if (!capable(CAP_NET_ADMIN))
+ return -EACCES;
+ rtnl_lock();
+ exclusive = 1;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ if (!capable(CAP_NET_ADMIN))
+ return -EACCES;
+ if (sin->sin_family != AF_INET)
+ return -EINVAL;
+ rtnl_lock();
+ exclusive = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+
+ if ((dev = dev_get(ifr.ifr_name)) == NULL) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+#ifdef CONFIG_IP_ALIAS
+ if (colon)
+ *colon = ':';
+#endif
+
+ if ((in_dev=dev->ip_ptr) != NULL) {
+ for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next)
+ if (strcmp(ifr.ifr_name, ifa->ifa_label) == 0)
+ break;
+ }
+
+ if (ifa == NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) {
+ ret = -EADDRNOTAVAIL;
+ goto done;
+ }
+
+ switch(cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+#ifdef CONFIG_IP_ALIAS
+ if (colon) {
+ if (ifa == NULL) {
+ ret = -EADDRNOTAVAIL;
+ break;
+ }
+ if (!(ifr.ifr_flags&IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+#endif
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (!ifa) {
+ if ((ifa = inet_alloc_ifa()) == NULL) {
+ ret = -ENOBUFS;
+ break;
+ }
+#ifdef CONFIG_IP_ALIAS
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+#endif
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_anycast = 0;
+ }
+
+ ifa->ifa_address =
+ ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags&IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(in_dev, ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ if (ifa->ifa_address != sin->sin_addr.s_addr) {
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0) {
+ ret = -EINVAL;
+ break;
+ }
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(in_dev, ifa);
+ }
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ if (bad_mask(sin->sin_addr.s_addr, 0)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+ inet_set_ifa(dev, ifa);
+ }
+ break;
+ }
+done:
+ if (exclusive)
+ rtnl_unlock();
+ return ret;
+
+rarok:
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+inet_gifconf(struct device *dev, char *buf, int len)
+{
+ struct in_device *in_dev = dev->ip_ptr;
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done=0;
+
+ if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL)
+ return 0;
+
+ for ( ; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ return done;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ if (ifa->ifa_label)
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+ else
+ strcpy(ifr.ifr_name, dev->name);
+
+ (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+ return done;
+}
+
+u32 inet_select_addr(struct device *dev, u32 dst, int scope)
+{
+ u32 addr = 0;
+ struct in_device *in_dev = dev->ip_ptr;
+
+ if (in_dev == NULL)
+ return 0;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa))
+ return ifa->ifa_local;
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+
+ if (addr || scope >= RT_SCOPE_LINK)
+ return addr;
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is importnat that lo is the first interface
+ in dev_base list.
+ */
+ for (dev=dev_base; dev; dev=dev->next) {
+ if ((in_dev=dev->ip_ptr) == NULL)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope <= scope)
+ return ifa->ifa_local;
+ } endfor_ifa(in_dev);
+ }
+
+ return 0;
+}
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&inetaddr_chain, nb);
+}
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&inetaddr_chain,nb);
+}
+
+static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct device *dev = ptr;
+ struct in_device *in_dev = dev->ip_ptr;
+
+ if (in_dev == NULL)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (in_dev)
+ printk(KERN_DEBUG "inetdev_event: bug\n");
+ dev->ip_ptr = NULL;
+ break;
+ case NETDEV_UP:
+ if (dev == &loopback_dev) {
+ struct in_ifaddr *ifa;
+ if ((ifa = inet_alloc_ifa()) != NULL) {
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ inet_insert_ifa(in_dev, ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (dev->mtu >= 68)
+ break;
+ /* MTU falled under minimal IP mtu. Disable IP. */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ if (in_dev->ifa_list) {
+ struct in_ifaddr *ifa;
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ /* Do not notify about label change, this event is
+ not interesting to applications using netlink.
+ */
+ }
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+struct notifier_block ip_netdev_notifier={
+ inetdev_event,
+ NULL,
+ 0
+};
+
+#ifdef CONFIG_RTNETLINK
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 pid, u32 seq, int event)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+ ifm = NLMSG_DATA(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (ifa->ifa_address)
+ RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
+ if (ifa->ifa_local)
+ RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
+ if (ifa->ifa_broadcast)
+ RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
+ if (ifa->ifa_anycast)
+ RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
+ if (ifa->ifa_label[0])
+ RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx, ip_idx;
+ int s_idx, s_ip_idx;
+ struct device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ s_idx = cb->args[0];
+ s_ip_idx = ip_idx = cb->args[1];
+ for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (idx > s_idx)
+ s_ip_idx = 0;
+ if ((in_dev = dev->ip_ptr) == NULL)
+ continue;
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0)
+ goto done;
+ }
+ }
+done:
+ cb->args[0] = idx;
+ cb->args[1] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr * ifa)
+{
+ struct sk_buff *skb;
+ int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128);
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb) {
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+ return;
+ }
+ if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ kfree_skb(skb);
+ netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ return;
+ }
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+}
+
+
+static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] =
+{
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+
+ { inet_rtm_newaddr, NULL, },
+ { inet_rtm_deladdr, NULL, },
+ { NULL, inet_dump_ifaddr, },
+ { NULL, NULL, },
+
+ { inet_rtm_newroute, NULL, },
+ { inet_rtm_delroute, NULL, },
+ { inet_rtm_getroute, inet_dump_fib, },
+ { NULL, NULL, },
+
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ { inet_rtm_newrule, NULL, },
+ { inet_rtm_delrule, NULL, },
+ { NULL, inet_dump_rules, },
+ { NULL, NULL, },
+#else
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+ { NULL, NULL, },
+#endif
+};
+
+#endif /* CONFIG_RTNETLINK */
+
+
+#ifdef CONFIG_SYSCTL
+
+void inet_forward_change()
+{
+ struct device *dev;
+ int on = ipv4_devconf.forwarding;
+
+ ipv4_devconf.accept_redirects = !on;
+ ipv4_devconf_dflt.forwarding = on;
+
+ for (dev = dev_base; dev; dev = dev->next) {
+ struct in_device *in_dev = dev->ip_ptr;
+ if (in_dev)
+ in_dev->cnf.forwarding = on;
+ }
+
+ rt_cache_flush(0);
+
+ ip_statistics.IpForwarding = on ? 1 : 2;
+}
+
+static
+int devinet_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+ void *buffer, size_t *lenp)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret;
+
+ ret = proc_dointvec(ctl, write, filp, buffer, lenp);
+
+ if (write && *valp != val) {
+ if (valp == &ipv4_devconf.forwarding)
+ inet_forward_change();
+ else if (valp != &ipv4_devconf_dflt.forwarding)
+ rt_cache_flush(0);
+ }
+
+ return ret;
+}
+
+static struct devinet_sysctl_table
+{
+ struct ctl_table_header *sysctl_header;
+ ctl_table devinet_vars[12];
+ ctl_table devinet_dev[2];
+ ctl_table devinet_conf_dir[2];
+ ctl_table devinet_proto_dir[2];
+ ctl_table devinet_root_dir[2];
+} devinet_sysctl = {
+ NULL,
+ {{NET_IPV4_CONF_FORWARDING, "forwarding",
+ &ipv4_devconf.forwarding, sizeof(int), 0644, NULL,
+ &devinet_sysctl_forward},
+ {NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding",
+ &ipv4_devconf.mc_forwarding, sizeof(int), 0444, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects",
+ &ipv4_devconf.accept_redirects, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects",
+ &ipv4_devconf.secure_redirects, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_SHARED_MEDIA, "shared_media",
+ &ipv4_devconf.shared_media, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_RP_FILTER, "rp_filter",
+ &ipv4_devconf.rp_filter, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects",
+ &ipv4_devconf.send_redirects, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route",
+ &ipv4_devconf.accept_source_route, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_PROXY_ARP, "proxy_arp",
+ &ipv4_devconf.proxy_arp, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay",
+ &ipv4_devconf.bootp_relay, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_CONF_LOG_MARTIANS, "log_martians",
+ &ipv4_devconf.log_martians, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {0}},
+
+ {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, devinet_sysctl.devinet_vars},{0}},
+ {{NET_IPV4_CONF, "conf", NULL, 0, 0555, devinet_sysctl.devinet_dev},{0}},
+ {{NET_IPV4, "ipv4", NULL, 0, 0555, devinet_sysctl.devinet_conf_dir},{0}},
+ {{CTL_NET, "net", NULL, 0, 0555, devinet_sysctl.devinet_proto_dir},{0}}
+};
+
+static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p)
+{
+ int i;
+ struct device *dev = in_dev ? in_dev->dev : NULL;
+ struct devinet_sysctl_table *t;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (t == NULL)
+ return;
+ memcpy(t, &devinet_sysctl, sizeof(*t));
+ for (i=0; i<sizeof(t->devinet_vars)/sizeof(t->devinet_vars[0])-1; i++) {
+ t->devinet_vars[i].data += (char*)p - (char*)&ipv4_devconf;
+ t->devinet_vars[i].de = NULL;
+ }
+ if (dev) {
+ t->devinet_dev[0].procname = dev->name;
+ t->devinet_dev[0].ctl_name = dev->ifindex;
+ } else {
+ t->devinet_dev[0].procname = "default";
+ t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+ }
+ t->devinet_dev[0].child = t->devinet_vars;
+ t->devinet_dev[0].de = NULL;
+ t->devinet_conf_dir[0].child = t->devinet_dev;
+ t->devinet_conf_dir[0].de = NULL;
+ t->devinet_proto_dir[0].child = t->devinet_conf_dir;
+ t->devinet_proto_dir[0].de = NULL;
+ t->devinet_root_dir[0].child = t->devinet_proto_dir;
+ t->devinet_root_dir[0].de = NULL;
+
+ t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+ if (t->sysctl_header == NULL)
+ kfree(t);
+ else
+ p->sysctl = t;
+}
+
+static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+{
+ if (p->sysctl) {
+ struct devinet_sysctl_table *t = p->sysctl;
+ p->sysctl = NULL;
+ unregister_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
+}
+#endif
+
+__initfunc(void devinet_init(void))
+{
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+#ifdef CONFIG_RTNETLINK
+ rtnetlink_links[PF_INET] = inet_rtnetlink_table;
+#endif
+#ifdef CONFIG_SYSCTL
+ devinet_sysctl.sysctl_header =
+ register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+ devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+#endif
+}
diff --git a/pfinet/linux-src/net/ipv4/fib_frontend.c b/pfinet/linux-src/net/ipv4/fib_frontend.c
new file mode 100644
index 00000000..a1747048
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/fib_frontend.c
@@ -0,0 +1,628 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Version: $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+
+#define FFprint(a...) printk(KERN_DEBUG a)
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+#define RT_TABLE_MIN RT_TABLE_MAIN
+
+struct fib_table *local_table;
+struct fib_table *main_table;
+
+#else
+
+#define RT_TABLE_MIN 1
+
+struct fib_table *fib_tables[RT_TABLE_MAX+1];
+
+struct fib_table *__fib_new_table(int id)
+{
+ struct fib_table *tb;
+
+ tb = fib_hash_init(id);
+ if (!tb)
+ return NULL;
+ fib_tables[id] = tb;
+ return tb;
+}
+
+
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+
+void fib_flush(void)
+{
+ int flushed = 0;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_table *tb;
+ int id;
+
+ for (id = RT_TABLE_MAX; id>0; id--) {
+ if ((tb = fib_get_table(id))==NULL)
+ continue;
+ flushed += tb->tb_flush(tb);
+ }
+#else /* CONFIG_IP_MULTIPLE_TABLES */
+ flushed += main_table->tb_flush(main_table);
+ flushed += local_table->tb_flush(local_table);
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+ if (flushed)
+ rt_cache_flush(-1);
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Called from the PROCfs module. This outputs /proc/net/route.
+ *
+ * It always works in backward compatibility mode.
+ * The format of the file is not supposed to be changed.
+ */
+
+static int
+fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ int first = offset/128;
+ char *ptr = buffer;
+ int count = (length+127)/128;
+ int len;
+
+ *start = buffer + offset%128;
+
+ if (--first < 0) {
+ sprintf(buffer, "%-127s\n", "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
+ --count;
+ ptr += 128;
+ first = 0;
+ }
+
+ /* rtnl_shlock(); -- it is pointless at the moment --ANK */
+ if (main_table && count > 0) {
+ int n = main_table->tb_get_info(main_table, ptr, first, count);
+ count -= n;
+ ptr += n*128;
+ }
+ /* rtnl_shunlock(); */
+ len = ptr - *start;
+ if (len >= length)
+ return length;
+ if (len >= 0)
+ return len;
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Find the first device with a given source address.
+ */
+
+struct device * ip_dev_find(u32 addr)
+{
+ struct rt_key key;
+ struct fib_result res;
+
+ memset(&key, 0, sizeof(key));
+ key.dst = addr;
+
+ if (!local_table || local_table->tb_lookup(local_table, &key, &res)
+ || res.type != RTN_LOCAL)
+ return NULL;
+
+ return FIB_RES_DEV(res);
+}
+
+unsigned inet_addr_type(u32 addr)
+{
+ struct rt_key key;
+ struct fib_result res;
+
+ if (ZERONET(addr) || BADCLASS(addr))
+ return RTN_BROADCAST;
+ if (MULTICAST(addr))
+ return RTN_MULTICAST;
+
+ memset(&key, 0, sizeof(key));
+ key.dst = addr;
+
+ if (local_table) {
+ if (local_table->tb_lookup(local_table, &key, &res) == 0)
+ return res.type;
+ return RTN_UNICAST;
+ }
+ return RTN_BROADCAST;
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ - (main) check, that source is valid i.e. not broadcast or our local
+ address.
+ - figure out what "logical" interface this packet arrived
+ and calculate "specific destination" address.
+ - check, that packet arrived from expected physical interface.
+ */
+
+int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+ struct device *dev, u32 *spec_dst, u32 *itag)
+{
+ struct in_device *in_dev = dev->ip_ptr;
+ struct rt_key key;
+ struct fib_result res;
+
+ key.dst = src;
+ key.src = dst;
+ key.tos = tos;
+ key.oif = 0;
+ key.iif = oif;
+ key.scope = RT_SCOPE_UNIVERSE;
+
+ if (in_dev == NULL)
+ return -EINVAL;
+ if (fib_lookup(&key, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST)
+ return -EINVAL;
+ *spec_dst = FIB_RES_PREFSRC(res);
+ if (itag)
+ fib_combine_itag(itag, &res);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+#else
+ if (FIB_RES_DEV(res) == dev)
+#endif
+ return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+
+ if (in_dev->ifa_list == NULL)
+ goto last_resort;
+ if (IN_DEV_RPFILTER(in_dev))
+ return -EINVAL;
+ key.oif = dev->ifindex;
+ if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(res);
+ return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ return 0;
+
+last_resort:
+ if (IN_DEV_RPFILTER(in_dev))
+ return -EINVAL;
+ *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ *itag = 0;
+ return 0;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+/*
+ * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ */
+
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+ int err;
+ struct kern_rta rta;
+ struct rtentry r;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+ return -EFAULT;
+ rtnl_lock();
+ err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+ if (err == 0) {
+ if (cmd == SIOCDELRT) {
+ struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
+ err = -ESRCH;
+ if (tb)
+ err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+ } else {
+ struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
+ err = -ENOBUFS;
+ if (tb)
+ err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ }
+ if (rta.rta_mx)
+ kfree(rta.rta_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+#else
+
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+ return -EINVAL;
+}
+
+#endif
+
+#ifdef CONFIG_RTNETLINK
+
+static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+ int i;
+
+ for (i=1; i<=RTA_MAX; i++) {
+ struct rtattr *attr = rta[i-1];
+ if (attr) {
+ if (RTA_PAYLOAD(attr) < 4)
+ return -EINVAL;
+ if (i != RTA_MULTIPATH && i != RTA_METRICS)
+ rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+ }
+ }
+ return 0;
+}
+
+int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_get_table(r->rtm_table);
+ if (tb)
+ return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ESRCH;
+}
+
+int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct fib_table * tb;
+ struct rtattr **rta = arg;
+ struct rtmsg *r = NLMSG_DATA(nlh);
+
+ if (inet_check_attr(r, rta))
+ return -EINVAL;
+
+ tb = fib_new_table(r->rtm_table);
+ if (tb)
+ return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+ return -ENOBUFS;
+}
+
+int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int t;
+ int s_t;
+ struct fib_table *tb;
+
+ if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+ return ip_rt_dump(skb, cb);
+
+ s_t = cb->args[0];
+ if (s_t == 0)
+ s_t = cb->args[0] = RT_TABLE_MIN;
+
+ for (t=s_t; t<=RT_TABLE_MAX; t++) {
+ if (t < s_t) continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ if ((tb = fib_get_table(t))==NULL)
+ continue;
+ if (tb->tb_dump(tb, skb, cb) < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+#endif
+
+/* Prepare and feed intra-kernel routing request.
+ Really, it should be netlink message, but :-( netlink
+ can be not configured, so that we feed it directly
+ to fib engine. It is legal, because all events occur
+ only when netlink is already locked.
+ */
+
+static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct fib_table * tb;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+ struct kern_rta rta;
+
+ memset(&req.rtm, 0, sizeof(req.rtm));
+ memset(&rta, 0, sizeof(rta));
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(RT_TABLE_LOCAL);
+
+ if (tb == NULL)
+ return;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = cmd;
+ req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = 0;
+
+ req.rtm.rtm_dst_len = dst_len;
+ req.rtm.rtm_table = tb->tb_id;
+ req.rtm.rtm_protocol = RTPROT_KERNEL;
+ req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+ req.rtm.rtm_type = type;
+
+ rta.rta_dst = &dst;
+ rta.rta_prefsrc = &ifa->ifa_local;
+ rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+
+ if (cmd == RTM_NEWROUTE)
+ tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+ else
+ tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+
+static void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ u32 mask = ifa->ifa_mask;
+ u32 addr = ifa->ifa_local;
+ u32 prefix = ifa->ifa_address&mask;
+
+ if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ }
+ }
+}
+
+static void fib_del_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa;
+ u32 brd = ifa->ifa_address|~ifa->ifa_mask;
+ u32 any = ifa->ifa_address&ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned ok = 0;
+
+ if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+ RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ else {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (prim == NULL) {
+ printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ return;
+ }
+ }
+
+ /* Deletion is more complicated than add.
+ We should take care of not to delete too much :-)
+
+ Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ }
+
+ if (!(ok&BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (!(ok&BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok&BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ if (!(ok&LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ We must flush stray FIB entries.
+
+ First of all, we scan fib_info list searching
+ for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down(ifa->ifa_local, NULL, 0))
+ fib_flush();
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void fib_disable_ip(struct device *dev, int force)
+{
+ if (fib_sync_down(0, dev, force))
+ fib_flush();
+ rt_cache_flush(0);
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ /* Last address was deleted from this interface.
+ Disable IP.
+ */
+ fib_disable_ip(ifa->ifa_dev->dev, 1);
+ } else {
+ fib_del_ifaddr(ifa);
+ rt_cache_flush(-1);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct device *dev = ptr;
+ struct in_device *in_dev = dev->ip_ptr;
+
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0);
+ break;
+ case NETDEV_UNREGISTER:
+ fib_disable_ip(dev, 1);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(0);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block fib_inetaddr_notifier = {
+ fib_inetaddr_event,
+ NULL,
+ 0
+};
+
+struct notifier_block fib_netdev_notifier = {
+ fib_netdev_event,
+ NULL,
+ 0
+};
+
+__initfunc(void ip_fib_init(void))
+{
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&(struct proc_dir_entry) {
+ PROC_NET_ROUTE, 5, "route",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ fib_get_procinfo
+ });
+#endif /* CONFIG_PROC_FS */
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+ local_table = fib_hash_init(RT_TABLE_LOCAL);
+ main_table = fib_hash_init(RT_TABLE_MAIN);
+#else
+ fib_rules_init();
+#endif
+
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+}
+
diff --git a/pfinet/linux-src/net/ipv4/fib_hash.c b/pfinet/linux-src/net/ipv4/fib_hash.c
new file mode 100644
index 00000000..d9e029ce
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/fib_hash.c
@@ -0,0 +1,885 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Version: $Id: fib_hash.c,v 1.8 1999/03/25 10:04:17 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FTprint(a...)
+/*
+ printk(KERN_DEBUG a)
+ */
+
+/*
+ These bizarre types are just to force strict type checking.
+ When I reversed order of bytes and changed to natural mask lengths,
+ I forgot to make fixes in several places. Now I am lazy to return
+ it back.
+ */
+
+typedef struct {
+ u32 datum;
+} fn_key_t;
+
+typedef struct {
+ u32 datum;
+} fn_hash_idx_t;
+
+struct fib_node
+{
+ struct fib_node *fn_next;
+ struct fib_info *fn_info;
+#define FIB_INFO(f) ((f)->fn_info)
+ fn_key_t fn_key;
+ u8 fn_tos;
+ u8 fn_type;
+ u8 fn_scope;
+ u8 fn_state;
+};
+
+#define FN_S_ZOMBIE 1
+#define FN_S_ACCESSED 2
+
+static int fib_hash_zombies;
+
+struct fn_zone
+{
+ struct fn_zone *fz_next; /* Next not empty zone */
+ struct fib_node **fz_hash; /* Hash table pointer */
+ int fz_nent; /* Number of entries */
+
+ int fz_divisor; /* Hash divisor */
+ u32 fz_hashmask; /* (1<<fz_divisor) - 1 */
+#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
+
+ int fz_order; /* Zone order */
+ u32 fz_mask;
+#define FZ_MASK(fz) ((fz)->fz_mask)
+};
+
+/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
+ can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+
+struct fn_hash
+{
+ struct fn_zone *fn_zones[33];
+ struct fn_zone *fn_zone_list;
+};
+
+static __inline__ fn_hash_idx_t fn_hash(fn_key_t key, struct fn_zone *fz)
+{
+ u32 h = ntohl(key.datum)>>(32 - fz->fz_order);
+ h ^= (h>>20);
+ h ^= (h>>10);
+ h ^= (h>>5);
+ h &= FZ_HASHMASK(fz);
+ return *(fn_hash_idx_t*)&h;
+}
+
+#define fz_key_0(key) ((key).datum = 0)
+#define fz_prefix(key,fz) ((key).datum)
+
+static __inline__ fn_key_t fz_key(u32 dst, struct fn_zone *fz)
+{
+ fn_key_t k;
+ k.datum = dst & FZ_MASK(fz);
+ return k;
+}
+
+static __inline__ struct fib_node ** fz_chain_p(fn_key_t key, struct fn_zone *fz)
+{
+ return &fz->fz_hash[fn_hash(key, fz).datum];
+}
+
+static __inline__ struct fib_node * fz_chain(fn_key_t key, struct fn_zone *fz)
+{
+ return fz->fz_hash[fn_hash(key, fz).datum];
+}
+
+extern __inline__ int fn_key_eq(fn_key_t a, fn_key_t b)
+{
+ return a.datum == b.datum;
+}
+
+extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b)
+{
+ return a.datum <= b.datum;
+}
+
+#define FZ_MAX_DIVISOR 1024
+
+#ifdef CONFIG_IP_ROUTE_LARGE_TABLES
+
+static __inline__ void fn_rebuild_zone(struct fn_zone *fz,
+ struct fib_node **old_ht,
+ int old_divisor)
+{
+ int i;
+ struct fib_node *f, **fp, *next;
+
+ for (i=0; i<old_divisor; i++) {
+ for (f=old_ht[i]; f; f=next) {
+ next = f->fn_next;
+ for (fp = fz_chain_p(f->fn_key, fz);
+ *fp && fn_key_leq((*fp)->fn_key, f->fn_key);
+ fp = &(*fp)->fn_next)
+ /* NONE */;
+ f->fn_next = *fp;
+ *fp = f;
+ }
+ }
+}
+
+static void fn_rehash_zone(struct fn_zone *fz)
+{
+ struct fib_node **ht, **old_ht;
+ int old_divisor, new_divisor;
+ u32 new_hashmask;
+
+ old_divisor = fz->fz_divisor;
+
+ switch (old_divisor) {
+ case 16:
+ new_divisor = 256;
+ new_hashmask = 0xFF;
+ break;
+ case 256:
+ new_divisor = 1024;
+ new_hashmask = 0x3FF;
+ break;
+ default:
+ printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
+ return;
+ }
+#if RT_CACHE_DEBUG >= 2
+ printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+#endif
+
+ ht = kmalloc(new_divisor*sizeof(struct fib_node*), GFP_KERNEL);
+
+ if (ht) {
+ memset(ht, 0, new_divisor*sizeof(struct fib_node*));
+ start_bh_atomic();
+ old_ht = fz->fz_hash;
+ fz->fz_hash = ht;
+ fz->fz_hashmask = new_hashmask;
+ fz->fz_divisor = new_divisor;
+ fn_rebuild_zone(fz, old_ht, old_divisor);
+ end_bh_atomic();
+ kfree(old_ht);
+ }
+}
+#endif /* CONFIG_IP_ROUTE_LARGE_TABLES */
+
+static void fn_free_node(struct fib_node * f)
+{
+ fib_release_info(FIB_INFO(f));
+ kfree_s(f, sizeof(struct fib_node));
+}
+
+
+static struct fn_zone *
+fn_new_zone(struct fn_hash *table, int z)
+{
+ int i;
+ struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
+ if (!fz)
+ return NULL;
+
+ memset(fz, 0, sizeof(struct fn_zone));
+ if (z) {
+ fz->fz_divisor = 16;
+ fz->fz_hashmask = 0xF;
+ } else {
+ fz->fz_divisor = 1;
+ fz->fz_hashmask = 0;
+ }
+ fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL);
+ if (!fz->fz_hash) {
+ kfree(fz);
+ return NULL;
+ }
+ memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*));
+ fz->fz_order = z;
+ fz->fz_mask = inet_make_mask(z);
+
+ /* Find the first not empty zone with more specific mask */
+ for (i=z+1; i<=32; i++)
+ if (table->fn_zones[i])
+ break;
+ if (i>32) {
+ /* No more specific masks, we are the first. */
+ fz->fz_next = table->fn_zone_list;
+ table->fn_zone_list = fz;
+ } else {
+ fz->fz_next = table->fn_zones[i]->fz_next;
+ table->fn_zones[i]->fz_next = fz;
+ }
+ table->fn_zones[z] = fz;
+ return fz;
+}
+
+static int
+fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
+{
+ int err;
+ struct fn_zone *fz;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+
+ for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+ struct fib_node *f;
+ fn_key_t k = fz_key(key->dst, fz);
+
+ for (f = fz_chain(k, fz); f; f = f->fn_next) {
+ if (!fn_key_eq(k, f->fn_key)) {
+ if (fn_key_leq(k, f->fn_key))
+ break;
+ else
+ continue;
+ }
+#ifdef CONFIG_IP_ROUTE_TOS
+ if (f->fn_tos && f->fn_tos != key->tos)
+ continue;
+#endif
+ f->fn_state |= FN_S_ACCESSED;
+
+ if (f->fn_state&FN_S_ZOMBIE)
+ continue;
+ if (f->fn_scope < key->scope)
+ continue;
+
+ err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res);
+ if (err == 0) {
+ res->type = f->fn_type;
+ res->scope = f->fn_scope;
+ res->prefixlen = fz->fz_order;
+ res->prefix = &fz_prefix(f->fn_key, fz);
+ return 0;
+ }
+ if (err < 0)
+ return err;
+ }
+ }
+ return 1;
+}
+
+static int fn_hash_last_dflt=-1;
+
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state==NUD_REACHABLE)
+ return 0;
+ if ((state&NUD_VALID) && order != fn_hash_last_dflt)
+ return 0;
+ if ((state&NUD_VALID) ||
+ (*last_idx<0 && order > fn_hash_last_dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
+{
+ int order, last_idx;
+ struct fib_node *f;
+ struct fib_info *fi = NULL;
+ struct fib_info *last_resort;
+ struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz = t->fn_zones[0];
+
+ if (fz == NULL)
+ return;
+
+ last_idx = -1;
+ last_resort = NULL;
+ order = -1;
+
+ for (f = fz->fz_hash[0]; f; f = f->fn_next) {
+ struct fib_info *next_fi = FIB_INFO(f);
+
+ if ((f->fn_state&FN_S_ZOMBIE) ||
+ f->fn_scope != res->scope ||
+ f->fn_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+ f->fn_state |= FN_S_ACCESSED;
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ res->fi = fi;
+ fn_hash_last_dflt = order;
+ return;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order<=0 || fi==NULL) {
+ fn_hash_last_dflt = -1;
+ return;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ res->fi = fi;
+ fn_hash_last_dflt = order;
+ return;
+ }
+
+ if (last_idx >= 0)
+ res->fi = last_resort;
+ fn_hash_last_dflt = last_idx;
+}
+
+#define FIB_SCAN(f, fp) \
+for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
+
+#define FIB_SCAN_KEY(f, fp, key) \
+for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
+
+#ifndef CONFIG_IP_ROUTE_TOS
+#define FIB_SCAN_TOS(f, fp, key, tos) FIB_SCAN_KEY(f, fp, key)
+#else
+#define FIB_SCAN_TOS(f, fp, key, tos) \
+for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)) && \
+ (f)->fn_tos == (tos) ; (fp) = &(f)->fn_next)
+#endif
+
+
+#ifdef CONFIG_RTNETLINK
+static void rtmsg_fib(int, struct fib_node*, int, int,
+ struct nlmsghdr *n,
+ struct netlink_skb_parms *);
+#else
+#define rtmsg_fib(a, b, c, d, e, f)
+#endif
+
+
+static int
+fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fib_node *new_f, *f, **fp, **del_fp;
+ struct fn_zone *fz;
+ struct fib_info *fi;
+
+ int z = r->rtm_dst_len;
+ int type = r->rtm_type;
+#ifdef CONFIG_IP_ROUTE_TOS
+ u8 tos = r->rtm_tos;
+#endif
+ fn_key_t key;
+ int err;
+
+FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
+*(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1,
+rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
+ if (z > 32)
+ return -EINVAL;
+ fz = table->fn_zones[z];
+ if (!fz && !(fz = fn_new_zone(table, z)))
+ return -ENOBUFS;
+
+ fz_key_0(key);
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ if ((fi = fib_create_info(r, rta, n, &err)) == NULL)
+ return err;
+
+#ifdef CONFIG_IP_ROUTE_LARGE_TABLES
+ if (fz->fz_nent > (fz->fz_divisor<<2) &&
+ fz->fz_divisor < FZ_MAX_DIVISOR &&
+ (z==32 || (1<<z) > fz->fz_divisor))
+ fn_rehash_zone(fz);
+#endif
+
+ fp = fz_chain_p(key, fz);
+
+ /*
+ * Scan list to find the first route with the same destination
+ */
+ FIB_SCAN(f, fp) {
+ if (fn_key_leq(key,f->fn_key))
+ break;
+ }
+
+#ifdef CONFIG_IP_ROUTE_TOS
+ /*
+ * Find route with the same destination and tos.
+ */
+ FIB_SCAN_KEY(f, fp, key) {
+ if (f->fn_tos <= tos)
+ break;
+ }
+#endif
+
+ del_fp = NULL;
+
+ if (f && (f->fn_state&FN_S_ZOMBIE) &&
+#ifdef CONFIG_IP_ROUTE_TOS
+ f->fn_tos == tos &&
+#endif
+ fn_key_eq(f->fn_key, key)) {
+ del_fp = fp;
+ fp = &f->fn_next;
+ f = *fp;
+ goto create;
+ }
+
+ FIB_SCAN_TOS(f, fp, key, tos) {
+ if (fi->fib_priority <= FIB_INFO(f)->fib_priority)
+ break;
+ }
+
+ /* Now f==*fp points to the first node with the same
+ keys [prefix,tos,priority], if such key already
+ exists or to the node, before which we will insert new one.
+ */
+
+ if (f &&
+#ifdef CONFIG_IP_ROUTE_TOS
+ f->fn_tos == tos &&
+#endif
+ fn_key_eq(f->fn_key, key) &&
+ fi->fib_priority == FIB_INFO(f)->fib_priority) {
+ struct fib_node **ins_fp;
+
+ err = -EEXIST;
+ if (n->nlmsg_flags&NLM_F_EXCL)
+ goto out;
+
+ if (n->nlmsg_flags&NLM_F_REPLACE) {
+ del_fp = fp;
+ fp = &f->fn_next;
+ f = *fp;
+ goto replace;
+ }
+
+ ins_fp = fp;
+ err = -EEXIST;
+
+ FIB_SCAN_TOS(f, fp, key, tos) {
+ if (fi->fib_priority != FIB_INFO(f)->fib_priority)
+ break;
+ if (f->fn_type == type && f->fn_scope == r->rtm_scope
+ && FIB_INFO(f) == fi)
+ goto out;
+ }
+
+ if (!(n->nlmsg_flags&NLM_F_APPEND)) {
+ fp = ins_fp;
+ f = *fp;
+ }
+ }
+
+create:
+ err = -ENOENT;
+ if (!(n->nlmsg_flags&NLM_F_CREATE))
+ goto out;
+
+replace:
+ err = -ENOBUFS;
+ new_f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
+ if (new_f == NULL)
+ goto out;
+
+ memset(new_f, 0, sizeof(struct fib_node));
+
+ new_f->fn_key = key;
+#ifdef CONFIG_IP_ROUTE_TOS
+ new_f->fn_tos = tos;
+#endif
+ new_f->fn_type = type;
+ new_f->fn_scope = r->rtm_scope;
+ FIB_INFO(new_f) = fi;
+
+ /*
+ * Insert new entry to the list.
+ */
+
+ new_f->fn_next = f;
+ *fp = new_f;
+ fz->fz_nent++;
+
+ if (del_fp) {
+ f = *del_fp;
+ /* Unlink replaced node */
+ *del_fp = f->fn_next;
+ synchronize_bh();
+
+ if (!(f->fn_state&FN_S_ZOMBIE))
+ rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
+ if (f->fn_state&FN_S_ACCESSED)
+ rt_cache_flush(-1);
+ fn_free_node(f);
+ fz->fz_nent--;
+ } else {
+ rt_cache_flush(-1);
+ }
+ rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req);
+ return 0;
+
+out:
+ fib_release_info(fi);
+ return err;
+}
+
+
+static int
+fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fib_node **fp, **del_fp, *f;
+ int z = r->rtm_dst_len;
+ struct fn_zone *fz;
+ fn_key_t key;
+ int matched;
+#ifdef CONFIG_IP_ROUTE_TOS
+ u8 tos = r->rtm_tos;
+#endif
+
+FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
+ *(u32*)rta->rta_dst : 0, z, rta->rta_oif ? *rta->rta_oif : -1);
+ if (z > 32)
+ return -EINVAL;
+ if ((fz = table->fn_zones[z]) == NULL)
+ return -ESRCH;
+
+ fz_key_0(key);
+ if (rta->rta_dst) {
+ u32 dst;
+ memcpy(&dst, rta->rta_dst, 4);
+ if (dst & ~FZ_MASK(fz))
+ return -EINVAL;
+ key = fz_key(dst, fz);
+ }
+
+ fp = fz_chain_p(key, fz);
+
+ FIB_SCAN(f, fp) {
+ if (fn_key_eq(f->fn_key, key))
+ break;
+ if (fn_key_leq(key, f->fn_key))
+ return -ESRCH;
+ }
+#ifdef CONFIG_IP_ROUTE_TOS
+ FIB_SCAN_KEY(f, fp, key) {
+ if (f->fn_tos == tos)
+ break;
+ }
+#endif
+
+ matched = 0;
+ del_fp = NULL;
+ FIB_SCAN_TOS(f, fp, key, tos) {
+ struct fib_info * fi = FIB_INFO(f);
+
+ if (f->fn_state&FN_S_ZOMBIE)
+ return -ESRCH;
+
+ matched++;
+
+ if (del_fp == NULL &&
+ (!r->rtm_type || f->fn_type == r->rtm_type) &&
+ (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
+ (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) &&
+ fib_nh_match(r, n, rta, fi) == 0)
+ del_fp = fp;
+ }
+
+ if (del_fp) {
+ f = *del_fp;
+ rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
+
+ if (matched != 1) {
+ *del_fp = f->fn_next;
+ synchronize_bh();
+
+ if (f->fn_state&FN_S_ACCESSED)
+ rt_cache_flush(-1);
+ fn_free_node(f);
+ fz->fz_nent--;
+ } else {
+ f->fn_state |= FN_S_ZOMBIE;
+ if (f->fn_state&FN_S_ACCESSED) {
+ f->fn_state &= ~FN_S_ACCESSED;
+ rt_cache_flush(-1);
+ }
+ if (++fib_hash_zombies > 128)
+ fib_flush();
+ }
+
+ return 0;
+ }
+ return -ESRCH;
+}
+
+extern __inline__ int
+fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
+{
+ int found = 0;
+ struct fib_node *f;
+
+ while ((f = *fp) != NULL) {
+ struct fib_info *fi = FIB_INFO(f);
+
+ if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
+ *fp = f->fn_next;
+ synchronize_bh();
+
+ fn_free_node(f);
+ found++;
+ continue;
+ }
+ fp = &f->fn_next;
+ }
+ return found;
+}
+
+static int fn_hash_flush(struct fib_table *tb)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz;
+ int found = 0;
+
+ fib_hash_zombies = 0;
+ for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ int i;
+ int tmp = 0;
+ for (i=fz->fz_divisor-1; i>=0; i--)
+ tmp += fn_flush_list(&fz->fz_hash[i], fz->fz_order, table);
+ fz->fz_nent -= tmp;
+ found += tmp;
+ }
+ return found;
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int count)
+{
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+ struct fn_zone *fz;
+ int pos = 0;
+ int n = 0;
+
+ for (fz=table->fn_zone_list; fz; fz = fz->fz_next) {
+ int i;
+ struct fib_node *f;
+ int maxslot = fz->fz_divisor;
+ struct fib_node **fp = fz->fz_hash;
+
+ if (fz->fz_nent == 0)
+ continue;
+
+ if (pos + fz->fz_nent <= first) {
+ pos += fz->fz_nent;
+ continue;
+ }
+
+ for (i=0; i < maxslot; i++, fp++) {
+ for (f = *fp; f; f = f->fn_next) {
+ if (++pos <= first)
+ continue;
+ fib_node_get_info(f->fn_type,
+ f->fn_state&FN_S_ZOMBIE,
+ FIB_INFO(f),
+ fz_prefix(f->fn_key, fz),
+ FZ_MASK(fz), buffer);
+ buffer += 128;
+ if (++n >= count)
+ return n;
+ }
+ }
+ }
+ return n;
+}
+#endif
+
+
+#ifdef CONFIG_RTNETLINK
+
+extern __inline__ int
+fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz,
+ struct fib_node *f)
+{
+ int i, s_i;
+
+ s_i = cb->args[3];
+ for (i=0; f; i++, f=f->fn_next) {
+ if (i < s_i) continue;
+ if (f->fn_state&FN_S_ZOMBIE) continue;
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id, (f->fn_state&FN_S_ZOMBIE) ? 0 : f->fn_type, f->fn_scope,
+ &f->fn_key, fz->fz_order, f->fn_tos,
+ f->fn_info) < 0) {
+ cb->args[3] = i;
+ return -1;
+ }
+ }
+ cb->args[3] = i;
+ return skb->len;
+}
+
+extern __inline__ int
+fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_table *tb,
+ struct fn_zone *fz)
+{
+ int h, s_h;
+
+ s_h = cb->args[2];
+ for (h=0; h < fz->fz_divisor; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ if (fz->fz_hash == NULL || fz->fz_hash[h] == NULL)
+ continue;
+ if (fn_hash_dump_bucket(skb, cb, tb, fz, fz->fz_hash[h]) < 0) {
+ cb->args[2] = h;
+ return -1;
+ }
+ }
+ cb->args[2] = h;
+ return skb->len;
+}
+
+static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int m, s_m;
+ struct fn_zone *fz;
+ struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+
+ s_m = cb->args[1];
+ for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
+ if (m < s_m) continue;
+ if (m > s_m)
+ memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
+ if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
+ cb->args[1] = m;
+ return -1;
+ }
+ }
+ cb->args[1] = m;
+ return skb->len;
+}
+
+static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct sk_buff *skb;
+ u32 pid = req ? req->pid : 0;
+ int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
+ f->fn_type, f->fn_scope, &f->fn_key, z, f->fn_tos,
+ FIB_INFO(f)) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+ NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ if (n->nlmsg_flags&NLM_F_ECHO)
+ netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+}
+
+#endif /* CONFIG_RTNETLINK */
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+__initfunc(struct fib_table * fib_hash_init(int id))
+#endif
+{
+ struct fib_table *tb;
+ tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL);
+ if (tb == NULL)
+ return NULL;
+ tb->tb_id = id;
+ tb->tb_lookup = fn_hash_lookup;
+ tb->tb_insert = fn_hash_insert;
+ tb->tb_delete = fn_hash_delete;
+ tb->tb_flush = fn_hash_flush;
+ tb->tb_select_default = fn_hash_select_default;
+#ifdef CONFIG_RTNETLINK
+ tb->tb_dump = fn_hash_dump;
+#endif
+#ifdef CONFIG_PROC_FS
+ tb->tb_get_info = fn_hash_get_info;
+#endif
+ memset(tb->tb_data, 0, sizeof(struct fn_hash));
+ return tb;
+}
diff --git a/pfinet/linux-src/net/ipv4/fib_rules.c b/pfinet/linux-src/net/ipv4/fib_rules.c
new file mode 100644
index 00000000..868c44c3
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/fib_rules.c
@@ -0,0 +1,419 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Version: $Id: fib_rules.c,v 1.9 1999/03/25 10:04:23 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FRprintk(a...)
+
+struct fib_rule
+{
+ struct fib_rule *r_next;
+ u32 r_preference;
+ unsigned char r_table;
+ unsigned char r_action;
+ unsigned char r_dst_len;
+ unsigned char r_src_len;
+ u32 r_src;
+ u32 r_srcmask;
+ u32 r_dst;
+ u32 r_dstmask;
+ u32 r_srcmap;
+ u8 r_flags;
+ u8 r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ u32 r_fwmark;
+#endif
+ int r_ifindex;
+#ifdef CONFIG_NET_CLS_ROUTE
+ __u32 r_tclassid;
+#endif
+ char r_ifname[IFNAMSIZ];
+};
+
+static struct fib_rule default_rule = { NULL, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST, };
+static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST, };
+static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, };
+
+static struct fib_rule *fib_rules = &local_rule;
+
+int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, **rp;
+
+ for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
+ if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
+ rtm->rtm_src_len == r->r_src_len &&
+ rtm->rtm_dst_len == r->r_dst_len &&
+ (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
+ rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
+ (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
+ (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
+ (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
+ (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+ if (r == &local_rule)
+ return -EPERM;
+
+ *rp = r->r_next;
+ synchronize_bh();
+
+ if (r != &default_rule && r != &main_rule)
+ kfree(r);
+ return 0;
+ }
+ }
+ return -ESRCH;
+}
+
+/* Allocate new unique table id */
+
+static struct fib_table *fib_empty_table(void)
+{
+ int id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (fib_tables[id] == NULL)
+ return __fib_new_table(id);
+ return NULL;
+}
+
+
+int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct fib_rule *r, *new_r, **rp;
+ unsigned char table_id;
+
+ if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
+ (rtm->rtm_tos & ~IPTOS_TOS_MASK))
+ return -EINVAL;
+
+ if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
+ return -EINVAL;
+
+ table_id = rtm->rtm_table;
+ if (table_id == RT_TABLE_UNSPEC) {
+ struct fib_table *table;
+ if (rtm->rtm_type == RTN_UNICAST || rtm->rtm_type == RTN_NAT) {
+ if ((table = fib_empty_table()) == NULL)
+ return -ENOBUFS;
+ table_id = table->tb_id;
+ }
+ }
+
+ new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
+ if (!new_r)
+ return -ENOMEM;
+ memset(new_r, 0, sizeof(*new_r));
+ if (rta[RTA_SRC-1])
+ memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
+ if (rta[RTA_DST-1])
+ memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
+ if (rta[RTA_GATEWAY-1])
+ memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
+ new_r->r_src_len = rtm->rtm_src_len;
+ new_r->r_dst_len = rtm->rtm_dst_len;
+ new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
+ new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
+ new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (rta[RTA_PROTOINFO-1])
+ memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
+ new_r->r_action = rtm->rtm_type;
+ new_r->r_flags = rtm->rtm_flags;
+ if (rta[RTA_PRIORITY-1])
+ memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+ new_r->r_table = table_id;
+ if (rta[RTA_IIF-1]) {
+ struct device *dev;
+ memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ);
+ new_r->r_ifname[IFNAMSIZ-1] = 0;
+ new_r->r_ifindex = -1;
+ dev = dev_get(new_r->r_ifname);
+ if (dev)
+ new_r->r_ifindex = dev->ifindex;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta[RTA_FLOW-1])
+ memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
+#endif
+
+ rp = &fib_rules;
+ if (!new_r->r_preference) {
+ r = fib_rules;
+ if (r && (r = r->r_next) != NULL) {
+ rp = &fib_rules->r_next;
+ if (r->r_preference)
+ new_r->r_preference = r->r_preference - 1;
+ }
+ }
+
+ while ( (r = *rp) != NULL ) {
+ if (r->r_preference > new_r->r_preference)
+ break;
+ rp = &r->r_next;
+ }
+
+ new_r->r_next = r;
+ *rp = new_r;
+ return 0;
+}
+
+u32 fib_rules_map_destination(u32 daddr, struct fib_result *res)
+{
+ u32 mask = inet_make_mask(res->prefixlen);
+ return (daddr&~mask)|res->fi->fib_nh->nh_gw;
+}
+
+u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags)
+{
+ struct fib_rule *r = res->r;
+
+ if (r->r_action == RTN_NAT) {
+ int addrtype = inet_addr_type(r->r_srcmap);
+
+ if (addrtype == RTN_NAT) {
+ /* Packet is from translated source; remember it */
+ saddr = (saddr&~r->r_srcmask)|r->r_srcmap;
+ *flags |= RTCF_SNAT;
+ } else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) {
+ /* Packet is from masqueraded source; remember it */
+ saddr = r->r_srcmap;
+ *flags |= RTCF_MASQ;
+ }
+ }
+ return saddr;
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+ if (res->r)
+ return res->r->r_tclassid;
+ return 0;
+}
+#endif
+
+
+static void fib_rules_detach(struct device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == dev->ifindex)
+ r->r_ifindex = -1;
+ }
+}
+
+static void fib_rules_attach(struct device *dev)
+{
+ struct fib_rule *r;
+
+ for (r=fib_rules; r; r=r->r_next) {
+ if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0)
+ r->r_ifindex = dev->ifindex;
+ }
+}
+
+int fib_lookup(const struct rt_key *key, struct fib_result *res)
+{
+ int err;
+ struct fib_rule *r, *policy;
+ struct fib_table *tb;
+
+ u32 daddr = key->dst;
+ u32 saddr = key->src;
+
+FRprintk("Lookup: %08x <- %08x ", key->dst, key->src);
+ for (r = fib_rules; r; r=r->r_next) {
+ if (((saddr^r->r_src) & r->r_srcmask) ||
+ ((daddr^r->r_dst) & r->r_dstmask) ||
+#ifdef CONFIG_IP_ROUTE_TOS
+ (r->r_tos && r->r_tos != key->tos) ||
+#endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ (r->r_fwmark && r->r_fwmark != key->fwmark) ||
+#endif
+ (r->r_ifindex && r->r_ifindex != key->iif))
+ continue;
+
+FRprintk("tb %d r %d ", r->r_table, r->r_action);
+ switch (r->r_action) {
+ case RTN_UNICAST:
+ case RTN_NAT:
+ policy = r;
+ break;
+ case RTN_UNREACHABLE:
+ return -ENETUNREACH;
+ default:
+ case RTN_BLACKHOLE:
+ return -EINVAL;
+ case RTN_PROHIBIT:
+ return -EACCES;
+ }
+
+ if ((tb = fib_get_table(r->r_table)) == NULL)
+ continue;
+ err = tb->tb_lookup(tb, key, res);
+ if (err == 0) {
+FRprintk("ok\n");
+ res->r = policy;
+ return 0;
+ }
+ if (err < 0 && err != -EAGAIN)
+ return err;
+ }
+FRprintk("FAILURE\n");
+ return -ENETUNREACH;
+}
+
+void fib_select_default(const struct rt_key *key, struct fib_result *res)
+{
+ if (res->r && res->r->r_action == RTN_UNICAST &&
+ FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+ struct fib_table *tb;
+ if ((tb = fib_get_table(res->r->r_table)) != NULL)
+ tb->tb_select_default(tb, key, res);
+ }
+}
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct device *dev = ptr;
+
+ if (event == NETDEV_UNREGISTER)
+ fib_rules_detach(dev);
+ else if (event == NETDEV_REGISTER)
+ fib_rules_attach(dev);
+ return NOTIFY_DONE;
+}
+
+
+struct notifier_block fib_rules_notifier = {
+ fib_rules_event,
+ NULL,
+ 0
+};
+
+#ifdef CONFIG_RTNETLINK
+
+extern __inline__ int inet_fill_rule(struct sk_buff *skb,
+ struct fib_rule *r,
+ struct netlink_callback *cb)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = r->r_dst_len;
+ rtm->rtm_src_len = r->r_src_len;
+ rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ if (r->r_fwmark)
+ RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
+ rtm->rtm_table = r->r_table;
+ rtm->rtm_protocol = 0;
+ rtm->rtm_scope = 0;
+ rtm->rtm_type = r->r_action;
+ rtm->rtm_flags = r->r_flags;
+
+ if (r->r_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
+ if (r->r_src_len)
+ RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
+ if (r->r_ifname[0])
+ RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
+ if (r->r_preference)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
+ if (r->r_srcmap)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (r->r_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_put(skb, b - skb->tail);
+ return -1;
+}
+
+int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->args[0];
+ struct fib_rule *r;
+
+ for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
+ if (idx < s_idx)
+ continue;
+ if (inet_fill_rule(skb, r, cb) < 0)
+ break;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+#endif /* CONFIG_RTNETLINK */
+
+__initfunc(void fib_rules_init(void))
+{
+ register_netdevice_notifier(&fib_rules_notifier);
+}
diff --git a/pfinet/linux-src/net/ipv4/fib_semantics.c b/pfinet/linux-src/net/ipv4/fib_semantics.c
new file mode 100644
index 00000000..b78f7eba
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/fib_semantics.c
@@ -0,0 +1,991 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Version: $Id: fib_semantics.c,v 1.13 1999/03/21 05:22:34 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FSprintk(a...)
+
+static struct fib_info *fib_info_list;
+
+#define for_fib_info() { struct fib_info *fi; \
+ for (fi = fib_info_list; fi; fi = fi->fib_next)
+
+#define endfor_fib_info() }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
+for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+static struct
+{
+ int error;
+ u8 scope;
+} fib_props[RTA_MAX+1] = {
+ { 0, RT_SCOPE_NOWHERE}, /* RTN_UNSPEC */
+ { 0, RT_SCOPE_UNIVERSE}, /* RTN_UNICAST */
+ { 0, RT_SCOPE_HOST}, /* RTN_LOCAL */
+ { 0, RT_SCOPE_LINK}, /* RTN_BROADCAST */
+ { 0, RT_SCOPE_LINK}, /* RTN_ANYCAST */
+ { 0, RT_SCOPE_UNIVERSE}, /* RTN_MULTICAST */
+ { -EINVAL, RT_SCOPE_UNIVERSE}, /* RTN_BLACKHOLE */
+ { -EHOSTUNREACH, RT_SCOPE_UNIVERSE},/* RTN_UNREACHABLE */
+ { -EACCES, RT_SCOPE_UNIVERSE}, /* RTN_PROHIBIT */
+ { -EAGAIN, RT_SCOPE_UNIVERSE}, /* RTN_THROW */
+#ifdef CONFIG_IP_ROUTE_NAT
+ { 0, RT_SCOPE_HOST}, /* RTN_NAT */
+#else
+ { -EINVAL, RT_SCOPE_NOWHERE}, /* RTN_NAT */
+#endif
+ { -EINVAL, RT_SCOPE_NOWHERE} /* RTN_XRESOLVE */
+};
+
+/* Release a nexthop info record */
+
+void fib_release_info(struct fib_info *fi)
+{
+ if (fi && !--fi->fib_refcnt) {
+ if (fi->fib_next)
+ fi->fib_next->fib_prev = fi->fib_prev;
+ if (fi->fib_prev)
+ fi->fib_prev->fib_next = fi->fib_next;
+ if (fi == fib_info_list)
+ fib_info_list = fi->fib_next;
+ kfree(fi);
+ }
+}
+
+extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+ const struct fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi)
+{
+ for_fib_info() {
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_mtu == fi->fib_mtu &&
+ nfi->fib_rtt == fi->fib_rtt &&
+ nfi->fib_window == fi->fib_window &&
+ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ return fi;
+ } endfor_fib_info();
+ return NULL;
+}
+
+/* Check, that the gateway is already configured.
+ Used only by redirect accept routine.
+ */
+
+int ip_fib_check_default(u32 gw, struct device *dev)
+{
+ for_fib_info() {
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for_nexthops(fi) {
+ if (nh->nh_dev == dev && nh->nh_gw == gw &&
+ !(nh->nh_flags&RTNH_F_DEAD))
+ return 0;
+ } endfor_nexthops(fi);
+ } endfor_fib_info();
+ return -1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
+{
+ while (RTA_OK(attr,attrlen)) {
+ if (attr->rta_type == type)
+ return *(u32*)RTA_DATA(attr);
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ return 0;
+}
+
+static int
+fib_count_nexthops(struct rtattr *rta)
+{
+ int nhs = 0;
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ while (nhlen >= (int)sizeof(struct rtnexthop)) {
+ if ((nhlen -= nhp->rtnh_len) < 0)
+ return 0;
+ nhs++;
+ nhp = RTNH_NEXT(nhp);
+ };
+ return nhs;
+}
+
+static int
+fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+ struct rtnexthop *nhp = RTA_DATA(rta);
+ int nhlen = RTA_PAYLOAD(rta);
+
+ change_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+ nh->nh_oif = nhp->rtnh_ifindex;
+ nh->nh_weight = nhp->rtnh_hops + 1;
+ if (attrlen) {
+ nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+#ifdef CONFIG_NET_CLS_ROUTE
+ nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+#endif
+
+int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
+ struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ struct rtnexthop *nhp;
+ int nhlen;
+#endif
+
+ if (rta->rta_priority &&
+ *rta->rta_priority != fi->fib_priority)
+ return 1;
+
+ if (rta->rta_oif || rta->rta_gw) {
+ if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+ (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+ return 0;
+ return 1;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp == NULL)
+ return 0;
+ nhp = RTA_DATA(rta->rta_mp);
+ nhlen = RTA_PAYLOAD(rta->rta_mp);
+
+ for_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ u32 gw;
+
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+ return 1;
+ if (attrlen) {
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+ if (gw && gw != nh->nh_gw)
+ return 1;
+#ifdef CONFIG_NET_CLS_ROUTE
+ gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+ if (gw && gw != nh->nh_tclassid)
+ return 1;
+#endif
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+#endif
+ return 0;
+}
+
+
+/*
+ Picture
+ -------
+
+ Semantics of nexthop is very messy by historical reasons.
+ We have to take into account, that:
+ a) gateway can be actually local interface address,
+ so that gatewayed route is direct.
+ b) gateway must be on-link address, possibly
+ described not by an ifaddr, but also by a direct route.
+ c) If both gateway and interface are specified, they should not
+ contradict.
+ d) If we use tunnel routes, gateway could be not on-link.
+
+ Attempt to reconcile all of these (alas, self-contradictory) conditions
+ results in pretty ugly and hairy code with obscure logic.
+
+ I choosed to generalized it instead, so that the size
+ of code does not increase practically, but it becomes
+ much more general.
+ Every prefix is assigned a "scope" value: "host" is local address,
+ "link" is direct route,
+ [ ... "site" ... "interior" ... ]
+ and "universe" is true gateway route with global meaning.
+
+ Every prefix refers to a set of "nexthop"s (gw, oif),
+ where gw must have narrower scope. This recursion stops
+ when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ which means that gw is forced to be on link.
+
+ Code is still hairy, but now it is apparently logically
+ consistent and very flexible. F.e. as by-product it allows
+ to co-exists in peace independent exterior and interior
+ routing processes.
+
+ Normally it looks as following.
+
+ {universe prefix} -> (gw, oif) [scope link]
+ |
+ |-> {link prefix} -> (gw, oif) [scope local]
+ |
+ |-> {local prefix} (terminal node)
+ */
+
+static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+{
+ int err;
+
+ if (nh->nh_gw) {
+ struct rt_key key;
+ struct fib_result res;
+
+#ifdef CONFIG_IP_ROUTE_PERVASIVE
+ if (nh->nh_flags&RTNH_F_PERVASIVE)
+ return 0;
+#endif
+ if (nh->nh_flags&RTNH_F_ONLINK) {
+ struct device *dev;
+
+ if (r->rtm_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ if ((dev = dev_get_by_index(nh->nh_oif)) == NULL)
+ return -ENODEV;
+ if (!(dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ memset(&key, 0, sizeof(key));
+ key.dst = nh->nh_gw;
+ key.oif = nh->nh_oif;
+ key.scope = r->rtm_scope + 1;
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (key.scope < RT_SCOPE_LINK)
+ key.scope = RT_SCOPE_LINK;
+
+ if ((err = fib_lookup(&key, &res)) != 0)
+ return err;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = FIB_RES_OIF(res);
+ nh->nh_dev = FIB_RES_DEV(res);
+ } else {
+ struct in_device *in_dev;
+
+ if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ return -EINVAL;
+
+ in_dev = inetdev_by_index(nh->nh_oif);
+ if (in_dev == NULL)
+ return -ENODEV;
+ if (!(in_dev->dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = in_dev->dev;
+ nh->nh_scope = RT_SCOPE_HOST;
+ }
+ return 0;
+}
+
+struct fib_info *
+fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+ const struct nlmsghdr *nlh, int *errp)
+{
+ int err;
+ struct fib_info *fi = NULL;
+ struct fib_info *ofi;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int nhs = 1;
+#else
+ const int nhs = 1;
+#endif
+
+ /* Fast check to catch the most weird cases */
+ if (fib_props[r->rtm_type].scope > r->rtm_scope)
+ goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (rta->rta_mp) {
+ nhs = fib_count_nexthops(rta->rta_mp);
+ if (nhs == 0)
+ goto err_inval;
+ }
+#endif
+
+ fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ err = -ENOBUFS;
+ if (fi == NULL)
+ goto failure;
+ memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
+
+ fi->fib_protocol = r->rtm_protocol;
+ fi->fib_nhs = nhs;
+ fi->fib_flags = r->rtm_flags;
+ if (rta->rta_priority)
+ fi->fib_priority = *rta->rta_priority;
+ if (rta->rta_mx) {
+ int attrlen = RTA_PAYLOAD(rta->rta_mx);
+ struct rtattr *attr = RTA_DATA(rta->rta_mx);
+
+ while (RTA_OK(attr, attrlen)) {
+ unsigned flavor = attr->rta_type;
+ if (flavor) {
+ if (flavor > FIB_MAX_METRICS)
+ goto err_inval;
+ fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+ }
+ attr = RTA_NEXT(attr, attrlen);
+ }
+ }
+ if (rta->rta_prefsrc)
+ memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
+
+ if (rta->rta_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+ goto failure;
+ if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+ goto err_inval;
+ if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+ goto err_inval;
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+ goto err_inval;
+#endif
+#else
+ goto err_inval;
+#endif
+ } else {
+ struct fib_nh *nh = fi->fib_nh;
+ if (rta->rta_oif)
+ nh->nh_oif = *rta->rta_oif;
+ if (rta->rta_gw)
+ memcpy(&nh->nh_gw, rta->rta_gw, 4);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rta->rta_flow)
+ memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+#endif
+ nh->nh_flags = r->rtm_flags;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight = 1;
+#endif
+ }
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ if (r->rtm_type == RTN_NAT) {
+ if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif)
+ goto err_inval;
+ memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 4);
+ goto link_it;
+ }
+#endif
+
+ if (fib_props[r->rtm_type].error) {
+ if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+ goto err_inval;
+ goto link_it;
+ }
+
+ if (r->rtm_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (r->rtm_scope == RT_SCOPE_HOST) {
+ struct fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added. */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (nh->nh_dev == NULL)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ if ((err = fib_check_nh(r, fi, nh)) != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+ memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+ if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+link_it:
+ if ((ofi = fib_find_info(fi)) != NULL) {
+ kfree(fi);
+ ofi->fib_refcnt++;
+ return ofi;
+ }
+
+ fi->fib_refcnt++;
+ fi->fib_next = fib_info_list;
+ fi->fib_prev = NULL;
+ if (fib_info_list)
+ fib_info_list->fib_prev = fi;
+ fib_info_list = fi;
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ *errp = err;
+ if (fi)
+ kfree(fi);
+ return NULL;
+}
+
+int
+fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res)
+{
+ int err = fib_props[type].error;
+
+ if (err == 0) {
+ if (fi->fib_flags&RTNH_F_DEAD)
+ return 1;
+
+ res->fi = fi;
+
+ switch (type) {
+#ifdef CONFIG_IP_ROUTE_NAT
+ case RTN_NAT:
+ FIB_RES_RESET(*res);
+ return 0;
+#endif
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ for_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ continue;
+ if (!key->oif || key->oif == nh->nh_oif)
+ break;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (nhsel < fi->fib_nhs) {
+ res->nh_sel = nhsel;
+ return 0;
+ }
+#else
+ if (nhsel < 1)
+ return 0;
+#endif
+ endfor_nexthops(fi);
+ return 1;
+ default:
+ printk(KERN_DEBUG "impossible 102\n");
+ return -EINVAL;
+ }
+ }
+ return err;
+}
+
+/* Find appropriate source address to this destination */
+
+u32 __fib_res_prefsrc(struct fib_result *res)
+{
+ return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+}
+
+#ifdef CONFIG_RTNETLINK
+
+int
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+ struct fib_info *fi)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+ rtm = NLMSG_DATA(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = tos;
+ rtm->rtm_table = tb_id;
+ rtm->rtm_type = type;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = scope;
+ if (rtm->rtm_dst_len)
+ RTA_PUT(skb, RTA_DST, 4, dst);
+ rtm->rtm_protocol = fi->fib_protocol;
+ if (fi->fib_priority)
+ RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (fi->fib_nh[0].nh_tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
+#endif
+ if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) {
+ int i;
+ struct rtattr *mx = (struct rtattr *)skb->tail;
+ RTA_PUT(skb, RTA_METRICS, 0, NULL);
+ for (i=0; i<FIB_MAX_METRICS; i++) {
+ if (fi->fib_metrics[i])
+ RTA_PUT(skb, i+1, sizeof(unsigned), fi->fib_metrics + i);
+ }
+ mx->rta_len = skb->tail - (u8*)mx;
+ }
+ if (fi->fib_prefsrc)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
+ if (fi->fib_nh->nh_oif)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *nhp;
+ struct rtattr *mp_head;
+ if (skb_tailroom(skb) <= RTA_SPACE(0))
+ goto rtattr_failure;
+ mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
+
+ for_nexthops(fi) {
+ if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+ goto rtattr_failure;
+ nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ nhp->rtnh_flags = nh->nh_flags & 0xFF;
+ nhp->rtnh_hops = nh->nh_weight-1;
+ nhp->rtnh_ifindex = nh->nh_oif;
+ if (nh->nh_gw)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
+ nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
+ } endfor_nexthops(fi);
+ mp_head->rta_type = RTA_MULTIPATH;
+ mp_head->rta_len = skb->tail - (u8*)mp_head;
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#endif /* CONFIG_RTNETLINK */
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+int
+fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
+ struct kern_rta *rta, struct rtentry *r)
+{
+ int plen;
+ u32 *ptr;
+
+ memset(rtm, 0, sizeof(*rtm));
+ memset(rta, 0, sizeof(*rta));
+
+ if (r->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /* Check mask for validity:
+ a) it must be contiguous.
+ b) destination must have all host bits clear.
+ c) if application forgot to set correct family (AF_INET),
+ reject request unless it is absolutely clear i.e.
+ both family and mask are zero.
+ */
+ plen = 32;
+ ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
+ if (!(r->rt_flags&RTF_HOST)) {
+ u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
+ if (r->rt_genmask.sa_family != AF_INET) {
+ if (mask || r->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+ if (bad_mask(mask, *ptr))
+ return -EINVAL;
+ plen = inet_mask_len(mask);
+ }
+
+ nl->nlmsg_flags = NLM_F_REQUEST;
+ nl->nlmsg_pid = 0;
+ nl->nlmsg_seq = 0;
+ nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
+ if (cmd == SIOCDELRT) {
+ nl->nlmsg_type = RTM_DELROUTE;
+ nl->nlmsg_flags = 0;
+ } else {
+ nl->nlmsg_type = RTM_NEWROUTE;
+ nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
+ rtm->rtm_protocol = RTPROT_BOOT;
+ }
+
+ rtm->rtm_dst_len = plen;
+ rta->rta_dst = ptr;
+
+ if (r->rt_metric) {
+ *(u32*)&r->rt_pad3 = r->rt_metric - 1;
+ rta->rta_priority = (u32*)&r->rt_pad3;
+ }
+ if (r->rt_flags&RTF_REJECT) {
+ rtm->rtm_scope = RT_SCOPE_HOST;
+ rtm->rtm_type = RTN_UNREACHABLE;
+ return 0;
+ }
+ rtm->rtm_scope = RT_SCOPE_NOWHERE;
+ rtm->rtm_type = RTN_UNICAST;
+
+ if (r->rt_dev) {
+#ifdef CONFIG_IP_ALIAS
+ char *colon;
+#endif
+ struct device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ devname[IFNAMSIZ-1] = 0;
+#ifdef CONFIG_IP_ALIAS
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+#endif
+ dev = dev_get(devname);
+ if (!dev)
+ return -ENODEV;
+ rta->rta_oif = &dev->ifindex;
+#ifdef CONFIG_IP_ALIAS
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = dev->ip_ptr;
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (ifa == NULL)
+ return -ENODEV;
+ rta->rta_prefsrc = &ifa->ifa_local;
+ }
+#endif
+ }
+
+ ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
+ if (r->rt_gateway.sa_family == AF_INET && *ptr) {
+ rta->rta_gw = ptr;
+ if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
+ return -EINVAL;
+
+ if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
+ rtm->rtm_scope = RT_SCOPE_LINK;
+
+ if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
+ struct rtattr *rec;
+ struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
+ if (mx == NULL)
+ return -ENOMEM;
+ rta->rta_mx = mx;
+ mx->rta_type = RTA_METRICS;
+ mx->rta_len = RTA_LENGTH(0);
+ if (r->rt_flags&RTF_MTU) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_MTU;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_mtu;
+ }
+ if (r->rt_flags&RTF_WINDOW) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_WINDOW;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_window;
+ }
+ if (r->rt_flags&RTF_IRTT) {
+ rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+ rec->rta_type = RTAX_RTT;
+ rec->rta_len = RTA_LENGTH(4);
+ mx->rta_len += RTA_LENGTH(4);
+ *(u32*)RTA_DATA(rec) = r->rt_irtt;
+ }
+ }
+ return 0;
+}
+
+#endif
+
+/*
+ Update FIB if:
+ - local address disappeared -> we must delete all the entries
+ referring to it.
+ - device went down -> we must shutdown all nexthops going via it.
+ */
+
+int fib_sync_down(u32 local, struct device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+
+ if (force)
+ scope = -1;
+
+ for_fib_info() {
+ if (local && fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ } else if (dev && fi->fib_nhs) {
+ int dead = 0;
+
+ change_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ dead++;
+ else if (nh->nh_dev == dev &&
+ nh->nh_scope != scope) {
+ nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fi->fib_power -= nh->nh_power;
+ nh->nh_power = 0;
+#endif
+ dead++;
+ }
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ } endfor_fib_info();
+ return ret;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ Dead device goes up. We wake up dead nexthops.
+ It takes sense only on multipath routes.
+ */
+
+int fib_sync_up(struct device *dev)
+{
+ int ret = 0;
+
+ if (!(dev->flags&IFF_UP))
+ return 0;
+
+ for_fib_info() {
+ int alive = 0;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ continue;
+ if (nh->nh_dev != dev || dev->ip_ptr == NULL)
+ continue;
+ alive++;
+ nh->nh_power = 0;
+ nh->nh_flags &= ~RTNH_F_DEAD;
+ } endfor_nexthops(fi)
+
+ if (alive == fi->fib_nhs) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ } endfor_fib_info();
+ return ret;
+}
+
+/*
+ The algorithm is suboptimal, but it provides really
+ fair weighted route distribution.
+ */
+
+void fib_select_multipath(const struct rt_key *key, struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+ int w;
+
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ power += nh->nh_weight;
+ nh->nh_power = nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+#if 1
+ if (power <= 0) {
+ printk(KERN_CRIT "impossible 777\n");
+ return;
+ }
+#endif
+ }
+
+
+ /* w should be random number [0..fi->fib_power-1],
+ it is pretty bad approximation.
+ */
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+ if ((w -= nh->nh_power) <= 0) {
+ nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+
+#if 1
+ printk(KERN_CRIT "impossible 888\n");
+#endif
+ return;
+}
+#endif
+
+
+#ifdef CONFIG_PROC_FS
+
+static unsigned fib_flag_trans(int type, int dead, u32 mask, struct fib_info *fi)
+{
+ static unsigned type2flags[RTN_MAX+1] = {
+ 0, 0, 0, 0, 0, 0, 0, RTF_REJECT, RTF_REJECT, 0, 0, 0
+ };
+ unsigned flags = type2flags[type];
+
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == 0xFFFFFFFF)
+ flags |= RTF_HOST;
+ if (!dead)
+ flags |= RTF_UP;
+ return flags;
+}
+
+void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 mask, char *buffer)
+{
+ int len;
+ unsigned flags = fib_flag_trans(type, dead, mask, fi);
+
+ if (fi) {
+ len = sprintf(buffer, "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*", prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
+ mask, fi->fib_mtu, fi->fib_window, fi->fib_rtt);
+ } else {
+ len = sprintf(buffer, "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ prefix, 0,
+ flags, 0, 0, 0,
+ mask, 0, 0, 0);
+ }
+ memset(buffer+len, ' ', 127-len);
+ buffer[127] = '\n';
+}
+
+#endif
diff --git a/pfinet/linux-src/net/ipv4/icmp.c b/pfinet/linux-src/net/ipv4/icmp.c
new file mode 100644
index 00000000..34b48a93
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/icmp.c
@@ -0,0 +1,1155 @@
+/*
+ * NET3: Implementation of the ICMP protocol layer.
+ *
+ * Alan Cox, <alan@redhat.com>
+ *
+ * Version: $Id: icmp.c,v 1.52.2.2 1999/06/20 21:27:39 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Some of the function names and the icmp unreach table for this
+ * module were derived from [icmp.c 1.0.11 06/02/93] by
+ * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ * Other than that this module is a complete rewrite.
+ *
+ * Fixes:
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Multicast ping reply as self.
+ * Alan Cox : Fix atomicity lockup in ip_build_xmit
+ * call.
+ * Alan Cox : Added 216,128 byte paths to the MTU
+ * code.
+ * Martin Mares : RFC1812 checks.
+ * Martin Mares : Can be configured to follow redirects
+ * if acting as a router _without_ a
+ * routing protocol (RFC 1812).
+ * Martin Mares : Echo requests may be configured to
+ * be ignored (RFC 1812).
+ * Martin Mares : Limitation of ICMP error message
+ * transmit rate (RFC 1812).
+ * Martin Mares : TOS and Precedence set correctly
+ * (RFC 1812).
+ * Martin Mares : Now copying as much data from the
+ * original packet as we can without
+ * exceeding 576 bytes (RFC 1812).
+ * Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : RFC1191 correction for 4.2BSD based
+ * path MTU bug.
+ * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
+ * valid (RFC 1812).
+ * Andi Kleen : Check all packet lengths properly
+ * and moved all kfree_skb() up to
+ * icmp_rcv.
+ * Andi Kleen : Move the rate limit bookkeeping
+ * into the dest entry and use a token
+ * bucket filter (thanks to ANK). Make
+ * the rates sysctl configurable.
+ * Yu Tianli : Fixed two ugly bugs in icmp_send
+ * - IP option length was accounted wrongly
+ * - ICMP header length was not accounted at all.
+ * Tristan Greaves : Added sysctl option to ignore bogus broadcast
+ * responses from broken routers.
+ *
+ * To Fix:
+ *
+ * - Should use skb_pull() instead of all the manual checking.
+ * This would also greatly simply some upper layer error handlers. --AK
+ *
+ * RFC1122 (Host Requirements -- Comm. Layer) Status:
+ * (boy, are there a lot of rules for ICMP)
+ * 3.2.2 (Generic ICMP stuff)
+ * MUST discard messages of unknown type. (OK)
+ * MUST copy at least the first 8 bytes from the offending packet
+ * when sending ICMP errors. (OBSOLETE -- see RFC1812)
+ * MUST pass received ICMP errors up to protocol level. (OK)
+ * SHOULD send ICMP errors with TOS == 0. (OBSOLETE -- see RFC1812)
+ * MUST NOT send ICMP errors in reply to:
+ * ICMP errors (OK)
+ * Broadcast/multicast datagrams (OK)
+ * MAC broadcasts (OK)
+ * Non-initial fragments (OK)
+ * Datagram with a source address that isn't a single host. (OK)
+ * 3.2.2.1 (Destination Unreachable)
+ * All the rules govern the IP layer, and are dealt with in ip.c, not here.
+ * 3.2.2.2 (Redirect)
+ * Host SHOULD NOT send ICMP_REDIRECTs. (OK)
+ * MUST update routing table in response to host or network redirects.
+ * (host OK, network OBSOLETE)
+ * SHOULD drop redirects if they're not from directly connected gateway
+ * (OK -- we drop it if it's not from our old gateway, which is close
+ * enough)
+ * 3.2.2.3 (Source Quench)
+ * MUST pass incoming SOURCE_QUENCHs to transport layer (OK)
+ * Other requirements are dealt with at the transport layer.
+ * 3.2.2.4 (Time Exceeded)
+ * MUST pass TIME_EXCEEDED to transport layer (OK)
+ * Other requirements dealt with at IP (generating TIME_EXCEEDED).
+ * 3.2.2.5 (Parameter Problem)
+ * SHOULD generate these (OK)
+ * MUST pass received PARAMPROBLEM to transport layer (NOT YET)
+ * [Solaris 2.X seems to assert EPROTO when this occurs] -- AC
+ * 3.2.2.6 (Echo Request/Reply)
+ * MUST reply to ECHO_REQUEST, and give app to do ECHO stuff (OK, OK)
+ * MAY discard broadcast ECHO_REQUESTs. (Configurable with a sysctl.)
+ * MUST reply using same source address as the request was sent to.
+ * We're OK for unicast ECHOs, and it doesn't say anything about
+ * how to handle broadcast ones, since it's optional.
+ * MUST copy data from REQUEST to REPLY (OK)
+ * unless it would require illegal fragmentation (OK)
+ * MUST pass REPLYs to transport/user layer (OK)
+ * MUST use any provided source route (reversed) for REPLY. (NOT YET)
+ * 3.2.2.7 (Information Request/Reply)
+ * MUST NOT implement this. (I guess that means silently discard...?) (OK)
+ * 3.2.2.8 (Timestamp Request/Reply)
+ * MAY implement (OK)
+ * SHOULD be in-kernel for "minimum variability" (OK)
+ * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency)
+ * MUST reply using same source address as the request was sent to. (OK)
+ * MUST reverse source route, as per ECHO (NOT YET)
+ * MUST pass REPLYs to transport/user layer (requires RAW, just like
+ * ECHO) (OK)
+ * MUST update clock for timestamp at least 15 times/sec (OK)
+ * MUST be "correct within a few minutes" (OK)
+ * 3.2.2.9 (Address Mask Request/Reply)
+ * MAY implement (OK)
+ * MUST send a broadcast REQUEST if using this system to set netmask
+ * (OK... we don't use it)
+ * MUST discard received REPLYs if not using this system (OK)
+ * MUST NOT send replies unless specifically made agent for this sort
+ * of thing. (OK)
+ *
+ *
+ * RFC 1812 (IPv4 Router Requirements) Status (even longer):
+ * 4.3.2.1 (Unknown Message Types)
+ * MUST pass messages of unknown type to ICMP user iface or silently discard
+ * them (OK)
+ * 4.3.2.2 (ICMP Message TTL)
+ * MUST initialize TTL when originating an ICMP message (OK)
+ * 4.3.2.3 (Original Message Header)
+ * SHOULD copy as much data from the offending packet as possible without
+ * the length of the ICMP datagram exceeding 576 bytes (OK)
+ * MUST leave original IP header of the offending packet, but we're not
+ * required to undo modifications made (OK)
+ * 4.3.2.4 (Original Message Source Address)
+ * MUST use one of addresses for the interface the orig. packet arrived as
+ * source address (OK)
+ * 4.3.2.5 (TOS and Precedence)
+ * SHOULD leave TOS set to the same value unless the packet would be
+ * discarded for that reason (OK)
+ * MUST use TOS=0 if not possible to leave original value (OK)
+ * MUST leave IP Precedence for Source Quench messages (OK -- not sent
+ * at all)
+ * SHOULD use IP Precedence = 6 (Internetwork Control) or 7 (Network Control)
+ * for all other error messages (OK, we use 6)
+ * MAY allow configuration of IP Precedence (OK -- not done)
+ * MUST leave IP Precedence and TOS for reply messages (OK)
+ * 4.3.2.6 (Source Route)
+ * SHOULD use reverse source route UNLESS sending Parameter Problem on source
+ * routing and UNLESS the packet would be immediately discarded (NOT YET)
+ * 4.3.2.7 (When Not to Send ICMP Errors)
+ * MUST NOT send ICMP errors in reply to:
+ * ICMP errors (OK)
+ * Packets failing IP header validation tests unless otherwise noted (OK)
+ * Broadcast/multicast datagrams (OK)
+ * MAC broadcasts (OK)
+ * Non-initial fragments (OK)
+ * Datagram with a source address that isn't a single host. (OK)
+ * 4.3.2.8 (Rate Limiting)
+ * SHOULD be able to limit error message rate (OK)
+ * SHOULD allow setting of rate limits (OK, in the source)
+ * 4.3.3.1 (Destination Unreachable)
+ * All the rules govern the IP layer, and are dealt with in ip.c, not here.
+ * 4.3.3.2 (Redirect)
+ * MAY ignore ICMP Redirects if running a routing protocol or if forwarding
+ * is enabled on the interface (OK -- ignores)
+ * 4.3.3.3 (Source Quench)
+ * SHOULD NOT originate SQ messages (OK)
+ * MUST be able to limit SQ rate if originates them (OK as we don't
+ * send them)
+ * MAY ignore SQ messages it receives (OK -- we don't)
+ * 4.3.3.4 (Time Exceeded)
+ * Requirements dealt with at IP (generating TIME_EXCEEDED).
+ * 4.3.3.5 (Parameter Problem)
+ * MUST generate these for all errors not covered by other messages (OK)
+ * MUST include original value of the value pointed by (OK)
+ * 4.3.3.6 (Echo Request)
+ * MUST implement echo server function (OK)
+ * MUST process at ER of at least max(576, MTU) (OK)
+ * MAY reject broadcast/multicast ER's (We don't, but that's OK)
+ * SHOULD have a config option for silently ignoring ER's (OK)
+ * MUST have a default value for the above switch = NO (OK)
+ * MUST have application layer interface for Echo Request/Reply (OK)
+ * MUST reply using same source address as the request was sent to.
+ * We're OK for unicast ECHOs, and it doesn't say anything about
+ * how to handle broadcast ones, since it's optional.
+ * MUST copy data from Request to Reply (OK)
+ * SHOULD update Record Route / Timestamp options (??)
+ * MUST use reversed Source Route for Reply if possible (NOT YET)
+ * 4.3.3.7 (Information Request/Reply)
+ * SHOULD NOT originate or respond to these (OK)
+ * 4.3.3.8 (Timestamp / Timestamp Reply)
+ * MAY implement (OK)
+ * MUST reply to every Timestamp message received (OK)
+ * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency)
+ * MUST reply using same source address as the request was sent to. (OK)
+ * MUST use reversed Source Route if possible (NOT YET)
+ * SHOULD update Record Route / Timestamp options (??)
+ * MUST pass REPLYs to transport/user layer (requires RAW, just like
+ * ECHO) (OK)
+ * MUST update clock for timestamp at least 16 times/sec (OK)
+ * MUST be "correct within a few minutes" (OK)
+ * 4.3.3.9 (Address Mask Request/Reply)
+ * MUST have support for receiving AMRq and responding with AMRe (OK,
+ * but only as a compile-time option)
+ * SHOULD have option for each interface for AMRe's, MUST default to
+ * NO (NOT YET)
+ * MUST NOT reply to AMRq before knows the correct AM (OK)
+ * MUST NOT respond to AMRq with source address 0.0.0.0 on physical
+ * interfaces having multiple logical i-faces with different masks
+ * (NOT YET)
+ * SHOULD examine all AMRe's it receives and check them (NOT YET)
+ * SHOULD log invalid AMRe's (AM+sender) (NOT YET)
+ * MUST NOT use contents of AMRe to determine correct AM (OK)
+ * MAY broadcast AMRe's after having configured address masks (OK -- doesn't)
+ * MUST NOT do broadcast AMRe's if not set by extra option (OK, no option)
+ * MUST use the { <NetPrefix>, -1 } form of broadcast addresses (OK)
+ * 4.3.3.10 (Router Advertisement and Solicitations)
+ * MUST support router part of Router Discovery Protocol on all networks we
+ * support broadcast or multicast addressing. (OK -- done by gated)
+ * MUST have all config parameters with the respective defaults (OK)
+ * 5.2.7.1 (Destination Unreachable)
+ * MUST generate DU's (OK)
+ * SHOULD choose a best-match response code (OK)
+ * SHOULD NOT generate Host Isolated codes (OK)
+ * SHOULD use Communication Administratively Prohibited when administratively
+ * filtering packets (NOT YET -- bug-to-bug compatibility)
+ * MAY include config option for not generating the above and silently
+ * discard the packets instead (OK)
+ * MAY include config option for not generating Precedence Violation and
+ * Precedence Cutoff messages (OK as we don't generate them at all)
+ * MUST use Host Unreachable or Dest. Host Unknown codes whenever other hosts
+ * on the same network might be reachable (OK -- no net unreach's at all)
+ * MUST use new form of Fragmentation Needed and DF Set messages (OK)
+ * 5.2.7.2 (Redirect)
+ * MUST NOT generate network redirects (OK)
+ * MUST be able to generate host redirects (OK)
+ * SHOULD be able to generate Host+TOS redirects (NO as we don't use TOS)
+ * MUST have an option to use Host redirects instead of Host+TOS ones (OK as
+ * no Host+TOS Redirects are used)
+ * MUST NOT generate redirects unless forwarding to the same i-face and the
+ * dest. address is on the same subnet as the src. address and no source
+ * routing is in use. (OK)
+ * MUST NOT follow redirects when using a routing protocol (OK)
+ * MAY use redirects if not using a routing protocol (OK, compile-time option)
+ * MUST comply to Host Requirements when not acting as a router (OK)
+ * 5.2.7.3 (Time Exceeded)
+ * MUST generate Time Exceeded Code 0 when discarding packet due to TTL=0 (OK)
+ * MAY have a per-interface option to disable origination of TE messages, but
+ * it MUST default to "originate" (OK -- we don't support it)
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+
+#ifdef CONFIG_IP_MASQUERADE
+#include <net/ip_masq.h>
+#endif
+
+#define min(a,b) ((a)<(b)?(a):(b))
+
+/*
+ * Statistics
+ */
+
+struct icmp_mib icmp_statistics;
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOS_UNREACH and SR_FAIELD MUST be considered 'transient errs'. */
+
+struct icmp_err icmp_err_convert[] = {
+ { ENETUNREACH, 0 }, /* ICMP_NET_UNREACH */
+ { EHOSTUNREACH, 0 }, /* ICMP_HOST_UNREACH */
+ { ENOPROTOOPT, 1 }, /* ICMP_PROT_UNREACH */
+ { ECONNREFUSED, 1 }, /* ICMP_PORT_UNREACH */
+ { EMSGSIZE, 0 }, /* ICMP_FRAG_NEEDED */
+ { EOPNOTSUPP, 0 }, /* ICMP_SR_FAILED */
+ { ENETUNREACH, 1 }, /* ICMP_NET_UNKNOWN */
+ { EHOSTDOWN, 1 }, /* ICMP_HOST_UNKNOWN */
+ { ENONET, 1 }, /* ICMP_HOST_ISOLATED */
+ { ENETUNREACH, 1 }, /* ICMP_NET_ANO */
+ { EHOSTUNREACH, 1 }, /* ICMP_HOST_ANO */
+ { ENETUNREACH, 0 }, /* ICMP_NET_UNR_TOS */
+ { EHOSTUNREACH, 0 }, /* ICMP_HOST_UNR_TOS */
+ { EHOSTUNREACH, 1 }, /* ICMP_PKT_FILTERED */
+ { EHOSTUNREACH, 1 }, /* ICMP_PREC_VIOLATION */
+ { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */
+};
+
+/* Control parameters for ECHO relies. */
+int sysctl_icmp_echo_ignore_all = 0;
+int sysctl_icmp_echo_ignore_broadcasts = 0;
+
+/* Control parameter - ignore bogus broadcast responses? */
+int sysctl_icmp_ignore_bogus_error_responses =0;
+
+/*
+ * ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control
+{
+ unsigned long *output; /* Address to increment on output */
+ unsigned long *input; /* Address to increment on input */
+ void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len);
+ short error; /* This ICMP is classed as an error message */
+ int *timeout; /* Rate limit */
+};
+
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ * Build xmit assembly blocks
+ */
+
+struct icmp_bxm
+{
+ void *data_ptr;
+ int data_len;
+ struct icmphdr icmph;
+ unsigned long csum;
+ struct ip_options replyopts;
+ unsigned char optbuf[40];
+};
+
+/*
+ * The ICMP socket. This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ */
+
+struct inode icmp_inode;
+struct socket *icmp_socket=&icmp_inode.u.socket_i;
+
+/*
+ * Send an ICMP frame.
+ */
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the destination cache now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same dst_entry fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared.
+ *
+ * Note that the same dst_entry fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+int xrlim_allow(struct dst_entry *dst, int timeout)
+{
+ unsigned long now;
+
+ now = jiffies;
+ dst->rate_tokens += now - dst->rate_last;
+ dst->rate_last = now;
+ if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout)
+ dst->rate_tokens = XRLIM_BURST_FACTOR*timeout;
+ if (dst->rate_tokens >= timeout) {
+ dst->rate_tokens -= timeout;
+ return 1;
+ }
+ return 0;
+}
+
+static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+{
+ struct dst_entry *dst = &rt->u.dst;
+
+ if (type > NR_ICMP_TYPES || !icmp_pointers[type].timeout)
+ return 1;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ return 1;
+
+ /* Redirect has its own rate limit mechanism */
+ if (type == ICMP_REDIRECT)
+ return 1;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ return 1;
+
+ return xrlim_allow(dst, *(icmp_pointers[type].timeout));
+}
+
+/*
+ * Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+
+static void icmp_out_count(int type)
+{
+ if (type>NR_ICMP_TYPES)
+ return;
+ (*icmp_pointers[type].output)++;
+ icmp_statistics.IcmpOutMsgs++;
+}
+
+/*
+ * Checksum each fragment, and on the first include the headers and final checksum.
+ */
+
+static int icmp_glue_bits(const void *p, char *to, unsigned int offset, unsigned int fraglen)
+{
+ struct icmp_bxm *icmp_param = (struct icmp_bxm *)p;
+ struct icmphdr *icmph;
+ unsigned long csum;
+
+ if (offset) {
+ icmp_param->csum=csum_partial_copy(icmp_param->data_ptr+offset-sizeof(struct icmphdr),
+ to, fraglen,icmp_param->csum);
+ return 0;
+ }
+
+ /*
+ * First fragment includes header. Note that we've done
+ * the other fragments first, so that we get the checksum
+ * for the whole packet here.
+ */
+ csum = csum_partial_copy((void *)&icmp_param->icmph,
+ to, sizeof(struct icmphdr),
+ icmp_param->csum);
+ csum = csum_partial_copy(icmp_param->data_ptr,
+ to+sizeof(struct icmphdr),
+ fraglen-sizeof(struct icmphdr), csum);
+ icmph=(struct icmphdr *)to;
+ icmph->checksum = csum_fold(csum);
+ return 0;
+}
+
+/*
+ * Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+ struct sock *sk=icmp_socket->sk;
+ struct ipcm_cookie ipc;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ u32 daddr;
+
+ if (ip_options_echo(&icmp_param->replyopts, skb))
+ return;
+
+ icmp_param->icmph.checksum=0;
+ icmp_param->csum=0;
+ icmp_out_count(icmp_param->icmph.type);
+
+ sk->ip_tos = skb->nh.iph->tos;
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = &icmp_param->replyopts;
+ if (ipc.opt->srr)
+ daddr = icmp_param->replyopts.faddr;
+ if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
+ return;
+ ip_build_xmit(sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+sizeof(struct icmphdr),
+ &ipc, rt, MSG_DONTWAIT);
+ ip_rt_put(rt);
+}
+
+
+/*
+ * Send an ICMP message in response to a situation
+ *
+ * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. MAY send more (we do).
+ * MUST NOT change this header information.
+ * MUST NOT reply to a multicast/broadcast IP address.
+ * MUST NOT reply to a multicast/broadcast MAC address.
+ * MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
+{
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ int room;
+ struct icmp_bxm icmp_param;
+ struct rtable *rt = (struct rtable*)skb_in->dst;
+ struct ipcm_cookie ipc;
+ u32 saddr;
+ u8 tos;
+
+ /*
+ * Find the original header
+ */
+
+ iph = skb_in->nh.iph;
+
+ /*
+ * No replies to physical multicast/broadcast
+ */
+
+ if (skb_in->pkt_type!=PACKET_HOST)
+ return;
+
+ /*
+ * Now check at the protocol level
+ */
+ if (!rt) {
+#ifndef CONFIG_IP_ALWAYS_DEFRAG
+ if (net_ratelimit())
+ printk(KERN_DEBUG "icmp_send: destinationless packet\n");
+#endif
+ return;
+ }
+ if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
+ return;
+
+
+ /*
+ * Only reply to fragment 0. We byte re-order the constant
+ * mask for efficiency.
+ */
+
+ if (iph->frag_off&htons(IP_OFFSET))
+ return;
+
+ /*
+ * If we send an ICMP error to an ICMP error a mess would result..
+ */
+
+ if (icmp_pointers[type].error) {
+ /*
+ * We are an error, check if we are replying to an ICMP error
+ */
+
+ if (iph->protocol==IPPROTO_ICMP) {
+ icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
+ /*
+ * Assume any unknown ICMP type is an error. This isn't
+ * specified by the RFC, but think about it..
+ */
+ if (icmph->type>NR_ICMP_TYPES || icmp_pointers[icmph->type].error)
+ return;
+ }
+ }
+
+
+ /*
+ * Construct source address and options.
+ */
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ /*
+ * Restore original addresses if packet has been translated.
+ */
+ if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) {
+ iph->daddr = rt->key.dst;
+ iph->saddr = rt->key.src;
+ }
+#endif
+#ifdef CONFIG_IP_MASQUERADE
+ if (type==ICMP_DEST_UNREACH && IPCB(skb_in)->flags&IPSKB_MASQUERADED) {
+ ip_fw_unmasq_icmp(skb_in);
+ }
+#endif
+
+ saddr = iph->daddr;
+ if (!(rt->rt_flags & RTCF_LOCAL))
+ saddr = 0;
+
+ tos = icmp_pointers[type].error ?
+ ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+
+ /* XXX: use a more aggressive expire for routes created by
+ * this call (not longer than the rate limit timeout).
+ * It could be also worthwhile to not put them into ipv4
+ * fast routing cache at first. Otherwise an attacker can
+ * grow the routing table.
+ */
+ if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0))
+ return;
+
+ if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ goto ende;
+
+
+ /*
+ * Prepare data for ICMP header.
+ */
+
+ icmp_param.icmph.type=type;
+ icmp_param.icmph.code=code;
+ icmp_param.icmph.un.gateway = info;
+ icmp_param.icmph.checksum=0;
+ icmp_param.csum=0;
+ icmp_param.data_ptr=iph;
+ icmp_out_count(icmp_param.icmph.type);
+ icmp_socket->sk->ip_tos = tos;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param.replyopts;
+ if (icmp_param.replyopts.srr) {
+ ip_rt_put(rt);
+ if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0))
+ return;
+ }
+
+ if (!icmpv4_xrlim_allow(rt, type, code))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+ room = rt->u.dst.pmtu;
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct icmphdr);
+
+ icmp_param.data_len=(iph->ihl<<2)+skb_in->len;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+
+ ip_build_xmit(icmp_socket->sk, icmp_glue_bits, &icmp_param,
+ icmp_param.data_len+sizeof(struct icmphdr),
+ &ipc, rt, MSG_DONTWAIT);
+
+ende:
+ ip_rt_put(rt);
+}
+
+
+/*
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+
+static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+ struct iphdr *iph;
+ int hash;
+ struct inet_protocol *ipprot;
+ unsigned char *dp;
+ struct sock *raw_sk;
+
+ /*
+ * Incomplete header ?
+ * Only checks for the IP header, there should be an
+ * additional check for longer headers in upper levels.
+ */
+
+ if(len<sizeof(struct iphdr)) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+
+ iph = (struct iphdr *) (icmph + 1);
+ dp = (unsigned char*)iph;
+
+ if(icmph->type==ICMP_DEST_UNREACH) {
+ switch(icmph->code & 15) {
+ case ICMP_NET_UNREACH:
+ break;
+ case ICMP_HOST_UNREACH:
+ break;
+ case ICMP_PROT_UNREACH:
+ break;
+ case ICMP_PORT_UNREACH:
+ break;
+ case ICMP_FRAG_NEEDED:
+ if (ipv4_config.no_pmtu_disc) {
+ if (net_ratelimit())
+ printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n",
+ NIPQUAD(iph->daddr));
+ } else {
+ unsigned short new_mtu;
+ new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu));
+ if (!new_mtu)
+ return;
+ icmph->un.frag.mtu = htons(new_mtu);
+ }
+ break;
+ case ICMP_SR_FAILED:
+ if (net_ratelimit())
+ printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr));
+ break;
+ default:
+ break;
+ }
+ if (icmph->code>NR_ICMP_UNREACH)
+ return;
+ }
+
+ /*
+ * Throw it at our lower layers
+ *
+ * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed header.
+ * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the transport layer.
+ * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to transport layer.
+ */
+
+ /*
+ * Check the other end isnt violating RFC 1122. Some routers send
+ * bogus responses to broadcast frames. If you see this message
+ * first check your netmask matches at both ends, if it does then
+ * get the other vendor to fix their kit.
+ */
+
+ if (!sysctl_icmp_ignore_bogus_error_responses)
+ {
+
+ if (inet_addr_type(iph->daddr) == RTN_BROADCAST)
+ {
+ if (net_ratelimit())
+ printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n",
+ NIPQUAD(skb->nh.iph->saddr));
+ return;
+ }
+ }
+
+ /*
+ * Deliver ICMP message to raw sockets. Pretty useless feature?
+ */
+
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ hash = iph->protocol & (MAX_INET_PROTOS - 1);
+ if ((raw_sk = raw_v4_htable[hash]) != NULL)
+ {
+ while ((raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr,
+ iph->daddr, skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb);
+ raw_sk = raw_sk->next;
+ }
+ }
+
+ /*
+ * This can't change while we are doing it.
+ */
+
+ ipprot = (struct inet_protocol *) inet_protos[hash];
+ while(ipprot != NULL) {
+ struct inet_protocol *nextip;
+
+ nextip = (struct inet_protocol *) ipprot->next;
+
+ /*
+ * Pass it off to everyone who wants it.
+ */
+
+ /* RFC1122: OK. Passes appropriate ICMP errors to the */
+ /* appropriate protocol layer (MUST), as per 3.2.2. */
+
+ if (iph->protocol == ipprot->protocol && ipprot->err_handler)
+ ipprot->err_handler(skb, dp, len);
+
+ ipprot = nextip;
+ }
+}
+
+
+/*
+ * Handle ICMP_REDIRECT.
+ */
+
+static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+ struct iphdr *iph;
+ unsigned long ip;
+
+ if (len < sizeof(struct iphdr)) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+
+ /*
+ * Get the copied header of the packet that caused the redirect
+ */
+
+ iph = (struct iphdr *) (icmph + 1);
+ ip = iph->daddr;
+
+ switch(icmph->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ /*
+ * As per RFC recommendations now handle it as
+ * a host redirect.
+ */
+
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ ip_rt_redirect(skb->nh.iph->saddr, ip, icmph->un.gateway, iph->saddr, iph->tos, skb->dev);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Handle ICMP_ECHO ("ping") requests.
+ *
+ * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo requests.
+ * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be included in the reply.
+ * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring echo requests, MUST have default=NOT.
+ * See also WRT handling of options once they are done and working.
+ */
+
+static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+ if (!sysctl_icmp_echo_ignore_all) {
+ struct icmp_bxm icmp_param;
+
+ icmp_param.icmph=*icmph;
+ icmp_param.icmph.type=ICMP_ECHOREPLY;
+ icmp_param.data_ptr=(icmph+1);
+ icmp_param.data_len=len;
+ icmp_reply(&icmp_param, skb);
+ }
+}
+
+/*
+ * Handle ICMP Timestamp requests.
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+
+static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+ struct timeval tv;
+ __u32 times[3]; /* So the new timestamp works on ALPHA's.. */
+ struct icmp_bxm icmp_param;
+
+ /*
+ * Too short.
+ */
+
+ if(len<12) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+
+ /*
+ * Fill in the current time as ms since midnight UT:
+ */
+
+ do_gettimeofday(&tv);
+ times[1] = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ times[2] = times[1];
+ memcpy((void *)&times[0], icmph+1, 4); /* Incoming stamp */
+ icmp_param.icmph=*icmph;
+ icmp_param.icmph.type=ICMP_TIMESTAMPREPLY;
+ icmp_param.icmph.code=0;
+ icmp_param.data_ptr=&times;
+ icmp_param.data_len=12;
+ icmp_reply(&icmp_param, skb);
+}
+
+
+/*
+ * Handle ICMP_ADDRESS_MASK requests. (RFC950)
+ *
+ * RFC1122 (3.2.2.9). A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent. Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off. Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9). A router MUST implement it.
+ * A router SHOULD have switch turning it on/off.
+ * This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+
+static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+#if 0
+ if (net_ratelimit())
+ printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+
+/*
+ * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
+ * loudly if an inconsistency is found.
+ */
+
+static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct device *dev = skb->dev;
+ struct in_device *in_dev = dev->ip_ptr;
+ struct in_ifaddr *ifa;
+ u32 mask;
+
+ if (!in_dev || !in_dev->ifa_list ||
+ !IN_DEV_LOG_MARTIANS(in_dev) ||
+ !IN_DEV_FORWARD(in_dev) ||
+ len < 4 ||
+ !(rt->rt_flags&RTCF_DIRECTSRC))
+ return;
+
+ mask = *(u32*)&icmph[1];
+ for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa))
+ return;
+ }
+ if (net_ratelimit())
+ printk(KERN_INFO "Wrong address mask %08lX from %08lX/%s\n",
+ ntohl(mask), ntohl(rt->rt_src), dev->name);
+}
+
+static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len)
+{
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+/*
+ * Check incoming icmp packets not addressed locally, to check whether
+ * they relate to a (proxying) socket on our system.
+ * Needed for transparent proxying.
+ *
+ * This code is presently ugly and needs cleanup.
+ * Probably should add a chkaddr entry to ipprot to call a chk routine
+ * in udp.c or tcp.c...
+ */
+
+/* This should work with the new hashes now. -DaveM */
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+
+int icmp_chkaddr(struct sk_buff *skb)
+{
+ struct icmphdr *icmph=(struct icmphdr *)(skb->nh.raw + skb->nh.iph->ihl*4);
+ struct iphdr *iph = (struct iphdr *) (icmph + 1);
+ void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len) = icmp_pointers[icmph->type].handler;
+
+ if (handler == icmp_unreach || handler == icmp_redirect) {
+ struct sock *sk;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ {
+ struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
+
+ sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
+ if (!sk || (sk->state == TCP_LISTEN))
+ return 0;
+ /*
+ * This packet came from us.
+ */
+ return 1;
+ }
+ case IPPROTO_UDP:
+ {
+ struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
+
+ sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
+ if (!sk) return 0;
+ if (sk->saddr != iph->saddr && inet_addr_type(iph->saddr) != RTN_LOCAL)
+ return 0;
+ /*
+ * This packet may have come from us.
+ * Assume it did.
+ */
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+#endif
+
+/*
+ * Deal with incoming ICMP packets.
+ */
+
+int icmp_rcv(struct sk_buff *skb, unsigned short len)
+{
+ struct icmphdr *icmph = skb->h.icmph;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ icmp_statistics.IcmpInMsgs++;
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently discarded.
+ */
+ if(len < sizeof(struct icmphdr) ||
+ ip_compute_csum((unsigned char *) icmph, len) ||
+ icmph->type > NR_ICMP_TYPES)
+ goto error;
+
+ /*
+ * Parse the ICMP message
+ */
+
+ if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
+ /*
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored (we let user decide with a sysctl).
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ if (icmph->type == ICMP_ECHO &&
+ sysctl_icmp_echo_ignore_broadcasts) {
+ goto error;
+ }
+ if (icmph->type != ICMP_ECHO &&
+ icmph->type != ICMP_TIMESTAMP &&
+ icmph->type != ICMP_ADDRESS &&
+ icmph->type != ICMP_ADDRESSREPLY) {
+ goto error;
+ }
+ }
+
+ len -= sizeof(struct icmphdr);
+ (*icmp_pointers[icmph->type].input)++;
+ (icmp_pointers[icmph->type].handler)(icmph, skb, len);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+error:
+ icmp_statistics.IcmpInErrors++;
+ goto drop;
+}
+
+/*
+ * A spare long used to speed up statistics updating
+ */
+
+static unsigned long dummy;
+
+/*
+ * Configurable rate limits.
+ * Someone should check if these default values are correct.
+ * Note that these values interact with the routing cache GC timeout.
+ * If you chose them too high they won't take effect, because the
+ * dst_entry gets expired too early. The same should happen when
+ * the cache grows too big.
+ */
+int sysctl_icmp_destunreach_time = 1*HZ;
+int sysctl_icmp_timeexceed_time = 1*HZ;
+int sysctl_icmp_paramprob_time = 1*HZ;
+int sysctl_icmp_echoreply_time = 0; /* don't limit it per default. */
+
+/*
+ * This table is the definition of how we handle ICMP.
+ */
+
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = {
+/* ECHO REPLY (0) */
+ { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, &sysctl_icmp_echoreply_time},
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+/* DEST UNREACH (3) */
+ { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &sysctl_icmp_destunreach_time },
+/* SOURCE QUENCH (4) */
+ { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, },
+/* REDIRECT (5) */
+ { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, },
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+/* ECHO (8) */
+ { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, },
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+ { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, },
+/* TIME EXCEEDED (11) */
+ { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &sysctl_icmp_timeexceed_time },
+/* PARAMETER PROBLEM (12) */
+ { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_unreach, 1, &sysctl_icmp_paramprob_time },
+/* TIMESTAMP (13) */
+ { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, },
+/* TIMESTAMP REPLY (14) */
+ { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, },
+/* INFO (15) */
+ { &dummy, &dummy, icmp_discard, 0, },
+/* INFO REPLY (16) */
+ { &dummy, &dummy, icmp_discard, 0, },
+/* ADDR MASK (17) */
+ { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, },
+/* ADDR MASK REPLY (18) */
+ { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, }
+};
+
+__initfunc(void icmp_init(struct net_proto_family *ops))
+{
+ int err;
+
+ icmp_inode.i_mode = S_IFSOCK;
+ icmp_inode.i_sock = 1;
+ icmp_inode.i_uid = 0;
+ icmp_inode.i_gid = 0;
+
+ icmp_socket->inode = &icmp_inode;
+ icmp_socket->state = SS_UNCONNECTED;
+ icmp_socket->type=SOCK_RAW;
+
+ if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0)
+ panic("Failed to create the ICMP control socket.\n");
+ icmp_socket->sk->allocation=GFP_ATOMIC;
+ icmp_socket->sk->num = 256; /* Don't receive any data */
+ icmp_socket->sk->ip_ttl = MAXTTL;
+}
diff --git a/pfinet/linux-src/net/ipv4/igmp.c b/pfinet/linux-src/net/ipv4/igmp.c
new file mode 100644
index 00000000..934e8601
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/igmp.c
@@ -0,0 +1,698 @@
+/*
+ * Linux NET3: Internet Group Management Protocol [IGMP]
+ *
+ * This code implements the IGMP protocol as defined in RFC1112. There has
+ * been a further revision of this protocol since which is now supported.
+ *
+ * If you have trouble with this module be careful what gcc you have used,
+ * the older version didn't come out right using gcc 2.5.8, the newer one
+ * seems to fall out with gcc 2.6.2.
+ *
+ * Version: $Id: igmp.c,v 1.30.2.1 1999/07/23 15:29:22 davem Exp $
+ *
+ * Authors:
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *
+ * Alan Cox : Added lots of __inline__ to optimise
+ * the memory usage of all the tiny little
+ * functions.
+ * Alan Cox : Dumped the header building experiment.
+ * Alan Cox : Minor tweaks ready for multicast routing
+ * and extended IGMP protocol.
+ * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
+ * writes utterly bogus code otherwise (sigh)
+ * fixed IGMP loopback to behave in the manner
+ * desired by mrouted, fixed the fact it has been
+ * broken since 1.3.6 and cleaned up a few minor
+ * points.
+ *
+ * Chih-Jen Chang : Tried to revise IGMP to Version 2
+ * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ * The enhancements are mainly based on Steve Deering's
+ * ipmulti-3.5 source code.
+ * Chih-Jen Chang : Added the igmp_get_mrouter_info and
+ * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
+ * the mrouted version on that device.
+ * Chih-Jen Chang : Added the max_resp_time parameter to
+ * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
+ * to identify the multicast router version
+ * and do what the IGMP version 2 specified.
+ * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
+ * Tsu-Sheng Tsao if the specified time expired.
+ * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
+ * Alan Cox : Use GFP_ATOMIC in the right places.
+ * Christian Daudt : igmp timer wasn't set for local group
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
+ * from %p with timer not initialized\n"
+ * message (960131).
+ * Christian Daudt : removed del_timer from
+ * igmp_timer_expire function (960205).
+ * Christian Daudt : igmp_heard_report now only calls
+ * igmp_timer_expire if tm->running is
+ * true (960216).
+ * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
+ * igmp_heard_query never trigger. Expiry
+ * miscalculation fixed in igmp_heard_query
+ * and random() made to return unsigned to
+ * prevent negative expiry times.
+ * Alexey Kuznetsov: Wrong group leaving behaviour, backport
+ * fix from pending 2.1.x patches.
+ * Alan Cox: Forget to enable FDDI support earlier.
+ * Alexey Kuznetsov: Fixed leaving groups on device down.
+ * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
+ */
+
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS 20
+
+#ifdef CONFIG_IP_MULTICAST
+
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_Router_Present_Timeout (400*HZ)
+#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_Query_Response_Interval (10*HZ)
+#define IGMP_Unsolicited_Report_Count 2
+
+
+#define IGMP_Initial_Report_Delay (1*HZ)
+
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && (long)(jiffies - (in_dev)->mr_v1_seen) < 0)
+
+/*
+ * Timer management
+ */
+
+static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+{
+ if (im->tm_running) {
+ del_timer(&im->timer);
+ im->tm_running=0;
+ }
+}
+
+static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+ int tv;
+ if (im->tm_running)
+ return;
+ tv=net_random() % max_delay;
+ im->timer.expires=jiffies+tv+2;
+ im->tm_running=1;
+ add_timer(&im->timer);
+}
+
+/*
+ * Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+static int igmp_send_report(struct device *dev, u32 group, int type)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct igmphdr *ih;
+ struct rtable *rt;
+ u32 dst;
+
+ /* According to IGMPv2 specs, LEAVE messages are
+ * sent to all-routers group.
+ */
+ dst = group;
+ if (type == IGMP_HOST_LEAVE_MESSAGE)
+ dst = IGMP_ALL_ROUTER;
+
+ if (ip_route_output(&rt, dst, 0, 0, dev->ifindex))
+ return -1;
+ if (rt->rt_src == 0) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC);
+ if (skb == NULL) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+ skb->dst = &rt->u.dst;
+
+ skb_reserve(skb, (dev->hard_header_len+15)&~15);
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+ iph->version = 4;
+ iph->ihl = (sizeof(struct iphdr)+4)>>2;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->ttl = 1;
+ iph->daddr = dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = IPPROTO_IGMP;
+ iph->tot_len = htons(IGMP_SIZE);
+ iph->id = htons(ip_id_count++);
+ ((u8*)&iph[1])[0] = IPOPT_RA;
+ ((u8*)&iph[1])[1] = 4;
+ ((u8*)&iph[1])[2] = 0;
+ ((u8*)&iph[1])[3] = 0;
+ ip_send_check(iph);
+
+ ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih->type=type;
+ ih->code=0;
+ ih->csum=0;
+ ih->group=group;
+ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return skb->dst->output(skb);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+ struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct in_device *in_dev = im->interface;
+ int err;
+
+ im->tm_running=0;
+
+ if (IGMP_V1_SEEN(in_dev))
+ err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT);
+ else
+ err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT);
+
+ /* Failed. Retry later. */
+ if (err) {
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ return;
+ }
+
+ if (im->unsolicit_count) {
+ im->unsolicit_count--;
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ }
+ im->reporter = 1;
+}
+
+static void igmp_heard_report(struct in_device *in_dev, u32 group)
+{
+ struct ip_mc_list *im;
+
+ /* Timers are only set for non-local groups */
+
+ if (group == IGMP_ALL_HOSTS)
+ return;
+
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (im->multiaddr == group) {
+ igmp_stop_timer(im);
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ return;
+ }
+ }
+}
+
+static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time,
+ u32 group)
+{
+ struct ip_mc_list *im;
+ int max_delay;
+
+ max_delay = max_resp_time*(HZ/IGMP_TIMER_SCALE);
+
+ if (max_resp_time == 0) {
+ /* Alas, old v1 router presents here. */
+
+ max_delay = IGMP_Query_Response_Interval;
+ in_dev->mr_v1_seen = jiffies + IGMP_V1_Router_Present_Timeout;
+ group = 0;
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to a "local" group (224.0.0.X)
+ * - For timers already running check if they need to
+ * be reset.
+ * - Use the igmp->igmp_code field as the maximum
+ * delay possible
+ */
+ for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ if (group && group != im->multiaddr)
+ continue;
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ im->unsolicit_count = 0;
+ if (im->tm_running && (long)(im->timer.expires-jiffies) > max_delay)
+ igmp_stop_timer(im);
+ igmp_start_timer(im, max_delay);
+ }
+}
+
+int igmp_rcv(struct sk_buff *skb, unsigned short len)
+{
+ /* This basically follows the spec line by line -- see RFC1112 */
+ struct igmphdr *ih = skb->h.igmph;
+ struct in_device *in_dev = skb->dev->ip_ptr;
+
+ if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len)
+ || in_dev==NULL) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ igmp_heard_query(in_dev, ih->code, ih->group);
+ break;
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMP_HOST_NEW_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+ if (((struct rtable*)skb->dst)->key.iif == 0)
+ break;
+ igmp_heard_report(in_dev, ih->group);
+ break;
+ case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+ return pim_rcv_v1(skb, len);
+#endif
+ case IGMP_DVMRP:
+ case IGMP_TRACE:
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_MTRACE:
+ case IGMP_MTRACE_RESP:
+ break;
+ default:
+ NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ }
+ kfree_skb(skb);
+ return 0;
+}
+
+#endif
+
+
+/*
+ * Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct device *dev = in_dev->dev;
+
+ /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+ We will get multicast token leakage, when IFF_MULTICAST
+ is changed. This check should be done in dev->set_multicast_list
+ routine. Something sort of:
+ if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+ --ANK
+ */
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_add(dev,buf,dev->addr_len,0);
+}
+
+/*
+ * Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct device *dev = in_dev->dev;
+
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_delete(dev,buf,dev->addr_len,0);
+}
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ if (im->loaded) {
+ im->loaded = 0;
+ ip_mc_filter_del(im->interface, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ start_bh_atomic();
+ igmp_stop_timer(im);
+ end_bh_atomic();
+
+ if (im->reporter && !IGMP_V1_SEEN(im->interface))
+ igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE);
+#endif
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+ if (im->loaded == 0) {
+ im->loaded = 1;
+ ip_mc_filter_add(im->interface, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ start_bh_atomic();
+ igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ end_bh_atomic();
+#endif
+}
+
+
+/*
+ * Multicast list managers
+ */
+
+
+/*
+ * A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *i, *im;
+
+ im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+
+ for (i=in_dev->mc_list; i; i=i->next) {
+ if (i->multiaddr == addr) {
+ i->users++;
+ if (im)
+ kfree(im);
+ return;
+ }
+ }
+ if (!im)
+ return;
+ im->users=1;
+ im->interface=in_dev;
+ im->multiaddr=addr;
+#ifdef CONFIG_IP_MULTICAST
+ im->tm_running=0;
+ init_timer(&im->timer);
+ im->timer.data=(unsigned long)im;
+ im->timer.function=&igmp_timer_expire;
+ im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+ im->reporter = 0;
+ im->loaded = 0;
+#endif
+ im->next=in_dev->mc_list;
+ in_dev->mc_list=im;
+ igmp_group_added(im);
+ if (in_dev->dev->flags & IFF_UP)
+ ip_rt_multicast_event(in_dev);
+ return;
+}
+
+/*
+ * A socket has left a multicast group on device dev
+ */
+
+int ip_mc_dec_group(struct in_device *in_dev, u32 addr)
+{
+ struct ip_mc_list *i, **ip;
+
+ for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+ if (i->multiaddr==addr) {
+ if (--i->users == 0) {
+ *ip = i->next;
+ synchronize_bh();
+
+ igmp_group_dropped(i);
+ if (in_dev->dev->flags & IFF_UP)
+ ip_rt_multicast_event(in_dev);
+ kfree_s(i, sizeof(*i));
+ }
+ return 0;
+ }
+ }
+ return -ESRCH;
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_dropped(i);
+
+ ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+ for (i=in_dev->mc_list; i; i=i->next)
+ igmp_group_added(i);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ while ((i = in_dev->mc_list) != NULL) {
+ in_dev->mc_list = i->next;
+ igmp_group_dropped(i);
+ kfree_s(i, sizeof(*i));
+ }
+}
+
+static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+{
+ struct rtable *rt;
+ struct device *dev = NULL;
+
+ if (imr->imr_address.s_addr) {
+ dev = ip_dev_find(imr->imr_address.s_addr);
+ if (!dev)
+ return NULL;
+ }
+
+ if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) {
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ if (dev) {
+ imr->imr_ifindex = dev->ifindex;
+ return dev->ip_ptr;
+ }
+ return NULL;
+}
+
+/*
+ * Join a socket to a group
+ */
+int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
+
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+ int err;
+ u32 addr = imr->imr_multiaddr.s_addr;
+ struct ip_mc_socklist *iml, *i;
+ struct in_device *in_dev;
+ int count = 0;
+
+ if (!MULTICAST(addr))
+ return -EINVAL;
+
+ rtnl_shlock();
+
+ if (!imr->imr_ifindex)
+ in_dev = ip_mc_find_dev(imr);
+ else
+ in_dev = inetdev_by_index(imr->imr_ifindex);
+
+ if (!in_dev) {
+ iml = NULL;
+ err = -ENODEV;
+ goto done;
+ }
+
+ iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+
+ err = -EADDRINUSE;
+ for (i=sk->ip_mc_list; i; i=i->next) {
+ if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+ /* New style additions are reference counted */
+ if (imr->imr_address.s_addr == 0) {
+ i->count++;
+ err = 0;
+ }
+ goto done;
+ }
+ count++;
+ }
+ err = -ENOBUFS;
+ if (iml == NULL || count >= sysctl_igmp_max_memberships)
+ goto done;
+ memcpy(&iml->multi, imr, sizeof(*imr));
+ iml->next = sk->ip_mc_list;
+ iml->count = 1;
+ sk->ip_mc_list = iml;
+ ip_mc_inc_group(in_dev, addr);
+ iml = NULL;
+ err = 0;
+done:
+ rtnl_shunlock();
+ if (iml)
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return err;
+}
+
+/*
+ * Ask a socket to leave a group.
+ */
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ struct ip_mc_socklist *iml, **imlp;
+
+ for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) {
+ if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+ iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+ (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+ struct in_device *in_dev;
+ if (--iml->count)
+ return 0;
+
+ *imlp = iml->next;
+ synchronize_bh();
+
+ in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ return 0;
+ }
+ }
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+ struct ip_mc_socklist *iml;
+
+ while ((iml=sk->ip_mc_list) != NULL) {
+ struct in_device *in_dev;
+ sk->ip_mc_list = iml->next;
+ if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
+ ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ sock_kfree_s(sk, iml, sizeof(*iml));
+ }
+}
+
+
+#ifdef CONFIG_IP_MULTICAST
+
+int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ off_t pos=0, begin=0;
+ struct ip_mc_list *im;
+ int len=0;
+ struct device *dev;
+
+ len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
+
+ for(dev = dev_base; dev; dev = dev->next)
+ {
+ struct in_device *in_dev = dev->ip_ptr;
+ char *querier = "NONE";
+
+ if (in_dev == NULL)
+ continue;
+
+ querier = IGMP_V1_SEEN(in_dev) ? "V1" : "V2";
+
+ len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n",
+ dev->ifindex, dev->name, dev->mc_count, querier);
+
+ for (im = in_dev->mc_list; im; im = im->next) {
+ len+=sprintf(buffer+len,
+ "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+ im->multiaddr, im->users,
+ im->tm_running, im->timer.expires-jiffies, im->reporter);
+
+ pos=begin+len;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ goto done;
+ }
+ }
+done:
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if(len<0)
+ len=0;
+ return len;
+}
+#endif
+
diff --git a/pfinet/linux-src/net/ipv4/ip_forward.c b/pfinet/linux-src/net/ipv4/ip_forward.c
new file mode 100644
index 00000000..08ebbc2f
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_forward.c
@@ -0,0 +1,297 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP forwarding functionality.
+ *
+ * Version: $Id: ip_forward.c,v 1.43 1999/03/21 05:22:37 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip_input.c for
+ * history.
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * routing.
+ * Jos Vos : Add call_out_firewall before sending,
+ * use output device for accounting.
+ * Jos Vos : Call forward firewall after routing
+ * (always use output device).
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/firewall.h>
+#include <linux/ip_fw.h>
+#ifdef CONFIG_IP_MASQUERADE
+#include <net/ip_masq.h>
+#endif
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+/*
+ * Check the packet against our socket administration to see
+ * if it is related to a connection on our system.
+ * Needed for transparent proxying.
+ */
+
+int ip_chksock(struct sk_buff *skb)
+{
+ switch (skb->nh.iph->protocol) {
+ case IPPROTO_ICMP:
+ return icmp_chkaddr(skb);
+ case IPPROTO_TCP:
+ return tcp_chkaddr(skb);
+ case IPPROTO_UDP:
+ return udp_chkaddr(skb);
+ default:
+ return 0;
+ }
+}
+#endif
+
+
+int ip_forward(struct sk_buff *skb)
+{
+ struct device *dev2; /* Output device */
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ unsigned short mtu;
+#if defined(CONFIG_FIREWALL) || defined(CONFIG_IP_MASQUERADE)
+ int fw_res = 0;
+#endif
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ /*
+ * According to the RFC, we must first decrease the TTL field. If
+ * that reaches zero, we must reply an ICMP control message telling
+ * that the packet's lifetime expired.
+ */
+
+ iph = skb->nh.iph;
+ rt = (struct rtable*)skb->dst;
+
+#ifdef CONFIG_CPU_IS_SLOW
+ if (net_cpu_congestion > 1 && !(iph->tos&IPTOS_RELIABILITY) &&
+ IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
+ if (((xtime.tv_usec&0xF)<<net_cpu_congestion) > 0x1C)
+ goto drop;
+ }
+#endif
+
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (ip_chksock(skb))
+ goto local_pkt;
+#endif
+
+ if (iph->ttl <= 1)
+ goto too_many_hops;
+
+ if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto sr_failed;
+
+ /*
+ * Having picked a route we can now send the frame out
+ * after asking the firewall permission to do so.
+ */
+
+ skb->priority = rt_tos2priority(iph->tos);
+ dev2 = rt->u.dst.dev;
+ mtu = rt->u.dst.pmtu;
+
+#ifdef CONFIG_NET_SECURITY
+ call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL);
+#endif
+
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+ */
+ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+ ip_rt_send_redirect(skb);
+
+ /* We are about to mangle packet. Copy it! */
+ if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL)
+ return -1;
+ iph = skb->nh.iph;
+ opt = &(IPCB(skb)->opt);
+
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
+ /*
+ * We now may allocate a new buffer, and copy the datagram into it.
+ * If the indicated interface is up and running, kick it.
+ */
+
+ if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF))
+ goto frag_needed;
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ if (rt->rt_flags & RTCF_NAT) {
+ if (ip_do_nat(skb)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+#endif
+
+#ifdef CONFIG_IP_MASQUERADE
+ if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) {
+ /*
+ * Check that any ICMP packets are not for a
+ * masqueraded connection. If so rewrite them
+ * and skip the firewall checks
+ */
+ if (iph->protocol == IPPROTO_ICMP) {
+ __u32 maddr;
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ struct icmphdr *icmph = (struct icmphdr *)((char*)iph + (iph->ihl << 2));
+ if ((icmph->type==ICMP_DEST_UNREACH)||
+ (icmph->type==ICMP_SOURCE_QUENCH)||
+ (icmph->type==ICMP_TIME_EXCEEDED))
+ {
+#endif
+ maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ fw_res = ip_fw_masq_icmp(&skb, maddr);
+ if (fw_res < 0) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (fw_res)
+ /* ICMP matched - skip firewall */
+ goto skip_call_fw_firewall;
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ }
+#endif
+ }
+ if (rt->rt_flags&RTCF_MASQ)
+ goto skip_call_fw_firewall;
+#endif /* CONFIG_IP_MASQUERADE */
+
+#ifdef CONFIG_FIREWALL
+ fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL, &skb);
+ switch (fw_res) {
+ case FW_ACCEPT:
+ case FW_MASQUERADE:
+ break;
+ case FW_REJECT:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ /* fall thru */
+ default:
+ kfree_skb(skb);
+ return -1;
+ }
+#endif
+
+#ifdef CONFIG_IP_MASQUERADE
+ }
+
+skip_call_fw_firewall:
+ /*
+ * If this fragment needs masquerading, make it so...
+ * (Don't masquerade de-masqueraded fragments)
+ */
+ if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) &&
+ (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) {
+ u32 maddr;
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0;
+
+ if (maddr == 0)
+#endif
+ maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+
+ if (ip_fw_masquerade(&skb, maddr) < 0) {
+ kfree_skb(skb);
+ return -1;
+ } else {
+ /*
+ * Masquerader may have changed skb
+ */
+ iph = skb->nh.iph;
+ opt = &(IPCB(skb)->opt);
+ }
+ }
+#endif
+
+
+#ifdef CONFIG_FIREWALL
+ if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) {
+ /* FW_ACCEPT and FW_MASQUERADE are treated equal:
+ masquerading is only supported via forward rules */
+ if (fw_res == FW_REJECT)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+#endif
+
+ ip_statistics.IpForwDatagrams++;
+
+ if (opt->optlen == 0) {
+#ifdef CONFIG_NET_FASTROUTE
+ if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
+ unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
+ /* Time to switch to functional programming :-) */
+ dst_release_irqwait(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
+ }
+#endif
+ ip_send(skb);
+ return 0;
+ }
+
+ ip_forward_options(skb);
+ ip_send(skb);
+ return 0;
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+local_pkt:
+ return ip_local_deliver(skb);
+#endif
+
+frag_needed:
+ ip_statistics.IpFragFails++;
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ goto drop;
+
+sr_failed:
+ /*
+ * Strict routing permits no gatewaying
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
+
+too_many_hops:
+ /* Tell the sender its packet died... */
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+ kfree_skb(skb);
+ return -1;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_fragment.c b/pfinet/linux-src/net/ipv4/ip_fragment.c
new file mode 100644
index 00000000..f066e607
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_fragment.c
@@ -0,0 +1,593 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP fragmentation functionality.
+ *
+ * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
+ *
+ * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ * Alan Cox : Split from ip.c , see ip_input.c for history.
+ * David S. Miller : Begin massive cleanup...
+ * Andi Kleen : Add sysctls.
+ * xxxx : Overlapfrag bug.
+ * Ultima : ip_expire() kernel panic.
+ * Bill Hawes : Frag accounting and evictor fixes.
+ * John McDonald : 0 length frag bug.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/firewall.h>
+#include <linux/ip_fw.h>
+
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
+ */
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+
+int sysctl_ipfrag_time = IP_FRAG_TIME;
+
+/* Describe an IP fragment. */
+struct ipfrag {
+ int offset; /* offset of fragment in IP datagram */
+ int end; /* last byte of data in datagram */
+ int len; /* length of this fragment */
+ struct sk_buff *skb; /* complete received fragment */
+ unsigned char *ptr; /* pointer into real fragment data */
+ struct ipfrag *next; /* linked list pointers */
+ struct ipfrag *prev;
+};
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+ struct iphdr *iph; /* pointer to IP header */
+ struct ipq *next; /* linked list pointers */
+ struct ipfrag *fragments; /* linked list of received fragments */
+ int len; /* total length of original datagram */
+ short ihlen; /* length of the IP header */
+ struct timer_list timer; /* when will this queue expire? */
+ struct ipq **pprev;
+ struct device *dev; /* Device - for icmp replies */
+};
+
+#define IPQ_HASHSZ 64
+
+struct ipq *ipq_hash[IPQ_HASHSZ];
+
+#define ipqhashfn(id, saddr, daddr, prot) \
+ ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
+
+atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
+
+/* Memory Tracking Functions. */
+extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
+{
+ atomic_sub(skb->truesize, &ip_frag_mem);
+ kfree_skb(skb);
+}
+
+extern __inline__ void frag_kfree_s(void *ptr, int len)
+{
+ atomic_sub(len, &ip_frag_mem);
+ kfree(ptr);
+}
+
+extern __inline__ void *frag_kmalloc(int size, int pri)
+{
+ void *vp = kmalloc(size, pri);
+
+ if(!vp)
+ return NULL;
+ atomic_add(size, &ip_frag_mem);
+ return vp;
+}
+
+/* Create a new fragment entry. */
+static struct ipfrag *ip_frag_create(int offset, int end,
+ struct sk_buff *skb, unsigned char *ptr)
+{
+ struct ipfrag *fp;
+
+ fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
+ if (fp == NULL)
+ goto out_nomem;
+
+ /* Fill in the structure. */
+ fp->offset = offset;
+ fp->end = end;
+ fp->len = end - offset;
+ fp->skb = skb;
+ fp->ptr = ptr;
+ fp->next = fp->prev = NULL;
+
+ /* Charge for the SKB as well. */
+ atomic_add(skb->truesize, &ip_frag_mem);
+
+ return(fp);
+
+out_nomem:
+ NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
+ return(NULL);
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and return the queue entry address if found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
+{
+ __u16 id = iph->id;
+ __u32 saddr = iph->saddr;
+ __u32 daddr = iph->daddr;
+ __u8 protocol = iph->protocol;
+ unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+ struct ipq *qp;
+
+ /* Always, we are in a BH context, so no locking. -DaveM */
+ for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ if(qp->iph->id == id &&
+ qp->iph->saddr == saddr &&
+ qp->iph->daddr == daddr &&
+ qp->iph->protocol == protocol) {
+ del_timer(&qp->timer);
+ break;
+ }
+ }
+ return qp;
+}
+
+/* Remove an entry from the "incomplete datagrams" queue, either
+ * because we completed, reassembled and processed it, or because
+ * it timed out.
+ *
+ * This is called _only_ from BH contexts, on packet reception
+ * processing and from frag queue expiration timers. -DaveM
+ */
+static void ip_free(struct ipq *qp)
+{
+ struct ipfrag *fp;
+
+ /* Stop the timer for this entry. */
+ del_timer(&qp->timer);
+
+ /* Remove this entry from the "incomplete datagrams" queue. */
+ if(qp->next)
+ qp->next->pprev = qp->pprev;
+ *qp->pprev = qp->next;
+
+ /* Release all fragment data. */
+ fp = qp->fragments;
+ while (fp) {
+ struct ipfrag *xp = fp->next;
+
+ frag_kfree_skb(fp->skb);
+ frag_kfree_s(fp, sizeof(struct ipfrag));
+ fp = xp;
+ }
+
+ /* Release the IP header. */
+ frag_kfree_s(qp->iph, 64 + 8);
+
+ /* Finally, release the queue descriptor itself. */
+ frag_kfree_s(qp, sizeof(struct ipq));
+}
+
+/*
+ * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+ struct ipq *qp = (struct ipq *) arg;
+
+ if(!qp->fragments)
+ {
+#ifdef IP_EXPIRE_DEBUG
+ printk("warning: possible ip-expire attack\n");
+#endif
+ goto out;
+ }
+
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ ip_statistics.IpReasmTimeout++;
+ ip_statistics.IpReasmFails++;
+ icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+
+out:
+ /* Nuke the fragment queue. */
+ ip_free(qp);
+}
+
+/* Memory limiting on fragments. Evictor trashes the oldest
+ * fragment queue until we are back under the low threshold.
+ */
+static void ip_evictor(void)
+{
+ int i, progress;
+
+restart:
+ progress = 0;
+ /* FIXME: Make LRU queue of frag heads. -DaveM */
+ for (i = 0; i < IPQ_HASHSZ; i++) {
+ struct ipq *qp;
+ if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
+ return;
+ /* We are in a BH context, so these queue
+ * accesses are safe. -DaveM
+ */
+ qp = ipq_hash[i];
+ if (qp) {
+ /* find the oldest queue for this hash bucket */
+ while (qp->next)
+ qp = qp->next;
+ ip_free(qp);
+ progress = 1;
+ }
+ }
+ if (progress)
+ goto restart;
+ panic("ip_evictor: memcount");
+}
+
+/* Add an entry to the 'ipq' queue for a newly received IP datagram.
+ * We will (hopefully :-) receive all other fragments of this datagram
+ * in time, so we just create a queue for this datagram, in which we
+ * will insert the received fragments at their respective positions.
+ */
+static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
+{
+ struct ipq *qp;
+ unsigned int hash;
+ int ihlen;
+
+ qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
+ if (qp == NULL)
+ goto out_nomem;
+
+ /* Allocate memory for the IP header (plus 8 octets for ICMP). */
+ ihlen = iph->ihl * 4;
+
+ qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
+ if (qp->iph == NULL)
+ goto out_free;
+
+ memcpy(qp->iph, iph, ihlen + 8);
+ qp->len = 0;
+ qp->ihlen = ihlen;
+ qp->fragments = NULL;
+ qp->dev = skb->dev;
+
+ /* Initialize a timer for this entry. */
+ init_timer(&qp->timer);
+ qp->timer.expires = 0; /* (to be set later) */
+ qp->timer.data = (unsigned long) qp; /* pointer to queue */
+ qp->timer.function = ip_expire; /* expire function */
+
+ /* Add this entry to the queue. */
+ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+
+ /* We are in a BH context, no locking necessary. -DaveM */
+ if((qp->next = ipq_hash[hash]) != NULL)
+ qp->next->pprev = &qp->next;
+ ipq_hash[hash] = qp;
+ qp->pprev = &ipq_hash[hash];
+
+ return qp;
+
+out_free:
+ frag_kfree_s(qp, sizeof(struct ipq));
+out_nomem:
+ NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
+ return(NULL);
+}
+
+/* See if a fragment queue is complete. */
+static int ip_done(struct ipq *qp)
+{
+ struct ipfrag *fp;
+ int offset;
+
+ /* Only possible if we received the final fragment. */
+ if (qp->len == 0)
+ return 0;
+
+ /* Check all fragment offsets to see if they connect. */
+ fp = qp->fragments;
+ offset = 0;
+ while (fp) {
+ if (fp->offset > offset)
+ return(0); /* fragment(s) missing */
+ offset = fp->end;
+ fp = fp->next;
+ }
+
+ /* All fragments are present. */
+ return 1;
+}
+
+/* Build a new IP datagram from all its fragments.
+ *
+ * FIXME: We copy here because we lack an effective way of handling lists
+ * of bits on input. Until the new skb data handling is in I'm not going
+ * to touch this with a bargepole.
+ */
+static struct sk_buff *ip_glue(struct ipq *qp)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct ipfrag *fp;
+ unsigned char *ptr;
+ int count, len;
+
+ /* Allocate a new buffer for the datagram. */
+ len = qp->ihlen + qp->len;
+
+ if(len > 65535)
+ goto out_oversize;
+
+ skb = dev_alloc_skb(len);
+ if (!skb)
+ goto out_nomem;
+
+ /* Fill in the basic details. */
+ skb->mac.raw = ptr = skb->data;
+ skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len);
+
+ /* Copy the original IP headers into the new buffer. */
+ memcpy(ptr, qp->iph, qp->ihlen);
+ ptr += qp->ihlen;
+
+ /* Copy the data portions of all fragments into the new buffer. */
+ fp = qp->fragments;
+ count = qp->ihlen;
+ while(fp) {
+ if ((fp->len <= 0) || ((count + fp->len) > skb->len))
+ goto out_invalid;
+ memcpy((ptr + fp->offset), fp->ptr, fp->len);
+ if (count == qp->ihlen) {
+ skb->dst = dst_clone(fp->skb->dst);
+ skb->dev = fp->skb->dev;
+ }
+ count += fp->len;
+ fp = fp->next;
+ }
+
+ skb->pkt_type = qp->fragments->skb->pkt_type;
+ skb->protocol = qp->fragments->skb->protocol;
+ /*
+ * Clearly bogus, because security markings of the individual
+ * fragments should have been checked for consistency before
+ * gluing, and intermediate coalescing of fragments may have
+ * taken place in ip_defrag() before ip_glue() ever got called.
+ * If we're not going to do the consistency checking, we might
+ * as well take the value associated with the first fragment.
+ * --rct
+ */
+ skb->security = qp->fragments->skb->security;
+
+ /* Done with all fragments. Fixup the new IP header. */
+ iph = skb->nh.iph;
+ iph->frag_off = 0;
+ iph->tot_len = htons(count);
+ ip_statistics.IpReasmOKs++;
+ return skb;
+
+out_invalid:
+ NETDEBUG(printk(KERN_ERR
+ "Invalid fragment list: Fragment over size.\n"));
+ kfree_skb(skb);
+ goto out_fail;
+out_nomem:
+ NETDEBUG(printk(KERN_ERR
+ "IP: queue_glue: no memory for gluing queue %p\n",
+ qp));
+ goto out_fail;
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_INFO
+ "Oversized IP packet from %d.%d.%d.%d.\n",
+ NIPQUAD(qp->iph->saddr));
+out_fail:
+ ip_statistics.IpReasmFails++;
+ return NULL;
+}
+
+/* Process an incoming IP datagram fragment. */
+struct sk_buff *ip_defrag(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct ipfrag *prev, *next, *tmp, *tfp;
+ struct ipq *qp;
+ unsigned char *ptr;
+ int flags, offset;
+ int i, ihl, end;
+
+ ip_statistics.IpReasmReqds++;
+
+ /* Start by cleaning up the memory. */
+ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
+ ip_evictor();
+
+ /*
+ * Look for the entry for this IP datagram in the
+ * "incomplete datagrams" queue. If found, the
+ * timer is removed.
+ */
+ qp = ip_find(iph, skb->dst);
+
+ /* Is this a non-fragmented datagram? */
+ offset = ntohs(iph->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = iph->ihl * 4;
+
+ /*
+ * Check whether to create a fresh queue entry. If the
+ * queue already exists, its timer will be restarted as
+ * long as we continue to receive fragments.
+ */
+ if (qp) {
+ /* ANK. If the first fragment is received,
+ * we should remember the correct IP header (with options)
+ */
+ if (offset == 0) {
+ /* Fragmented frame replaced by unfragmented copy? */
+ if ((flags & IP_MF) == 0)
+ goto out_freequeue;
+ qp->ihlen = ihl;
+ memcpy(qp->iph, iph, (ihl + 8));
+ }
+ } else {
+ /* Fragmented frame replaced by unfragmented copy? */
+ if ((offset == 0) && ((flags & IP_MF) == 0))
+ goto out_skb;
+
+ /* If we failed to create it, then discard the frame. */
+ qp = ip_create(skb, iph);
+ if (!qp)
+ goto out_freeskb;
+ }
+
+ /* Attempt to construct an oversize packet. */
+ if((ntohs(iph->tot_len) + ((int) offset)) > 65535)
+ goto out_oversize;
+
+ /* Determine the position of this fragment. */
+ end = offset + ntohs(iph->tot_len) - ihl;
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0)
+ qp->len = end;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for(next = qp->fragments; next != NULL; next = next->next) {
+ if (next->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+ /* Point into the IP datagram 'data' part. */
+ ptr = skb->data + ihl;
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if ((prev != NULL) && (offset < prev->end)) {
+ i = prev->end - offset;
+ offset += i; /* ptr into datagram */
+ ptr += i; /* ptr into fragment data */
+ }
+
+ /* Look for overlap with succeeding segments.
+ * If we can merge fragments, do it.
+ */
+ for (tmp = next; tmp != NULL; tmp = tfp) {
+ tfp = tmp->next;
+ if (tmp->offset >= end)
+ break; /* no overlaps at all */
+
+ i = end - next->offset; /* overlap is 'i' bytes */
+ tmp->len -= i; /* so reduce size of */
+ tmp->offset += i; /* next fragment */
+ tmp->ptr += i;
+
+ /* If we get a frag size of <= 0, remove it and the packet
+ * that it goes with.
+ */
+ if (tmp->len <= 0) {
+ if (tmp->prev != NULL)
+ tmp->prev->next = tmp->next;
+ else
+ qp->fragments = tmp->next;
+
+ if (tmp->next != NULL)
+ tmp->next->prev = tmp->prev;
+
+ /* We have killed the original next frame. */
+ next = tfp;
+
+ frag_kfree_skb(tmp->skb);
+ frag_kfree_s(tmp, sizeof(struct ipfrag));
+ }
+ }
+
+ /*
+ * Create a fragment to hold this skb.
+ * No memory to save the fragment? throw the lot ...
+ */
+ tfp = ip_frag_create(offset, end, skb, ptr);
+ if (!tfp)
+ goto out_freeskb;
+
+ /* Insert this fragment in the chain of fragments. */
+ tfp->prev = prev;
+ tfp->next = next;
+ if (prev != NULL)
+ prev->next = tfp;
+ else
+ qp->fragments = tfp;
+
+ if (next != NULL)
+ next->prev = tfp;
+
+ /* OK, so we inserted this new fragment into the chain.
+ * Check if we now have a full IP datagram which we can
+ * bump up to the IP layer...
+ */
+ if (ip_done(qp)) {
+ /* Glue together the fragments. */
+ skb = ip_glue(qp);
+ /* Free the queue entry. */
+out_freequeue:
+ ip_free(qp);
+out_skb:
+ return skb;
+ }
+
+ /*
+ * The queue is still active ... reset its timer.
+ */
+out_timer:
+ mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
+out:
+ return NULL;
+
+ /*
+ * Error exits ... we need to reset the timer if there's a queue.
+ */
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n",
+ NIPQUAD(iph->saddr));
+ /* the skb isn't in a fragment, so fall through to free it */
+out_freeskb:
+ kfree_skb(skb);
+ ip_statistics.IpReasmFails++;
+ if (qp)
+ goto out_timer;
+ goto out;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_fw.c b/pfinet/linux-src/net/ipv4/ip_fw.c
new file mode 100644
index 00000000..99a91d53
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_fw.c
@@ -0,0 +1,1759 @@
+/*
+ * This code is heavily based on the code on the old ip_fw.c code; see below for
+ * copyrights and attributions of the old code. This code is basically GPL.
+ *
+ * 15-Aug-1997: Major changes to allow graphs for firewall rules.
+ * Paul Russell <Paul.Russell@rustcorp.com.au> and
+ * Michael Neuling <Michael.Neuling@rustcorp.com.au>
+ * 24-Aug-1997: Generalised protocol handling (not just TCP/UDP/ICMP).
+ * Added explicit RETURN from chains.
+ * Removed TOS mangling (done in ipchains 1.0.1).
+ * Fixed read & reset bug by reworking proc handling.
+ * Paul Russell <Paul.Russell@rustcorp.com.au>
+ * 28-Sep-1997: Added packet marking for net sched code.
+ * Removed fw_via comparisons: all done on device name now,
+ * similar to changes in ip_fw.c in DaveM's CVS970924 tree.
+ * Paul Russell <Paul.Russell@rustcorp.com.au>
+ * 2-Nov-1997: Moved types across to __u16, etc.
+ * Added inverse flags.
+ * Fixed fragment bug (in args to port_match).
+ * Changed mark to only one flag (MARKABS).
+ * 21-Nov-1997: Added ability to test ICMP code.
+ * 19-Jan-1998: Added wildcard interfaces.
+ * 6-Feb-1998: Merged 2.0 and 2.1 versions.
+ * Initialised ip_masq for 2.0.x version.
+ * Added explicit NETLINK option for 2.1.x version.
+ * Added packet and byte counters for policy matches.
+ * 26-Feb-1998: Fixed race conditions, added SMP support.
+ * 18-Mar-1998: Fix SMP, fix race condition fix.
+ * 1-May-1998: Remove caching of device pointer.
+ * 12-May-1998: Allow tiny fragment case for TCP/UDP.
+ * 15-May-1998: Treat short packets as fragments, don't just block.
+ * 3-Jan-1999: Fixed serious procfs security hole -- users should never
+ * be allowed to view the chains!
+ * Marc Santoro <ultima@snicker.emoti.com>
+ * 29-Jan-1999: Locally generated bogus IPs dealt with, rather than crash
+ * during dump_packet. --RR.
+ * 19-May-1999: Star Wars: The Phantom Menace opened. Rule num
+ * printed in log (modified from Michael Hasenstein's patch).
+ * Added SYN in log message. --RR
+ * 23-Jul-1999: Fixed small fragment security exposure opened on 15-May-1998.
+ * John McDonald <jm@dataprotect.com>
+ * Thomas Lopatic <tl@dataprotect.com>
+ */
+
+/*
+ *
+ * The origina Linux port was done Alan Cox, with changes/fixes from
+ * Pauline Middlelink, Jos Vos, Thomas Quinot, Wouter Gadeyne, Juan
+ * Jose Ciarlante, Bernd Eckenfels, Keith Owens and others.
+ *
+ * Copyright from the original FreeBSD version follows:
+ *
+ * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind. */
+
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/firewall.h>
+#include <linux/ip_fw.h>
+
+#ifdef CONFIG_IP_MASQUERADE
+#include <net/ip_masq.h>
+#endif
+
+#include <net/checksum.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+
+/* Understanding locking in this code: (thanks to Alan Cox for using
+ * little words to explain this to me). -- PR
+ *
+ * In UP, there can be two packets traversing the chains:
+ * 1) A packet from the current userspace context
+ * 2) A packet off the bh handlers (timer or net).
+ *
+ * For SMP (kernel v2.1+), multiply this by # CPUs.
+ *
+ * [Note that this in not correct for 2.2 - because the socket code always
+ * uses lock_kernel() to serialize, and bottom halves (timers and net_bhs)
+ * only run on one CPU at a time. This will probably change for 2.3.
+ * It is still good to use spinlocks because that avoids the global cli()
+ * for updating the tables, which is rather costly in SMP kernels -AK]
+ *
+ * This means counters and backchains can get corrupted if no precautions
+ * are taken.
+ *
+ * To actually alter a chain on UP, we need only do a cli(), as this will
+ * stop a bh handler firing, as we are in the current userspace context
+ * (coming from a setsockopt()).
+ *
+ * On SMP, we need a write_lock_irqsave(), which is a simple cli() in
+ * UP.
+ *
+ * For backchains and counters, we use an array, indexed by
+ * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of
+ * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So,
+ * confident of uniqueness, we modify counters even though we only
+ * have a read lock (to read the counters, you need a write lock,
+ * though). */
+
+/* Why I didn't use straight locking... -- PR
+ *
+ * The backchains can be separated out of the ip_chains structure, and
+ * allocated as needed inside ip_fw_check().
+ *
+ * The counters, however, can't. Trying to lock these means blocking
+ * interrupts every time we want to access them. This would suck HARD
+ * performance-wise. Not locking them leads to possible corruption,
+ * made worse on 32-bit machines (counters are 64-bit). */
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+/*#define DEBUG_IP_FIREWALL_LOCKING*/
+
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+static struct sock *ipfwsk;
+#endif
+
+#ifdef __SMP__
+#define SLOT_NUMBER() (cpu_number_map[smp_processor_id()]*2 + !in_interrupt())
+#else
+#define SLOT_NUMBER() (!in_interrupt())
+#endif
+#define NUM_SLOTS (smp_num_cpus*2)
+
+#define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \
+ + NUM_SLOTS*sizeof(struct ip_reent))
+#define SIZEOF_STRUCT_IP_FW_KERNEL (sizeof(struct ip_fwkernel) \
+ + NUM_SLOTS*sizeof(struct ip_counters))
+
+#ifdef DEBUG_IP_FIREWALL_LOCKING
+static unsigned int fwc_rlocks, fwc_wlocks;
+#define FWC_DEBUG_LOCK(d) \
+do { \
+ FWC_DONT_HAVE_LOCK(d); \
+ d |= (1 << SLOT_NUMBER()); \
+} while (0)
+
+#define FWC_DEBUG_UNLOCK(d) \
+do { \
+ FWC_HAVE_LOCK(d); \
+ d &= ~(1 << SLOT_NUMBER()); \
+} while (0)
+
+#define FWC_DONT_HAVE_LOCK(d) \
+do { \
+ if ((d) & (1 << SLOT_NUMBER())) \
+ printk("%s:%i: Got lock on %i already!\n", \
+ __FILE__, __LINE__, SLOT_NUMBER()); \
+} while(0)
+
+#define FWC_HAVE_LOCK(d) \
+do { \
+ if (!((d) & (1 << SLOT_NUMBER()))) \
+ printk("%s:%i:No lock on %i!\n", \
+ __FILE__, __LINE__, SLOT_NUMBER()); \
+} while (0)
+
+#else
+#define FWC_DEBUG_LOCK(d) do { } while(0)
+#define FWC_DEBUG_UNLOCK(d) do { } while(0)
+#define FWC_DONT_HAVE_LOCK(d) do { } while(0)
+#define FWC_HAVE_LOCK(d) do { } while(0)
+#endif /*DEBUG_IP_FIRWALL_LOCKING*/
+
+#define FWC_READ_LOCK(l) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock(l); } while (0)
+#define FWC_WRITE_LOCK(l) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock(l); } while (0)
+#define FWC_READ_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock_irqsave(l,f); } while (0)
+#define FWC_WRITE_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock_irqsave(l,f); } while (0)
+#define FWC_READ_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock(l); } while (0)
+#define FWC_WRITE_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock(l); } while (0)
+#define FWC_READ_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock_irqrestore(l,f); } while (0)
+#define FWC_WRITE_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock_irqrestore(l,f); } while (0)
+
+struct ip_chain;
+
+struct ip_counters
+{
+ __u64 pcnt, bcnt; /* Packet and byte counters */
+};
+
+struct ip_fwkernel
+{
+ struct ip_fw ipfw;
+ struct ip_fwkernel *next; /* where to go next if current
+ * rule doesn't match */
+ struct ip_chain *branch; /* which branch to jump to if
+ * current rule matches */
+ int simplebranch; /* Use this if branch == NULL */
+ struct ip_counters counters[0]; /* Actually several of these */
+};
+
+struct ip_reent
+{
+ struct ip_chain *prevchain; /* Pointer to referencing chain */
+ struct ip_fwkernel *prevrule; /* Pointer to referencing rule */
+ struct ip_counters counters;
+};
+
+struct ip_chain
+{
+ ip_chainlabel label; /* Defines the label for each block */
+ struct ip_chain *next; /* Pointer to next block */
+ struct ip_fwkernel *chain; /* Pointer to first rule in block */
+ __u32 refcount; /* Number of refernces to block */
+ int policy; /* Default rule for chain. Only *
+ * used in built in chains */
+ struct ip_reent reent[0]; /* Actually several of these */
+};
+
+/*
+ * Implement IP packet firewall
+ */
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Lock around ip_fw_chains linked list structure */
+rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED;
+
+/* Head of linked list of fw rules */
+static struct ip_chain *ip_fw_chains;
+
+#define IP_FW_INPUT_CHAIN ip_fw_chains
+#define IP_FW_FORWARD_CHAIN (ip_fw_chains->next)
+#define IP_FW_OUTPUT_CHAIN (ip_fw_chains->next->next)
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+extern inline int port_match(__u16 min, __u16 max, __u16 port,
+ int frag, int invert)
+{
+ if (frag) /* Fragments fail ANY port test. */
+ return (min == 0 && max == 0xFFFF);
+ else return (port >= min && port <= max) ^ invert;
+}
+
+/* Returns whether matches rule or not. */
+static int ip_rule_match(struct ip_fwkernel *f,
+ const char *ifname,
+ struct iphdr *ip,
+ char tcpsyn,
+ __u16 src_port, __u16 dst_port,
+ char isfrag)
+{
+#define FWINV(bool,invflg) ((bool) ^ !!(f->ipfw.fw_invflg & invflg))
+ /*
+ * This is a bit simpler as we don't have to walk
+ * an interface chain as you do in BSD - same logic
+ * however.
+ */
+
+ if (FWINV((ip->saddr&f->ipfw.fw_smsk.s_addr) != f->ipfw.fw_src.s_addr,
+ IP_FW_INV_SRCIP)
+ || FWINV((ip->daddr&f->ipfw.fw_dmsk.s_addr)!=f->ipfw.fw_dst.s_addr,
+ IP_FW_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
+ f->ipfw.fw_smsk.s_addr, f->ipfw.fw_src.s_addr,
+ f->ipfw.fw_invflg & IP_FW_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
+ f->ipfw.fw_dmsk.s_addr, f->ipfw.fw_dst.s_addr,
+ f->ipfw.fw_invflg & IP_FW_INV_DSTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /*
+ * Look for a VIA device match
+ */
+ if (f->ipfw.fw_flg & IP_FW_F_WILDIF) {
+ if (FWINV(strncmp(ifname, f->ipfw.fw_vianame,
+ strlen(f->ipfw.fw_vianame)) != 0,
+ IP_FW_INV_VIA)) {
+ dprintf("Wildcard interface mismatch.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_VIA ? " (INV)" : "");
+ return 0; /* Mismatch */
+ }
+ }
+ else if (FWINV(strcmp(ifname, f->ipfw.fw_vianame) != 0,
+ IP_FW_INV_VIA)) {
+ dprintf("Interface name does not match.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_VIA
+ ? " (INV)" : "");
+ return 0; /* Mismatch */
+ }
+
+ /*
+ * Ok the chain addresses match.
+ */
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * the we return zero */
+ if (FWINV((f->ipfw.fw_flg&IP_FW_F_FRAG) && !isfrag, IP_FW_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_FRAG ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Fragment NEVER passes a SYN test, even an inverted one. */
+ if (FWINV((f->ipfw.fw_flg&IP_FW_F_TCPSYN) && !tcpsyn, IP_FW_INV_SYN)
+ || (isfrag && (f->ipfw.fw_flg&IP_FW_F_TCPSYN))) {
+ dprintf("Rule requires SYN and packet has no SYN.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_SYN ? " (INV)" : "");
+ return 0;
+ }
+
+ if (f->ipfw.fw_proto) {
+ /*
+ * Specific firewall - packet's protocol
+ * must match firewall's.
+ */
+
+ if (FWINV(ip->protocol!=f->ipfw.fw_proto, IP_FW_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, f->ipfw.fw_proto,
+ f->ipfw.fw_invflg&IP_FW_INV_PROTO ? " (INV)":"");
+ return 0;
+ }
+
+ /* For non TCP/UDP/ICMP, port range is max anyway. */
+ if (!port_match(f->ipfw.fw_spts[0],
+ f->ipfw.fw_spts[1],
+ src_port, isfrag,
+ !!(f->ipfw.fw_invflg&IP_FW_INV_SRCPT))
+ || !port_match(f->ipfw.fw_dpts[0],
+ f->ipfw.fw_dpts[1],
+ dst_port, isfrag,
+ !!(f->ipfw.fw_invflg
+ &IP_FW_INV_DSTPT))) {
+ dprintf("Port match failed.\n");
+ return 0;
+ }
+ }
+
+ dprintf("Match succeeded.\n");
+ return 1;
+}
+
+static const char *branchname(struct ip_chain *branch,int simplebranch)
+{
+ if (branch)
+ return branch->label;
+ switch (simplebranch)
+ {
+ case FW_BLOCK: return IP_FW_LABEL_BLOCK;
+ case FW_ACCEPT: return IP_FW_LABEL_ACCEPT;
+ case FW_REJECT: return IP_FW_LABEL_REJECT;
+ case FW_REDIRECT: return IP_FW_LABEL_REDIRECT;
+ case FW_MASQUERADE: return IP_FW_LABEL_MASQUERADE;
+ case FW_SKIP: return "-";
+ case FW_SKIP+1: return IP_FW_LABEL_RETURN;
+ default:
+ return "UNKNOWN";
+ }
+}
+
+/*
+ * VERY ugly piece of code which actually
+ * makes kernel printf for matching packets...
+ */
+static void dump_packet(const struct iphdr *ip,
+ const char *ifname,
+ struct ip_fwkernel *f,
+ const ip_chainlabel chainlabel,
+ __u16 src_port,
+ __u16 dst_port,
+ unsigned int count,
+ int syn)
+{
+ __u32 *opt = (__u32 *) (ip + 1);
+ int opti;
+
+ if (f)
+ {
+ printk(KERN_INFO "Packet log: %s ",chainlabel);
+
+ printk("%s ",branchname(f->branch,f->simplebranch));
+ if (f->simplebranch==FW_REDIRECT)
+ printk("%d ",f->ipfw.fw_redirpt);
+ }
+
+ printk("%s PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu"
+ " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
+ ifname, ip->protocol,
+ (ntohl(ip->saddr)>>24)&0xFF,
+ (ntohl(ip->saddr)>>16)&0xFF,
+ (ntohl(ip->saddr)>>8)&0xFF,
+ (ntohl(ip->saddr))&0xFF,
+ src_port,
+ (ntohl(ip->daddr)>>24)&0xFF,
+ (ntohl(ip->daddr)>>16)&0xFF,
+ (ntohl(ip->daddr)>>8)&0xFF,
+ (ntohl(ip->daddr))&0xFF,
+ dst_port,
+ ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
+ ntohs(ip->frag_off), ip->ttl);
+
+ for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
+ printk(" O=0x%8.8X", *opt++);
+ printk(" %s(#%d)\n", syn ? "SYN " : /* "PENANCE" */ "", count);
+}
+
+/* function for checking chain labels for user space. */
+static int check_label(ip_chainlabel label)
+{
+ unsigned int i;
+ /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */
+ for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++)
+ if (label[i] == '\0') return 1;
+
+ return 0;
+}
+
+/* This function returns a pointer to the first chain with a label
+ * that matches the one given. */
+static struct ip_chain *find_label(ip_chainlabel label)
+{
+ struct ip_chain *tmp;
+ FWC_HAVE_LOCK(fwc_rlocks | fwc_wlocks);
+ for (tmp = ip_fw_chains; tmp; tmp = tmp->next)
+ if (strcmp(tmp->label,label) == 0)
+ break;
+ return tmp;
+}
+
+/* This function returns a boolean which when true sets answer to one
+ of the FW_*. */
+static int find_special(ip_chainlabel label, int *answer)
+{
+ if (label[0] == '\0') {
+ *answer = FW_SKIP; /* => pass-through rule */
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_ACCEPT) == 0) {
+ *answer = FW_ACCEPT;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_BLOCK) == 0) {
+ *answer = FW_BLOCK;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_REJECT) == 0) {
+ *answer = FW_REJECT;
+ return 1;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ } else if (strcmp(label,IP_FW_LABEL_REDIRECT) == 0) {
+ *answer = FW_REDIRECT;
+ return 1;
+#endif
+#ifdef CONFIG_IP_MASQUERADE
+ } else if (strcmp(label,IP_FW_LABEL_MASQUERADE) == 0) {
+ *answer = FW_MASQUERADE;
+ return 1;
+#endif
+ } else if (strcmp(label, IP_FW_LABEL_RETURN) == 0) {
+ *answer = FW_SKIP+1;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* This function cleans up the prevchain and prevrule. If the verbose
+ * flag is set then he names of the chains will be printed as it
+ * cleans up. */
+static void cleanup(struct ip_chain *chain,
+ const int verbose,
+ unsigned int slot)
+{
+ struct ip_chain *tmpchain = chain->reent[slot].prevchain;
+ if (verbose)
+ printk(KERN_ERR "Chain backtrace: ");
+ while (tmpchain) {
+ if (verbose)
+ printk("%s<-",chain->label);
+ chain->reent[slot].prevchain = NULL;
+ chain = tmpchain;
+ tmpchain = chain->reent[slot].prevchain;
+ }
+ if (verbose)
+ printk("%s\n",chain->label);
+}
+
+static inline int
+ip_fw_domatch(struct ip_fwkernel *f,
+ struct iphdr *ip,
+ const char *rif,
+ const ip_chainlabel label,
+ struct sk_buff *skb,
+ unsigned int slot,
+ __u16 src_port, __u16 dst_port,
+ unsigned int count,
+ int tcpsyn)
+{
+ f->counters[slot].bcnt+=ntohs(ip->tot_len);
+ f->counters[slot].pcnt++;
+ if (f->ipfw.fw_flg & IP_FW_F_PRN) {
+ dump_packet(ip,rif,f,label,src_port,dst_port,count,tcpsyn);
+ }
+ ip->tos = (ip->tos & f->ipfw.fw_tosand) ^ f->ipfw.fw_tosxor;
+
+/* This functionality is useless in stock 2.0.x series, but we don't
+ * discard the mark thing altogether, to avoid breaking ipchains (and,
+ * more importantly, the ipfwadm wrapper) --PR */
+ if (f->ipfw.fw_flg & IP_FW_F_MARKABS)
+ skb->fwmark = f->ipfw.fw_mark;
+ else
+ skb->fwmark+=f->ipfw.fw_mark;
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+ if (f->ipfw.fw_flg & IP_FW_F_NETLINK) {
+ size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len))
+ + sizeof(__u32) + sizeof(skb->fwmark) + IFNAMSIZ;
+ struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC);
+
+ duprintf("Sending packet out NETLINK (length = %u).\n",
+ (unsigned int)len);
+ if (outskb) {
+ /* Prepend length, mark & interface */
+ skb_put(outskb, len);
+ *((__u32 *)outskb->data) = (__u32)len;
+ *((__u32 *)(outskb->data+sizeof(__u32))) = skb->fwmark;
+ strcpy(outskb->data+sizeof(__u32)*2, rif);
+ memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip,
+ len-(sizeof(__u32)*2+IFNAMSIZ));
+ netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL);
+ }
+ else {
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_fw: packet drop due to "
+ "netlink failure\n");
+ return 0;
+ }
+ }
+#endif
+ return 1;
+}
+
+/*
+ * Returns one of the generic firewall policies, like FW_ACCEPT.
+ *
+ * The testing is either false for normal firewall mode or true for
+ * user checking mode (counters are not updated, TOS & mark not done).
+ */
+static int
+ip_fw_check(struct iphdr *ip,
+ const char *rif,
+ __u16 *redirport,
+ struct ip_chain *chain,
+ struct sk_buff *skb,
+ unsigned int slot,
+ int testing)
+{
+ struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
+ struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl);
+ struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl);
+ __u32 src, dst;
+ __u16 src_port = 0xFFFF, dst_port = 0xFFFF;
+ char tcpsyn=0;
+ __u16 offset;
+ unsigned char oldtos;
+ struct ip_fwkernel *f;
+ int ret = FW_SKIP+2;
+ unsigned int count;
+
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+
+ offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+ /*
+ * Don't allow a fragment of TCP 8 bytes in. Nobody
+ * normal causes this. Its a cracker trying to break
+ * in by doing a flag overwrite to pass the direction
+ * checks.
+ */
+
+ if (offset == 1 && ip->protocol == IPPROTO_TCP) {
+ if (!testing && net_ratelimit()) {
+ printk("Suspect TCP fragment.\n");
+ dump_packet(ip,rif,NULL,NULL,0,0,0,0);
+ }
+ return FW_BLOCK;
+ }
+
+ /* If we can't investigate ports, treat as fragment. It's
+ * either a trucated whole packet, or a truncated first
+ * fragment, or a TCP first fragment of length 8-15, in which
+ * case the above rule stops reassembly.
+ */
+ if (offset == 0) {
+ unsigned int size_req;
+ switch (ip->protocol) {
+ case IPPROTO_TCP:
+ /* Don't care about things past flags word */
+ size_req = 16;
+ break;
+
+ case IPPROTO_UDP:
+ case IPPROTO_ICMP:
+ size_req = 8;
+ break;
+
+ default:
+ size_req = 0;
+ }
+ offset = (ntohs(ip->tot_len) < (ip->ihl<<2)+size_req);
+
+ /* If it is a truncated first fragment then it can be
+ * used to rewrite port information, and thus should
+ * be blocked.
+ */
+ if (offset && (ntohs(ip->frag_off) & IP_MF)) {
+ if (!testing && net_ratelimit()) {
+ printk("Suspect short first fragment.\n");
+ dump_packet(ip,rif,NULL,NULL,0,0,0,0);
+ }
+ return FW_BLOCK;
+ }
+ }
+
+ src = ip->saddr;
+ dst = ip->daddr;
+ oldtos = ip->tos;
+
+ /*
+ * If we got interface from which packet came
+ * we can use the address directly. Linux 2.1 now uses address
+ * chains per device too, but unlike BSD we first check if the
+ * incoming packet matches a device address and the routing
+ * table before calling the firewall.
+ */
+
+ dprintf("Packet ");
+ switch(ip->protocol)
+ {
+ case IPPROTO_TCP:
+ dprintf("TCP ");
+ if (!offset) {
+ src_port=ntohs(tcp->source);
+ dst_port=ntohs(tcp->dest);
+
+ /* Connection initilisation can only
+ * be made when the syn bit is set and
+ * neither of the ack or reset is
+ * set. */
+ if(tcp->syn && !(tcp->ack || tcp->rst))
+ tcpsyn=1;
+ }
+ break;
+ case IPPROTO_UDP:
+ dprintf("UDP ");
+ if (!offset) {
+ src_port=ntohs(udp->source);
+ dst_port=ntohs(udp->dest);
+ }
+ break;
+ case IPPROTO_ICMP:
+ if (!offset) {
+ src_port=(__u16)icmp->type;
+ dst_port=(__u16)icmp->code;
+ }
+ dprintf("ICMP ");
+ break;
+ default:
+ dprintf("p=%d ",ip->protocol);
+ break;
+ }
+#ifdef DEBUG_IP_FIREWALL
+ print_ip(ip->saddr);
+
+ if (offset)
+ dprintf(":fragment (%i) ", ((int)offset)<<2);
+ else if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP
+ || ip->protocol==IPPROTO_ICMP)
+ dprintf(":%hu:%hu", src_port, dst_port);
+ dprintf("\n");
+#endif
+
+ if (!testing) FWC_READ_LOCK(&ip_fw_lock);
+ else FWC_HAVE_LOCK(fwc_rlocks);
+
+ f = chain->chain;
+ do {
+ count = 0;
+ for (; f; f = f->next) {
+ count++;
+ if (ip_rule_match(f,rif,ip,
+ tcpsyn,src_port,dst_port,offset)) {
+ if (!testing
+ && !ip_fw_domatch(f, ip, rif, chain->label,
+ skb, slot,
+ src_port, dst_port,
+ count, tcpsyn)) {
+ ret = FW_BLOCK;
+ goto out;
+ }
+ break;
+ }
+ }
+ if (f) {
+ if (f->branch) {
+ /* Do sanity check to see if we have
+ * already set prevchain and if so we
+ * must be in a loop */
+ if (f->branch->reent[slot].prevchain) {
+ if (!testing) {
+ printk(KERN_ERR
+ "IP firewall: "
+ "Loop detected "
+ "at `%s'.\n",
+ f->branch->label);
+ cleanup(chain, 1, slot);
+ ret = FW_BLOCK;
+ } else {
+ cleanup(chain, 0, slot);
+ ret = FW_SKIP+1;
+ }
+ }
+ else {
+ f->branch->reent[slot].prevchain
+ = chain;
+ f->branch->reent[slot].prevrule
+ = f->next;
+ chain = f->branch;
+ f = chain->chain;
+ }
+ }
+ else if (f->simplebranch == FW_SKIP)
+ f = f->next;
+ else if (f->simplebranch == FW_SKIP+1) {
+ /* Just like falling off the chain */
+ goto fall_off_chain;
+ }
+ else {
+ cleanup(chain, 0, slot);
+ ret = f->simplebranch;
+ }
+ } /* f == NULL */
+ else {
+ fall_off_chain:
+ if (chain->reent[slot].prevchain) {
+ struct ip_chain *tmp = chain;
+ f = chain->reent[slot].prevrule;
+ chain = chain->reent[slot].prevchain;
+ tmp->reent[slot].prevchain = NULL;
+ }
+ else {
+ ret = chain->policy;
+ if (!testing) {
+ chain->reent[slot].counters.pcnt++;
+ chain->reent[slot].counters.bcnt
+ += ntohs(ip->tot_len);
+ }
+ }
+ }
+ } while (ret == FW_SKIP+2);
+
+ out:
+ if (!testing) FWC_READ_UNLOCK(&ip_fw_lock);
+
+ /* Recalculate checksum if not going to reject, and TOS changed. */
+ if (ip->tos != oldtos
+ && ret != FW_REJECT && ret != FW_BLOCK
+ && !testing)
+ ip_send_check(ip);
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (ret == FW_REDIRECT && redirport) {
+ if ((*redirport = htons(f->ipfw.fw_redirpt)) == 0) {
+ /* Wildcard redirection.
+ * Note that redirport will become
+ * 0xFFFF for non-TCP/UDP packets.
+ */
+ *redirport = htons(dst_port);
+ }
+ }
+#endif
+
+#ifdef DEBUG_ALLOW_ALL
+ return (testing ? ret : FW_ACCEPT);
+#else
+ return ret;
+#endif
+}
+
+/* Must have write lock & interrupts off for any of these */
+
+/* This function sets all the byte counters in a chain to zero. The
+ * input is a pointer to the chain required for zeroing */
+static int zero_fw_chain(struct ip_chain *chainptr)
+{
+ struct ip_fwkernel *i;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (i = chainptr->chain; i; i = i->next)
+ memset(i->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
+ return 0;
+}
+
+static int clear_fw_chain(struct ip_chain *chainptr)
+{
+ struct ip_fwkernel *i= chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ chainptr->chain=NULL;
+
+ while (i) {
+ struct ip_fwkernel *tmp = i->next;
+ if (i->branch)
+ i->branch->refcount--;
+ kfree(i);
+ i = tmp;
+ }
+ return 0;
+}
+
+static int replace_in_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl,
+ __u32 position)
+{
+ struct ip_fwkernel *f = chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ while (--position && f != NULL) f = f->next;
+ if (f == NULL)
+ return EINVAL;
+
+ if (f->branch) f->branch->refcount--;
+ if (frwl->branch) frwl->branch->refcount++;
+
+ frwl->next = f->next;
+ memcpy(f,frwl,sizeof(struct ip_fwkernel));
+ kfree(frwl);
+ return 0;
+}
+
+static int append_to_chain(struct ip_chain *chainptr, struct ip_fwkernel *rule)
+{
+ struct ip_fwkernel *i;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* Special case if no rules already present */
+ if (chainptr->chain == NULL) {
+
+ /* If pointer writes are atomic then turning off
+ * interupts is not necessary. */
+ chainptr->chain = rule;
+ if (rule->branch) rule->branch->refcount++;
+ return 0;
+ }
+
+ /* Find the rule before the end of the chain */
+ for (i = chainptr->chain; i->next; i = i->next);
+ i->next = rule;
+ if (rule->branch) rule->branch->refcount++;
+ return 0;
+}
+
+/* This function inserts a rule at the position of position in the
+ * chain refenced by chainptr. If position is 1 then this rule will
+ * become the new rule one. */
+static int insert_in_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl,
+ __u32 position)
+{
+ struct ip_fwkernel *f = chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* special case if the position is number 1 */
+ if (position == 1) {
+ frwl->next = chainptr->chain;
+ if (frwl->branch) frwl->branch->refcount++;
+ chainptr->chain = frwl;
+ return 0;
+ }
+ position--;
+ while (--position && f != NULL) f = f->next;
+ if (f == NULL)
+ return EINVAL;
+ if (frwl->branch) frwl->branch->refcount++;
+ frwl->next = f->next;
+
+ f->next = frwl;
+ return 0;
+}
+
+/* This function deletes the a rule from a given rulenum and chain.
+ * With rulenum = 1 is the first rule is deleted. */
+
+static int del_num_from_chain(struct ip_chain *chainptr, __u32 rulenum)
+{
+ struct ip_fwkernel *i=chainptr->chain,*tmp;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ if (!chainptr->chain)
+ return ENOENT;
+
+ /* Need a special case for the first rule */
+ if (rulenum == 1) {
+ /* store temp to allow for freeing up of memory */
+ tmp = chainptr->chain;
+ if (chainptr->chain->branch) chainptr->chain->branch->refcount--;
+ chainptr->chain = chainptr->chain->next;
+ kfree(tmp); /* free memory that is now unused */
+ } else {
+ rulenum--;
+ while (--rulenum && i->next ) i = i->next;
+ if (!i->next)
+ return ENOENT;
+ tmp = i->next;
+ if (i->next->branch)
+ i->next->branch->refcount--;
+ i->next = i->next->next;
+ kfree(tmp);
+ }
+ return 0;
+}
+
+
+/* This function deletes the a rule from a given rule and chain.
+ * The rule that is deleted is the first occursance of that rule. */
+static int del_rule_from_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl)
+{
+ struct ip_fwkernel *ltmp,*ftmp = chainptr->chain ;
+ int was_found;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ /* Sure, we should compare marks, but since the `ipfwadm'
+ * script uses it for an unholy hack... well, life is easier
+ * this way. We also mask it out of the flags word. --PR */
+ for (ltmp=NULL, was_found=0;
+ !was_found && ftmp != NULL;
+ ltmp = ftmp,ftmp = ftmp->next) {
+ if (ftmp->ipfw.fw_src.s_addr!=frwl->ipfw.fw_src.s_addr
+ || ftmp->ipfw.fw_dst.s_addr!=frwl->ipfw.fw_dst.s_addr
+ || ftmp->ipfw.fw_smsk.s_addr!=frwl->ipfw.fw_smsk.s_addr
+ || ftmp->ipfw.fw_dmsk.s_addr!=frwl->ipfw.fw_dmsk.s_addr
+#if 0
+ || ftmp->ipfw.fw_flg!=frwl->ipfw.fw_flg
+#else
+ || ((ftmp->ipfw.fw_flg & ~IP_FW_F_MARKABS)
+ != (frwl->ipfw.fw_flg & ~IP_FW_F_MARKABS))
+#endif
+ || ftmp->ipfw.fw_invflg!=frwl->ipfw.fw_invflg
+ || ftmp->ipfw.fw_proto!=frwl->ipfw.fw_proto
+#if 0
+ || ftmp->ipfw.fw_mark!=frwl->ipfw.fw_mark
+#endif
+ || ftmp->ipfw.fw_redirpt!=frwl->ipfw.fw_redirpt
+ || ftmp->ipfw.fw_spts[0]!=frwl->ipfw.fw_spts[0]
+ || ftmp->ipfw.fw_spts[1]!=frwl->ipfw.fw_spts[1]
+ || ftmp->ipfw.fw_dpts[0]!=frwl->ipfw.fw_dpts[0]
+ || ftmp->ipfw.fw_dpts[1]!=frwl->ipfw.fw_dpts[1]
+ || ftmp->ipfw.fw_outputsize!=frwl->ipfw.fw_outputsize) {
+ duprintf("del_rule_from_chain: mismatch:"
+ "src:%u/%u dst:%u/%u smsk:%u/%u dmsk:%u/%u "
+ "flg:%hX/%hX invflg:%hX/%hX proto:%u/%u "
+ "mark:%u/%u "
+ "ports:%hu-%hu/%hu-%hu %hu-%hu/%hu-%hu "
+ "outputsize:%hu-%hu\n",
+ ftmp->ipfw.fw_src.s_addr,
+ frwl->ipfw.fw_src.s_addr,
+ ftmp->ipfw.fw_dst.s_addr,
+ frwl->ipfw.fw_dst.s_addr,
+ ftmp->ipfw.fw_smsk.s_addr,
+ frwl->ipfw.fw_smsk.s_addr,
+ ftmp->ipfw.fw_dmsk.s_addr,
+ frwl->ipfw.fw_dmsk.s_addr,
+ ftmp->ipfw.fw_flg,
+ frwl->ipfw.fw_flg,
+ ftmp->ipfw.fw_invflg,
+ frwl->ipfw.fw_invflg,
+ ftmp->ipfw.fw_proto,
+ frwl->ipfw.fw_proto,
+ ftmp->ipfw.fw_mark,
+ frwl->ipfw.fw_mark,
+ ftmp->ipfw.fw_spts[0],
+ frwl->ipfw.fw_spts[0],
+ ftmp->ipfw.fw_spts[1],
+ frwl->ipfw.fw_spts[1],
+ ftmp->ipfw.fw_dpts[0],
+ frwl->ipfw.fw_dpts[0],
+ ftmp->ipfw.fw_dpts[1],
+ frwl->ipfw.fw_dpts[1],
+ ftmp->ipfw.fw_outputsize,
+ frwl->ipfw.fw_outputsize);
+ continue;
+ }
+
+ if (strncmp(ftmp->ipfw.fw_vianame,
+ frwl->ipfw.fw_vianame,
+ IFNAMSIZ)) {
+ duprintf("del_rule_from_chain: if mismatch: %s/%s\n",
+ ftmp->ipfw.fw_vianame,
+ frwl->ipfw.fw_vianame);
+ continue;
+ }
+ if (ftmp->branch != frwl->branch) {
+ duprintf("del_rule_from_chain: branch mismatch: "
+ "%s/%s\n",
+ ftmp->branch?ftmp->branch->label:"(null)",
+ frwl->branch?frwl->branch->label:"(null)");
+ continue;
+ }
+ if (ftmp->branch == NULL
+ && ftmp->simplebranch != frwl->simplebranch) {
+ duprintf("del_rule_from_chain: simplebranch mismatch: "
+ "%i/%i\n",
+ ftmp->simplebranch, frwl->simplebranch);
+ continue;
+ }
+ was_found = 1;
+ if (ftmp->branch)
+ ftmp->branch->refcount--;
+ if (ltmp)
+ ltmp->next = ftmp->next;
+ else
+ chainptr->chain = ftmp->next;
+ kfree(ftmp);
+ break;
+ }
+
+ if (was_found)
+ return 0;
+ else {
+ duprintf("del_rule_from_chain: no matching rule found\n");
+ return EINVAL;
+ }
+}
+
+/* This function takes the label of a chain and deletes the first
+ * chain with that name. No special cases required for the built in
+ * chains as they have their refcount initilised to 1 so that they are
+ * never deleted. */
+static int del_chain(ip_chainlabel label)
+{
+ struct ip_chain *tmp,*tmp2;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* Corner case: return EBUSY not ENOENT for first elem ("input") */
+ if (strcmp(label, ip_fw_chains->label) == 0)
+ return EBUSY;
+
+ for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
+ if(strcmp(tmp->next->label,label) == 0)
+ break;
+
+ tmp2 = tmp->next;
+ if (!tmp2)
+ return ENOENT;
+
+ if (tmp2->refcount)
+ return EBUSY;
+
+ if (tmp2->chain)
+ return ENOTEMPTY;
+
+ tmp->next = tmp2->next;
+ kfree(tmp2);
+ return 0;
+}
+
+/* This is a function to initilise a chain. Built in rules start with
+ * refcount = 1 so that they cannot be deleted. User defined rules
+ * start with refcount = 0 so they can be deleted. */
+static struct ip_chain *ip_init_chain(ip_chainlabel name,
+ __u32 ref,
+ int policy)
+{
+ unsigned int i;
+ struct ip_chain *label
+ = kmalloc(SIZEOF_STRUCT_IP_CHAIN, GFP_KERNEL);
+ if (label == NULL)
+ panic("Can't kmalloc for firewall chains.\n");
+ strcpy(label->label,name);
+ label->next = NULL;
+ label->chain = NULL;
+ label->refcount = ref;
+ label->policy = policy;
+ for (i = 0; i < smp_num_cpus*2; i++) {
+ label->reent[i].counters.pcnt = label->reent[i].counters.bcnt
+ = 0;
+ label->reent[i].prevchain = NULL;
+ label->reent[i].prevrule = NULL;
+ }
+
+ return label;
+}
+
+/* This is a function for reating a new chain. The chains is not
+ * created if a chain of the same name already exists */
+static int create_chain(ip_chainlabel label)
+{
+ struct ip_chain *tmp;
+
+ if (!check_label(label))
+ return EINVAL;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
+ if (strcmp(tmp->label,label) == 0)
+ return EEXIST;
+
+ if (strcmp(tmp->label,label) == 0)
+ return EEXIST;
+
+ tmp->next = ip_init_chain(label, 0, FW_SKIP); /* refcount is
+ * zero since this is a
+ * user defined chain *
+ * and therefore can be
+ * deleted */
+ return 0;
+}
+
+/* This function simply changes the policy on one of the built in
+ * chains. checking must be done before this is call to ensure that
+ * chainptr is pointing to one of the three possible chains */
+static int change_policy(struct ip_chain *chainptr, int policy)
+{
+ FWC_HAVE_LOCK(fwc_wlocks);
+ chainptr->policy = policy;
+ return 0;
+}
+
+/* This function takes an ip_fwuser and converts it to a ip_fwkernel. It also
+ * performs some checks in the structure. */
+static struct ip_fwkernel *convert_ipfw(struct ip_fwuser *fwuser, int *errno)
+{
+ struct ip_fwkernel *fwkern;
+
+ if ( (fwuser->ipfw.fw_flg & ~IP_FW_F_MASK) != 0 ) {
+ duprintf("convert_ipfw: undefined flag bits set (flags=%x)\n",
+ fwuser->ipfw.fw_flg);
+ *errno = EINVAL;
+ return NULL;
+ }
+
+#ifdef DEBUG_IP_FIREWALL_USER
+ /* These are sanity checks that don't really matter.
+ * We can get rid of these once testing is complete.
+ */
+ if ((fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
+ && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
+ || fwuser->ipfw.fw_proto != IPPROTO_TCP)) {
+ duprintf("convert_ipfw: TCP SYN flag set but proto != TCP!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if (strcmp(fwuser->label, IP_FW_LABEL_REDIRECT) != 0
+ && fwuser->ipfw.fw_redirpt != 0) {
+ duprintf("convert_ipfw: Target not REDIR but redirpt != 0!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if ((!(fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
+ && (fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG))
+ || (!(fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
+ && (fwuser->ipfw.fw_invflg & IP_FW_INV_SYN))) {
+ duprintf("convert_ipfw: Can't have INV flag if flag unset!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if (((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCPT)
+ && fwuser->ipfw.fw_spts[0] == 0
+ && fwuser->ipfw.fw_spts[1] == 0xFFFF)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTPT)
+ && fwuser->ipfw.fw_dpts[0] == 0
+ && fwuser->ipfw.fw_dpts[1] == 0xFFFF)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_VIA)
+ && (fwuser->ipfw.fw_vianame)[0] == '\0')
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCIP)
+ && fwuser->ipfw.fw_smsk.s_addr == 0)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTIP)
+ && fwuser->ipfw.fw_dmsk.s_addr == 0)) {
+ duprintf("convert_ipfw: INV flag makes rule unmatchable!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if ((fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
+ && !(fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG)
+ && (fwuser->ipfw.fw_spts[0] != 0
+ || fwuser->ipfw.fw_spts[1] != 0xFFFF
+ || fwuser->ipfw.fw_dpts[0] != 0
+ || fwuser->ipfw.fw_dpts[1] != 0xFFFF
+ || (fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN))) {
+ duprintf("convert_ipfw: Can't test ports or SYN with frag!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+#endif
+
+ if ((fwuser->ipfw.fw_spts[0] != 0
+ || fwuser->ipfw.fw_spts[1] != 0xFFFF
+ || fwuser->ipfw.fw_dpts[0] != 0
+ || fwuser->ipfw.fw_dpts[1] != 0xFFFF)
+ && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
+ || (fwuser->ipfw.fw_proto != IPPROTO_TCP
+ && fwuser->ipfw.fw_proto != IPPROTO_UDP
+ && fwuser->ipfw.fw_proto != IPPROTO_ICMP))) {
+ duprintf("convert_ipfw: Can only test ports for TCP/UDP/ICMP!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ fwkern = kmalloc(SIZEOF_STRUCT_IP_FW_KERNEL, GFP_KERNEL);
+ if (!fwkern) {
+ duprintf("convert_ipfw: kmalloc failed!\n");
+ *errno = ENOMEM;
+ return NULL;
+ }
+ memcpy(&fwkern->ipfw,&fwuser->ipfw,sizeof(struct ip_fw));
+
+ if (!find_special(fwuser->label, &fwkern->simplebranch)) {
+ fwkern->branch = find_label(fwuser->label);
+ if (!fwkern->branch) {
+ duprintf("convert_ipfw: chain doesn't exist `%s'.\n",
+ fwuser->label);
+ kfree(fwkern);
+ *errno = ENOENT;
+ return NULL;
+ } else if (fwkern->branch == IP_FW_INPUT_CHAIN
+ || fwkern->branch == IP_FW_FORWARD_CHAIN
+ || fwkern->branch == IP_FW_OUTPUT_CHAIN) {
+ duprintf("convert_ipfw: Can't branch to builtin chain `%s'.\n",
+ fwuser->label);
+ kfree(fwkern);
+ *errno = ENOENT;
+ return NULL;
+ }
+ } else
+ fwkern->branch = NULL;
+ memset(fwkern->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
+
+ /* Handle empty vianame by making it a wildcard */
+ if ((fwkern->ipfw.fw_vianame)[0] == '\0')
+ fwkern->ipfw.fw_flg |= IP_FW_F_WILDIF;
+
+ fwkern->next = NULL;
+ return fwkern;
+}
+
+int ip_fw_ctl(int cmd, void *m, int len)
+{
+ int ret;
+ struct ip_chain *chain;
+ unsigned long flags;
+
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ switch (cmd) {
+ case IP_FW_FLUSH:
+ if (len != sizeof(ip_chainlabel) || !check_label(m))
+ ret = EINVAL;
+ else if ((chain = find_label(m)) == NULL)
+ ret = ENOENT;
+ else ret = clear_fw_chain(chain);
+ break;
+
+ case IP_FW_ZERO:
+ if (len != sizeof(ip_chainlabel) || !check_label(m))
+ ret = EINVAL;
+ else if ((chain = find_label(m)) == NULL)
+ ret = ENOENT;
+ else ret = zero_fw_chain(chain);
+ break;
+
+ case IP_FW_CHECK: {
+ struct ip_fwtest *new = m;
+ struct iphdr *ip;
+
+ /* Don't need write lock. */
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+
+ if (len != sizeof(struct ip_fwtest) || !check_label(m))
+ return EINVAL;
+
+ /* Need readlock to do find_label */
+ FWC_READ_LOCK(&ip_fw_lock);
+
+ if ((chain = find_label(new->fwt_label)) == NULL)
+ ret = ENOENT;
+ else {
+ ip = &(new->fwt_packet.fwp_iph);
+
+ if (ip->ihl != sizeof(struct iphdr) / sizeof(int)) {
+ duprintf("ip_fw_ctl: ip->ihl=%d, want %d\n",
+ ip->ihl,
+ sizeof(struct iphdr) / sizeof(int));
+ ret = EINVAL;
+ }
+ else {
+ ret = ip_fw_check(ip, new->fwt_packet.fwp_vianame,
+ NULL, chain,
+ NULL, SLOT_NUMBER(), 1);
+ switch (ret) {
+ case FW_ACCEPT:
+ ret = 0; break;
+ case FW_REDIRECT:
+ ret = ECONNABORTED; break;
+ case FW_MASQUERADE:
+ ret = ECONNRESET; break;
+ case FW_REJECT:
+ ret = ECONNREFUSED; break;
+ /* Hack to help diag; these only get
+ returned when testing. */
+ case FW_SKIP+1:
+ ret = ELOOP; break;
+ case FW_SKIP:
+ ret = ENFILE; break;
+ default: /* FW_BLOCK */
+ ret = ETIMEDOUT; break;
+ }
+ }
+ }
+ FWC_READ_UNLOCK(&ip_fw_lock);
+ return ret;
+ }
+
+ case IP_FW_MASQ_TIMEOUTS: {
+#ifdef CONFIG_IP_MASQUERADE
+ ret = ip_fw_masq_timeouts(m, len);
+#else
+ ret = EINVAL;
+#endif
+ }
+ break;
+
+ case IP_FW_REPLACE: {
+ struct ip_fwkernel *ip_fwkern;
+ struct ip_fwnew *new = m;
+
+ if (len != sizeof(struct ip_fwnew)
+ || !check_label(new->fwn_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwn_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
+ != NULL)
+ ret = replace_in_chain(chain, ip_fwkern,
+ new->fwn_rulenum);
+ }
+ break;
+
+ case IP_FW_APPEND: {
+ struct ip_fwchange *new = m;
+ struct ip_fwkernel *ip_fwkern;
+
+ if (len != sizeof(struct ip_fwchange)
+ || !check_label(new->fwc_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwc_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
+ != NULL)
+ ret = append_to_chain(chain, ip_fwkern);
+ }
+ break;
+
+ case IP_FW_INSERT: {
+ struct ip_fwkernel *ip_fwkern;
+ struct ip_fwnew *new = m;
+
+ if (len != sizeof(struct ip_fwnew)
+ || !check_label(new->fwn_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwn_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
+ != NULL)
+ ret = insert_in_chain(chain, ip_fwkern,
+ new->fwn_rulenum);
+ }
+ break;
+
+ case IP_FW_DELETE: {
+ struct ip_fwchange *new = m;
+ struct ip_fwkernel *ip_fwkern;
+
+ if (len != sizeof(struct ip_fwchange)
+ || !check_label(new->fwc_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwc_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
+ != NULL) {
+ ret = del_rule_from_chain(chain, ip_fwkern);
+ kfree(ip_fwkern);
+ }
+ }
+ break;
+
+ case IP_FW_DELETE_NUM: {
+ struct ip_fwdelnum *new = m;
+
+ if (len != sizeof(struct ip_fwdelnum)
+ || !check_label(new->fwd_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwd_label)) == NULL)
+ ret = ENOENT;
+ else ret = del_num_from_chain(chain, new->fwd_rulenum);
+ }
+ break;
+
+ case IP_FW_CREATECHAIN: {
+ if (len != sizeof(ip_chainlabel)) {
+ duprintf("create_chain: bad size %i\n", len);
+ ret = EINVAL;
+ }
+ else ret = create_chain(m);
+ }
+ break;
+
+ case IP_FW_DELETECHAIN: {
+ if (len != sizeof(ip_chainlabel)) {
+ duprintf("delete_chain: bad size %i\n", len);
+ ret = EINVAL;
+ }
+ else ret = del_chain(m);
+ }
+ break;
+
+ case IP_FW_POLICY: {
+ struct ip_fwpolicy *new = m;
+
+ if (len != sizeof(struct ip_fwpolicy)
+ || !check_label(new->fwp_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwp_label)) == NULL)
+ ret = ENOENT;
+ else if (chain != IP_FW_INPUT_CHAIN
+ && chain != IP_FW_FORWARD_CHAIN
+ && chain != IP_FW_OUTPUT_CHAIN) {
+ duprintf("change_policy: can't change policy on user"
+ " defined chain.\n");
+ ret = EINVAL;
+ }
+ else {
+ int pol = FW_SKIP;
+ find_special(new->fwp_policy, &pol);
+
+ switch(pol) {
+ case FW_MASQUERADE:
+ if (chain != IP_FW_FORWARD_CHAIN) {
+ ret = EINVAL;
+ break;
+ }
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_ACCEPT:
+ case FW_REJECT:
+ ret = change_policy(chain, pol);
+ break;
+ default:
+ duprintf("change_policy: bad policy `%s'\n",
+ new->fwp_policy);
+ ret = EINVAL;
+ }
+ }
+ break;
+
+ }
+ default:
+ duprintf("ip_fw_ctl: unknown request %d\n",cmd);
+ ret = EINVAL;
+ }
+
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ return ret;
+}
+
+/* Returns bytes used - doesn't NUL terminate */
+static int dump_rule(char *buffer,
+ const char *chainlabel,
+ const struct ip_fwkernel *rule)
+{
+ int len;
+ unsigned int i;
+ __u64 packets = 0, bytes = 0;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (i = 0; i < NUM_SLOTS; i++) {
+ packets += rule->counters[i].pcnt;
+ bytes += rule->counters[i].bcnt;
+ }
+
+ len=sprintf(buffer,
+ "%9s " /* Chain name */
+ "%08lX/%08lX->%08lX/%08lX " /* Source & Destination IPs */
+ "%.16s " /* Interface */
+ "%X %X " /* fw_flg and fw_invflg fields */
+ "%u " /* Protocol */
+ "%-9u %-9u %-9u %-9u " /* Packet & byte counters */
+ "%u-%u %u-%u " /* Source & Dest port ranges */
+ "A%02X X%02X " /* TOS and and xor masks */
+ "%08X " /* Redirection port */
+ "%u " /* fw_mark field */
+ "%u " /* output size */
+ "%9s\n", /* Target */
+ chainlabel,
+ ntohl(rule->ipfw.fw_src.s_addr),
+ ntohl(rule->ipfw.fw_smsk.s_addr),
+ ntohl(rule->ipfw.fw_dst.s_addr),
+ ntohl(rule->ipfw.fw_dmsk.s_addr),
+ (rule->ipfw.fw_vianame)[0] ? rule->ipfw.fw_vianame : "-",
+ rule->ipfw.fw_flg,
+ rule->ipfw.fw_invflg,
+ rule->ipfw.fw_proto,
+ (__u32)(packets >> 32), (__u32)packets,
+ (__u32)(bytes >> 32), (__u32)bytes,
+ rule->ipfw.fw_spts[0], rule->ipfw.fw_spts[1],
+ rule->ipfw.fw_dpts[0], rule->ipfw.fw_dpts[1],
+ rule->ipfw.fw_tosand, rule->ipfw.fw_tosxor,
+ rule->ipfw.fw_redirpt,
+ rule->ipfw.fw_mark,
+ rule->ipfw.fw_outputsize,
+ branchname(rule->branch,rule->simplebranch));
+
+ duprintf("dump_rule: %i bytes done.\n", len);
+ return len;
+}
+
+/* File offset is actually in records, not bytes. */
+static int ip_chain_procinfo(char *buffer, char **start,
+ off_t offset, int length, int reset)
+{
+ struct ip_chain *i;
+ struct ip_fwkernel *j = ip_fw_chains->chain;
+ unsigned long flags;
+ int len = 0;
+ int last_len = 0;
+ off_t upto = 0;
+
+ duprintf("Offset starts at %lu\n", offset);
+ duprintf("ip_fw_chains is 0x%0lX\n", (unsigned long int)ip_fw_chains);
+
+ /* Need a write lock to lock out ``readers'' which update counters. */
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ for (i = ip_fw_chains; i; i = i->next) {
+ for (j = i->chain; j; j = j->next) {
+ if (upto == offset) break;
+ duprintf("Skipping rule in chain `%s'\n",
+ i->label);
+ upto++;
+ }
+ if (upto == offset) break;
+ }
+
+ /* Don't init j first time, or once i = NULL */
+ for (; i; (void)((i = i->next) && (j = i->chain))) {
+ duprintf("Dumping chain `%s'\n", i->label);
+ for (; j; j = j->next, upto++, last_len = len)
+ {
+ len += dump_rule(buffer+len, i->label, j);
+ if (len > length) {
+ duprintf("Dumped to %i (past %i). "
+ "Moving back to %i.\n",
+ len, length, last_len);
+ len = last_len;
+ goto outside;
+ }
+ else if (reset)
+ memset(j->counters, 0,
+ sizeof(struct ip_counters)*NUM_SLOTS);
+ }
+ }
+outside:
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ buffer[len] = '\0';
+
+ duprintf("ip_chain_procinfo: Length = %i (of %i). Offset = %li.\n",
+ len, length, upto);
+ /* `start' hack - see fs/proc/generic.c line ~165 */
+ *start=(char *)((unsigned int)upto-offset);
+ return len;
+}
+
+static int ip_chain_name_procinfo(char *buffer, char **start,
+ off_t offset, int length, int reset)
+{
+ struct ip_chain *i;
+ int len = 0,last_len = 0;
+ off_t pos = 0,begin = 0;
+ unsigned long flags;
+
+ /* Need a write lock to lock out ``readers'' which update counters. */
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ for (i = ip_fw_chains; i; i = i->next)
+ {
+ unsigned int j;
+ __u32 packetsHi = 0, packetsLo = 0, bytesHi = 0, bytesLo = 0;
+
+ for (j = 0; j < NUM_SLOTS; j++) {
+ packetsLo += i->reent[j].counters.pcnt & 0xFFFFFFFF;
+ packetsHi += ((i->reent[j].counters.pcnt >> 32)
+ & 0xFFFFFFFF);
+ bytesLo += i->reent[j].counters.bcnt & 0xFFFFFFFF;
+ bytesHi += ((i->reent[j].counters.bcnt >> 32)
+ & 0xFFFFFFFF);
+ }
+
+ /* print the label and the policy */
+ len+=sprintf(buffer+len,"%s %s %i %u %u %u %u\n",
+ i->label,branchname(NULL, i->policy),i->refcount,
+ packetsHi, packetsLo, bytesHi, bytesLo);
+ pos=begin+len;
+ if(pos<offset) {
+ len=0;
+ begin=pos;
+ }
+ else if(pos>offset+length) {
+ len = last_len;
+ break;
+ }
+
+ last_len = len;
+ }
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+
+ *start = buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ return len;
+}
+
+/*
+ * Interface to the generic firewall chains.
+ */
+int ipfw_input_check(struct firewall_ops *this, int pf, struct device *dev,
+ void *phdr, void *arg, struct sk_buff **pskb)
+{
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_INPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+int ipfw_output_check(struct firewall_ops *this, int pf, struct device *dev,
+ void *phdr, void *arg, struct sk_buff **pskb)
+{
+ /* Locally generated bogus packets by root. <SIGH>. */
+ if (((struct iphdr *)phdr)->ihl * 4 < sizeof(struct iphdr)
+ || (*pskb)->len < sizeof(struct iphdr))
+ return FW_ACCEPT;
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_OUTPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+int ipfw_forward_check(struct firewall_ops *this, int pf, struct device *dev,
+ void *phdr, void *arg, struct sk_buff **pskb)
+{
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_FORWARD_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+struct firewall_ops ipfw_ops=
+{
+ NULL,
+ ipfw_forward_check,
+ ipfw_input_check,
+ ipfw_output_check,
+ PF_INET,
+ 0 /* We don't even allow a fall through so we are last */
+};
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_ipfwchains_chain = {
+ PROC_NET_IPFW_CHAINS, sizeof(IP_FW_PROC_CHAINS)-1,
+ IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0,
+ 0, &proc_net_inode_operations, ip_chain_procinfo
+};
+
+static struct proc_dir_entry proc_net_ipfwchains_chainnames = {
+ PROC_NET_IPFW_CHAIN_NAMES, sizeof(IP_FW_PROC_CHAIN_NAMES)-1,
+ IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0,
+ 0, &proc_net_inode_operations, ip_chain_name_procinfo
+};
+
+#endif
+
+__initfunc(void ip_fw_init(void))
+{
+#ifdef DEBUG_IP_FIRWALL_LOCKING
+ fwc_wlocks = fwc_rlocks = 0;
+#endif
+
+ IP_FW_INPUT_CHAIN = ip_init_chain(IP_FW_LABEL_INPUT, 1, FW_ACCEPT);
+ IP_FW_FORWARD_CHAIN = ip_init_chain(IP_FW_LABEL_FORWARD, 1, FW_ACCEPT);
+ IP_FW_OUTPUT_CHAIN = ip_init_chain(IP_FW_LABEL_OUTPUT, 1, FW_ACCEPT);
+
+ if(register_firewall(PF_INET,&ipfw_ops)<0)
+ panic("Unable to register IP firewall.\n");
+
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_ipfwchains_chain);
+ proc_net_register(&proc_net_ipfwchains_chainnames);
+#endif
+
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+ ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL);
+ if (ipfwsk == NULL)
+ panic("ip_fw_init: cannot initialize netlink\n");
+#endif
+#if defined(DEBUG_IP_FIREWALL) || defined(DEBUG_IP_FIREWALL_USER)
+ printk("Firewall graphs enabled! Untested kernel coming thru. \n");
+#endif
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_gre.c b/pfinet/linux-src/net/ipv4/ip_gre.c
new file mode 100644
index 00000000..6a7546fd
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_gre.c
@@ -0,0 +1,1223 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+
+#ifdef CONFIG_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+ Problems & solutions
+ --------------------
+
+ 1. The most important issue is detecting local dead loops.
+ They would cause complete host lockup in transmit, which
+ would be "resolved" by stack overflow or, if queueing is enabled,
+ with infinite looping in net_bh.
+
+ We cannot track such dead loops during route installation,
+ it is infeasible task. The most general solutions would be
+ to keep skb->encapsulation counter (sort of local ttl),
+ and silently drop packet when it expires. It is the best
+ solution, but it supposes maintaing new variable in ALL
+ skb, even if no tunneling is used.
+
+ Current solution: t->recursion lock breaks dead loops. It looks
+ like dev->tbusy flag, but I preferred new variable, because
+ the semantics is different. One day, when hard_start_xmit
+ will be multithreaded we will have to use skb->encapsulation.
+
+
+
+ 2. Networking dead loops would not kill routers, but would really
+ kill network. IP hop limit plays role of "t->recursion" in this case,
+ if we copy it from packet being encapsulated to upper header.
+ It is very good solution, but it introduces two problems:
+
+ - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+ do not work over tunnels.
+ - traceroute does not work. I planned to relay ICMP from tunnel,
+ so that this problem would be solved and traceroute output
+ would even more informative. This idea appeared to be wrong:
+ only Linux complies to rfc1812 now (yes, guys, Linux is the only
+ true router now :-)), all routers (at least, in neighbourhood of mine)
+ return only 8 bytes of payload. It is the end.
+
+ Hence, if we want that OSPF worked or traceroute said something reasonable,
+ we should search for another solution.
+
+ One of them is to parse packet trying to detect inner encapsulation
+ made by our node. It is difficult or even impossible, especially,
+ taking into account fragmentation. TO be short, tt is not solution at all.
+
+ Current solution: The solution was UNEXPECTEDLY SIMPLE.
+ We force DF flag on tunnels with preconfigured hop limit,
+ that is ALL. :-) Well, it does not remove the problem completely,
+ but exponential growth of network traffic is changed to linear
+ (branches, that exceed pmtu are pruned) and tunnel mtu
+ fastly degrades to value <68, where looping stops.
+ Yes, it is not good if there exists a router in the loop,
+ which does not force DF, even when encapsulating packets have DF set.
+ But it is not our problem! Nobody could accuse us, we made
+ all that we could make. Even if it is your gated who injected
+ fatal route to network, even if it were you who configured
+ fatal static route: you are innocent. :-)
+
+
+
+ 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+ practically identical code. It would be good to glue them
+ together, but it is not very evident, how to make them modular.
+ sit is integral part of IPv6, ipip and gre are naturally modular.
+ We could extract common parts (hash table, ioctl etc)
+ to a separate module (ip_tunnel.c).
+
+ Alexey Kuznetsov.
+ */
+
+static int ipgre_tunnel_init(struct device *dev);
+
+/* Fallback tunnel: no source, no destination, no key, no options */
+
+static int ipgre_fb_tunnel_init(struct device *dev);
+
+static struct device ipgre_fb_tunnel_dev = {
+ NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
+};
+
+static struct ip_tunnel ipgre_fb_tunnel = {
+ NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", }
+};
+
+/* Tunnel hash table */
+
+/*
+ 4 hash tables:
+
+ 3: (remote,local)
+ 2: (remote,*)
+ 1: (*,local)
+ 0: (*,*)
+
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ */
+
+#define HASH_SIZE 16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static struct ip_tunnel *tunnels[4][HASH_SIZE];
+
+#define tunnels_r_l (tunnels[3])
+#define tunnels_r (tunnels[2])
+#define tunnels_l (tunnels[1])
+#define tunnels_wc (tunnels[0])
+
+/* Given src, dst and key, find approriate for input tunnel. */
+
+static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(key);
+ struct ip_tunnel *t;
+
+ for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_r[h0^h1]; t; t = t->next) {
+ if (remote == t->parms.iph.daddr) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_l[h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr ||
+ (local == t->parms.iph.daddr && MULTICAST(local))) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ }
+ for (t = tunnels_wc[h1]; t; t = t->next) {
+ if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ if (ipgre_fb_tunnel_dev.flags&IFF_UP)
+ return &ipgre_fb_tunnel;
+ return NULL;
+}
+
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ u32 key = t->parms.i_key;
+ unsigned h = HASH(key);
+ int prio = 0;
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+
+ return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipgre_bucket(t);
+
+ t->next = *tp;
+ wmb();
+ *tp = t;
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ *tp = t->next;
+ synchronize_bh();
+ break;
+ }
+ }
+}
+
+static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+ u32 key = parms->i_key;
+ struct ip_tunnel *t, **tp, *nt;
+ struct device *dev;
+ unsigned h = HASH(key);
+ int prio = 0;
+
+ if (local)
+ prio |= 1;
+ if (remote && !MULTICAST(remote)) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+ if (key == t->parms.i_key)
+ return t;
+ }
+ }
+ if (!create)
+ return NULL;
+
+ MOD_INC_USE_COUNT;
+ dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
+ if (dev == NULL) {
+ MOD_DEC_USE_COUNT;
+ return NULL;
+ }
+ memset(dev, 0, sizeof(*dev) + sizeof(*t));
+ dev->priv = (void*)(dev+1);
+ nt = (struct ip_tunnel*)dev->priv;
+ nt->dev = dev;
+ dev->name = nt->parms.name;
+ dev->init = ipgre_tunnel_init;
+ memcpy(&nt->parms, parms, sizeof(*parms));
+ if (dev->name[0] == 0) {
+ int i;
+ for (i=1; i<100; i++) {
+ sprintf(dev->name, "gre%d", i);
+ if (dev_get(dev->name) == NULL)
+ break;
+ }
+ if (i==100)
+ goto failed;
+ memcpy(parms->name, dev->name, IFNAMSIZ);
+ }
+ if (register_netdevice(dev) < 0)
+ goto failed;
+
+ ipgre_tunnel_link(nt);
+ /* Do not decrement MOD_USE_COUNT here. */
+ return nt;
+
+failed:
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
+ return NULL;
+}
+
+static void ipgre_tunnel_destroy(struct device *dev)
+{
+ ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+
+ if (dev != &ipgre_fb_tunnel_dev) {
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
+ }
+}
+
+
+void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft state for keyed
+ GRE tunnels with enabled checksum. Tell them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standrads established
+ by themself???
+ */
+
+ struct iphdr *iph = (struct iphdr*)dp;
+ u16 *p = (u16*)(dp+(iph->ihl<<2));
+ int grehlen = (iph->ihl<<2) + 4;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct ip_tunnel *t;
+ u16 flags;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_KEY) {
+ grehlen += 4;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ }
+ }
+
+ /* If only 8 bytes returned, keyed message will be dropped here */
+ if (len < grehlen)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* Soft state for pmtu is maintained by IP core. */
+ return;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
+ if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+ return;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ return;
+
+ if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+ return;
+#else
+ struct iphdr *iph = (struct iphdr*)dp;
+ struct iphdr *eiph;
+ u16 *p = (u16*)(dp+(iph->ihl<<2));
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int rel_type = 0;
+ int rel_code = 0;
+ int rel_info = 0;
+ u16 flags;
+ int grehlen = (iph->ihl<<2) + 4;
+ struct sk_buff *skb2;
+ struct rtable *rt;
+
+ if (p[1] != __constant_htons(ETH_P_IP))
+ return;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ if (flags&GRE_KEY)
+ grehlen += 4;
+ if (flags&GRE_SEQ)
+ grehlen += 4;
+ }
+ if (len < grehlen + sizeof(struct iphdr))
+ return;
+ eiph = (struct iphdr*)(dp + grehlen);
+
+ switch (type) {
+ default:
+ return;
+ case ICMP_PARAMETERPROB:
+ if (skb->h.icmph->un.gateway < (iph->ihl<<2))
+ return;
+
+ /* So... This guy found something strange INSIDE encapsulated
+ packet. Well, he is fool, but what can we do ?
+ */
+ rel_type = ICMP_PARAMETERPROB;
+ rel_info = skb->h.icmph->un.gateway - grehlen;
+ break;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* And it is the only really necesary thing :-) */
+ rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+ if (rel_info < grehlen+68)
+ return;
+ rel_info -= grehlen;
+ /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+ if (rel_info > ntohs(eiph->tot_len))
+ return;
+ break;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe, it is just ether pollution. --ANK
+ */
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ /* Prepare fake skb to feed it to icmp_send */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ dst_release(skb2->dst);
+ skb2->dst = NULL;
+ skb_pull(skb2, skb->data - (u8*)eiph);
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+ if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dev = rt->u.dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
+ rt->u.dst.dev->type != ARPHRD_IPGRE) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+ }
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+ skb2->dst->dev->type != ARPHRD_IPGRE) {
+ kfree_skb(skb2);
+ return;
+ }
+ }
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ if (rel_info > skb2->dst->pmtu) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dst->pmtu = rel_info;
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+ if (t->parms.iph.ttl) {
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ }
+ }
+
+ icmp_send(skb2, rel_type, rel_code, rel_info);
+ kfree_skb(skb2);
+#endif
+}
+
+int ipgre_rcv(struct sk_buff *skb, unsigned short len)
+{
+ struct iphdr *iph = skb->nh.iph;
+ u8 *h = skb->h.raw;
+ u16 flags = *(u16*)h;
+ u16 csum = 0;
+ u32 key = 0;
+ u32 seqno = 0;
+ struct ip_tunnel *tunnel;
+ int offset = 4;
+
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+ /* - Version must be 0.
+ - We do not support routing headers.
+ */
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ goto drop;
+
+ if (flags&GRE_CSUM) {
+ csum = ip_compute_csum(h, len);
+ offset += 4;
+ }
+ if (flags&GRE_KEY) {
+ key = *(u32*)(h + offset);
+ offset += 4;
+ }
+ if (flags&GRE_SEQ) {
+ seqno = ntohl(*(u32*)(h + offset));
+ offset += 4;
+ }
+ }
+
+ if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = skb_pull(skb, h + offset - skb->data);
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->ip_summed = 0;
+ skb->protocol = *(u16*)(h + 2);
+ skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ /* Looped back packet, drop it! */
+ if (((struct rtable*)skb->dst)->key.iif == 0)
+ goto drop;
+ tunnel->stat.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if (((flags&GRE_CSUM) && csum) ||
+ (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+ tunnel->stat.rx_crc_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ if (tunnel->parms.i_flags&GRE_SEQ) {
+ if (!(flags&GRE_SEQ) ||
+ (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+ tunnel->stat.rx_fifo_errors++;
+ tunnel->stat.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = seqno + 1;
+ }
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ netif_rx(skb);
+ return(0);
+ }
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return(0);
+}
+
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct net_device_stats *stats = &tunnel->stat;
+ struct iphdr *old_iph = skb->nh.iph;
+ struct iphdr *tiph;
+ u8 tos;
+ u16 df;
+ struct rtable *rt; /* Route to the other host */
+ struct device *tdev; /* Device to other host */
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ int gre_hlen;
+ u32 dst;
+ int mtu;
+
+ if (tunnel->recursion++) {
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (dev->hard_header) {
+ gre_hlen = 0;
+ tiph = (struct iphdr*)skb->data;
+ } else {
+ gre_hlen = tunnel->hlen;
+ tiph = &tunnel->parms.iph;
+ }
+
+ if ((dst = tiph->daddr) == 0) {
+ /* NBMA tunnel */
+
+ if (skb->dst == NULL) {
+ tunnel->stat.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == __constant_htons(ETH_P_IP)) {
+ rt = (struct rtable*)skb->dst;
+ if ((dst = rt->rt_gateway) == 0)
+ goto tx_error_icmp;
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+ struct in6_addr *addr6;
+ int addr_type;
+ struct neighbour *neigh = skb->dst->neighbour;
+
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (struct in6_addr*)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &skb->nh.ipv6h->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ goto tx_error_icmp;
+
+ dst = addr6->s6_addr32[3];
+ }
+#endif
+ else
+ goto tx_error;
+ }
+
+ tos = tiph->tos;
+ if (tos&1) {
+ if (skb->protocol == __constant_htons(ETH_P_IP))
+ tos = old_iph->tos;
+ tos &= ~1;
+ }
+
+ if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
+ tunnel->stat.tx_carrier_errors++;
+ goto tx_error;
+ }
+ tdev = rt->u.dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ df = tiph->frag_off;
+ mtu = rt->u.dst.pmtu - tunnel->hlen;
+
+ if (skb->protocol == __constant_htons(ETH_P_IP)) {
+ if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
+ skb->dst->pmtu = mtu;
+
+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+ if ((old_iph->frag_off&__constant_htons(IP_DF)) &&
+ mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+
+ if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ skb->dst->pmtu = mtu;
+ }
+ }
+
+ if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+#endif
+
+ if (tunnel->err_count > 0) {
+ if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb->h.raw = skb->nh.raw;
+
+ max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ stats->tx_dropped++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ }
+
+ skb->nh.raw = skb_push(skb, gre_hlen);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_GRE;
+ iph->tos = tos;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+
+ if ((iph->ttl = tiph->ttl) == 0) {
+ if (skb->protocol == __constant_htons(ETH_P_IP))
+ iph->ttl = old_iph->ttl;
+#ifdef CONFIG_IPV6
+ else if (skb->protocol == __constant_htons(ETH_P_IPV6))
+ iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+#endif
+ else
+ iph->ttl = ip_statistics.IpDefaultTTL;
+ }
+
+ ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+ ((u16*)(iph+1))[1] = skb->protocol;
+
+ if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+ u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
+
+ if (tunnel->parms.o_flags&GRE_SEQ) {
+ ++tunnel->o_seqno;
+ *ptr = htonl(tunnel->o_seqno);
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_KEY) {
+ *ptr = tunnel->parms.o_key;
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_CSUM) {
+ *ptr = 0;
+ *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+ }
+ }
+
+ iph->tot_len = htons(skb->len);
+ iph->id = htons(ip_id_count++);
+ ip_send_check(iph);
+
+ stats->tx_bytes += skb->len;
+ stats->tx_packets++;
+ ip_send(skb);
+ tunnel->recursion--;
+ return 0;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+
+tx_error:
+ stats->tx_errors++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+}
+
+static int
+ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+
+ MOD_INC_USE_COUNT;
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == &ipgre_fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipgre_tunnel_locate(&p, 0);
+ }
+ if (t == NULL)
+ t = (struct ip_tunnel*)dev->priv;
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= __constant_htons(IP_DF);
+
+ if (!(p.i_flags&GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags&GRE_KEY))
+ p.o_key = 0;
+
+ t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+ if (dev != &ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+ t != &ipgre_fb_tunnel) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned nflags=0;
+
+ t = (struct ip_tunnel*)dev->priv;
+
+ if (MULTICAST(p.iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p.iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+ start_bh_atomic();
+ ipgre_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ t->parms.i_key = p.i_key;
+ t->parms.o_key = p.o_key;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipgre_tunnel_link(t);
+ end_bh_atomic();
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.iph.ttl = p.iph.ttl;
+ t->parms.iph.tos = p.iph.tos;
+ t->parms.iph.frag_off = p.iph.frag_off;
+ }
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == &ipgre_fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == &ipgre_fb_tunnel)
+ goto done;
+ }
+ err = unregister_netdevice(dev);
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+static struct net_device_stats *ipgre_tunnel_get_stats(struct device *dev)
+{
+ return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+/* Nice toy. Unfortunately, useless in real life :-)
+ It allows to construct virtual multiprotocol broadcast "LAN"
+ over the Internet, provided multicast routing is tuned.
+
+
+ I have no idea was this bicycle invented before me,
+ so that I had to set ARPHRD_IPGRE to a random value.
+ I have an impression, that Cisco could make something similar,
+ but this feature is apparently missing in IOS<=11.2(8).
+
+ I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+ with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+ ping -t 255 224.66.66.66
+
+ If nobody answers, mbone does not work.
+
+ ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+ ip addr add 10.66.66.<somewhat>/24 dev Universe
+ ifconfig Universe up
+ ifconfig Universe add fe80::<Your_real_addr>/10
+ ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+ ftp 10.66.66.66
+ ...
+ ftp fec0:6666:6666::193.233.7.65
+ ...
+
+ */
+
+static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+ u16 *p = (u16*)(iph+1);
+
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ p[0] = t->parms.o_flags;
+ p[1] = htons(type);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (saddr)
+ memcpy(&iph->saddr, saddr, 4);
+
+ if (daddr) {
+ memcpy(&iph->daddr, daddr, 4);
+ return t->hlen;
+ }
+ if (iph->daddr && !MULTICAST(iph->daddr))
+ return t->hlen;
+
+ return -t->hlen;
+}
+
+static int ipgre_open(struct device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+ MOD_INC_USE_COUNT;
+ if (MULTICAST(t->parms.iph.daddr)) {
+ struct rtable *rt;
+ if (ip_route_output(&rt, t->parms.iph.daddr,
+ t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
+ t->parms.link)) {
+ MOD_DEC_USE_COUNT;
+ return -EADDRNOTAVAIL;
+ }
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ if (dev->ip_ptr == NULL) {
+ MOD_DEC_USE_COUNT;
+ return -EADDRNOTAVAIL;
+ }
+ t->mlink = dev->ifindex;
+ ip_mc_inc_group(dev->ip_ptr, t->parms.iph.daddr);
+ }
+ return 0;
+}
+
+static int ipgre_close(struct device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+ if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
+ dev = dev_get_by_index(t->mlink);
+ if (dev && dev->ip_ptr)
+ ip_mc_dec_group(dev->ip_ptr, t->parms.iph.daddr);
+ }
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+#endif
+
+static void ipgre_tunnel_init_gen(struct device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+ dev->destructor = ipgre_tunnel_destroy;
+ dev->hard_start_xmit = ipgre_tunnel_xmit;
+ dev->get_stats = ipgre_tunnel_get_stats;
+ dev->do_ioctl = ipgre_tunnel_ioctl;
+ dev->change_mtu = ipgre_tunnel_change_mtu;
+
+ dev_init_buffers(dev);
+
+ dev->type = ARPHRD_IPGRE;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+ dev->mtu = 1500 - sizeof(struct iphdr) - 4;
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+ memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
+}
+
+static int ipgre_tunnel_init(struct device *dev)
+{
+ struct device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = 1500;
+ int addend = sizeof(struct iphdr) + 4;
+
+ tunnel = (struct ip_tunnel*)dev->priv;
+ iph = &tunnel->parms.iph;
+
+ ipgre_tunnel_init_gen(dev);
+
+ /* Guess output device to choose reasonable mtu and hard_header_len */
+
+ if (iph->daddr) {
+ struct rtable *rt;
+ if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+
+ dev->flags |= IFF_POINTOPOINT;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ if (!iph->saddr)
+ return -EINVAL;
+ dev->flags = IFF_BROADCAST;
+ dev->hard_header = ipgre_header;
+ dev->open = ipgre_open;
+ dev->stop = ipgre_close;
+ }
+#endif
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = dev_get_by_index(tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ /* Precalculate GRE options length */
+ if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+ if (tunnel->parms.o_flags&GRE_CSUM)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_KEY)
+ addend += 4;
+ if (tunnel->parms.o_flags&GRE_SEQ)
+ addend += 4;
+ }
+ dev->hard_header_len = hlen + addend;
+ dev->mtu = mtu - addend;
+ tunnel->hlen = addend;
+ return 0;
+}
+
+#ifdef MODULE
+static int ipgre_fb_tunnel_open(struct device *dev)
+{
+ MOD_INC_USE_COUNT;
+ return 0;
+}
+
+static int ipgre_fb_tunnel_close(struct device *dev)
+{
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+#endif
+
+__initfunc(int ipgre_fb_tunnel_init(struct device *dev))
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct iphdr *iph;
+
+ ipgre_tunnel_init_gen(dev);
+#ifdef MODULE
+ dev->open = ipgre_fb_tunnel_open;
+ dev->stop = ipgre_fb_tunnel_close;
+#endif
+
+ iph = &ipgre_fb_tunnel.parms.iph;
+ iph->version = 4;
+ iph->protocol = IPPROTO_GRE;
+ iph->ihl = 5;
+ tunnel->hlen = sizeof(struct iphdr) + 4;
+
+ tunnels_wc[0] = &ipgre_fb_tunnel;
+ return 0;
+}
+
+
+static struct inet_protocol ipgre_protocol = {
+ ipgre_rcv, /* GRE handler */
+ ipgre_err, /* TUNNEL error control */
+ 0, /* next */
+ IPPROTO_GRE, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "GRE" /* name */
+};
+
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+#ifdef MODULE
+int init_module(void)
+#else
+__initfunc(int ipgre_init(void))
+#endif
+{
+ printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
+ ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
+ ipgre_fb_tunnel_dev.name = ipgre_fb_tunnel.parms.name;
+#ifdef MODULE
+ register_netdev(&ipgre_fb_tunnel_dev);
+#else
+ register_netdevice(&ipgre_fb_tunnel_dev);
+#endif
+
+ inet_add_protocol(&ipgre_protocol);
+ return 0;
+}
+
+#ifdef MODULE
+
+void cleanup_module(void)
+{
+ if ( inet_del_protocol(&ipgre_protocol) < 0 )
+ printk(KERN_INFO "ipgre close: can't remove protocol\n");
+
+ unregister_netdev(&ipgre_fb_tunnel_dev);
+}
+
+#endif
diff --git a/pfinet/linux-src/net/ipv4/ip_input.c b/pfinet/linux-src/net/ipv4/ip_input.c
new file mode 100644
index 00000000..7a3e2618
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_input.c
@@ -0,0 +1,549 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) module.
+ *
+ * Version: $Id: ip_input.c,v 1.37 1999/04/22 10:38:36 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ * Alan Cox : Commented a couple of minor bits of surplus code
+ * Alan Cox : Undefining IP_FORWARD doesn't include the code
+ * (just stops a compiler warning).
+ * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ * are junked rather than corrupting things.
+ * Alan Cox : Frames to bad broadcast subnets are dumped
+ * We used to process them non broadcast and
+ * boy could that cause havoc.
+ * Alan Cox : ip_forward sets the free flag on the
+ * new frame it queues. Still crap because
+ * it copies the frame but at least it
+ * doesn't eat memory too.
+ * Alan Cox : Generic queue code and memory fixes.
+ * Fred Van Kempen : IP fragment support (borrowed from NET2E)
+ * Gerhard Koerting: Forward fragmented frames correctly.
+ * Gerhard Koerting: Fixes to my fix of the above 8-).
+ * Gerhard Koerting: IP interface addressing fix.
+ * Linus Torvalds : More robustness checks
+ * Alan Cox : Even more checks: Still not as robust as it ought to be
+ * Alan Cox : Save IP header pointer for later
+ * Alan Cox : ip option setting
+ * Alan Cox : Use ip_tos/ip_ttl settings
+ * Alan Cox : Fragmentation bogosity removed
+ * (Thanks to Mark.Bush@prg.ox.ac.uk)
+ * Dmitry Gorodchanin : Send of a raw packet crash fix.
+ * Alan Cox : Silly ip bug when an overlength
+ * fragment turns up. Now frees the
+ * queue.
+ * Linus Torvalds/ : Memory leakage on fragmentation
+ * Alan Cox : handling.
+ * Gerhard Koerting: Forwarding uses IP priority hints
+ * Teemu Rantanen : Fragment problems.
+ * Alan Cox : General cleanup, comments and reformat
+ * Alan Cox : SNMP statistics
+ * Alan Cox : BSD address rule semantics. Also see
+ * UDP as there is a nasty checksum issue
+ * if you do things the wrong way.
+ * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
+ * Alan Cox : IP options adjust sk->priority.
+ * Pedro Roque : Fix mtu/length error in ip_forward.
+ * Alan Cox : Avoid ip_chk_addr when possible.
+ * Richard Underwood : IP multicasting.
+ * Alan Cox : Cleaned up multicast handlers.
+ * Alan Cox : RAW sockets demultiplex in the BSD style.
+ * Gunther Mayer : Fix the SNMP reporting typo
+ * Alan Cox : Always in group 224.0.0.1
+ * Pauline Middelink : Fast ip_checksum update when forwarding
+ * Masquerading support.
+ * Alan Cox : Multicast loopback error for 224.0.0.1
+ * Alan Cox : IP_MULTICAST_LOOP option.
+ * Alan Cox : Use notifiers.
+ * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
+ * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
+ * Stefan Becker : Send out ICMP HOST REDIRECT
+ * Arnt Gulbrandsen : ip_build_xmit
+ * Alan Cox : Per socket routing cache
+ * Alan Cox : Fixed routing cache, added header cache.
+ * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
+ * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Alan Cox : Incoming IP option handling.
+ * Alan Cox : Set saddr on raw output frames as per BSD.
+ * Alan Cox : Stopped broadcast source route explosions.
+ * Alan Cox : Can disable source routing
+ * Takeshi Sone : Masquerading didn't work.
+ * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
+ * Alan Cox : Memory leaks, tramples, misc debugging.
+ * Alan Cox : Fixed multicast (by popular demand 8))
+ * Alan Cox : Fixed forwarding (by even more popular demand 8))
+ * Alan Cox : Fixed SNMP statistics [I think]
+ * Gerhard Koerting : IP fragmentation forwarding fix
+ * Alan Cox : Device lock against page fault.
+ * Alan Cox : IP_HDRINCL facility.
+ * Werner Almesberger : Zero fragment bug
+ * Alan Cox : RAW IP frame length bug
+ * Alan Cox : Outgoing firewall on build_xmit
+ * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
+ * Alan Cox : Multicast routing hooks
+ * Jos Vos : Do accounting *before* call_in_firewall
+ * Willy Konynenberg : Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ * and could be made very efficient with the addition of some virtual memory hacks to permit
+ * the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ * Output fragmentation wants updating along with the buffer management to use a single
+ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ * fragmentation anyway.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/ip_fw.h>
+#ifdef CONFIG_IP_MASQUERADE
+#include <net/ip_masq.h>
+#endif
+#include <linux/firewall.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * SNMP management statistics
+ */
+
+struct ip_mib ip_statistics={2,IPDEFTTL,}; /* Forwarding=No, Default TTL=64 */
+
+
+/*
+ * Handle the issuing of an ioctl() request
+ * for the ip device. This is scheduled to
+ * disappear
+ */
+
+int ip_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch(cmd)
+ {
+ default:
+ return(-EINVAL);
+ }
+}
+
+
+#if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG)
+#define CONFIG_IP_ALWAYS_DEFRAG 1
+#endif
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int type;
+
+ type = skb->h.icmph->type;
+ if (type < 32)
+ return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
+
+ /* Do not block unknown ICMP types */
+ return 0;
+}
+
+/*
+ * Process Router Attention IP option
+ */
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+ struct ip_ra_chain *ra;
+ u8 protocol = skb->nh.iph->protocol;
+ struct sock *last = NULL;
+
+ for (ra = ip_ra_chain; ra; ra = ra->next) {
+ struct sock *sk = ra->sk;
+ if (sk && sk->num == protocol) {
+ if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (skb == NULL)
+ return 1;
+ }
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ raw_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ raw_rcv(last, skb);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct inet_protocol *ipprot;
+ struct sock *raw_sk=NULL;
+ unsigned char hash;
+ int flag = 0;
+
+#ifndef CONFIG_IP_ALWAYS_DEFRAG
+ /*
+ * Reassemble IP fragments.
+ */
+
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return 0;
+ iph = skb->nh.iph;
+ }
+#endif
+
+#ifdef CONFIG_IP_MASQUERADE
+ /*
+ * Do we need to de-masquerade this packet?
+ */
+ {
+ int ret;
+ /*
+ * Some masq modules can re-inject packets if
+ * bad configured.
+ */
+
+ if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
+ printk(KERN_DEBUG "ip_input(): demasq recursion detected. Check masq modules configuration\n");
+ kfree_skb(skb);
+ return 0;
+ }
+
+ ret = ip_fw_demasquerade(&skb);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (ret) {
+ iph=skb->nh.iph;
+ IPCB(skb)->flags |= IPSKB_MASQUERADED;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) {
+ kfree_skb(skb);
+ return 0;
+ }
+ return skb->dst->input(skb);
+ }
+ }
+#endif
+
+ /*
+ * Point into the IP datagram, just past the header.
+ */
+
+ skb->h.raw = skb->nh.raw + iph->ihl*4;
+
+ /*
+ * Deliver to raw sockets. This is fun as to avoid copies we want to make no
+ * surplus copies.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ hash = iph->protocol & (MAX_INET_PROTOS - 1);
+
+ /*
+ * If there maybe a raw socket we must check - if not we don't care less
+ */
+
+ if((raw_sk = raw_v4_htable[hash]) != NULL) {
+ struct sock *sknext = NULL;
+ struct sk_buff *skb1;
+ raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex);
+ if(raw_sk) { /* Any raw sockets */
+ do {
+ /* Find the next */
+ sknext = raw_v4_lookup(raw_sk->next, iph->protocol,
+ iph->saddr, iph->daddr, skb->dev->ifindex);
+ if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) {
+ if (sknext == NULL)
+ break;
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+ if(skb1)
+ {
+ raw_rcv(raw_sk, skb1);
+ }
+ }
+ raw_sk = sknext;
+ } while(raw_sk!=NULL);
+
+ /* Here either raw_sk is the last raw socket, or NULL if
+ * none. We deliver to the last raw socket AFTER the
+ * protocol checks as it avoids a surplus copy.
+ */
+ }
+ }
+
+ /*
+ * skb->h.raw now points at the protocol beyond the IP header.
+ */
+
+ for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next)
+ {
+ struct sk_buff *skb2;
+
+ if (ipprot->protocol != iph->protocol)
+ continue;
+ /*
+ * See if we need to make a copy of it. This will
+ * only be set if more than one protocol wants it.
+ * and then not for the last one. If there is a pending
+ * raw delivery wait for that
+ */
+
+ if (ipprot->copy || raw_sk)
+ {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if(skb2==NULL)
+ continue;
+ }
+ else
+ {
+ skb2 = skb;
+ }
+ flag = 1;
+
+ /*
+ * Pass on the datagram to each protocol that wants it,
+ * based on the datagram protocol. We should really
+ * check the protocol handler's return values here...
+ */
+
+ ipprot->handler(skb2, ntohs(iph->tot_len) - (iph->ihl * 4));
+ }
+
+ /*
+ * All protocols checked.
+ * If this packet was a broadcast, we may *not* reply to it, since that
+ * causes (proven, grin) ARP storms and a leakage of memory (i.e. all
+ * ICMP reply messages get queued up for transmission...)
+ */
+
+ if(raw_sk!=NULL) /* Shift to last raw user */
+ {
+ raw_rcv(raw_sk, skb);
+
+ }
+ else if (!flag) /* Free and report errors */
+ {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+ kfree_skb(skb);
+ }
+
+ return(0);
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+{
+ struct iphdr *iph = skb->nh.iph;
+#ifdef CONFIG_FIREWALL
+ int fwres;
+ u16 rport;
+#endif /* CONFIG_FIREWALL */
+
+ /*
+ * When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ ip_statistics.IpInReceives++;
+
+ /*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (skb->len < sizeof(struct iphdr))
+ goto inhdr_error;
+ if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ goto inhdr_error;
+
+ {
+ __u32 len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ goto inhdr_error;
+
+ /*
+ * Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+
+ __skb_trim(skb, len);
+ }
+
+#ifdef CONFIG_IP_ALWAYS_DEFRAG
+ /* Won't send ICMP reply, since skb->dst == NULL. --RR */
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return 0;
+ iph = skb->nh.iph;
+ ip_send_check(iph);
+ }
+#endif
+
+#ifdef CONFIG_FIREWALL
+ /*
+ * See if the firewall wants to dispose of the packet.
+ *
+ * We can't do ICMP reply or local delivery before routing,
+ * so we delay those decisions until after route. --RR
+ */
+ fwres = call_in_firewall(PF_INET, dev, iph, &rport, &skb);
+ if (fwres < FW_ACCEPT && fwres != FW_REJECT)
+ goto drop;
+ iph = skb->nh.iph;
+#endif /* CONFIG_FIREWALL */
+
+ /*
+ * Initialise the virtual path cache for the packet. It describes
+ * how the packet travels inside Linux networking.
+ */
+ if (skb->dst == NULL) {
+ if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+ goto drop;
+#ifdef CONFIG_CPU_IS_SLOW
+ if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) &&
+ IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
+ goto drop;
+ }
+#endif
+ }
+
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (skb->dst->tclassid) {
+ u32 idx = skb->dst->tclassid;
+ ip_rt_acct[idx&0xFF].o_packets++;
+ ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
+ ip_rt_acct[(idx>>16)&0xFF].i_packets++;
+ ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
+ }
+#endif
+
+ if (iph->ihl > 5) {
+ struct ip_options *opt;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+
+ skb = skb_cow(skb, skb_headroom(skb));
+ if (skb == NULL)
+ return 0;
+ iph = skb->nh.iph;
+
+ skb->ip_summed = 0;
+ if (ip_options_compile(NULL, skb))
+ goto inhdr_error;
+
+ opt = &(IPCB(skb)->opt);
+ if (opt->srr) {
+ struct in_device *in_dev = dev->ip_ptr;
+ if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n",
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ goto drop;
+ }
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+ }
+
+#ifdef CONFIG_FIREWALL
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (fwres == FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
+ return ip_local_deliver(skb);
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
+
+ if (fwres == FW_REJECT) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ goto drop;
+ }
+#endif /* CONFIG_FIREWALL */
+
+ return skb->dst->input(skb);
+
+inhdr_error:
+ ip_statistics.IpInHdrErrors++;
+drop:
+ kfree_skb(skb);
+ return(0);
+}
+
diff --git a/pfinet/linux-src/net/ipv4/ip_masq.c b/pfinet/linux-src/net/ipv4/ip_masq.c
new file mode 100644
index 00000000..0187c58d
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq.c
@@ -0,0 +1,2545 @@
+/*
+ *
+ * Masquerading functionality
+ *
+ * Copyright (c) 1994 Pauline Middelink
+ *
+ * $Id: ip_masq.c,v 1.34.2.2 1999/08/07 10:56:28 davem Exp $
+ *
+ *
+ * See ip_fw.c for original log
+ *
+ * Fixes:
+ * Juan Jose Ciarlante : Modularized application masquerading (see ip_masq_app.c)
+ * Juan Jose Ciarlante : New struct ip_masq_seq that holds output/input delta seq.
+ * Juan Jose Ciarlante : Added hashed lookup by proto,maddr,mport and proto,saddr,sport
+ * Juan Jose Ciarlante : Fixed deadlock if free ports get exhausted
+ * Juan Jose Ciarlante : Added NO_ADDR status flag.
+ * Richard Lynch : Added IP Autoforward
+ * Nigel Metheringham : Added ICMP handling for demasquerade
+ * Nigel Metheringham : Checksum checking of masqueraded data
+ * Nigel Metheringham : Better handling of timeouts of TCP conns
+ * Delian Delchev : Added support for ICMP requests and replys
+ * Nigel Metheringham : ICMP in ICMP handling, tidy ups, bug fixes, made ICMP optional
+ * Juan Jose Ciarlante : re-assign maddr if no packet received from outside
+ * Juan Jose Ciarlante : ported to 2.1 tree
+ * Juan Jose Ciarlante : reworked control connections
+ * Steven Clarke : Added Port Forwarding
+ * Juan Jose Ciarlante : Just ONE ip_masq_new (!)
+ * Juan Jose Ciarlante : IP masq modules support
+ * Juan Jose Ciarlante : don't go into search loop if mport specified
+ * Juan Jose Ciarlante : locking
+ * Steven Clarke : IP_MASQ_S_xx state design
+ * Juan Jose Ciarlante : IP_MASQ_S state implementation
+ * Juan Jose Ciarlante : xx_get() clears timer, _put() inserts it
+ * Juan Jose Ciarlante : create /proc/net/ip_masq/
+ * Juan Jose Ciarlante : reworked checksums (save payload csum if possible)
+ * Juan Jose Ciarlante : added missing ip_fw_masquerade checksum
+ * Juan Jose Ciarlante : csum savings
+ * Juan Jose Ciarlante : added user-space tunnel creation/del, etc
+ * Juan Jose Ciarlante : (last) moved to ip_masq_user runtime module
+ * Juan Jose Ciarlante : user timeout handling again
+ * Juan Jose Ciarlante : make new modules support optional
+ * Juan Jose Ciarlante : u-space context => locks reworked
+ * Juan Jose Ciarlante : fixed stupid SMP locking bug
+ * Juan Jose Ciarlante : fixed "tap"ing in demasq path by copy-on-w
+ * Juan Jose Ciarlante : make masq_proto_doff() robust against fake sized/corrupted packets
+ * Kai Bankett : do not toss other IP protos in proto_doff()
+ * Dan Kegel : pointed correct NAT behavior for UDP streams
+ * Julian Anastasov : use daddr and dport as hash keys
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/inet.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/checksum.h>
+#include <net/ip_masq.h>
+
+#ifdef CONFIG_IP_MASQUERADE_MOD
+#include <net/ip_masq_mod.h>
+#endif
+
+#include <linux/sysctl.h>
+#include <linux/ip_fw.h>
+#include <linux/ip_masq.h>
+
+int sysctl_ip_masq_debug = 0;
+
+/*
+ * Exported wrapper
+ */
+int ip_masq_get_debug_level(void)
+{
+ return sysctl_ip_masq_debug;
+}
+
+struct ip_masq_hook *ip_masq_user_hook = NULL;
+
+/*
+ * Timeout table[state]
+ */
+/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */
+static struct ip_masq_timeout_table masq_timeout_table = {
+ ATOMIC_INIT(0), /* refcnt */
+ 0, /* scale */
+ {
+ 30*60*HZ, /* IP_MASQ_S_NONE, */
+ 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */
+ 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */
+ 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */
+ 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */
+ 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */
+ 10*HZ, /* IP_MASQ_S_CLOSE, */
+ 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */
+ 30*HZ, /* IP_MASQ_S_LAST_ACK, */
+ 2*60*HZ, /* IP_MASQ_S_LISTEN, */
+ 5*60*HZ, /* IP_MASQ_S_UDP, */
+ 1*60*HZ, /* IP_MASQ_S_ICMP, */
+ 2*HZ,/* IP_MASQ_S_LAST */
+ }, /* timeout */
+};
+
+#define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]
+
+static const char * state_name_table[IP_MASQ_S_LAST+1] = {
+ "NONE", /* IP_MASQ_S_NONE, */
+ "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */
+ "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */
+ "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */
+ "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */
+ "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */
+ "CLOSE", /* IP_MASQ_S_CLOSE, */
+ "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */
+ "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */
+ "LISTEN", /* IP_MASQ_S_LISTEN, */
+ "UDP", /* IP_MASQ_S_UDP, */
+ "ICMP", /* IP_MASQ_S_ICMP, */
+ "BUG!", /* IP_MASQ_S_LAST */
+};
+
+#define mNO IP_MASQ_S_NONE
+#define mES IP_MASQ_S_ESTABLISHED
+#define mSS IP_MASQ_S_SYN_SENT
+#define mSR IP_MASQ_S_SYN_RECV
+#define mFW IP_MASQ_S_FIN_WAIT
+#define mTW IP_MASQ_S_TIME_WAIT
+#define mCL IP_MASQ_S_CLOSE
+#define mCW IP_MASQ_S_CLOSE_WAIT
+#define mLA IP_MASQ_S_LAST_ACK
+#define mLI IP_MASQ_S_LISTEN
+
+struct masq_tcp_states_t {
+ int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */
+};
+
+const char * ip_masq_state_name(int state)
+{
+ if (state >= IP_MASQ_S_LAST)
+ return "ERR!";
+ return state_name_table[state];
+}
+
+struct masq_tcp_states_t masq_tcp_states [] = {
+/* INPUT */
+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
+/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
+/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }},
+/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }},
+/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }},
+
+/* OUTPUT */
+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
+/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }},
+/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }},
+/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }},
+/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }},
+};
+
+static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output)
+{
+ /*
+ * [0-3]: input states, [4-7]: output.
+ */
+ if (output)
+ output=4;
+
+ if (th->rst)
+ return output+3;
+ if (th->syn)
+ return output+0;
+ if (th->fin)
+ return output+1;
+ if (th->ack)
+ return output+2;
+ return -1;
+}
+
+
+
+static int masq_set_state_timeout(struct ip_masq *ms, int state)
+{
+ struct ip_masq_timeout_table *mstim = ms->timeout_table;
+ int scale;
+
+ /*
+ * Use default timeout table if no specific for this entry
+ */
+ if (!mstim)
+ mstim = &masq_timeout_table;
+
+ ms->timeout = mstim->timeout[ms->state=state];
+ scale = mstim->scale;
+
+ if (scale<0)
+ ms->timeout >>= -scale;
+ else if (scale > 0)
+ ms->timeout <<= scale;
+
+ return state;
+}
+
+static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th)
+{
+ int state_idx;
+ int new_state = IP_MASQ_S_CLOSE;
+
+ if ((state_idx = masq_tcp_state_idx(th, output)) < 0) {
+ IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n",
+ output, state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state = masq_tcp_states[state_idx].next_state[ms->state];
+
+tcp_state_out:
+ if (new_state!=ms->state)
+ IP_MASQ_DEBUG(1, "%s %s [%c%c%c%c] %08lX:%04X-%08lX:%04X state: %s->%s\n",
+ masq_proto_name(ms->protocol),
+ output? "output" : "input ",
+ th->syn? 'S' : '.',
+ th->fin? 'F' : '.',
+ th->ack? 'A' : '.',
+ th->rst? 'R' : '.',
+ ntohl(ms->saddr), ntohs(ms->sport),
+ ntohl(ms->daddr), ntohs(ms->dport),
+ ip_masq_state_name(ms->state),
+ ip_masq_state_name(new_state));
+ return masq_set_state_timeout(ms, new_state);
+}
+
+
+/*
+ * Handle state transitions
+ */
+static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp)
+{
+ switch (iph->protocol) {
+ case IPPROTO_ICMP:
+ return masq_set_state_timeout(ms, IP_MASQ_S_ICMP);
+ case IPPROTO_UDP:
+ return masq_set_state_timeout(ms, IP_MASQ_S_UDP);
+ case IPPROTO_TCP:
+ return masq_tcp_state(ms, output, tp);
+ }
+ return -1;
+}
+
+/*
+ * Set LISTEN timeout. (ip_masq_put will setup timer)
+ */
+int ip_masq_listen(struct ip_masq *ms)
+{
+ masq_set_state_timeout(ms, IP_MASQ_S_LISTEN);
+ return ms->timeout;
+}
+
+/*
+ * Dynamic address rewriting
+ */
+extern int sysctl_ip_dynaddr;
+
+/*
+ * Lookup lock
+ */
+rwlock_t __ip_masq_lock = RW_LOCK_UNLOCKED;
+
+/*
+ * Implement IP packet masquerading
+ */
+
+/*
+ * Converts an ICMP reply code into the equivalent request code
+ */
+static __inline__ const __u8 icmp_type_request(__u8 type)
+{
+ switch (type)
+ {
+ case ICMP_ECHOREPLY: return ICMP_ECHO; break;
+ case ICMP_TIMESTAMPREPLY: return ICMP_TIMESTAMP; break;
+ case ICMP_INFO_REPLY: return ICMP_INFO_REQUEST; break;
+ case ICMP_ADDRESSREPLY: return ICMP_ADDRESS; break;
+ default: return (255); break;
+ }
+}
+
+/*
+ * Helper macros - attempt to make code clearer!
+ */
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) ((icmph->un).echo.id)
+/* (port) hash value using in ICMP lookups for requests */
+#define icmp_hv_req(icmph) ((__u16)(icmph->code+(__u16)(icmph->type<<8)))
+/* (port) hash value using in ICMP lookups for replies */
+#define icmp_hv_rep(icmph) ((__u16)(icmph->code+(__u16)(icmp_type_request(icmph->type)<<8)))
+
+/*
+ * Last masq_port number in use.
+ * Will cycle in MASQ_PORT boundaries.
+ */
+static __u16 masq_port = PORT_MASQ_BEGIN;
+static spinlock_t masq_port_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * free ports counters (UDP & TCP)
+ *
+ * Their value is _less_ or _equal_ to actual free ports:
+ * same masq port, diff masq addr (firewall iface address) allocated
+ * entries are accounted but their actually don't eat a more than 1 port.
+ *
+ * Greater values could lower MASQ_EXPIRATION setting as a way to
+ * manage 'masq_entries resource'.
+ *
+ * By default we will reuse masq.port iff (output) connection
+ * (5-upla) if not duplicated.
+ * This may break midentd and others ...
+ */
+
+#ifdef CONFIG_IP_MASQ_NREUSE
+#define PORT_MASQ_MUL 1
+#else
+#define PORT_MASQ_MUL 10
+#endif
+
+/*
+ * At the moment, hardcore in sync with masq_proto_num
+ */
+atomic_t ip_masq_free_ports[3] = {
+ ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* UDP */
+ ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* TCP */
+ ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* ICMP */
+};
+
+/*
+ * Counts entries that have been requested with specific mport.
+ * Used for incoming packets to "relax" input rule (port in MASQ range).
+ */
+atomic_t mport_count = ATOMIC_INIT(0);
+
+EXPORT_SYMBOL(ip_masq_get_debug_level);
+EXPORT_SYMBOL(ip_masq_new);
+EXPORT_SYMBOL(ip_masq_listen);
+EXPORT_SYMBOL(ip_masq_free_ports);
+EXPORT_SYMBOL(ip_masq_out_get);
+EXPORT_SYMBOL(ip_masq_in_get);
+EXPORT_SYMBOL(ip_masq_put);
+EXPORT_SYMBOL(ip_masq_control_add);
+EXPORT_SYMBOL(ip_masq_control_del);
+EXPORT_SYMBOL(ip_masq_control_get);
+EXPORT_SYMBOL(ip_masq_user_hook);
+EXPORT_SYMBOL(ip_masq_state_name);
+EXPORT_SYMBOL(ip_masq_select_addr);
+EXPORT_SYMBOL(__ip_masq_lock);
+EXPORT_SYMBOL(ip_masq_m_table);
+EXPORT_SYMBOL(ip_masq_s_table);
+EXPORT_SYMBOL(ip_masq_d_table);
+
+/*
+ * 3 ip_masq hash double linked tables:
+ * 2 for input m{addr,port} and output s{addr,port} pkts lookups.
+ * 1 for extra modules support (daddr)
+ */
+
+#define IP_MASQ_NTABLES 3
+
+struct list_head ip_masq_m_table[IP_MASQ_TAB_SIZE];
+struct list_head ip_masq_s_table[IP_MASQ_TAB_SIZE];
+struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE];
+
+/*
+ * timeouts
+ */
+
+#if 000 /* FIXED timeout handling */
+static struct ip_fw_masq ip_masq_dummy = {
+ MASQUERADE_EXPIRE_TCP,
+ MASQUERADE_EXPIRE_TCP_FIN,
+ MASQUERADE_EXPIRE_UDP
+};
+
+EXPORT_SYMBOL(ip_masq_expire);
+struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy;
+#endif
+
+/*
+ * These flags enable non-strict d{addr,port} checks
+ * Given that both (in/out) lookup tables are hashed
+ * by m{addr,port} and s{addr,port} this is quite easy
+ */
+
+#define MASQ_DADDR_PASS (IP_MASQ_F_NO_DADDR|IP_MASQ_F_DLOOSE)
+#define MASQ_DPORT_PASS (IP_MASQ_F_NO_DPORT|IP_MASQ_F_DLOOSE)
+
+/*
+ * By default enable dest loose semantics
+ */
+#define CONFIG_IP_MASQ_LOOSE_DEFAULT 1
+
+
+/*
+ * Set masq expiration (deletion) and adds timer,
+ * if timeout==0 cancel expiration.
+ * Warning: it does not check/delete previous timer!
+ */
+
+static void __ip_masq_set_expire(struct ip_masq *ms, unsigned long tout)
+{
+ if (tout) {
+ ms->timer.expires = jiffies+tout;
+ add_timer(&ms->timer);
+ } else {
+ del_timer(&ms->timer);
+ }
+}
+
+
+/*
+ * Returns hash value
+ */
+
+static __inline__ unsigned
+ip_masq_hash_key(unsigned proto, __u32 addr, __u16 port)
+{
+ return (proto^ntohl(addr)^ntohs(port)) & (IP_MASQ_TAB_SIZE-1);
+}
+
+/*
+ * Hashes ip_masq by its proto,addrs,ports.
+ * should be called with locked tables.
+ * returns bool success.
+ */
+
+static int ip_masq_hash(struct ip_masq *ms)
+{
+ unsigned hash;
+
+ if (ms->flags & IP_MASQ_F_HASHED) {
+ IP_MASQ_ERR( "ip_masq_hash(): request for already hashed, called from %p\n",
+ __builtin_return_address(0));
+ return 0;
+ }
+ atomic_add(IP_MASQ_NTABLES, &ms->refcnt);
+
+ if ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS |
+ IP_MASQ_F_SIMPLE_HASH)) == 0)
+ /*
+ * Hash by proto,m{addr,port},d{addr,port}
+ */
+ hash = ip_masq_hash_key(ms->protocol,
+ ms->maddr^ms->daddr, ms->mport^ms->dport);
+ else
+ /*
+ * Hash by proto,m{addr,port}
+ */
+ hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport);
+
+ list_add(&ms->m_list, &ip_masq_m_table[hash]);
+
+ if ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS |
+ IP_MASQ_F_NO_SADDR | IP_MASQ_F_NO_SPORT |
+ IP_MASQ_F_SIMPLE_HASH)) == 0)
+ /*
+ * Hash by proto,s{addr,port},d{addr,port}
+ */
+ hash = ip_masq_hash_key(ms->protocol,
+ ms->saddr^ms->daddr, ms->sport^ms->dport);
+ else
+ /*
+ * Hash by proto,s{addr,port}
+ */
+ hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport);
+
+ list_add(&ms->s_list, &ip_masq_s_table[hash]);
+
+ /*
+ * Hash by proto,d{addr,port}
+ */
+ hash = ip_masq_hash_key(ms->protocol, ms->daddr, ms->dport);
+ list_add(&ms->d_list, &ip_masq_d_table[hash]);
+
+
+ ms->flags |= IP_MASQ_F_HASHED;
+ return 1;
+}
+
+/*
+ * UNhashes ip_masq from ip_masq_[ms]_tables.
+ * should be called with locked tables.
+ * returns bool success.
+ */
+
+static int ip_masq_unhash(struct ip_masq *ms)
+{
+ if (!(ms->flags & IP_MASQ_F_HASHED)) {
+ IP_MASQ_ERR( "ip_masq_unhash(): request for unhash flagged, called from %p\n",
+ __builtin_return_address(0));
+ return 0;
+ }
+ list_del(&ms->m_list);
+ list_del(&ms->s_list);
+ list_del(&ms->d_list);
+
+ atomic_sub(IP_MASQ_NTABLES, &ms->refcnt);
+
+ ms->flags &= ~IP_MASQ_F_HASHED;
+ return 1;
+}
+
+/*
+ * Returns ip_masq associated with supplied parameters, either
+ * broken out of the ip/tcp headers or directly supplied for those
+ * pathological protocols with address/port in the data stream
+ * (ftp, irc). addresses and ports are in network order.
+ * called for pkts coming from OUTside-to-INside the firewall.
+ *
+ * s_addr, s_port: pkt source address (foreign host)
+ * d_addr, d_port: pkt dest address (firewall)
+ *
+ * NB. Cannot check destination address, just for the incoming port.
+ * reason: archie.doc.ac.uk has 6 interfaces, you send to
+ * phoenix and get a reply from any other interface(==dst)!
+ *
+ * [Only for UDP] - AC
+ *
+ * Caller must lock tables
+ */
+
+static struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_masq *ms = NULL;
+ struct list_head *l,*e;
+
+ hash = ip_masq_hash_key(protocol, d_addr^s_addr, d_port^s_port);
+
+ l = &ip_masq_m_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, m_list);
+ if (s_port==ms->dport && s_addr==ms->daddr &&
+ d_port==ms->mport && protocol==ms->protocol &&
+ d_addr==ms->maddr &&
+ ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS)) == 0)
+ ) {
+ IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+ }
+
+ hash = ip_masq_hash_key(protocol, d_addr, d_port);
+
+ l = &ip_masq_m_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, m_list);
+ if (protocol==ms->protocol &&
+ (d_addr==ms->maddr && d_port==ms->mport) &&
+ (s_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
+ (s_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
+ ) {
+ IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+ }
+ IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX fail\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+
+ ms = NULL;
+out:
+ return ms;
+}
+
+/*
+ * Returns ip_masq associated with supplied parameters, either
+ * broken out of the ip/tcp headers or directly supplied for those
+ * pathological protocols with address/port in the data stream
+ * (ftp, irc). addresses and ports are in network order.
+ * called for pkts coming from inside-to-OUTside the firewall.
+ *
+ * Normally we know the source address and port but for some protocols
+ * (e.g. ftp PASV) we do not know the source port initially. Alas the
+ * hash is keyed on source port so if the first lookup fails then try again
+ * with a zero port, this time only looking at entries marked "no source
+ * port".
+ *
+ * Caller must lock tables
+ */
+
+static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_masq *ms = NULL;
+ struct list_head *l,*e;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_masq_hash_key(protocol, s_addr^d_addr, s_port^d_port);
+
+ l = &ip_masq_s_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, s_list);
+ if (d_addr==ms->daddr && d_port==ms->dport &&
+ s_addr==ms->saddr && s_port==ms->sport &&
+ protocol==ms->protocol &&
+ ((ms->flags & (MASQ_DADDR_PASS | MASQ_DPORT_PASS |
+ IP_MASQ_F_NO_SADDR | IP_MASQ_F_NO_SPORT)) == 0)
+ ) {
+ IP_MASQ_DEBUG(2, "lk/out0 %d %08X:%04hX->%08X:%04hX OK\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+
+ }
+
+ hash = ip_masq_hash_key(protocol, s_addr, s_port);
+
+ l = &ip_masq_s_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, s_list);
+ if (protocol == ms->protocol &&
+ s_addr == ms->saddr && s_port == ms->sport &&
+ (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
+ (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
+ ) {
+ IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX OK\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+
+ }
+
+ /*
+ * Check for NO_SPORT entries
+ */
+ hash = ip_masq_hash_key(protocol, s_addr, 0);
+ l = &ip_masq_s_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, s_list);
+ if (ms->flags & IP_MASQ_F_NO_SPORT &&
+ protocol == ms->protocol &&
+ s_addr == ms->saddr &&
+ (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
+ (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
+ ) {
+ IP_MASQ_DEBUG(2, "lk/out2 %d %08X:%04hX->%08X:%04hX OK\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+ }
+ IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX fail\n",
+ protocol,
+ s_addr,
+ s_port,
+ d_addr,
+ d_port);
+
+ ms = NULL;
+out:
+ return ms;
+}
+
+#ifdef CONFIG_IP_MASQ_NREUSE
+/*
+ * Returns ip_masq for given proto,m_addr,m_port.
+ * called by allocation routine to find an unused m_port.
+ *
+ * Caller must lock tables
+ */
+
+static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_port)
+{
+ unsigned hash;
+ struct ip_masq *ms = NULL;
+
+ hash = ip_masq_hash_key(protocol, m_addr, m_port);
+
+ for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) {
+ if ( protocol==ms->protocol &&
+ (m_addr==ms->maddr && m_port==ms->mport)) {
+ atomic_inc(&ms->refcnt);
+ goto out;
+ }
+ }
+
+out:
+ return ms;
+}
+#endif
+
+struct ip_masq * ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ struct ip_masq *ms;
+
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port);
+ read_unlock(&__ip_masq_lock);
+
+ if (ms)
+ __ip_masq_set_expire(ms, 0);
+ return ms;
+}
+
+struct ip_masq * ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ struct ip_masq *ms;
+
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
+ read_unlock(&__ip_masq_lock);
+
+ if (ms)
+ __ip_masq_set_expire(ms, 0);
+ return ms;
+}
+
+static __inline__ void __ip_masq_put(struct ip_masq *ms)
+{
+ atomic_dec(&ms->refcnt);
+}
+
+void ip_masq_put(struct ip_masq *ms)
+{
+ /*
+ * Decrement refcnt
+ */
+ __ip_masq_put(ms);
+
+ /*
+ * if refcnt==IP_MASQ_NTABLES
+ */
+ if (atomic_read(&ms->refcnt)==IP_MASQ_NTABLES) {
+ __ip_masq_set_expire(ms, ms->timeout);
+ } else {
+ IP_MASQ_DEBUG(0, "did not set timer with refcnt=%d, called from %p\n",
+ atomic_read(&ms->refcnt),
+ __builtin_return_address(0));
+ }
+}
+
+static void masq_expire(unsigned long data)
+{
+ struct ip_masq *ms = (struct ip_masq *)data;
+ ms->timeout = MASQUERADE_EXPIRE_RETRY;
+
+ /*
+ * hey, I'm using it
+ */
+ atomic_inc(&ms->refcnt);
+
+ IP_MASQ_DEBUG(1, "Masqueraded %s %08lX:%04X expired\n",
+ masq_proto_name(ms->protocol),
+ ntohl(ms->saddr),ntohs(ms->sport));
+
+ write_lock(&__ip_masq_lock);
+
+#if 0000
+ /*
+ * Already locked, do bounce ...
+ */
+ if (ip_masq_nlocks(&__ip_masq_lock) != 1) {
+ goto masq_expire_later;
+ }
+
+#endif
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&ms->n_control))
+ goto masq_expire_later;
+
+ /*
+ * does anybody controls me?
+ */
+
+ if (ms->control)
+ ip_masq_control_del(ms);
+
+ if (ip_masq_unhash(ms)) {
+ if (ms->flags&IP_MASQ_F_MPORT) {
+ atomic_dec(&mport_count);
+ } else {
+ atomic_inc(ip_masq_free_ports + masq_proto_num(ms->protocol));
+ }
+ ip_masq_unbind_app(ms);
+ }
+
+ /*
+ * refcnt==1 implies I'm the only one referrer
+ */
+ if (atomic_read(&ms->refcnt) == 1) {
+ kfree_s(ms,sizeof(*ms));
+ MOD_DEC_USE_COUNT;
+ goto masq_expire_out;
+ }
+
+masq_expire_later:
+ IP_MASQ_DEBUG(0, "masq_expire delayed: %s %08lX:%04X->%08lX:%04X masq.refcnt-1=%d masq.n_control=%d\n",
+ masq_proto_name(ms->protocol),
+ ntohl(ms->saddr), ntohs(ms->sport),
+ ntohl(ms->daddr), ntohs(ms->dport),
+ atomic_read(&ms->refcnt)-1,
+ atomic_read(&ms->n_control));
+
+ ip_masq_put(ms);
+
+masq_expire_out:
+ write_unlock(&__ip_masq_lock);
+}
+
+static __u16 get_next_mport(void)
+{
+ __u16 mport;
+
+ spin_lock_irq(&masq_port_lock);
+ /*
+ * Try the next available port number
+ */
+ mport = htons(masq_port++);
+ if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN;
+
+ spin_unlock_irq(&masq_port_lock);
+ return mport;
+}
+
+/*
+ * Create a new masquerade list entry, also allocate an
+ * unused mport, keeping the portnumber between the
+ * given boundaries MASQ_BEGIN and MASQ_END.
+ *
+ * Be careful, it can be called from u-space
+ */
+
+struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags)
+{
+ struct ip_masq *ms, *mst;
+ int ports_tried;
+ atomic_t *free_ports_p = NULL;
+ static int n_fails = 0;
+ int prio;
+
+
+ if (masq_proto_num(proto)!=-1 && mport == 0) {
+ free_ports_p = ip_masq_free_ports + masq_proto_num(proto);
+
+ if (atomic_read(free_ports_p) == 0) {
+ if (++n_fails < 5)
+ IP_MASQ_ERR( "ip_masq_new(proto=%s): no free ports.\n",
+ masq_proto_name(proto));
+ return NULL;
+ }
+ }
+
+ prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC;
+
+ ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio);
+ if (ms == NULL) {
+ if (++n_fails < 5)
+ IP_MASQ_ERR("ip_masq_new(proto=%s): no memory available.\n",
+ masq_proto_name(proto));
+ return NULL;
+ }
+ MOD_INC_USE_COUNT;
+ memset(ms, 0, sizeof(*ms));
+ INIT_LIST_HEAD(&ms->s_list);
+ INIT_LIST_HEAD(&ms->m_list);
+ INIT_LIST_HEAD(&ms->d_list);
+ init_timer(&ms->timer);
+ ms->timer.data = (unsigned long)ms;
+ ms->timer.function = masq_expire;
+ ms->protocol = proto;
+ ms->saddr = saddr;
+ ms->sport = sport;
+ ms->daddr = daddr;
+ ms->dport = dport;
+ ms->flags = mflags;
+ ms->app_data = NULL;
+ ms->control = NULL;
+
+ atomic_set(&ms->n_control,0);
+ atomic_set(&ms->refcnt,0);
+
+ if (proto == IPPROTO_UDP && !mport)
+#ifdef CONFIG_IP_MASQ_LOOSE_DEFAULT
+ /*
+ * Flag this tunnel as "dest loose"
+ *
+ */
+ ms->flags |= IP_MASQ_F_DLOOSE;
+#else
+ ms->flags |= IP_MASQ_F_NO_DADDR;
+#endif
+
+
+ /* get masq address from rif */
+ ms->maddr = maddr;
+
+ /*
+ * This flag will allow masq. addr (ms->maddr)
+ * to follow forwarding interface address.
+ */
+ ms->flags |= IP_MASQ_F_NO_REPLY;
+
+ /*
+ * We want a specific mport. Be careful.
+ */
+ if (masq_proto_num(proto) == -1 || mport) {
+ ms->mport = mport;
+
+ /*
+ * Check 5-upla uniqueness
+ */
+ if (mflags & IP_MASQ_F_USER)
+ write_lock_bh(&__ip_masq_lock);
+ else
+ write_lock(&__ip_masq_lock);
+
+ mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport);
+ if (mst==NULL) {
+ ms->flags |= IP_MASQ_F_MPORT;
+
+ atomic_inc(&mport_count);
+ ip_masq_hash(ms);
+
+ if (mflags & IP_MASQ_F_USER)
+ write_unlock_bh(&__ip_masq_lock);
+ else
+ write_unlock(&__ip_masq_lock);
+
+ ip_masq_bind_app(ms);
+ atomic_inc(&ms->refcnt);
+ masq_set_state_timeout(ms, IP_MASQ_S_NONE);
+ return ms;
+ }
+ if (mflags & IP_MASQ_F_USER)
+ write_unlock_bh(&__ip_masq_lock);
+ else
+ write_unlock(&__ip_masq_lock);
+
+ __ip_masq_put(mst);
+
+ IP_MASQ_ERR( "Already used connection: %s, %d.%d.%d.%d:%d => %d.%d.%d.%d:%d, called from %p\n",
+ masq_proto_name(proto),
+ NIPQUAD(maddr), ntohs(mport),
+ NIPQUAD(daddr), ntohs(dport),
+ __builtin_return_address(0));
+
+
+ goto mport_nono;
+ }
+
+
+ for (ports_tried = 0;
+ (atomic_read(free_ports_p) && (ports_tried <= (PORT_MASQ_END - PORT_MASQ_BEGIN)));
+ ports_tried++){
+
+ mport = ms->mport = get_next_mport();
+ /*
+ * lookup to find out if this connection is used.
+ */
+
+ if (mflags & IP_MASQ_F_USER)
+ write_lock_bh(&__ip_masq_lock);
+ else
+ write_lock(&__ip_masq_lock);
+
+#ifdef CONFIG_IP_MASQ_NREUSE
+ mst = __ip_masq_getbym(proto, maddr, mport);
+#else
+ mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport);
+#endif
+ if (mst == NULL) {
+
+ if (atomic_read(free_ports_p) == 0) {
+ if (mflags & IP_MASQ_F_USER)
+ write_unlock_bh(&__ip_masq_lock);
+ else
+ write_unlock(&__ip_masq_lock);
+
+ break;
+ }
+ atomic_dec(free_ports_p);
+ ip_masq_hash(ms);
+
+ if (mflags & IP_MASQ_F_USER)
+ write_unlock_bh(&__ip_masq_lock);
+ else
+ write_unlock(&__ip_masq_lock);
+
+ ip_masq_bind_app(ms);
+ n_fails = 0;
+ atomic_inc(&ms->refcnt);
+ masq_set_state_timeout(ms, IP_MASQ_S_NONE);
+ return ms;
+ }
+ if (mflags & IP_MASQ_F_USER)
+ write_unlock_bh(&__ip_masq_lock);
+ else
+ write_unlock(&__ip_masq_lock);
+
+ __ip_masq_put(mst);
+ }
+
+ if (++n_fails < 5)
+ IP_MASQ_ERR( "ip_masq_new(proto=%s): could not get free masq entry (free=%d).\n",
+ masq_proto_name(ms->protocol),
+ atomic_read(free_ports_p));
+mport_nono:
+ kfree_s(ms, sizeof(*ms));
+
+ MOD_DEC_USE_COUNT;
+ return NULL;
+}
+
+/*
+ * Get transport protocol data offset, check against size
+ * return:
+ * 0 if other IP proto
+ * -1 if error
+ */
+static __inline__ int proto_doff(unsigned proto, char *th, unsigned size)
+{
+ int ret = -1;
+ switch (proto) {
+ case IPPROTO_ICMP:
+ if (size >= sizeof(struct icmphdr))
+ ret = sizeof(struct icmphdr);
+ break;
+ case IPPROTO_UDP:
+ if (size >= sizeof(struct udphdr))
+ ret = sizeof(struct udphdr);
+ break;
+ case IPPROTO_TCP:
+ /*
+ * Is this case, this check _also_ avoids
+ * touching an invalid pointer if
+ * size is invalid
+ */
+ if (size >= sizeof(struct tcphdr)) {
+ ret = ((struct tcphdr*)th)->doff << 2;
+ if (ret > size) {
+ ret = -1 ;
+ }
+ }
+
+ break;
+ default:
+ /* Other proto: nothing to say, by now :) */
+ ret = 0;
+ }
+ if (ret < 0)
+ IP_MASQ_DEBUG(0, "mess proto_doff for proto=%d, size =%d\n",
+ proto, size);
+ return ret;
+}
+
+int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ union ip_masq_tphdr h;
+ struct ip_masq *ms;
+ int size;
+
+ /*
+ * doff holds transport protocol data offset
+ * csum holds its checksum
+ * csum_ok says if csum is valid
+ */
+ int doff = 0;
+ int csum = 0;
+ int csum_ok = 0;
+
+ /*
+ * We can only masquerade protocols with ports... and hack some ICMPs
+ */
+
+ h.raw = (char*) iph + iph->ihl * 4;
+ size = ntohs(iph->tot_len) - (iph->ihl * 4);
+
+
+ doff = proto_doff(iph->protocol, h.raw, size);
+ if (doff <= 0) {
+ /*
+ * Output path: do not pass other IP protos nor
+ * invalid packets.
+ */
+ return -1;
+ }
+
+ switch (iph->protocol) {
+ case IPPROTO_ICMP:
+ return(ip_fw_masq_icmp(skb_p, maddr));
+ case IPPROTO_UDP:
+ if (h.uh->check == 0)
+ /* No UDP checksum */
+ break;
+ case IPPROTO_TCP:
+ /* Make sure packet is in the masq range */
+ IP_MASQ_DEBUG(3, "O-pkt: %s size=%d\n",
+ masq_proto_name(iph->protocol),
+ size);
+
+#ifdef CONFIG_IP_MASQ_DEBUG
+ if (ip_masq_get_debug_level() > 3) {
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+#endif
+ /* Check that the checksum is OK */
+ switch (skb->ip_summed)
+ {
+ case CHECKSUM_NONE:
+ {
+ csum = csum_partial(h.raw + doff, size - doff, 0);
+ IP_MASQ_DEBUG(3, "O-pkt: %s I-datacsum=%d\n",
+ masq_proto_name(iph->protocol),
+ csum);
+
+ skb->csum = csum_partial(h.raw , doff, csum);
+ }
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol, skb->csum))
+ {
+ IP_MASQ_DEBUG(0, "Outgoing failed %s checksum from %d.%d.%d.%d (size=%d)!\n",
+ masq_proto_name(iph->protocol),
+ NIPQUAD(iph->saddr),
+ size);
+ return -1;
+ }
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ }
+ break;
+ default:
+ return -1;
+ }
+ /*
+ * Now hunt the list to see if we have an old entry
+ */
+
+ /* h.raw = (char*) iph + iph->ihl * 4; */
+
+ IP_MASQ_DEBUG(2, "Outgoing %s %08lX:%04X -> %08lX:%04X\n",
+ masq_proto_name(iph->protocol),
+ ntohl(iph->saddr), ntohs(h.portp[0]),
+ ntohl(iph->daddr), ntohs(h.portp[1]));
+
+ ms = ip_masq_out_get_iph(iph);
+ if (ms!=NULL) {
+
+ /*
+ * If sysctl !=0 and no pkt has been received yet
+ * in this tunnel and routing iface address has changed...
+ * "You are welcome, diald".
+ */
+ if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) {
+
+ if (sysctl_ip_dynaddr > 1) {
+ IP_MASQ_INFO( "ip_fw_masquerade(): change masq.addr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ NIPQUAD(ms->maddr),NIPQUAD(maddr));
+ }
+
+ write_lock(&__ip_masq_lock);
+
+ ip_masq_unhash(ms);
+ ms->maddr = maddr;
+ ip_masq_hash(ms);
+
+ write_unlock(&__ip_masq_lock);
+ }
+
+ /*
+ * Set sport if not defined yet (e.g. ftp PASV). Because
+ * masq entries are hashed on sport, unhash with old value
+ * and hash with new.
+ */
+
+ if ( ms->flags & IP_MASQ_F_NO_SPORT && ms->protocol == IPPROTO_TCP ) {
+
+ write_lock(&__ip_masq_lock);
+
+ ip_masq_unhash(ms);
+ ms->flags &= ~IP_MASQ_F_NO_SPORT;
+ ms->sport = h.portp[0];
+ ip_masq_hash(ms); /* hash on new sport */
+
+ write_unlock(&__ip_masq_lock);
+
+ IP_MASQ_DEBUG(1, "ip_fw_masquerade(): filled sport=%d\n",
+ ntohs(ms->sport));
+ }
+ if (ms->flags & IP_MASQ_F_DLOOSE) {
+ /*
+ * update dest loose values
+ */
+ ms->dport = h.portp[1];
+ ms->daddr = iph->daddr;
+ }
+ } else {
+ /*
+ * Nope, not found, create a new entry for it
+ */
+
+#ifdef CONFIG_IP_MASQUERADE_MOD
+ if (!(ms = ip_masq_mod_out_create(skb, iph, maddr)))
+#endif
+ ms = ip_masq_new(iph->protocol,
+ maddr, 0,
+ iph->saddr, h.portp[0],
+ iph->daddr, h.portp[1],
+ 0);
+ if (ms == NULL)
+ return -1;
+ }
+
+ /*
+ * Call module's output update hook
+ */
+
+#ifdef CONFIG_IP_MASQUERADE_MOD
+ ip_masq_mod_out_update(skb, iph, ms);
+#endif
+
+ /*
+ * Change the fragments origin
+ */
+
+ size = skb->len - (h.raw - skb->nh.raw);
+
+ /*
+ * Set iph addr and port from ip_masq obj.
+ */
+ iph->saddr = ms->maddr;
+ h.portp[0] = ms->mport;
+
+ /*
+ * Invalidate csum saving if tunnel has masq helper
+ */
+
+ if (ms->app)
+ csum_ok = 0;
+
+ /*
+ * Attempt ip_masq_app call.
+ * will fix ip_masq and iph seq stuff
+ */
+ if (ip_masq_app_pkt_out(ms, skb_p, maddr) != 0)
+ {
+ /*
+ * skb has possibly changed, update pointers.
+ */
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + iph->ihl *4;
+ size = skb->len - (h.raw - skb->nh.raw);
+ /* doff should have not changed */
+ }
+
+ /*
+ * Adjust packet accordingly to protocol
+ */
+
+ /*
+ * Transport's payload partial csum
+ */
+
+ if (!csum_ok) {
+ csum = csum_partial(h.raw + doff, size - doff, 0);
+ }
+ skb->csum = csum;
+
+ IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n",
+ masq_proto_name(iph->protocol),
+ size,
+ csum);
+
+ /*
+ * Protocol csum
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ h.th->check = 0;
+ h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw , doff, csum));
+ IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n",
+ masq_proto_name(iph->protocol),
+ h.th->check,
+ (char*) & (h.th->check) - (char*) h.raw);
+
+ break;
+ case IPPROTO_UDP:
+ h.uh->check = 0;
+ h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw , doff, csum));
+ if (h.uh->check == 0)
+ h.uh->check = 0xFFFF;
+ IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n",
+ masq_proto_name(iph->protocol),
+ h.uh->check,
+ (char*) &(h.uh->check)- (char*) h.raw);
+ break;
+ }
+ ip_send_check(iph);
+
+ IP_MASQ_DEBUG(2, "O-routed from %08lX:%04X with masq.addr %08lX\n",
+ ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr));
+
+ masq_set_state(ms, 1, iph, h.portp);
+ ip_masq_put(ms);
+
+ return 0;
+ }
+
+/*
+ * Restore original addresses and ports in the original IP
+ * datagram if the failing packet has been [de]masqueraded.
+ * This is ugly in the extreme. We no longer have the original
+ * packet so we have to reconstruct it from the failing packet
+ * plus data in the masq tables. The resulting "original data"
+ * should be good enough to tell the sender which session to
+ * throttle. Relies on far too much knowledge of masq internals,
+ * there ought to be a better way - KAO 990303.
+ *
+ * Moved here from icmp.c - JJC.
+ * Already known: type == ICMP_DEST_UNREACH, IPSKB_MASQUERADED
+ * skb->nh.iph points to original header.
+ *
+ * Must try both OUT and IN tables; we could add a flag
+ * ala IPSKB_MASQUERADED to avoid 2nd tables lookup, but this is VERY
+ * unlike because routing makes mtu decision before reaching
+ * ip_fw_masquerade().
+ *
+ */
+int ip_fw_unmasq_icmp(struct sk_buff *skb) {
+ struct ip_masq *ms;
+ struct iphdr *iph = skb->nh.iph;
+ __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+
+ /*
+ * Always called from _bh context: use read_[un]lock()
+ */
+
+ /*
+ * Peek "out" table, this packet has bounced:
+ * out->in(frag_needed!)->OUT[icmp]
+ *
+ * iph->daddr is IN host
+ * iph->saddr is OUT host
+ */
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_out_get(iph->protocol,
+ iph->daddr, portp[1],
+ iph->saddr, portp[0]);
+ read_unlock(&__ip_masq_lock);
+ if (ms) {
+ IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ NIPQUAD(iph->daddr), NIPQUAD(ms->maddr));
+ iph->daddr = ms->maddr;
+ portp[1] = ms->mport;
+ __ip_masq_put(ms);
+ return 1;
+ }
+ /*
+ * Peek "in" table
+ * in->out(frag_needed!)->IN[icmp]
+ *
+ * iph->daddr is OUT host
+ * iph->saddr is MASQ host
+ *
+ */
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_in_get(iph->protocol,
+ iph->daddr, portp[1],
+ iph->saddr, portp[0]);
+ read_unlock(&__ip_masq_lock);
+ if (ms) {
+ IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ NIPQUAD(iph->saddr), NIPQUAD(ms->saddr));
+ iph->saddr = ms->saddr;
+ portp[0] = ms->sport;
+ __ip_masq_put(ms);
+ return 1;
+ }
+ return 0;
+
+}
+/*
+ * Handle ICMP messages in forward direction.
+ * Find any that might be relevant, check against existing connections,
+ * forward to masqueraded host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded
+ */
+
+int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
+ struct iphdr *ciph; /* The ip header contained within the ICMP */
+ __u16 *pptr; /* port numbers from TCP/UDP contained header */
+ struct ip_masq *ms;
+ unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4);
+
+ IP_MASQ_DEBUG(2, "Incoming forward ICMP (%d,%d) %lX -> %lX\n",
+ icmph->type, ntohs(icmp_id(icmph)),
+ ntohl(iph->saddr), ntohl(iph->daddr));
+
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ if ((icmph->type == ICMP_ECHO ) ||
+ (icmph->type == ICMP_TIMESTAMP ) ||
+ (icmph->type == ICMP_INFO_REQUEST ) ||
+ (icmph->type == ICMP_ADDRESS )) {
+
+ IP_MASQ_DEBUG(2, "icmp request rcv %lX->%lX id %d type %d\n",
+ ntohl(iph->saddr),
+ ntohl(iph->daddr),
+ ntohs(icmp_id(icmph)),
+ icmph->type);
+
+ ms = ip_masq_out_get(iph->protocol,
+ iph->saddr,
+ icmp_id(icmph),
+ iph->daddr,
+ icmp_hv_req(icmph));
+ if (ms == NULL) {
+ ms = ip_masq_new(iph->protocol,
+ maddr, 0,
+ iph->saddr, icmp_id(icmph),
+ iph->daddr, icmp_hv_req(icmph),
+ 0);
+ if (ms == NULL)
+ return (-1);
+ IP_MASQ_DEBUG(1, "Created new icmp entry\n");
+ }
+ /* Rewrite source address */
+
+ /*
+ * If sysctl !=0 and no pkt has been received yet
+ * in this tunnel and routing iface address has changed...
+ * "You are welcome, diald".
+ */
+ if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) {
+
+ if (sysctl_ip_dynaddr > 1) {
+ IP_MASQ_INFO( "ip_fw_masq_icmp(): change masq.addr %d.%d.%d.%d to %d.%d.%d.%d",
+ NIPQUAD(ms->maddr), NIPQUAD(maddr));
+ }
+
+ write_lock(&__ip_masq_lock);
+
+ ip_masq_unhash(ms);
+ ms->maddr = maddr;
+ ip_masq_hash(ms);
+
+ write_unlock(&__ip_masq_lock);
+ }
+
+ iph->saddr = ms->maddr;
+ ip_send_check(iph);
+ /* Rewrite port (id) */
+ (icmph->un).echo.id = ms->mport;
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *)icmph, len);
+
+ IP_MASQ_DEBUG(2, "icmp request rwt %lX->%lX id %d type %d\n",
+ ntohl(iph->saddr),
+ ntohl(iph->daddr),
+ ntohs(icmp_id(icmph)),
+ icmph->type);
+
+ masq_set_state(ms, 1, iph, icmph);
+ ip_masq_put(ms);
+
+ return 1;
+ }
+#endif
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that
+ * means easy things are checked first to speed up
+ * processing.... however this means that some
+ * packets will manage to get a long way down this
+ * stack and then be rejected, but thats life
+ */
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_SOURCE_QUENCH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED))
+ return 0;
+
+ /* Now find the contained IP header */
+ ciph = (struct iphdr *) (icmph + 1);
+
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ if (ciph->protocol == IPPROTO_ICMP) {
+ /*
+ * This section handles ICMP errors for ICMP packets
+ */
+ struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph +
+ (ciph->ihl<<2));
+
+
+ IP_MASQ_DEBUG(2, "fw icmp/icmp rcv %lX->%lX id %d type %d\n",
+ ntohl(ciph->saddr),
+ ntohl(ciph->daddr),
+ ntohs(icmp_id(cicmph)),
+ cicmph->type);
+
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_out_get(ciph->protocol,
+ ciph->daddr,
+ icmp_id(cicmph),
+ ciph->saddr,
+ icmp_hv_rep(cicmph));
+ read_unlock(&__ip_masq_lock);
+
+ if (ms == NULL)
+ return 0;
+
+ /* Now we do real damage to this packet...! */
+ /* First change the source IP address, and recalc checksum */
+ iph->saddr = ms->maddr;
+ ip_send_check(iph);
+
+ /* Now change the *dest* address in the contained IP */
+ ciph->daddr = ms->maddr;
+ __ip_masq_put(ms);
+
+ ip_send_check(ciph);
+
+ /* Change the ID to the masqed one! */
+ (cicmph->un).echo.id = ms->mport;
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+
+
+ IP_MASQ_DEBUG(2, "fw icmp/icmp rwt %lX->%lX id %d type %d\n",
+ ntohl(ciph->saddr),
+ ntohl(ciph->daddr),
+ ntohs(icmp_id(cicmph)),
+ cicmph->type);
+
+ return 1;
+ }
+#endif /* CONFIG_IP_MASQUERADE_ICMP */
+
+ /* We are only interested ICMPs generated from TCP or UDP packets */
+ if ((ciph->protocol != IPPROTO_UDP) && (ciph->protocol != IPPROTO_TCP))
+ return 0;
+
+ /*
+ * Find the ports involved - this packet was
+ * incoming so the ports are right way round
+ * (but reversed relative to outer IP header!)
+ */
+ pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
+#if 0
+ if (ntohs(pptr[1]) < PORT_MASQ_BEGIN ||
+ ntohs(pptr[1]) > PORT_MASQ_END)
+ return 0;
+#endif
+
+ /* Ensure the checksum is correct */
+ if (ip_compute_csum((unsigned char *) icmph, len))
+ {
+ /* Failed checksum! */
+ IP_MASQ_DEBUG(0, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+ NIPQUAD(iph->saddr));
+ return(-1);
+ }
+
+
+ IP_MASQ_DEBUG(2, "Handling forward ICMP for %08lX:%04X -> %08lX:%04X\n",
+ ntohl(ciph->saddr), ntohs(pptr[0]),
+ ntohl(ciph->daddr), ntohs(pptr[1]));
+
+
+#if 0
+ /* This is pretty much what __ip_masq_in_get_iph() does */
+ ms = __ip_masq_in_get(ciph->protocol, ciph->saddr, pptr[0], ciph->daddr, pptr[1]);
+#endif
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_out_get(ciph->protocol,
+ ciph->daddr,
+ pptr[1],
+ ciph->saddr,
+ pptr[0]);
+ read_unlock(&__ip_masq_lock);
+
+ if (ms == NULL)
+ return 0;
+
+ /* Now we do real damage to this packet...! */
+ /* First change the source IP address, and recalc checksum */
+ iph->saddr = ms->maddr;
+ ip_send_check(iph);
+
+ /* Now change the *dest* address in the contained IP */
+ ciph->daddr = ms->maddr;
+ ip_send_check(ciph);
+
+ /* the TCP/UDP dest port - cannot redo check */
+ pptr[1] = ms->mport;
+ __ip_masq_put(ms);
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+
+
+ IP_MASQ_DEBUG(2, "Rewrote forward ICMP to %08lX:%04X -> %08lX:%04X\n",
+ ntohl(ciph->saddr), ntohs(pptr[0]),
+ ntohl(ciph->daddr), ntohs(pptr[1]));
+
+
+ return 1;
+}
+
+
+/*
+ * Own skb_cow() beast, tweaked for rewriting commonly
+ * used pointers in masq code
+ */
+static struct sk_buff * masq_skb_cow(struct sk_buff **skb_p,
+ struct iphdr **iph_p, unsigned char **t_p) {
+ struct sk_buff *skb=(*skb_p);
+ if (skb_cloned(skb)) {
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (skb) {
+ /*
+ * skb changed, update other pointers
+ */
+ struct iphdr *iph = skb->nh.iph;
+ kfree_skb(*skb_p);
+ *skb_p = skb;
+ *iph_p = iph;
+ *t_p = (char*) iph + iph->ihl * 4;
+ }
+ }
+ return skb;
+}
+
+/*
+ * Handle ICMP messages in reverse (demasquerade) direction.
+ * Find any that might be relevant, check against existing connections,
+ * forward to masqueraded host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded
+ */
+
+int ip_fw_demasq_icmp(struct sk_buff **skb_p)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
+ struct iphdr *ciph; /* The ip header contained within the ICMP */
+ __u16 *pptr; /* port numbers from TCP/UDP contained header */
+ struct ip_masq *ms;
+ unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4);
+
+
+ IP_MASQ_DEBUG(2, "icmp in/rev (%d,%d) %lX -> %lX\n",
+ icmph->type, ntohs(icmp_id(icmph)),
+ ntohl(iph->saddr), ntohl(iph->daddr));
+
+
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ if ((icmph->type == ICMP_ECHOREPLY) ||
+ (icmph->type == ICMP_TIMESTAMPREPLY) ||
+ (icmph->type == ICMP_INFO_REPLY) ||
+ (icmph->type == ICMP_ADDRESSREPLY)) {
+
+ IP_MASQ_DEBUG(2, "icmp reply rcv %lX->%lX id %d type %d, req %d\n",
+ ntohl(iph->saddr),
+ ntohl(iph->daddr),
+ ntohs(icmp_id(icmph)),
+ icmph->type,
+ icmp_type_request(icmph->type));
+
+ ms = ip_masq_in_get(iph->protocol,
+ iph->saddr,
+ icmp_hv_rep(icmph),
+ iph->daddr,
+ icmp_id(icmph));
+ if (ms == NULL)
+ return 0;
+
+ /*
+ * got reply, so clear flag
+ */
+ ms->flags &= ~IP_MASQ_F_NO_REPLY;
+
+ if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
+ ip_masq_put(ms);
+ return -1;
+ }
+
+ /* Reset source address */
+ iph->daddr = ms->saddr;
+ /* Redo IP header checksum */
+ ip_send_check(iph);
+ /* Set ID to fake port number */
+ (icmph->un).echo.id = ms->sport;
+ /* Reset ICMP checksum and set expiry */
+ icmph->checksum=0;
+ icmph->checksum=ip_compute_csum((unsigned char *)icmph,len);
+
+
+
+ IP_MASQ_DEBUG(2, "icmp reply rwt %lX->%lX id %d type %d\n",
+ ntohl(iph->saddr),
+ ntohl(iph->daddr),
+ ntohs(icmp_id(icmph)),
+ icmph->type);
+
+ masq_set_state(ms, 0, iph, icmph);
+ ip_masq_put(ms);
+
+ return 1;
+ } else {
+#endif
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_SOURCE_QUENCH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED))
+ return 0;
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ }
+#endif
+ /*
+ * If we get here we have an ICMP error of one of the above 3 types
+ * Now find the contained IP header
+ */
+
+ ciph = (struct iphdr *) (icmph + 1);
+
+#ifdef CONFIG_IP_MASQUERADE_ICMP
+ if (ciph->protocol == IPPROTO_ICMP) {
+ /*
+ * This section handles ICMP errors for ICMP packets
+ *
+ * First get a new ICMP header structure out of the IP packet
+ */
+ struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph +
+ (ciph->ihl<<2));
+
+
+ IP_MASQ_DEBUG(2, "rv icmp/icmp rcv %lX->%lX id %d type %d\n",
+ ntohl(ciph->saddr),
+ ntohl(ciph->daddr),
+ ntohs(icmp_id(cicmph)),
+ cicmph->type);
+
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_in_get(ciph->protocol,
+ ciph->daddr,
+ icmp_hv_req(cicmph),
+ ciph->saddr,
+ icmp_id(cicmph));
+ read_unlock(&__ip_masq_lock);
+
+ if (ms == NULL)
+ return 0;
+
+ if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
+ __ip_masq_put(ms);
+ return -1;
+ }
+ ciph = (struct iphdr *) (icmph + 1);
+ cicmph = (struct icmphdr *)((char *)ciph +
+ (ciph->ihl<<2));
+ /* Now we do real damage to this packet...! */
+ /* First change the dest IP address, and recalc checksum */
+ iph->daddr = ms->saddr;
+ ip_send_check(iph);
+
+ /* Now change the *source* address in the contained IP */
+ ciph->saddr = ms->saddr;
+ ip_send_check(ciph);
+
+ /* Change the ID to the original one! */
+ (cicmph->un).echo.id = ms->sport;
+ __ip_masq_put(ms);
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+
+
+ IP_MASQ_DEBUG(2, "rv icmp/icmp rwt %lX->%lX id %d type %d\n",
+ ntohl(ciph->saddr),
+ ntohl(ciph->daddr),
+ ntohs(icmp_id(cicmph)),
+ cicmph->type);
+
+ return 1;
+ }
+#endif /* CONFIG_IP_MASQUERADE_ICMP */
+
+ /* We are only interested ICMPs generated from TCP or UDP packets */
+ if ((ciph->protocol != IPPROTO_UDP) &&
+ (ciph->protocol != IPPROTO_TCP))
+ return 0;
+
+ /*
+ * Find the ports involved - remember this packet was
+ * *outgoing* so the ports are reversed (and addresses)
+ */
+ pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
+ if (ntohs(pptr[0]) < PORT_MASQ_BEGIN ||
+ ntohs(pptr[0]) > PORT_MASQ_END)
+ return 0;
+
+ /* Ensure the checksum is correct */
+ if (ip_compute_csum((unsigned char *) icmph, len))
+ {
+ /* Failed checksum! */
+ IP_MASQ_ERR( "reverse ICMP: failed checksum from %d.%d.%d.%d!\n",
+ NIPQUAD(iph->saddr));
+ return(-1);
+ }
+
+
+ IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08lX:%04X -> %08lX:%04X\n",
+ ntohl(ciph->saddr), ntohs(pptr[0]),
+ ntohl(ciph->daddr), ntohs(pptr[1]));
+
+
+ /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */
+ read_lock(&__ip_masq_lock);
+ ms = __ip_masq_in_get(ciph->protocol,
+ ciph->daddr,
+ pptr[1],
+ ciph->saddr,
+ pptr[0]);
+ read_unlock(&__ip_masq_lock);
+
+ if (ms == NULL)
+ return 0;
+
+ if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
+ __ip_masq_put(ms);
+ return -1;
+ }
+ ciph = (struct iphdr *) (icmph + 1);
+ pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
+
+ /* Now we do real damage to this packet...! */
+ /* First change the dest IP address, and recalc checksum */
+ iph->daddr = ms->saddr;
+ ip_send_check(iph);
+
+ /* Now change the *source* address in the contained IP */
+ ciph->saddr = ms->saddr;
+ ip_send_check(ciph);
+
+ /* the TCP/UDP source port - cannot redo check */
+ pptr[0] = ms->sport;
+ __ip_masq_put(ms);
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+
+
+ IP_MASQ_DEBUG(2, "Rewrote reverse ICMP to %08lX:%04X -> %08lX:%04X\n",
+ ntohl(ciph->saddr), ntohs(pptr[0]),
+ ntohl(ciph->daddr), ntohs(pptr[1]));
+
+
+ return 1;
+}
+
+ /*
+ * Check if it's an masqueraded port, look it up,
+ * and send it on its way...
+ *
+ * Better not have many hosts using the designated portrange
+ * as 'normal' ports, or you'll be spending many time in
+ * this function.
+ */
+
+int ip_fw_demasquerade(struct sk_buff **skb_p)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ union ip_masq_tphdr h;
+ struct ip_masq *ms;
+ unsigned short size;
+ int doff = 0;
+ int csum = 0;
+ int csum_ok = 0;
+ __u32 maddr;
+
+ /*
+ * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
+ * ... don't know why 1st test DOES NOT include 2nd (?)
+ */
+
+ if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) {
+ IP_MASQ_DEBUG(2, "ip_fw_demasquerade(): packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+ skb->pkt_type,
+ iph->protocol,
+ NIPQUAD(iph->daddr));
+ return 0;
+ }
+
+ h.raw = (char*) iph + iph->ihl * 4;
+
+ /*
+ * IP payload size
+ */
+ size = ntohs(iph->tot_len) - (iph->ihl * 4);
+
+ doff = proto_doff(iph->protocol, h.raw, size);
+
+ switch (doff) {
+ case 0:
+ /*
+ * Input path: other IP protos Ok, will
+ * reach local sockets path.
+ */
+ return 0;
+ case -1:
+ IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n");
+ return -1;
+ }
+
+ maddr = iph->daddr;
+ switch (iph->protocol) {
+ case IPPROTO_ICMP:
+ return(ip_fw_demasq_icmp(skb_p));
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ /*
+ * Make sure packet is in the masq range
+ * ... or some mod-ule relaxes input range
+ * ... or there is still some `special' mport opened
+ */
+ if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN
+ || ntohs(h.portp[1]) > PORT_MASQ_END)
+#ifdef CONFIG_IP_MASQUERADE_MOD
+ && (ip_masq_mod_in_rule(skb, iph) != 1)
+#endif
+ && atomic_read(&mport_count) == 0 )
+ return 0;
+
+ /* Check that the checksum is OK */
+ if ((iph->protocol == IPPROTO_UDP) && (h.uh->check == 0))
+ /* No UDP checksum */
+ break;
+#ifdef CONFIG_IP_MASQ_DEBUG
+ if (ip_masq_get_debug_level() > 3) {
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+#endif
+
+ switch (skb->ip_summed)
+ {
+ case CHECKSUM_NONE:
+ csum = csum_partial(h.raw + doff, size - doff, 0);
+ csum_ok++;
+ skb->csum = csum_partial(h.raw , doff, csum);
+
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol, skb->csum))
+ {
+ IP_MASQ_DEBUG(0, "Incoming failed %s checksum from %d.%d.%d.%d (size=%d)!\n",
+ masq_proto_name(iph->protocol),
+ NIPQUAD(iph->saddr),
+ size);
+ return -1;
+ }
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ }
+ break;
+ default:
+ return 0;
+ }
+
+
+
+ IP_MASQ_DEBUG(2, "Incoming %s %08lX:%04X -> %08lX:%04X\n",
+ masq_proto_name(iph->protocol),
+ ntohl(iph->saddr), ntohs(h.portp[0]),
+ ntohl(iph->daddr), ntohs(h.portp[1]));
+
+ /*
+ * reroute to original host:port if found...
+ */
+
+ ms = ip_masq_in_get_iph(iph);
+
+ /*
+ * Give additional modules a chance to create an entry
+ */
+#ifdef CONFIG_IP_MASQUERADE_MOD
+ if (!ms)
+ ms = ip_masq_mod_in_create(skb, iph, maddr);
+
+ /*
+ * Call module's input update hook
+ */
+ ip_masq_mod_in_update(skb, iph, ms);
+#endif
+
+
+ if (ms != NULL)
+ {
+
+ /*
+ * got reply, so clear flag
+ */
+ ms->flags &= ~IP_MASQ_F_NO_REPLY;
+
+ /*
+ * Set daddr,dport if not defined yet
+ * and tunnel is not setup as "dest loose"
+ */
+
+ if (ms->flags & IP_MASQ_F_DLOOSE) {
+ /*
+ * update dest loose values
+ */
+ ms->dport = h.portp[0];
+ ms->daddr = iph->saddr;
+ } else {
+ if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */
+
+ write_lock(&__ip_masq_lock);
+
+ ip_masq_unhash(ms);
+ ms->flags &= ~IP_MASQ_F_NO_DPORT;
+ ms->dport = h.portp[0];
+ ip_masq_hash(ms); /* hash on new dport */
+
+ write_unlock(&__ip_masq_lock);
+
+ IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled dport=%d\n",
+ ntohs(ms->dport));
+
+ }
+ if (ms->flags & IP_MASQ_F_NO_DADDR ) { /* && ms->protocol == IPPROTO_TCP) { */
+
+ write_lock(&__ip_masq_lock);
+
+ ip_masq_unhash(ms);
+ ms->flags &= ~IP_MASQ_F_NO_DADDR;
+ ms->daddr = iph->saddr;
+ ip_masq_hash(ms); /* hash on new daddr */
+
+ write_unlock(&__ip_masq_lock);
+
+ IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled daddr=%lX\n",
+ ntohl(ms->daddr));
+
+ }
+ }
+ if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) {
+ ip_masq_put(ms);
+ return -1;
+ }
+ iph->daddr = ms->saddr;
+ h.portp[1] = ms->sport;
+
+ /*
+ * Invalidate csum saving if tunnel has masq helper
+ */
+
+ if (ms->app)
+ csum_ok = 0;
+
+ /*
+ * Attempt ip_masq_app call.
+ * will fix ip_masq and iph ack_seq stuff
+ */
+
+ if (ip_masq_app_pkt_in(ms, skb_p, maddr) != 0)
+ {
+ /*
+ * skb has changed, update pointers.
+ */
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + iph->ihl*4;
+ size = ntohs(iph->tot_len) - (iph->ihl * 4);
+ }
+
+ /*
+ * Yug! adjust UDP/TCP checksums
+ */
+
+ /*
+ * Transport's payload partial csum
+ */
+
+ if (!csum_ok) {
+ csum = csum_partial(h.raw + doff, size - doff, 0);
+ }
+ skb->csum = csum;
+
+ /*
+ * Protocol csum
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ h.th->check = 0;
+ h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw , doff, csum));
+ break;
+ case IPPROTO_UDP:
+ h.uh->check = 0;
+ h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw , doff, csum));
+ if (h.uh->check == 0)
+ h.uh->check = 0xFFFF;
+ break;
+ }
+ ip_send_check(iph);
+
+ IP_MASQ_DEBUG(2, "I-routed to %08lX:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1]));
+
+ masq_set_state (ms, 0, iph, h.portp);
+ ip_masq_put(ms);
+
+ return 1;
+ }
+
+ /* sorry, all this trouble for a no-hit :) */
+ return 0;
+}
+
+
+void ip_masq_control_add(struct ip_masq *ms, struct ip_masq* ctl_ms)
+{
+ if (ms->control) {
+ IP_MASQ_ERR( "request control ADD for already controlled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(ms->saddr),ntohs(ms->sport),
+ NIPQUAD(ms->daddr),ntohs(ms->dport));
+ ip_masq_control_del(ms);
+ }
+ IP_MASQ_DEBUG(1, "ADDing control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n",
+ NIPQUAD(ms->daddr),ntohs(ms->dport),
+ NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport));
+ ms->control = ctl_ms;
+ atomic_inc(&ctl_ms->n_control);
+}
+
+void ip_masq_control_del(struct ip_masq *ms)
+{
+ struct ip_masq *ctl_ms = ms->control;
+ if (!ctl_ms) {
+ IP_MASQ_ERR( "request control DEL for uncontrolled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(ms->saddr),ntohs(ms->sport),
+ NIPQUAD(ms->daddr),ntohs(ms->dport));
+ return;
+ }
+ IP_MASQ_DEBUG(1, "DELeting control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n",
+ NIPQUAD(ms->daddr),ntohs(ms->dport),
+ NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport));
+ ms->control = NULL;
+ if (atomic_read(&ctl_ms->n_control) == 0) {
+ IP_MASQ_ERR( "BUG control DEL with n=0 : %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(ms->saddr),ntohs(ms->sport),
+ NIPQUAD(ms->daddr),ntohs(ms->dport));
+ return;
+
+ }
+ atomic_dec(&ctl_ms->n_control);
+}
+
+struct ip_masq * ip_masq_control_get(struct ip_masq *ms)
+{
+ return ms->control;
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ * /proc/net entries
+ * From userspace
+ */
+static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset,
+ int length, int unused)
+{
+ off_t pos=0, begin;
+ struct ip_masq *ms;
+ char temp[129];
+ int idx = 0;
+ int len=0;
+ struct list_head *l,*e;
+
+ if (offset < 128)
+ {
+ sprintf(temp,
+ "Prc FromIP FPrt ToIP TPrt Masq Init-seq Delta PDelta Expires (free=%d,%d,%d)",
+ atomic_read(ip_masq_free_ports),
+ atomic_read(ip_masq_free_ports+1),
+ atomic_read(ip_masq_free_ports+2));
+ len = sprintf(buffer, "%-127s\n", temp);
+ }
+ pos = 128;
+
+ for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++)
+ {
+ /*
+ * Lock is actually only need in next loop
+ * we are called from uspace: must stop bh.
+ */
+ read_lock_bh(&__ip_masq_lock);
+
+ l = &ip_masq_m_table[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ ms = list_entry(e, struct ip_masq, m_list);
+ pos += 128;
+ if (pos <= offset) {
+ len = 0;
+ continue;
+ }
+
+ /*
+ * We have locked the tables, no need to del/add timers
+ * nor cli() 8)
+ */
+
+ sprintf(temp,"%s %08lX:%04X %08lX:%04X %04X %08X %6d %6d %7lu",
+ masq_proto_name(ms->protocol),
+ ntohl(ms->saddr), ntohs(ms->sport),
+ ntohl(ms->daddr), ntohs(ms->dport),
+ ntohs(ms->mport),
+ ms->out_seq.init_seq,
+ ms->out_seq.delta,
+ ms->out_seq.previous_delta,
+ ms->timer.expires-jiffies);
+ len += sprintf(buffer+len, "%-127s\n", temp);
+
+ if(len >= length) {
+
+ read_unlock_bh(&__ip_masq_lock);
+ goto done;
+ }
+ }
+ read_unlock_bh(&__ip_masq_lock);
+
+ }
+done:
+
+
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if(len>length)
+ len = length;
+ return len;
+}
+
+#endif
+
+/*
+ * Timeouts handling by ipfwadm/ipchains
+ * From ip_fw.c
+ */
+
+int ip_fw_masq_timeouts(void *m, int len)
+{
+ struct ip_fw_masq *masq;
+ int ret = EINVAL;
+
+ if (len != sizeof(struct ip_fw_masq)) {
+ IP_MASQ_DEBUG(1, "ip_fw_masq_timeouts: length %d, expected %d\n",
+ len, sizeof(struct ip_fw_masq));
+ } else {
+ masq = (struct ip_fw_masq *)m;
+ if (masq->tcp_timeout)
+ masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED]
+ = masq->tcp_timeout;
+
+ if (masq->tcp_fin_timeout)
+ masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT]
+ = masq->tcp_fin_timeout;
+
+ if (masq->udp_timeout)
+ masq_timeout_table.timeout[IP_MASQ_S_UDP]
+ = masq->udp_timeout;
+ ret = 0;
+ }
+ return ret;
+}
+/*
+ * Module autoloading stuff
+ */
+
+static int ip_masq_user_check_hook(void) {
+#ifdef CONFIG_KMOD
+ if (ip_masq_user_hook == NULL) {
+ IP_MASQ_DEBUG(1, "About to request \"ip_masq_user\" module\n");
+ request_module("ip_masq_user");
+ }
+#endif /* CONFIG_KMOD */
+ return (ip_masq_user_hook != NULL);
+}
+
+/*
+ * user module hook- info
+ */
+static int ip_masq_user_info(char *buffer, char **start, off_t offset,
+ int len, int *eof, void *data)
+{
+ int ret = -ENOPKG;
+ if (ip_masq_user_check_hook()) {
+ ret = ip_masq_user_hook->info(buffer, start, offset, len, (int) data);
+ }
+ return ret;
+}
+
+/*
+ * user module hook- entry mgmt
+ */
+static int ip_masq_user_ctl(int optname, void *arg, int arglen)
+{
+ int ret = -ENOPKG;
+ if (ip_masq_user_check_hook()) {
+ ret = ip_masq_user_hook->ctl(optname, arg, arglen);
+ }
+ return ret;
+}
+
+/*
+ * Control from ip_sockglue
+ * MAIN ENTRY point from userspace (apart from /proc *info entries)
+ * Returns errno
+ */
+int ip_masq_uctl(int optname, char * optval , int optlen)
+{
+ struct ip_masq_ctl masq_ctl;
+ int ret = -EINVAL;
+
+ if(optlen>sizeof(masq_ctl))
+ return -EINVAL;
+
+ if(copy_from_user(&masq_ctl,optval,optlen))
+ return -EFAULT;
+
+ IP_MASQ_DEBUG(1,"ip_masq_ctl(optname=%d, optlen=%d, target=%d, cmd=%d)\n",
+ optname, optlen, masq_ctl.m_target, masq_ctl.m_cmd);
+
+ switch (masq_ctl.m_target) {
+ case IP_MASQ_TARGET_USER:
+ ret = ip_masq_user_ctl(optname, &masq_ctl, optlen);
+ break;
+#ifdef CONFIG_IP_MASQUERADE_MOD
+ case IP_MASQ_TARGET_MOD:
+ ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen);
+ break;
+#endif
+ }
+
+ /*
+ * If ret>0, copy to user space
+ */
+
+ if (ret > 0 && ret <= sizeof (masq_ctl)) {
+ if (copy_to_user(optval, &masq_ctl, ret) )
+ return -EFAULT;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_net_ip_masq = NULL;
+
+#ifdef MODULE
+static void ip_masq_proc_count(struct inode *inode, int fill)
+{
+ if (fill)
+ MOD_INC_USE_COUNT;
+ else
+ MOD_DEC_USE_COUNT;
+}
+#endif
+
+int ip_masq_proc_register(struct proc_dir_entry *ent)
+{
+ if (!proc_net_ip_masq) return -1;
+ IP_MASQ_DEBUG(1, "registering \"/proc/net/ip_masq/%s\" entry\n",
+ ent->name);
+ return proc_register(proc_net_ip_masq, ent);
+}
+void ip_masq_proc_unregister(struct proc_dir_entry *ent)
+{
+ if (!proc_net_ip_masq) return;
+ IP_MASQ_DEBUG(1, "unregistering \"/proc/net/ip_masq/%s\" entry\n",
+ ent->name);
+ proc_unregister(proc_net_ip_masq, ent->low_ino);
+}
+
+
+__initfunc(static void masq_proc_init(void))
+{
+ IP_MASQ_DEBUG(1,"registering /proc/net/ip_masq\n");
+ if (!proc_net_ip_masq) {
+ struct proc_dir_entry *ent;
+ ent = create_proc_entry("net/ip_masq", S_IFDIR, 0);
+ if (ent) {
+#ifdef MODULE
+ ent->fill_inode = ip_masq_proc_count;
+#endif
+ proc_net_ip_masq = ent;
+ } else {
+ IP_MASQ_ERR("Could not create \"/proc/net/ip_masq\" entry\n");
+ }
+ }
+}
+#endif /* CONFIG_PROC_FS */
+/*
+ * Wrapper over inet_select_addr()
+ */
+u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope)
+{
+ return inet_select_addr(dev, dst, scope);
+}
+
+/*
+ * Initialize ip masquerading
+ */
+__initfunc(int ip_masq_init(void))
+{
+ int idx;
+ for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_masq_s_table[idx]);
+ INIT_LIST_HEAD(&ip_masq_m_table[idx]);
+ INIT_LIST_HEAD(&ip_masq_d_table[idx]);
+ }
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&(struct proc_dir_entry) {
+ PROC_NET_IPMSQHST, 13, "ip_masquerade",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ ip_msqhst_procinfo
+ });
+ masq_proc_init();
+
+ ip_masq_proc_register(&(struct proc_dir_entry) {
+ 0, 3, "tcp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ NULL, /* get_info */
+ NULL, /* fill_inode */
+ NULL, NULL, NULL,
+ (char *) IPPROTO_TCP,
+ ip_masq_user_info
+ });
+ ip_masq_proc_register(&(struct proc_dir_entry) {
+ 0, 3, "udp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ NULL, /* get_info */
+ NULL, /* fill_inode */
+ NULL, NULL, NULL,
+ (char *) IPPROTO_UDP,
+ ip_masq_user_info
+ });
+ ip_masq_proc_register(&(struct proc_dir_entry) {
+ 0, 4, "icmp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ NULL, /* get_info */
+ NULL, /* fill_inode */
+ NULL, NULL, NULL,
+ (char *) IPPROTO_ICMP,
+ ip_masq_user_info
+ });
+#endif
+#ifdef CONFIG_IP_MASQUERADE_IPAUTOFW
+ ip_autofw_init();
+#endif
+#ifdef CONFIG_IP_MASQUERADE_IPPORTFW
+ ip_portfw_init();
+#endif
+#ifdef CONFIG_IP_MASQUERADE_MFW
+ ip_mfw_init();
+#endif
+ ip_masq_app_init();
+
+ return 0;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_app.c b/pfinet/linux-src/net/ipv4/ip_masq_app.c
new file mode 100644
index 00000000..84e059fa
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_app.c
@@ -0,0 +1,603 @@
+/*
+ * IP_MASQ_APP application masquerading module
+ *
+ *
+ * $Id: ip_masq_app.c,v 1.16 1998/08/29 23:51:14 davem Exp $
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * JJC : Implemented also input pkt hook
+ * Miquel van Smoorenburg : Copy more stuff when resizing skb
+ *
+ *
+ * FIXME:
+ * - ip_masq_skb_replace(): use same skb if space available.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <net/ip_masq.h>
+
+#define IP_MASQ_APP_TAB_SIZE 16 /* must be power of 2 */
+
+#define IP_MASQ_APP_HASH(proto, port) ((port^proto) & (IP_MASQ_APP_TAB_SIZE-1))
+#define IP_MASQ_APP_TYPE(proto, port) ( proto<<16 | port )
+#define IP_MASQ_APP_PORT(type) ( type & 0xffff )
+#define IP_MASQ_APP_PROTO(type) ( (type>>16) & 0x00ff )
+
+
+EXPORT_SYMBOL(register_ip_masq_app);
+EXPORT_SYMBOL(unregister_ip_masq_app);
+EXPORT_SYMBOL(ip_masq_skb_replace);
+
+/*
+ * will hold masq app. hashed list heads
+ */
+
+struct ip_masq_app *ip_masq_app_base[IP_MASQ_APP_TAB_SIZE];
+
+/*
+ * ip_masq_app registration routine
+ * port: host byte order.
+ */
+
+int register_ip_masq_app(struct ip_masq_app *mapp, unsigned short proto, __u16 port)
+{
+ unsigned long flags;
+ unsigned hash;
+ if (!mapp) {
+ IP_MASQ_ERR("register_ip_masq_app(): NULL arg\n");
+ return -EINVAL;
+ }
+ mapp->type = IP_MASQ_APP_TYPE(proto, port);
+ mapp->n_attach = 0;
+ hash = IP_MASQ_APP_HASH(proto, port);
+
+ save_flags(flags);
+ cli();
+ mapp->next = ip_masq_app_base[hash];
+ ip_masq_app_base[hash] = mapp;
+ restore_flags(flags);
+
+ return 0;
+}
+
+/*
+ * ip_masq_app unreg. routine.
+ */
+
+int unregister_ip_masq_app(struct ip_masq_app *mapp)
+{
+ struct ip_masq_app **mapp_p;
+ unsigned hash;
+ unsigned long flags;
+ if (!mapp) {
+ IP_MASQ_ERR("unregister_ip_masq_app(): NULL arg\n");
+ return -EINVAL;
+ }
+ /*
+ * only allow unregistration if it has no attachments
+ */
+ if (mapp->n_attach) {
+ IP_MASQ_ERR("unregister_ip_masq_app(): has %d attachments. failed\n",
+ mapp->n_attach);
+ return -EINVAL;
+ }
+ hash = IP_MASQ_APP_HASH(IP_MASQ_APP_PROTO(mapp->type), IP_MASQ_APP_PORT(mapp->type));
+
+ save_flags(flags);
+ cli();
+ for (mapp_p = &ip_masq_app_base[hash]; *mapp_p ; mapp_p = &(*mapp_p)->next)
+ if (mapp == (*mapp_p)) {
+ *mapp_p = mapp->next;
+ restore_flags(flags);
+ return 0;
+ }
+
+ restore_flags(flags);
+ IP_MASQ_ERR("unregister_ip_masq_app(proto=%s,port=%u): not hashed!\n",
+ masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), IP_MASQ_APP_PORT(mapp->type));
+ return -EINVAL;
+}
+
+/*
+ * get ip_masq_app object by its proto and port (net byte order).
+ */
+
+struct ip_masq_app * ip_masq_app_get(unsigned short proto, __u16 port)
+{
+ struct ip_masq_app *mapp;
+ unsigned hash;
+ unsigned type;
+
+ port = ntohs(port);
+ type = IP_MASQ_APP_TYPE(proto,port);
+ hash = IP_MASQ_APP_HASH(proto,port);
+ for(mapp = ip_masq_app_base[hash]; mapp ; mapp = mapp->next) {
+ if (type == mapp->type) return mapp;
+ }
+ return NULL;
+}
+
+/*
+ * ip_masq_app object binding related funcs.
+ */
+
+/*
+ * change ip_masq_app object's number of bindings
+ */
+
+static __inline__ int ip_masq_app_bind_chg(struct ip_masq_app *mapp, int delta)
+{
+ unsigned long flags;
+ int n_at;
+ if (!mapp) return -1;
+ save_flags(flags);
+ cli();
+ n_at = mapp->n_attach + delta;
+ if (n_at < 0) {
+ restore_flags(flags);
+ IP_MASQ_ERR("ip_masq_app: tried to set n_attach < 0 for (proto=%s,port==%d) ip_masq_app object.\n",
+ masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)),
+ IP_MASQ_APP_PORT(mapp->type));
+ return -1;
+ }
+ mapp->n_attach = n_at;
+ restore_flags(flags);
+ return 0;
+}
+
+/*
+ * Bind ip_masq to its ip_masq_app based on proto and dport ALREADY
+ * set in ip_masq struct. Also calls constructor.
+ */
+
+struct ip_masq_app * ip_masq_bind_app(struct ip_masq *ms)
+{
+ struct ip_masq_app * mapp;
+
+ if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP)
+ return NULL;
+
+ mapp = ip_masq_app_get(ms->protocol, ms->dport);
+
+#if 0000
+/* #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW */
+ if (mapp == NULL)
+ mapp = ip_masq_app_get(ms->protocol, ms->sport);
+/* #endif */
+#endif
+
+ if (mapp != NULL) {
+ /*
+ * don't allow binding if already bound
+ */
+
+ if (ms->app != NULL) {
+ IP_MASQ_ERR("ip_masq_bind_app() called for already bound object.\n");
+ return ms->app;
+ }
+
+ ms->app = mapp;
+ if (mapp->masq_init_1) mapp->masq_init_1(mapp, ms);
+ ip_masq_app_bind_chg(mapp, +1);
+ }
+ return mapp;
+}
+
+/*
+ * Unbind ms from type object and call ms destructor (does not kfree()).
+ */
+
+int ip_masq_unbind_app(struct ip_masq *ms)
+{
+ struct ip_masq_app * mapp;
+ mapp = ms->app;
+
+ if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP)
+ return 0;
+
+ if (mapp != NULL) {
+ if (mapp->masq_done_1) mapp->masq_done_1(mapp, ms);
+ ms->app = NULL;
+ ip_masq_app_bind_chg(mapp, -1);
+ }
+ return (mapp != NULL);
+}
+
+/*
+ * Fixes th->seq based on ip_masq_seq info.
+ */
+
+static __inline__ void masq_fix_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th)
+{
+ __u32 seq;
+
+ seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+
+ if (ms_seq->delta || ms_seq->previous_delta) {
+ if(after(seq,ms_seq->init_seq) ) {
+ th->seq = htonl(seq + ms_seq->delta);
+ IP_MASQ_DEBUG(1, "masq_fix_seq() : added delta (%d) to seq\n",ms_seq->delta);
+ } else {
+ th->seq = htonl(seq + ms_seq->previous_delta);
+ IP_MASQ_DEBUG(1, "masq_fix_seq() : added previous_delta (%d) to seq\n",ms_seq->previous_delta);
+ }
+ }
+
+
+}
+
+/*
+ * Fixes th->ack_seq based on ip_masq_seq info.
+ */
+
+static __inline__ void masq_fix_ack_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th)
+{
+ __u32 ack_seq;
+
+ ack_seq=ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+
+ if (ms_seq->delta || ms_seq->previous_delta) {
+ if(after(ack_seq,ms_seq->init_seq)) {
+ th->ack_seq = htonl(ack_seq-ms_seq->delta);
+ IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted delta (%d) from ack_seq\n",ms_seq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq-ms_seq->previous_delta);
+ IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted previous_delta (%d) from ack_seq\n",ms_seq->previous_delta);
+ }
+ }
+
+}
+
+/*
+ * Updates ip_masq_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+
+static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *ms_seq, unsigned mflag, __u32 seq, int diff)
+{
+ /* if (diff == 0) return; */
+
+ if ( !(ms->flags & mflag) || after(seq, ms_seq->init_seq))
+ {
+ ms_seq->previous_delta=ms_seq->delta;
+ ms_seq->delta+=diff;
+ ms_seq->init_seq=seq;
+ ms->flags |= mflag;
+ }
+}
+
+/*
+ * Output pkt hook. Will call bound ip_masq_app specific function
+ * called by ip_fw_masquerade(), assumes previously checked ms!=NULL
+ * returns (new - old) skb->len diff.
+ */
+
+int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct ip_masq_app * mapp;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ int diff;
+ __u32 seq;
+
+ /*
+ * check if application masquerading is bound to
+ * this ip_masq.
+ * assumes that once an ip_masq is bound,
+ * it will not be unbound during its life.
+ */
+
+ if ( (mapp = ms->app) == NULL)
+ return 0;
+
+ iph = (*skb_p)->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+
+ if (ms->protocol == IPPROTO_TCP) {
+ if (ms->flags & IP_MASQ_F_OUT_SEQ)
+ masq_fix_seq(&ms->out_seq, th);
+ if (ms->flags & IP_MASQ_F_IN_SEQ)
+ masq_fix_ack_seq(&ms->in_seq, th);
+ }
+
+ /*
+ * Call private output hook function
+ */
+
+ if ( mapp->pkt_out == NULL )
+ return 0;
+
+ diff = mapp->pkt_out(mapp, ms, skb_p, maddr);
+
+ /*
+ * Update ip_masq seq stuff if len has changed.
+ */
+
+ if (diff != 0 && ms->protocol == IPPROTO_TCP)
+ masq_seq_update(ms, &ms->out_seq, IP_MASQ_F_OUT_SEQ, seq, diff);
+
+ return diff;
+}
+
+/*
+ * Input pkt hook. Will call bound ip_masq_app specific function
+ * called by ip_fw_demasquerade(), assumes previously checked ms!=NULL.
+ * returns (new - old) skb->len diff.
+ */
+
+int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct ip_masq_app * mapp;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ int diff;
+ __u32 seq;
+
+ /*
+ * check if application masquerading is bound to
+ * this ip_masq.
+ * assumes that once an ip_masq is bound,
+ * it will not be unbound during its life.
+ */
+
+ if ( (mapp = ms->app) == NULL)
+ return 0;
+
+ iph = (*skb_p)->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+
+ if (ms->protocol == IPPROTO_TCP) {
+ if (ms->flags & IP_MASQ_F_IN_SEQ)
+ masq_fix_seq(&ms->in_seq, th);
+ if (ms->flags & IP_MASQ_F_OUT_SEQ)
+ masq_fix_ack_seq(&ms->out_seq, th);
+ }
+
+ /*
+ * Call private input hook function
+ */
+
+ if ( mapp->pkt_in == NULL )
+ return 0;
+
+ diff = mapp->pkt_in(mapp, ms, skb_p, maddr);
+
+ /*
+ * Update ip_masq seq stuff if len has changed.
+ */
+
+ if (diff != 0 && ms->protocol == IPPROTO_TCP)
+ masq_seq_update(ms, &ms->in_seq, IP_MASQ_F_IN_SEQ, seq, diff);
+
+ return diff;
+}
+
+/*
+ * /proc/ip_masq_app entry function
+ */
+
+int ip_masq_app_getinfo(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ off_t pos=0, begin=0;
+ int len=0;
+ struct ip_masq_app * mapp;
+ unsigned idx;
+
+ if (offset < 40)
+ len=sprintf(buffer,"%-39s\n", "prot port n_attach name");
+ pos = 40;
+
+ for (idx=0 ; idx < IP_MASQ_APP_TAB_SIZE; idx++)
+ for (mapp = ip_masq_app_base[idx]; mapp ; mapp = mapp->next) {
+ /*
+ * If you change the length of this sprintf, then all
+ * the length calculations need fixing too!
+ * Line length = 40 (3 + 2 + 7 + 1 + 7 + 1 + 2 + 17)
+ */
+ pos += 40;
+ if (pos < offset)
+ continue;
+
+ len += sprintf(buffer+len, "%-3s %-7u %-7d %-17s\n",
+ masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)),
+ IP_MASQ_APP_PORT(mapp->type), mapp->n_attach,
+ mapp->name);
+
+ if(len >= length)
+ goto done;
+ }
+done:
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if (len > length)
+ len = length;
+ return len;
+}
+
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_ip_masq_app = {
+ PROC_NET_IP_MASQ_APP, 3, "app",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ ip_masq_app_getinfo
+};
+#endif
+
+/*
+ * Initialization routine
+ */
+
+__initfunc(int ip_masq_app_init(void))
+{
+#ifdef CONFIG_PROC_FS
+ ip_masq_proc_register(&proc_net_ip_masq_app);
+#endif
+ return 0;
+}
+
+/*
+ * Replace a segment (of skb->data) with a new one.
+ * FIXME: Should re-use same skb if space available, this could
+ * be done if n_len < o_len, unless some extra space
+ * were already allocated at driver level :P .
+ */
+
+static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len)
+{
+ int maxsize, diff, o_offset;
+ struct sk_buff *n_skb;
+ int offset;
+
+ maxsize = skb->truesize;
+
+ diff = n_len - o_len;
+ o_offset = o_buf - (char*) skb->data;
+
+ if (maxsize <= n_len) {
+ if (diff != 0) {
+ memcpy(skb->data + o_offset + n_len,o_buf + o_len,
+ skb->len - (o_offset + o_len));
+ }
+
+ memcpy(skb->data + o_offset, n_buf, n_len);
+
+ n_skb = skb;
+ skb->len = n_len;
+ skb->end = skb->head+n_len;
+ } else {
+ /*
+ * Sizes differ, make a copy.
+ *
+ * FIXME: move this to core/sbuff.c:skb_grow()
+ */
+
+ n_skb = alloc_skb(MAX_HEADER + skb->len + diff, pri);
+ if (n_skb == NULL) {
+ IP_MASQ_ERR("skb_replace(): no room left (from %p)\n",
+ __builtin_return_address(0));
+ return skb;
+
+ }
+ skb_reserve(n_skb, MAX_HEADER);
+ skb_put(n_skb, skb->len + diff);
+
+ /*
+ * Copy as much data from the old skb as possible. Even
+ * though we're only forwarding packets, we need stuff
+ * like skb->protocol (PPP driver wants it).
+ */
+ offset = n_skb->data - skb->data;
+ n_skb->nh.raw = skb->nh.raw + offset;
+ n_skb->h.raw = skb->h.raw + offset;
+ n_skb->dev = skb->dev;
+ n_skb->mac.raw = skb->mac.raw + offset;
+ n_skb->pkt_type = skb->pkt_type;
+ n_skb->protocol = skb->protocol;
+ n_skb->ip_summed = skb->ip_summed;
+ n_skb->dst = dst_clone(skb->dst);
+
+ /*
+ * Copy pkt in new buffer
+ */
+
+ memcpy(n_skb->data, skb->data, o_offset);
+ memcpy(n_skb->data + o_offset, n_buf, n_len);
+ memcpy(n_skb->data + o_offset + n_len, o_buf + o_len,
+ skb->len - (o_offset + o_len) );
+
+ /*
+ * Problem, how to replace the new skb with old one,
+ * preferably inplace
+ */
+
+ kfree_skb(skb);
+ }
+ return n_skb;
+}
+
+/*
+ * calls skb_replace() and update ip header if new skb was allocated
+ */
+
+struct sk_buff * ip_masq_skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len)
+{
+ int diff;
+ struct sk_buff *n_skb;
+ unsigned skb_len;
+
+ diff = n_len - o_len;
+ n_skb = skb_replace(skb, pri, o_buf, o_len, n_buf, n_len);
+ skb_len = skb->len;
+
+ if (diff)
+ {
+ struct iphdr *iph;
+ IP_MASQ_DEBUG(1, "masq_skb_replace(): pkt resized for %d bytes (len=%d)\n", diff, skb->len);
+ /*
+ * update ip header
+ */
+ iph = n_skb->nh.iph;
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ iph->tot_len = htons(skb_len + diff);
+ }
+ return n_skb;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_autofw.c b/pfinet/linux-src/net/ipv4/ip_masq_autofw.c
new file mode 100644
index 00000000..d2a1729c
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_autofw.c
@@ -0,0 +1,448 @@
+/*
+ * IP_MASQ_AUTOFW auto forwarding module
+ *
+ *
+ * $Id: ip_masq_autofw.c,v 1.3 1998/08/29 23:51:10 davem Exp $
+ *
+ * Author: Richard Lynch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * Fixes:
+ * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c
+ * Juan Jose Ciarlante : modularized
+ * Juan Jose Ciarlante : use GFP_KERNEL when creating entries
+ * Juan Jose Ciarlante : call del_timer() when freeing entries (!)
+ * FIXME:
+ * - implement refcnt
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/if.h>
+#include <linux/init.h>
+#include <linux/ip_fw.h>
+#include <net/ip_masq.h>
+#include <net/ip_masq_mod.h>
+#include <linux/ip_masq.h>
+
+#define IP_AUTOFW_EXPIRE 15*HZ
+
+/* WARNING: bitwise equal to ip_autofw_user in linux/ip_masq.h */
+struct ip_autofw {
+ struct ip_autofw * next;
+ __u16 type;
+ __u16 low;
+ __u16 hidden;
+ __u16 high;
+ __u16 visible;
+ __u16 protocol;
+ __u32 lastcontact;
+ __u32 where;
+ __u16 ctlproto;
+ __u16 ctlport;
+ __u16 flags;
+ struct timer_list timer;
+};
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+/*
+ * Auto-forwarding table
+ */
+
+static struct ip_autofw * ip_autofw_hosts = NULL;
+static struct ip_masq_mod * mmod_self = NULL;
+
+/*
+ * Check if a masq entry should be created for a packet
+ */
+
+static __inline__ struct ip_autofw * ip_autofw_check_range (__u32 where, __u16 port, __u16 protocol, int reqact)
+{
+ struct ip_autofw *af;
+ af=ip_autofw_hosts;
+ port=ntohs(port);
+ while (af) {
+ if (af->type==IP_FWD_RANGE &&
+ port>=af->low &&
+ port<=af->high &&
+ protocol==af->protocol &&
+
+ /*
+ * It's ok to create masq entries after
+ * the timeout if we're in insecure mode
+ */
+ (af->flags & IP_AUTOFW_ACTIVE || !reqact || !(af->flags & IP_AUTOFW_SECURE)) &&
+ (!(af->flags & IP_AUTOFW_SECURE) || af->lastcontact==where || !reqact))
+ return(af);
+ af=af->next;
+ }
+ return(NULL);
+}
+
+static __inline__ struct ip_autofw * ip_autofw_check_port (__u16 port, __u16 protocol)
+{
+ struct ip_autofw *af;
+ af=ip_autofw_hosts;
+ port=ntohs(port);
+ while (af)
+ {
+ if (af->type==IP_FWD_PORT && port==af->visible && protocol==af->protocol)
+ return(af);
+ af=af->next;
+ }
+ return(NULL);
+}
+
+static __inline__ struct ip_autofw * ip_autofw_check_direct (__u16 port, __u16 protocol)
+{
+ struct ip_autofw *af;
+ af=ip_autofw_hosts;
+ port=ntohs(port);
+ while (af)
+ {
+ if (af->type==IP_FWD_DIRECT && af->low<=port && af->high>=port)
+ return(af);
+ af=af->next;
+ }
+ return(NULL);
+}
+
+static __inline__ void ip_autofw_update_out (__u32 who, __u32 where, __u16 port, __u16 protocol)
+{
+ struct ip_autofw *af;
+ af=ip_autofw_hosts;
+ port=ntohs(port);
+ while (af)
+ {
+ if (af->type==IP_FWD_RANGE && af->ctlport==port && af->ctlproto==protocol)
+ {
+ if (af->flags & IP_AUTOFW_USETIME)
+ {
+ mod_timer(&af->timer,
+ jiffies+IP_AUTOFW_EXPIRE);
+ }
+ af->flags|=IP_AUTOFW_ACTIVE;
+ af->lastcontact=where;
+ af->where=who;
+ }
+ af=af->next;
+ }
+}
+
+#if 0
+static __inline__ void ip_autofw_update_in (__u32 where, __u16 port, __u16 protocol)
+{
+ struct ip_autofw *af;
+ af=ip_autofw_check_range(where, port,protocol);
+ if (af)
+ {
+ mod_timer(&af->timer, jiffies+IP_AUTOFW_EXPIRE);
+ }
+}
+#endif
+
+
+static __inline__ void ip_autofw_expire(unsigned long data)
+{
+ struct ip_autofw * af;
+ af=(struct ip_autofw *) data;
+ af->flags &= ~IP_AUTOFW_ACTIVE;
+ af->timer.expires=0;
+ af->lastcontact=0;
+ if (af->flags & IP_AUTOFW_SECURE)
+ af->where=0;
+}
+
+
+
+static __inline__ int ip_autofw_add(struct ip_autofw_user * af)
+{
+ struct ip_autofw * newaf;
+ newaf = kmalloc( sizeof(struct ip_autofw), GFP_KERNEL );
+ init_timer(&newaf->timer);
+ if ( newaf == NULL )
+ {
+ printk("ip_autofw_add: malloc said no\n");
+ return( ENOMEM );
+ }
+
+ MOD_INC_USE_COUNT;
+
+ memcpy(newaf, af, sizeof(struct ip_autofw_user));
+ newaf->timer.data = (unsigned long) newaf;
+ newaf->timer.function = ip_autofw_expire;
+ newaf->timer.expires = 0;
+ newaf->lastcontact=0;
+ newaf->next=ip_autofw_hosts;
+ ip_autofw_hosts=newaf;
+ ip_masq_mod_inc_nent(mmod_self);
+ return(0);
+}
+
+static __inline__ int ip_autofw_del(struct ip_autofw_user * af)
+{
+ struct ip_autofw ** af_p, *curr;
+
+ for (af_p=&ip_autofw_hosts, curr=*af_p; (curr=*af_p); af_p = &(*af_p)->next) {
+ if (af->type == curr->type &&
+ af->low == curr->low &&
+ af->high == curr->high &&
+ af->hidden == curr->hidden &&
+ af->visible == curr->visible &&
+ af->protocol == curr->protocol &&
+ af->where == curr->where &&
+ af->ctlproto == curr->ctlproto &&
+ af->ctlport == curr->ctlport)
+ {
+ ip_masq_mod_dec_nent(mmod_self);
+ *af_p = curr->next;
+ if (af->flags&IP_AUTOFW_ACTIVE)
+ del_timer(&curr->timer);
+ kfree_s(curr,sizeof(struct ip_autofw));
+ MOD_DEC_USE_COUNT;
+ return 0;
+ }
+ curr=curr->next;
+ }
+ return EINVAL;
+}
+
+static __inline__ int ip_autofw_flush(void)
+{
+ struct ip_autofw * af;
+
+ while (ip_autofw_hosts)
+ {
+ af=ip_autofw_hosts;
+ ip_masq_mod_dec_nent(mmod_self);
+ ip_autofw_hosts=ip_autofw_hosts->next;
+ if (af->flags&IP_AUTOFW_ACTIVE)
+ del_timer(&af->timer);
+ kfree_s(af,sizeof(struct ip_autofw));
+ MOD_DEC_USE_COUNT;
+ }
+ return(0);
+}
+
+/*
+ * Methods for registered object
+ */
+
+static int autofw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
+{
+ struct ip_autofw_user *af = &mctl->u.autofw_user;
+
+ switch (mctl->m_cmd) {
+ case IP_MASQ_CMD_ADD:
+ case IP_MASQ_CMD_INSERT:
+ if (optlen<sizeof(*af))
+ return EINVAL;
+ return ip_autofw_add(af);
+ case IP_MASQ_CMD_DEL:
+ if (optlen<sizeof(*af))
+ return EINVAL;
+ return ip_autofw_del(af);
+ case IP_MASQ_CMD_FLUSH:
+ return ip_autofw_flush();
+
+ }
+ return EINVAL;
+}
+
+
+static int autofw_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ /*
+ * Update any ipautofw entries ...
+ */
+
+ ip_autofw_update_out(iph->saddr, iph->daddr, portp[1], iph->protocol);
+ return IP_MASQ_MOD_NOP;
+}
+
+static struct ip_masq * autofw_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ /*
+ * If the source port is supposed to match the masq port, then
+ * make it so
+ */
+
+ if (ip_autofw_check_direct(portp[1],iph->protocol)) {
+ return ip_masq_new(iph->protocol,
+ maddr, portp[0],
+ iph->saddr, portp[0],
+ iph->daddr, portp[1],
+ 0);
+ }
+ return NULL;
+}
+
+#if 0
+static int autofw_in_update(const struct sk_buff *skb, const struct iphdr *iph, __u16 *portp, struct ip_masq *ms)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ ip_autofw_update_in(iph->saddr, portp[1], iph->protocol);
+ return IP_MASQ_MOD_NOP;
+}
+#endif
+
+static int autofw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ return (ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0)
+ || ip_autofw_check_direct(portp[1], iph->protocol)
+ || ip_autofw_check_port(portp[1], iph->protocol));
+}
+
+static struct ip_masq * autofw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ struct ip_autofw *af;
+
+ if ((af=ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0))) {
+ IP_MASQ_DEBUG(1-debug, "autofw_check_range HIT\n");
+ return ip_masq_new(iph->protocol,
+ maddr, portp[1],
+ af->where, portp[1],
+ iph->saddr, portp[0],
+ 0);
+ }
+ if ((af=ip_autofw_check_port(portp[1], iph->protocol)) ) {
+ IP_MASQ_DEBUG(1-debug, "autofw_check_port HIT\n");
+ return ip_masq_new(iph->protocol,
+ maddr, htons(af->visible),
+ af->where, htons(af->hidden),
+ iph->saddr, portp[0],
+ 0);
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_PROC_FS
+static int autofw_procinfo(char *buffer, char **start, off_t offset,
+ int length, int unused)
+{
+ off_t pos=0, begin=0;
+ struct ip_autofw * af;
+ int len=0;
+
+ len=sprintf(buffer,"Type Prot Low High Vis Hid Where Last CPto CPrt Timer Flags\n");
+
+ for(af = ip_autofw_hosts; af ; af = af->next)
+ {
+ len+=sprintf(buffer+len,"%4X %4X %04X-%04X/%04X %04X %08lX %08lX %04X %04X %6lu %4X\n",
+ af->type,
+ af->protocol,
+ af->low,
+ af->high,
+ af->visible,
+ af->hidden,
+ ntohl(af->where),
+ ntohl(af->lastcontact),
+ af->ctlproto,
+ af->ctlport,
+ (af->timer.expires<jiffies ? 0 : af->timer.expires-jiffies),
+ af->flags);
+
+ pos=begin+len;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ break;
+ }
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ return len;
+}
+
+static struct proc_dir_entry autofw_proc_entry = {
+ 0, 0, NULL,
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ autofw_procinfo
+};
+
+#define proc_ent &autofw_proc_entry
+#else /* !CONFIG_PROC_FS */
+
+#define proc_ent NULL
+#endif
+
+
+#define autofw_in_update NULL
+#define autofw_out_rule NULL
+#define autofw_mod_init NULL
+#define autofw_mod_done NULL
+
+static struct ip_masq_mod autofw_mod = {
+ NULL, /* next */
+ NULL, /* next_reg */
+ "autofw", /* name */
+ ATOMIC_INIT(0), /* nent */
+ ATOMIC_INIT(0), /* refcnt */
+ proc_ent,
+ autofw_ctl,
+ autofw_mod_init,
+ autofw_mod_done,
+ autofw_in_rule,
+ autofw_in_update,
+ autofw_in_create,
+ autofw_out_rule,
+ autofw_out_update,
+ autofw_out_create,
+};
+
+__initfunc(int ip_autofw_init(void))
+{
+ return register_ip_masq_mod ((mmod_self=&autofw_mod));
+}
+
+int ip_autofw_done(void)
+{
+ return unregister_ip_masq_mod(&autofw_mod);
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_autofw_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_autofw_done() != 0)
+ printk(KERN_INFO "ip_autofw_done(): can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c b/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c
new file mode 100644
index 00000000..9b412baf
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_cuseeme.c
@@ -0,0 +1,264 @@
+/*
+ * IP_MASQ_FTP CUSeeMe masquerading module
+ *
+ *
+ * Version: @(#)$Id: ip_masq_cuseeme.c,v 1.4 1998/10/06 04:48:57 davem Exp $
+ *
+ * Author: Richard Lynch
+ *
+ *
+ * Fixes:
+ * Richard Lynch : Updated patch to conform to new module
+ * specifications
+ * Nigel Metheringham : Multiple port support
+ * Michael Owings : Fixed broken init code
+ * Added code to update inbound
+ * packets with correct local addresses.
+ * Fixes audio and "chat" problems
+ * Thanx to the CU-SeeMe Consortium for
+ * technical docs
+ * Steven Clarke : Small changes for 2.1
+ *
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Multiple Port Support
+ * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
+ * with the port numbers being defined at module load time. The module
+ * uses the symbol "ports" to define a list of monitored ports, which can
+ * be specified on the insmod command line as
+ * ports=x1,x2,x3...
+ * where x[n] are integer port numbers. This option can be put into
+ * /etc/conf.modules (or /etc/modules.conf depending on your config)
+ * where modload will pick it up should you use modload to load your
+ * modules.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+
+/* #define IP_MASQ_NDEBUG */
+#include <net/ip_masq.h>
+
+#pragma pack(1)
+/* CU-SeeMe Data Header */
+typedef struct {
+ u_short dest_family;
+ u_short dest_port;
+ u_long dest_addr;
+ short family;
+ u_short port;
+ u_long addr;
+ u_long seq;
+ u_short msg;
+ u_short data_type;
+ u_short packet_len;
+} cu_header;
+
+/* Open Continue Header */
+typedef struct {
+ cu_header cu_head;
+ u_short client_count; /* Number of client info structs */
+ u_long seq_no;
+ char user_name[20];
+ char stuff[4]; /* flags, version stuff, etc */
+}oc_header;
+
+/* client info structures */
+typedef struct {
+ u_long address; /* Client address */
+ char stuff[8]; /* Flags, pruning bitfield, packet counts etc */
+} client_info;
+#pragma pack()
+
+/*
+ * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[MAX_MASQ_APP_PORTS] = {7648}; /* I rely on the trailing items being set to zero */
+struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+
+static int
+masq_cuseeme_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ return 0;
+}
+
+static int
+masq_cuseeme_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+int
+masq_cuseeme_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
+ cu_header *cu_head;
+ char *data=(char *)&uh[1];
+
+ if (skb->len - ((unsigned char *) data - skb->h.raw) >= sizeof(cu_header))
+ {
+ cu_head = (cu_header *) data;
+ /* cu_head->port = ms->mport; */
+ if( cu_head->addr )
+ cu_head->addr = (u_long) maddr;
+ if(ntohs(cu_head->data_type) == 257)
+ IP_MASQ_DEBUG(1-debug, "Sending talk packet!\n");
+ }
+ return 0;
+}
+
+int
+masq_cuseeme_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
+ cu_header *cu_head;
+ oc_header *oc;
+ client_info *ci;
+ char *data=(char *)&uh[1];
+ u_short len = skb->len - ((unsigned char *) data - skb->h.raw);
+ int i, off;
+
+ if (len >= sizeof(cu_header))
+ {
+ cu_head = (cu_header *) data;
+ if(cu_head->dest_addr) /* Correct destination address */
+ cu_head->dest_addr = (u_long) ms->saddr;
+ if(ntohs(cu_head->data_type)==101 && len > sizeof(oc_header))
+ {
+ oc = (oc_header * ) data;
+ /* Spin (grovel) thru client_info structs till we find our own */
+ off=sizeof(oc_header);
+ for(i=0;
+ (i < oc->client_count && off+sizeof(client_info) <= len);
+ i++)
+ {
+ ci=(client_info *)(data+off);
+ if(ci->address==(u_long) maddr)
+ {
+ /* Update w/ our real ip address and exit */
+ ci->address = (u_long) ms->saddr;
+ break;
+ }
+ else
+ off+=sizeof(client_info);
+ }
+ }
+ }
+ return 0;
+}
+
+struct ip_masq_app ip_masq_cuseeme = {
+ NULL, /* next */
+ "cuseeme",
+ 0, /* type */
+ 0, /* n_attach */
+ masq_cuseeme_init_1, /* ip_masq_init_1 */
+ masq_cuseeme_done_1, /* ip_masq_done_1 */
+ masq_cuseeme_out, /* pkt_out */
+ masq_cuseeme_in /* pkt_in */
+};
+
+
+/*
+ * ip_masq_cuseeme initialization
+ */
+
+__initfunc(int ip_masq_cuseeme_init(void))
+{
+ int i, j;
+
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (ports[i]) {
+ if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
+ GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ memcpy(masq_incarnations[i], &ip_masq_cuseeme, sizeof(struct ip_masq_app));
+ if ((j = register_ip_masq_app(masq_incarnations[i],
+ IPPROTO_UDP,
+ ports[i]))) {
+ return j;
+ }
+#if DEBUG_CONFIG_IP_MASQ_CUSEEME
+ IP_MASQ_DEBUG(1-debug, "CuSeeMe: loaded support on port[%d] = %d\n",
+ i, ports[i]);
+#endif
+ } else {
+ /* To be safe, force the incarnation table entry to NULL */
+ masq_incarnations[i] = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * ip_masq_cuseeme fin.
+ */
+
+int ip_masq_cuseeme_done(void)
+{
+ int i, j, k;
+
+ k=0;
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (masq_incarnations[i]) {
+ if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
+ k = j;
+ } else {
+ kfree(masq_incarnations[i]);
+ masq_incarnations[i] = NULL;
+ IP_MASQ_DEBUG(1-debug, "CuSeeMe: unloaded support on port[%d] = %d\n", i, ports[i]);
+ }
+ }
+ }
+ return k;
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_cuseeme_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_cuseeme_done() != 0)
+ IP_MASQ_DEBUG(1-debug, "ip_masq_cuseeme: can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_ftp.c b/pfinet/linux-src/net/ipv4/ip_masq_ftp.c
new file mode 100644
index 00000000..35d1f544
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_ftp.c
@@ -0,0 +1,393 @@
+/*
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ *
+ * Fixes:
+ * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands
+ * Juan Jose Ciarlante : Code moved and adapted from ip_fw.c
+ * Keith Owens : Add keep alive for ftp control channel
+ * Nigel Metheringham : Added multiple port support
+ * Juan Jose Ciarlante : Use control_add() for ftp control chan
+ * Juan Jose Ciarlante : Litl bits for 2.1
+ * Juan Jose Ciarlante : use ip_masq_listen()
+ * Juan Jose Ciarlante : use private app_data for own flag(s)
+ *
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Multiple Port Support
+ * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
+ * with the port numbers being defined at module load time. The module
+ * uses the symbol "ports" to define a list of monitored ports, which can
+ * be specified on the insmod command line as
+ * ports=x1,x2,x3...
+ * where x[n] are integer port numbers. This option can be put into
+ * /etc/conf.modules (or /etc/modules.conf depending on your config)
+ * where modload will pick it up should you use modload to load your
+ * modules.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+
+/* #define IP_MASQ_NDEBUG */
+#include <net/ip_masq.h>
+
+
+/*
+ * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[MAX_MASQ_APP_PORTS] = {21}; /* I rely on the trailing items being set to zero */
+struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+
+/* Dummy variable */
+static int masq_ftp_pasv;
+
+static int
+masq_ftp_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ return 0;
+}
+
+static int
+masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+int
+masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *p, *data, *data_limit;
+ unsigned char p1,p2,p3,p4,p5,p6;
+ __u32 from;
+ __u16 port;
+ struct ip_masq *n_ms;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned buf_len;
+ int diff;
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+
+ data_limit = skb->h.raw + skb->len - 18;
+ if (skb->len >= 6 && (memcmp(data, "PASV\r\n", 6) == 0 || memcmp(data, "pasv\r\n", 6) == 0))
+ ms->app_data = &masq_ftp_pasv;
+
+ while (data < data_limit)
+ {
+ if (memcmp(data,"PORT ",5) && memcmp(data,"port ",5))
+ {
+ data ++;
+ continue;
+ }
+ p = data+5;
+ p1 = simple_strtoul(data+5,&data,10);
+ if (*data!=',')
+ continue;
+ p2 = simple_strtoul(data+1,&data,10);
+ if (*data!=',')
+ continue;
+ p3 = simple_strtoul(data+1,&data,10);
+ if (*data!=',')
+ continue;
+ p4 = simple_strtoul(data+1,&data,10);
+ if (*data!=',')
+ continue;
+ p5 = simple_strtoul(data+1,&data,10);
+ if (*data!=',')
+ continue;
+ p6 = simple_strtoul(data+1,&data,10);
+ if (*data!='\r' && *data!='\n')
+ continue;
+
+ from = (p1<<24) | (p2<<16) | (p3<<8) | p4;
+ port = (p5<<8) | p6;
+
+ IP_MASQ_DEBUG(1-debug, "PORT %X:%X detected\n",from,port);
+
+ /*
+ * Now update or create an masquerade entry for it
+ */
+
+ IP_MASQ_DEBUG(1-debug, "protocol %d %lX:%X %X:%X\n", iph->protocol, htonl(from), htons(port), iph->daddr, 0);
+
+ n_ms = ip_masq_out_get(iph->protocol,
+ htonl(from), htons(port),
+ iph->daddr, 0);
+ if (!n_ms) {
+ n_ms = ip_masq_new(IPPROTO_TCP,
+ maddr, 0,
+ htonl(from), htons(port),
+ iph->daddr, 0,
+ IP_MASQ_F_NO_DPORT);
+
+ if (n_ms==NULL)
+ return 0;
+ ip_masq_control_add(n_ms, ms);
+ }
+
+ /*
+ * Replace the old PORT with the new one
+ */
+ from = ntohl(n_ms->maddr);
+ port = ntohs(n_ms->mport);
+ sprintf(buf,"%d,%d,%d,%d,%d,%d",
+ from>>24&255,from>>16&255,from>>8&255,from&255,
+ port>>8&255,port&255);
+ buf_len = strlen(buf);
+
+ IP_MASQ_DEBUG(1-debug, "new PORT %X:%X\n",from,port);
+
+ /*
+ * Calculate required delta-offset to keep TCP happy
+ */
+
+ diff = buf_len - (data-p);
+
+ /*
+ * No shift.
+ */
+
+ if (diff==0) {
+ /*
+ * simple case, just replace the old PORT cmd
+ */
+ memcpy(p,buf,buf_len);
+ } else {
+
+ *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, p, data-p, buf, buf_len);
+ }
+ /*
+ * Move tunnel to listen state
+ */
+ ip_masq_listen(n_ms);
+ ip_masq_put(n_ms);
+
+ return diff;
+
+ }
+ return 0;
+
+}
+
+/*
+ * Look at incoming ftp packets to catch the response to a PASV command. When
+ * we see one we build a masquerading entry for the client address, client port
+ * 0 (unknown at the moment), the server address and the server port. Mark the
+ * current masquerade entry as a control channel and point the new entry at the
+ * control entry. All this work just for ftp keepalive across masquerading.
+ *
+ * The incoming packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * ncftp 2.3.0 cheats by skipping the leading number then going 22 bytes into
+ * the data so we do the same. If it's good enough for ncftp then it's good
+ * enough for me.
+ *
+ * In this case, the client is the source machine being masqueraded, the server
+ * is the destination for ftp requests. It all depends on your point of view ...
+ */
+
+int
+masq_ftp_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ unsigned char p1,p2,p3,p4,p5,p6;
+ __u32 to;
+ __u16 port;
+ struct ip_masq *n_ms;
+
+ if (ms->app_data != &masq_ftp_pasv)
+ return 0; /* quick exit if no outstanding PASV */
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+ data_limit = skb->h.raw + skb->len;
+
+ while (data < data_limit && *data != ' ')
+ ++data;
+ while (data < data_limit && *data == ' ')
+ ++data;
+ data += 22;
+ if (data >= data_limit || *data != '(')
+ return 0;
+ p1 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ',')
+ return 0;
+ p2 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ',')
+ return 0;
+ p3 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ',')
+ return 0;
+ p4 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ',')
+ return 0;
+ p5 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ',')
+ return 0;
+ p6 = simple_strtoul(data+1, &data, 10);
+ if (data >= data_limit || *data != ')')
+ return 0;
+
+ to = (p1<<24) | (p2<<16) | (p3<<8) | p4;
+ port = (p5<<8) | p6;
+
+ /*
+ * Now update or create an masquerade entry for it
+ */
+ IP_MASQ_DEBUG(1-debug, "PASV response %lX:%X %X:%X detected\n", ntohl(ms->saddr), 0, to, port);
+
+ n_ms = ip_masq_out_get(iph->protocol,
+ ms->saddr, 0,
+ htonl(to), htons(port));
+ if (!n_ms) {
+ n_ms = ip_masq_new(IPPROTO_TCP,
+ maddr, 0,
+ ms->saddr, 0,
+ htonl(to), htons(port),
+ IP_MASQ_F_NO_SPORT);
+
+ if (n_ms==NULL)
+ return 0;
+ ip_masq_control_add(n_ms, ms);
+ }
+
+#if 0 /* v0.12 state processing */
+
+ /*
+ * keep for a bit longer than tcp_fin, client may not issue open
+ * to server port before tcp_fin_timeout.
+ */
+ n_ms->timeout = ip_masq_expire->tcp_fin_timeout*3;
+#endif
+ ms->app_data = NULL;
+ ip_masq_put(n_ms);
+
+ return 0; /* no diff required for incoming packets, thank goodness */
+}
+
+struct ip_masq_app ip_masq_ftp = {
+ NULL, /* next */
+ "ftp", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_ftp_init_1, /* ip_masq_init_1 */
+ masq_ftp_done_1, /* ip_masq_done_1 */
+ masq_ftp_out, /* pkt_out */
+ masq_ftp_in, /* pkt_in */
+};
+
+/*
+ * ip_masq_ftp initialization
+ */
+
+__initfunc(int ip_masq_ftp_init(void))
+{
+ int i, j;
+
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (ports[i]) {
+ if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
+ GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ memcpy(masq_incarnations[i], &ip_masq_ftp, sizeof(struct ip_masq_app));
+ if ((j = register_ip_masq_app(masq_incarnations[i],
+ IPPROTO_TCP,
+ ports[i]))) {
+ return j;
+ }
+ IP_MASQ_DEBUG(1-debug, "Ftp: loaded support on port[%d] = %d\n",
+ i, ports[i]);
+ } else {
+ /* To be safe, force the incarnation table entry to NULL */
+ masq_incarnations[i] = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * ip_masq_ftp fin.
+ */
+
+int ip_masq_ftp_done(void)
+{
+ int i, j, k;
+
+ k=0;
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (masq_incarnations[i]) {
+ if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
+ k = j;
+ } else {
+ kfree(masq_incarnations[i]);
+ masq_incarnations[i] = NULL;
+ IP_MASQ_DEBUG(1-debug, "Ftp: unloaded support on port[%d] = %d\n",
+ i, ports[i]);
+ }
+ }
+ }
+ return k;
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_ftp_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_ftp_done() != 0)
+ printk(KERN_INFO "ip_masq_ftp: can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_irc.c b/pfinet/linux-src/net/ipv4/ip_masq_irc.c
new file mode 100644
index 00000000..e52a5720
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_irc.c
@@ -0,0 +1,345 @@
+/*
+ * IP_MASQ_IRC irc masquerading module
+ *
+ *
+ * Version: @(#)ip_masq_irc.c 0.04 99/06/19
+ *
+ * Author: Juan Jose Ciarlante
+ *
+ * Additions:
+ * - recognize a few non-irc-II DCC requests (Oliver Wagner)
+ * DCC MOVE (AmIRC/DCC.MOVE; SEND with resuming)
+ * DCC SCHAT (AmIRC IDEA encrypted CHAT)
+ * DCC TSEND (AmIRC/PIRCH SEND without ACKs)
+ * Fixes:
+ * Juan Jose Ciarlante : set NO_DADDR flag in ip_masq_new()
+ * Nigel Metheringham : Added multiple port support
+ * Juan Jose Ciarlante : litl bits for 2.1
+ * Oliver Wagner : more IRC cmds processing
+ * <winmute@lucifer.gv.kotnet.org>
+ * Juan Jose Ciarlante : put new ms entry to listen()
+ * Scottie Shore : added support for clients that add extra args
+ * <sshore@escape.ca>
+ *
+ * FIXME:
+ * - detect also previous "PRIVMSG" string ?.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Multiple Port Support
+ * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
+ * with the port numbers being defined at module load time. The module
+ * uses the symbol "ports" to define a list of monitored ports, which can
+ * be specified on the insmod command line as
+ * ports=x1,x2,x3...
+ * where x[n] are integer port numbers. This option can be put into
+ * /etc/conf.modules (or /etc/modules.conf depending on your config)
+ * where modload will pick it up should you use modload to load your
+ * modules.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip_masq.h>
+
+
+/*
+ * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+int ports[MAX_MASQ_APP_PORTS] = {6667}; /* I rely on the trailing items being set to zero */
+struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+
+
+/*
+ * List of supported DCC protocols
+ */
+
+#define NUM_DCCPROTO 5
+
+struct dccproto
+{
+ char *match;
+ int matchlen;
+};
+
+struct dccproto dccprotos[NUM_DCCPROTO] = {
+ { "SEND ", 5 },
+ { "CHAT ", 5 },
+ { "MOVE ", 5 },
+ { "TSEND ", 6 },
+ { "SCHAT ", 6 }
+};
+#define MAXMATCHLEN 6
+
+static int
+masq_irc_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ return 0;
+}
+
+static int
+masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+int
+masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ __u32 s_addr;
+ __u16 s_port;
+ struct ip_masq *n_ms;
+ char buf[20]; /* "m_addr m_port" (dec base)*/
+ unsigned buf_len;
+ int diff;
+ char *dcc_p, *addr_beg_p, *addr_end_p;
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+
+ /*
+ * Hunt irc DCC string, the _shortest_:
+ *
+ * strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+ * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+ * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+ * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits)
+ * P: bound port (min 1 d )
+ * F: filename (min 1 d )
+ * S: size (min 1 d )
+ * 0x01, \n: terminators
+ */
+
+ data_limit = skb->h.raw + skb->len;
+
+ while (data < (data_limit - ( 22 + MAXMATCHLEN ) ) )
+ {
+ int i;
+ if (memcmp(data,"\1DCC ",5)) {
+ data ++;
+ continue;
+ }
+
+ dcc_p = data;
+ data += 5; /* point to DCC cmd */
+
+ for(i=0; i<NUM_DCCPROTO; i++)
+ {
+ /*
+ * go through the table and hunt a match string
+ */
+
+ if( memcmp(data, dccprotos[i].match, dccprotos[i].matchlen ) == 0 )
+ {
+ data += dccprotos[i].matchlen;
+
+ /*
+ * skip next string.
+ */
+
+ while( *data++ != ' ')
+
+ /*
+ * must still parse, at least, "AAAAAAAA P\1\n",
+ * 12 bytes left.
+ */
+ if (data > (data_limit-12)) return 0;
+
+
+ addr_beg_p = data;
+
+ /*
+ * client bound address in dec base
+ */
+
+ s_addr = simple_strtoul(data,&data,10);
+ if (*data++ !=' ')
+ continue;
+
+ /*
+ * client bound port in dec base
+ */
+
+ s_port = simple_strtoul(data,&data,10);
+ addr_end_p = data;
+
+ /*
+ * Now create an masquerade entry for it
+ * must set NO_DPORT and NO_DADDR because
+ * connection is requested by another client.
+ */
+
+ n_ms = ip_masq_new(IPPROTO_TCP,
+ maddr, 0,
+ htonl(s_addr),htons(s_port),
+ 0, 0,
+ IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR);
+ if (n_ms==NULL)
+ return 0;
+
+ /*
+ * Replace the old "address port" with the new one
+ */
+
+ buf_len = sprintf(buf,"%lu %u",
+ ntohl(n_ms->maddr),ntohs(n_ms->mport));
+
+ /*
+ * Calculate required delta-offset to keep TCP happy
+ */
+
+ diff = buf_len - (addr_end_p-addr_beg_p);
+
+ *addr_beg_p = '\0';
+ IP_MASQ_DEBUG(1-debug, "masq_irc_out(): '%s' %X:%X detected (diff=%d)\n", dcc_p, s_addr,s_port, diff);
+
+ /*
+ * No shift.
+ */
+
+ if (diff==0) {
+ /*
+ * simple case, just copy.
+ */
+ memcpy(addr_beg_p,buf,buf_len);
+ } else {
+
+ *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC,
+ addr_beg_p, addr_end_p-addr_beg_p,
+ buf, buf_len);
+ }
+ ip_masq_listen(n_ms);
+ ip_masq_put(n_ms);
+ return diff;
+ }
+ }
+ }
+ return 0;
+
+}
+
+/*
+ * Main irc object
+ * You need 1 object per port in case you need
+ * to offer also other used irc ports (6665,6666,etc),
+ * they will share methods but they need own space for
+ * data.
+ */
+
+struct ip_masq_app ip_masq_irc = {
+ NULL, /* next */
+ "irc", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_irc_init_1, /* init_1 */
+ masq_irc_done_1, /* done_1 */
+ masq_irc_out, /* pkt_out */
+ NULL /* pkt_in */
+};
+
+/*
+ * ip_masq_irc initialization
+ */
+
+__initfunc(int ip_masq_irc_init(void))
+{
+ int i, j;
+
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (ports[i]) {
+ if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
+ GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ memcpy(masq_incarnations[i], &ip_masq_irc, sizeof(struct ip_masq_app));
+ if ((j = register_ip_masq_app(masq_incarnations[i],
+ IPPROTO_TCP,
+ ports[i]))) {
+ return j;
+ }
+ IP_MASQ_DEBUG(1-debug,
+ "Irc: loaded support on port[%d] = %d\n",
+ i, ports[i]);
+ } else {
+ /* To be safe, force the incarnation table entry to NULL */
+ masq_incarnations[i] = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * ip_masq_irc fin.
+ */
+
+int ip_masq_irc_done(void)
+{
+ int i, j, k;
+
+ k=0;
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (masq_incarnations[i]) {
+ if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
+ k = j;
+ } else {
+ kfree(masq_incarnations[i]);
+ masq_incarnations[i] = NULL;
+ IP_MASQ_DEBUG(1-debug, "Irc: unloaded support on port[%d] = %d\n",
+ i, ports[i]);
+ }
+ }
+ }
+ return k;
+}
+
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_irc_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_irc_done() != 0)
+ printk(KERN_INFO "ip_masq_irc: can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_mfw.c b/pfinet/linux-src/net/ipv4/ip_masq_mfw.c
new file mode 100644
index 00000000..60c77970
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_mfw.c
@@ -0,0 +1,769 @@
+/*
+ * IP_MASQ_MARKFW masquerading module
+ *
+ * Does (reverse-masq) forwarding based on skb->fwmark value
+ *
+ * $Id: ip_masq_mfw.c,v 1.3.2.1 1999/07/02 10:10:03 davem Exp $
+ *
+ * Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar>
+ * based on Steven Clarke's portfw
+ *
+ * Fixes:
+ * JuanJo Ciarlante: added u-space sched support
+ * JuanJo Ciarlante: if rport==0, use packet dest port *grin*
+ * JuanJo Ciarlante: fixed tcp syn&&!ack creation
+ *
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <linux/ip_fw.h>
+#include <linux/ip_masq.h>
+#include <net/ip_masq.h>
+#include <net/ip_masq_mod.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/softirq.h>
+#include <asm/spinlock.h>
+#include <asm/atomic.h>
+
+static struct ip_masq_mod *mmod_self = NULL;
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+/*
+ * Lists structure:
+ * There is a "main" linked list with entries hashed
+ * by fwmark value (struct ip_masq_mfw, the "m-entries").
+ *
+ * Each of this m-entry holds a double linked list
+ * of "forward-to" hosts (struct ip_masq_mfw_host, the "m.host"),
+ * the round-robin scheduling takes place by rotating m.host entries
+ * "inside" its m-entry.
+ */
+
+/*
+ * Each forwarded host (addr:port) is stored here
+ */
+struct ip_masq_mfw_host {
+ struct list_head list;
+ __u32 addr;
+ __u16 port;
+ __u16 pad0;
+ __u32 fwmark;
+ int pref;
+ atomic_t pref_cnt;
+};
+
+#define IP_MASQ_MFW_HSIZE 16
+/*
+ * This entries are indexed by fwmark,
+ * they hold a list of forwarded addr:port
+ */
+
+struct ip_masq_mfw {
+ struct ip_masq_mfw *next; /* linked list */
+ __u32 fwmark; /* key: firewall mark */
+ struct list_head hosts; /* list of forward-to hosts */
+ atomic_t nhosts; /* number of "" */
+ rwlock_t lock;
+};
+
+
+static struct semaphore mfw_sema = MUTEX;
+static rwlock_t mfw_lock = RW_LOCK_UNLOCKED;
+
+static struct ip_masq_mfw *ip_masq_mfw_table[IP_MASQ_MFW_HSIZE];
+
+static __inline__ int mfw_hash_val(int fwmark)
+{
+ return fwmark & 0x0f;
+}
+
+/*
+ * Get m-entry by "fwmark"
+ * Caller must lock tables.
+ */
+
+static struct ip_masq_mfw *__mfw_get(int fwmark)
+{
+ struct ip_masq_mfw* mfw;
+ int hash = mfw_hash_val(fwmark);
+
+ for (mfw=ip_masq_mfw_table[hash];mfw;mfw=mfw->next) {
+ if (mfw->fwmark==fwmark) {
+ goto out;
+ }
+ }
+out:
+ return mfw;
+}
+
+/*
+ * Links m-entry.
+ * Caller should have checked if already present for same fwmark
+ *
+ * Caller must lock tables.
+ */
+static int __mfw_add(struct ip_masq_mfw *mfw)
+{
+ int fwmark = mfw->fwmark;
+ int hash = mfw_hash_val(fwmark);
+
+ mfw->next = ip_masq_mfw_table[hash];
+ ip_masq_mfw_table[hash] = mfw;
+ ip_masq_mod_inc_nent(mmod_self);
+
+ return 0;
+}
+
+/*
+ * Creates a m-entry (doesn't link it)
+ */
+
+static struct ip_masq_mfw * mfw_new(int fwmark)
+{
+ struct ip_masq_mfw *mfw;
+
+ mfw = kmalloc(sizeof(*mfw), GFP_KERNEL);
+ if (mfw == NULL)
+ goto out;
+
+ MOD_INC_USE_COUNT;
+ memset(mfw, 0, sizeof(*mfw));
+ mfw->fwmark = fwmark;
+ mfw->lock = RW_LOCK_UNLOCKED;
+
+ INIT_LIST_HEAD(&mfw->hosts);
+out:
+ return mfw;
+}
+
+static void mfw_host_to_user(struct ip_masq_mfw_host *h, struct ip_mfw_user *mu)
+{
+ mu->raddr = h->addr;
+ mu->rport = h->port;
+ mu->fwmark = h->fwmark;
+ mu->pref = h->pref;
+}
+
+/*
+ * Creates a m.host (doesn't link it in a m-entry)
+ */
+static struct ip_masq_mfw_host * mfw_host_new(struct ip_mfw_user *mu)
+{
+ struct ip_masq_mfw_host * mfw_host;
+ mfw_host = kmalloc(sizeof (*mfw_host), GFP_KERNEL);
+ if (!mfw_host)
+ return NULL;
+
+ MOD_INC_USE_COUNT;
+ memset(mfw_host, 0, sizeof(*mfw_host));
+ mfw_host->addr = mu->raddr;
+ mfw_host->port = mu->rport;
+ mfw_host->fwmark = mu->fwmark;
+ mfw_host->pref = mu->pref;
+ atomic_set(&mfw_host->pref_cnt, mu->pref);
+
+ return mfw_host;
+}
+
+/*
+ * Create AND link m.host to m-entry.
+ * It locks m.lock.
+ */
+static int mfw_addhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu, int attail)
+{
+ struct ip_masq_mfw_host *mfw_host;
+
+ mfw_host = mfw_host_new(mu);
+ if (!mfw_host)
+ return -ENOMEM;
+
+ write_lock_bh(&mfw->lock);
+ list_add(&mfw_host->list, attail? mfw->hosts.prev : &mfw->hosts);
+ atomic_inc(&mfw->nhosts);
+ write_unlock_bh(&mfw->lock);
+
+ return 0;
+}
+
+/*
+ * Unlink AND destroy m.host(s) from m-entry.
+ * Wildcard (nul host or addr) ok.
+ * It uses m.lock.
+ */
+static int mfw_delhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu)
+{
+
+ struct list_head *l,*e;
+ struct ip_masq_mfw_host *h;
+ int n_del = 0;
+ l = &mfw->hosts;
+
+ write_lock_bh(&mfw->lock);
+ for (e=l->next; e!=l; e=e->next)
+ {
+ h = list_entry(e, struct ip_masq_mfw_host, list);
+ if ((!mu->raddr || h->addr == mu->raddr) &&
+ (!mu->rport || h->port == mu->rport)) {
+ /* HIT */
+ atomic_dec(&mfw->nhosts);
+ list_del(&h->list);
+ kfree_s(h, sizeof(*h));
+ MOD_DEC_USE_COUNT;
+ n_del++;
+ }
+
+ }
+ write_unlock_bh(&mfw->lock);
+ return n_del? 0 : -ESRCH;
+}
+
+/*
+ * Changes m.host parameters
+ * Wildcards ok
+ *
+ * Caller must lock tables.
+ */
+static int __mfw_edithost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu)
+{
+
+ struct list_head *l,*e;
+ struct ip_masq_mfw_host *h;
+ int n_edit = 0;
+ l = &mfw->hosts;
+
+ for (e=l->next; e!=l; e=e->next)
+ {
+ h = list_entry(e, struct ip_masq_mfw_host, list);
+ if ((!mu->raddr || h->addr == mu->raddr) &&
+ (!mu->rport || h->port == mu->rport)) {
+ /* HIT */
+ h->pref = mu->pref;
+ atomic_set(&h->pref_cnt, mu->pref);
+ n_edit++;
+ }
+
+ }
+ return n_edit? 0 : -ESRCH;
+}
+
+/*
+ * Destroys m-entry.
+ * Caller must have checked that it doesn't hold any m.host(s)
+ */
+static void mfw_destroy(struct ip_masq_mfw *mfw)
+{
+ kfree_s(mfw, sizeof(*mfw));
+ MOD_DEC_USE_COUNT;
+}
+
+/*
+ * Unlink m-entry.
+ *
+ * Caller must lock tables.
+ */
+static int __mfw_del(struct ip_masq_mfw *mfw)
+{
+ struct ip_masq_mfw **mfw_p;
+ int ret = -EINVAL;
+
+
+ for(mfw_p=&ip_masq_mfw_table[mfw_hash_val(mfw->fwmark)];
+ *mfw_p;
+ mfw_p = &((*mfw_p)->next))
+ {
+ if (mfw==(*mfw_p)) {
+ *mfw_p = mfw->next;
+ ip_masq_mod_dec_nent(mmod_self);
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+/*
+ * Crude m.host scheduler
+ * This interface could be exported to allow playing with
+ * other sched policies.
+ *
+ * Caller must lock m-entry.
+ */
+static struct ip_masq_mfw_host * __mfw_sched(struct ip_masq_mfw *mfw, int force)
+{
+ struct ip_masq_mfw_host *h = NULL;
+
+ if (atomic_read(&mfw->nhosts) == 0)
+ goto out;
+
+ /*
+ * Here resides actual sched policy:
+ * When pref_cnt touches 0, entry gets shifted to tail and
+ * its pref_cnt reloaded from h->pref (actual value
+ * passed from u-space).
+ *
+ * Exception is pref==0: avoid scheduling.
+ */
+
+ h = list_entry(mfw->hosts.next, struct ip_masq_mfw_host, list);
+
+ if (atomic_read(&mfw->nhosts) <= 1)
+ goto out;
+
+ if ((h->pref && atomic_dec_and_test(&h->pref_cnt)) || force) {
+ atomic_set(&h->pref_cnt, h->pref);
+ list_del(&h->list);
+ list_add(&h->list, mfw->hosts.prev);
+ }
+out:
+ return h;
+}
+
+/*
+ * Main lookup routine.
+ * HITs fwmark and schedules m.host entries if required
+ */
+static struct ip_masq_mfw_host * mfw_lookup(int fwmark)
+{
+ struct ip_masq_mfw *mfw;
+ struct ip_masq_mfw_host *h = NULL;
+
+ read_lock(&mfw_lock);
+ mfw = __mfw_get(fwmark);
+
+ if (mfw) {
+ write_lock(&mfw->lock);
+ h = __mfw_sched(mfw, 0);
+ write_unlock(&mfw->lock);
+ }
+
+ read_unlock(&mfw_lock);
+ return h;
+}
+
+#ifdef CONFIG_PROC_FS
+static int mfw_procinfo(char *buffer, char **start, off_t offset,
+ int length, int dummy)
+{
+ struct ip_masq_mfw *mfw;
+ struct ip_masq_mfw_host *h;
+ struct list_head *l,*e;
+ off_t pos=0, begin;
+ char temp[129];
+ int idx = 0;
+ int len=0;
+
+ MOD_INC_USE_COUNT;
+
+ IP_MASQ_DEBUG(1-debug, "Entered mfw_info\n");
+
+ if (offset < 64)
+ {
+ sprintf(temp, "FwMark > RAddr RPort PrCnt Pref");
+ len = sprintf(buffer, "%-63s\n", temp);
+ }
+ pos = 64;
+
+ for(idx = 0; idx < IP_MASQ_MFW_HSIZE; idx++)
+ {
+ read_lock(&mfw_lock);
+ for(mfw = ip_masq_mfw_table[idx]; mfw ; mfw = mfw->next)
+ {
+ read_lock_bh(&mfw->lock);
+ l=&mfw->hosts;
+
+ for(e=l->next;l!=e;e=e->next) {
+ h = list_entry(e, struct ip_masq_mfw_host, list);
+ pos += 64;
+ if (pos <= offset) {
+ len = 0;
+ continue;
+ }
+
+ sprintf(temp,"0x%x > %08lX %5u %5d %5d",
+ h->fwmark,
+ ntohl(h->addr), ntohs(h->port),
+ atomic_read(&h->pref_cnt), h->pref);
+ len += sprintf(buffer+len, "%-63s\n", temp);
+
+ if(len >= length) {
+ read_unlock_bh(&mfw->lock);
+ read_unlock(&mfw_lock);
+ goto done;
+ }
+ }
+ read_unlock_bh(&mfw->lock);
+ }
+ read_unlock(&mfw_lock);
+ }
+
+done:
+
+ if (len) {
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ }
+ if(len>length)
+ len = length;
+ MOD_DEC_USE_COUNT;
+ return len;
+}
+static struct proc_dir_entry mfw_proc_entry = {
+/* 0, 0, NULL", */
+ 0, 3, "mfw",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ mfw_procinfo
+};
+
+#define proc_ent &mfw_proc_entry
+#else /* !CONFIG_PROC_FS */
+
+#define proc_ent NULL
+#endif
+
+
+static void mfw_flush(void)
+{
+ struct ip_masq_mfw *mfw, *local_table[IP_MASQ_MFW_HSIZE];
+ struct ip_masq_mfw_host *h;
+ struct ip_masq_mfw *mfw_next;
+ int idx;
+ struct list_head *l,*e;
+
+ write_lock_bh(&mfw_lock);
+ memcpy(local_table, ip_masq_mfw_table, sizeof ip_masq_mfw_table);
+ memset(ip_masq_mfw_table, 0, sizeof ip_masq_mfw_table);
+ write_unlock_bh(&mfw_lock);
+
+ /*
+ * For every hash table row ...
+ */
+ for(idx=0;idx<IP_MASQ_MFW_HSIZE;idx++) {
+
+ /*
+ * For every m-entry in row ...
+ */
+ for(mfw=local_table[idx];mfw;mfw=mfw_next) {
+ /*
+ * For every m.host in m-entry ...
+ */
+ l=&mfw->hosts;
+ while((e=l->next) != l) {
+ h = list_entry(e, struct ip_masq_mfw_host, list);
+ atomic_dec(&mfw->nhosts);
+ list_del(&h->list);
+ kfree_s(h, sizeof(*h));
+ MOD_DEC_USE_COUNT;
+ }
+
+ if (atomic_read(&mfw->nhosts)) {
+ IP_MASQ_ERR("mfw_flush(): after flushing row nhosts=%d\n",
+ atomic_read(&mfw->nhosts));
+ }
+ mfw_next = mfw->next;
+ kfree_s(mfw, sizeof(*mfw));
+ MOD_DEC_USE_COUNT;
+ ip_masq_mod_dec_nent(mmod_self);
+ }
+ }
+}
+
+/*
+ * User space control entry point
+ */
+static int mfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
+{
+ struct ip_mfw_user *mu = &mctl->u.mfw_user;
+ struct ip_masq_mfw *mfw;
+ int ret = EINVAL;
+ int arglen = optlen - IP_MASQ_CTL_BSIZE;
+ int cmd;
+
+
+ IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
+ arglen,
+ sizeof (*mu),
+ optlen,
+ sizeof (*mctl));
+
+ /*
+ * checks ...
+ */
+ if (arglen != sizeof(*mu) && optlen != sizeof(*mctl))
+ return -EINVAL;
+
+ /*
+ * Don't trust the lusers - plenty of error checking!
+ */
+ cmd = mctl->m_cmd;
+ IP_MASQ_DEBUG(1-debug, "ip_masq_mfw_ctl(cmd=%d, fwmark=%d)\n",
+ cmd, mu->fwmark);
+
+
+ switch(cmd) {
+ case IP_MASQ_CMD_NONE:
+ return 0;
+ case IP_MASQ_CMD_FLUSH:
+ break;
+ case IP_MASQ_CMD_ADD:
+ case IP_MASQ_CMD_INSERT:
+ case IP_MASQ_CMD_SET:
+ if (mu->fwmark == 0) {
+ IP_MASQ_DEBUG(1-debug, "invalid fwmark==0\n");
+ return -EINVAL;
+ }
+ if (mu->pref < 0) {
+ IP_MASQ_DEBUG(1-debug, "invalid pref==%d\n",
+ mu->pref);
+ return -EINVAL;
+ }
+ break;
+ }
+
+
+ ret = -EINVAL;
+
+ switch(cmd) {
+ case IP_MASQ_CMD_ADD:
+ case IP_MASQ_CMD_INSERT:
+ if (!mu->raddr) {
+ IP_MASQ_DEBUG(0-debug, "ip_masq_mfw_ctl(ADD): invalid redirect 0x%x:%d\n",
+ mu->raddr, mu->rport);
+ goto out;
+ }
+
+ /*
+ * Cannot just use mfw_lock because below
+ * are allocations that can sleep; so
+ * to assure "new entry" atomic creation
+ * I use a semaphore.
+ *
+ */
+ down(&mfw_sema);
+
+ read_lock(&mfw_lock);
+ mfw = __mfw_get(mu->fwmark);
+ read_unlock(&mfw_lock);
+
+ /*
+ * If first host, create m-entry
+ */
+ if (mfw == NULL) {
+ mfw = mfw_new(mu->fwmark);
+ if (mfw == NULL)
+ ret = -ENOMEM;
+ }
+
+ if (mfw) {
+ /*
+ * Put m.host in m-entry.
+ */
+ ret = mfw_addhost(mfw, mu, cmd == IP_MASQ_CMD_ADD);
+
+ /*
+ * If first host, link m-entry to hash table.
+ * Already protected by global lock.
+ */
+ if (ret == 0 && atomic_read(&mfw->nhosts) == 1) {
+ write_lock_bh(&mfw_lock);
+ __mfw_add(mfw);
+ write_unlock_bh(&mfw_lock);
+ }
+ if (atomic_read(&mfw->nhosts) == 0) {
+ mfw_destroy(mfw);
+ }
+ }
+
+ up(&mfw_sema);
+
+ break;
+
+ case IP_MASQ_CMD_DEL:
+ down(&mfw_sema);
+
+ read_lock(&mfw_lock);
+ mfw = __mfw_get(mu->fwmark);
+ read_unlock(&mfw_lock);
+
+ if (mfw) {
+ ret = mfw_delhost(mfw, mu);
+
+ /*
+ * Last lease will free
+ * XXX check logic XXX
+ */
+ if (atomic_read(&mfw->nhosts) == 0) {
+ write_lock_bh(&mfw_lock);
+ __mfw_del(mfw);
+ write_unlock_bh(&mfw_lock);
+ mfw_destroy(mfw);
+ }
+ } else
+ ret = -ESRCH;
+
+ up(&mfw_sema);
+ break;
+ case IP_MASQ_CMD_FLUSH:
+
+ down(&mfw_sema);
+ mfw_flush();
+ up(&mfw_sema);
+ ret = 0;
+ break;
+ case IP_MASQ_CMD_SET:
+ /*
+ * No need to semaphorize here, main list is not
+ * modified.
+ */
+ read_lock(&mfw_lock);
+
+ mfw = __mfw_get(mu->fwmark);
+ if (mfw) {
+ write_lock_bh(&mfw->lock);
+
+ if (mu->flags & IP_MASQ_MFW_SCHED) {
+ struct ip_masq_mfw_host *h;
+ if ((h=__mfw_sched(mfw, 1))) {
+ mfw_host_to_user(h, mu);
+ ret = 0;
+ }
+ } else {
+ ret = __mfw_edithost(mfw, mu);
+ }
+
+ write_unlock_bh(&mfw->lock);
+ }
+
+ read_unlock(&mfw_lock);
+ break;
+ }
+out:
+
+ return ret;
+}
+
+/*
+ * Module stubs called from ip_masq core module
+ */
+
+/*
+ * Input rule stub, called very early for each incoming packet,
+ * to see if this module has "interest" in packet.
+ */
+static int mfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
+{
+ int val;
+ read_lock(&mfw_lock);
+ val = ( __mfw_get(skb->fwmark) != 0);
+ read_unlock(&mfw_lock);
+ return val;
+}
+
+/*
+ * Input-create stub, called to allow "custom" masq creation
+ */
+static struct ip_masq * mfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ union ip_masq_tphdr tph;
+ struct ip_masq *ms = NULL;
+ struct ip_masq_mfw_host *h = NULL;
+
+ tph.raw = (char*) iph + iph->ihl * 4;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ /*
+ * Only open TCP tunnel if SYN+!ACK packet
+ */
+ if (!tph.th->syn && tph.th->ack)
+ return NULL;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return NULL;
+ }
+
+ /*
+ * If no entry exists in the masquerading table
+ * and the port is involved
+ * in port forwarding, create a new masq entry
+ */
+
+ if ((h=mfw_lookup(skb->fwmark))) {
+ ms = ip_masq_new(iph->protocol,
+ iph->daddr, tph.portp[1],
+ /* if no redir-port, use packet dest port */
+ h->addr, h->port? h->port : tph.portp[1],
+ iph->saddr, tph.portp[0],
+ 0);
+
+ if (ms != NULL)
+ ip_masq_listen(ms);
+ }
+ return ms;
+}
+
+
+#define mfw_in_update NULL
+#define mfw_out_rule NULL
+#define mfw_out_create NULL
+#define mfw_out_update NULL
+
+static struct ip_masq_mod mfw_mod = {
+ NULL, /* next */
+ NULL, /* next_reg */
+ "mfw", /* name */
+ ATOMIC_INIT(0), /* nent */
+ ATOMIC_INIT(0), /* refcnt */
+ proc_ent,
+ mfw_ctl,
+ NULL, /* masq_mod_init */
+ NULL, /* masq_mod_done */
+ mfw_in_rule,
+ mfw_in_update,
+ mfw_in_create,
+ mfw_out_rule,
+ mfw_out_update,
+ mfw_out_create,
+};
+
+
+__initfunc(int ip_mfw_init(void))
+{
+ return register_ip_masq_mod ((mmod_self=&mfw_mod));
+}
+
+int ip_mfw_done(void)
+{
+ return unregister_ip_masq_mod(&mfw_mod);
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_mfw_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_mfw_done() != 0)
+ printk(KERN_INFO "can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_mod.c b/pfinet/linux-src/net/ipv4/ip_masq_mod.c
new file mode 100644
index 00000000..b99502f3
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_mod.c
@@ -0,0 +1,322 @@
+/*
+ * IP_MASQ_MOD masq modules support
+ *
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ * $Id: ip_masq_mod.c,v 1.5.2.1 1999/07/02 10:10:03 davem Exp $
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Cyrus Durgin: fixed kerneld stuff for kmod.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <net/ip_masq.h>
+#include <net/ip_masq_mod.h>
+
+#include <linux/ip_masq.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+EXPORT_SYMBOL(register_ip_masq_mod);
+EXPORT_SYMBOL(unregister_ip_masq_mod);
+EXPORT_SYMBOL(ip_masq_mod_lkp_link);
+EXPORT_SYMBOL(ip_masq_mod_lkp_unlink);
+
+static spinlock_t masq_mod_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Base pointer for registered modules
+ */
+struct ip_masq_mod * ip_masq_mod_reg_base = NULL;
+
+/*
+ * Base pointer for lookup (subset of above, a module could be
+ * registered, but it could have no active rule); will avoid
+ * unnecessary lookups.
+ */
+struct ip_masq_mod * ip_masq_mod_lkp_base = NULL;
+
+int ip_masq_mod_register_proc(struct ip_masq_mod *mmod)
+{
+#ifdef CONFIG_PROC_FS
+ int ret;
+
+ struct proc_dir_entry *ent = mmod->mmod_proc_ent;
+
+ if (!ent)
+ return 0;
+ if (!ent->name) {
+ ent->name = mmod->mmod_name;
+ ent->namelen = strlen (mmod->mmod_name);
+ }
+ ret = ip_masq_proc_register(ent);
+ if (ret) mmod->mmod_proc_ent = NULL;
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+void ip_masq_mod_unregister_proc(struct ip_masq_mod *mmod)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *ent = mmod->mmod_proc_ent;
+ if (!ent)
+ return;
+ ip_masq_proc_unregister(ent);
+#endif
+}
+
+/*
+ * Link/unlink object for lookups
+ */
+
+int ip_masq_mod_lkp_unlink(struct ip_masq_mod *mmod)
+{
+ struct ip_masq_mod **mmod_p;
+
+ write_lock_bh(&masq_mod_lock);
+
+ for (mmod_p = &ip_masq_mod_lkp_base; *mmod_p ; mmod_p = &(*mmod_p)->next)
+ if (mmod == (*mmod_p)) {
+ *mmod_p = mmod->next;
+ mmod->next = NULL;
+ write_unlock_bh(&masq_mod_lock);
+ return 0;
+ }
+
+ write_unlock_bh(&masq_mod_lock);
+ return -EINVAL;
+}
+
+int ip_masq_mod_lkp_link(struct ip_masq_mod *mmod)
+{
+ write_lock_bh(&masq_mod_lock);
+
+ mmod->next = ip_masq_mod_lkp_base;
+ ip_masq_mod_lkp_base=mmod;
+
+ write_unlock_bh(&masq_mod_lock);
+ return 0;
+}
+
+int register_ip_masq_mod(struct ip_masq_mod *mmod)
+{
+ if (!mmod) {
+ IP_MASQ_ERR("register_ip_masq_mod(): NULL arg\n");
+ return -EINVAL;
+ }
+ if (!mmod->mmod_name) {
+ IP_MASQ_ERR("register_ip_masq_mod(): NULL mmod_name\n");
+ return -EINVAL;
+ }
+ ip_masq_mod_register_proc(mmod);
+
+ mmod->next_reg = ip_masq_mod_reg_base;
+ ip_masq_mod_reg_base=mmod;
+
+ return 0;
+}
+
+int unregister_ip_masq_mod(struct ip_masq_mod *mmod)
+{
+ struct ip_masq_mod **mmod_p;
+
+ if (!mmod) {
+ IP_MASQ_ERR( "unregister_ip_masq_mod(): NULL arg\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Only allow unregistration if it is not referenced
+ */
+ if (atomic_read(&mmod->refcnt)) {
+ IP_MASQ_ERR( "unregister_ip_masq_mod(): is in use by %d guys. failed\n",
+ atomic_read(&mmod->refcnt));
+ return -EINVAL;
+ }
+
+ /*
+ * Must be already unlinked from lookup list
+ */
+ if (mmod->next) {
+ IP_MASQ_WARNING("MASQ: unregistering \"%s\" while in lookup list.fixed.",
+ mmod->mmod_name);
+ ip_masq_mod_lkp_unlink(mmod);
+ }
+
+ for (mmod_p = &ip_masq_mod_reg_base; *mmod_p ; mmod_p = &(*mmod_p)->next_reg)
+ if (mmod == (*mmod_p)) {
+ ip_masq_mod_unregister_proc(mmod);
+ *mmod_p = mmod->next_reg;
+ return 0;
+ }
+
+ IP_MASQ_ERR("unregister_ip_masq_mod(%s): not linked \n", mmod->mmod_name);
+ return -EINVAL;
+}
+
+int ip_masq_mod_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
+{
+ struct ip_masq_mod *mmod;
+ int ret = IP_MASQ_MOD_NOP;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_in_rule) continue;
+ switch (ret=mmod->mmod_in_rule(skb, iph)) {
+ case IP_MASQ_MOD_NOP:
+ continue;
+ case IP_MASQ_MOD_ACCEPT:
+ case IP_MASQ_MOD_REJECT:
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int ip_masq_mod_out_rule(const struct sk_buff *skb, const struct iphdr *iph)
+{
+ struct ip_masq_mod *mmod;
+ int ret = IP_MASQ_MOD_NOP;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_out_rule) continue;
+ switch (ret=mmod->mmod_out_rule(skb, iph)) {
+ case IP_MASQ_MOD_NOP:
+ continue;
+ case IP_MASQ_MOD_ACCEPT:
+ case IP_MASQ_MOD_REJECT:
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+struct ip_masq * ip_masq_mod_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ struct ip_masq_mod *mmod;
+ struct ip_masq *ms = NULL;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_in_create) continue;
+ if ((ms=mmod->mmod_in_create(skb, iph, maddr))) {
+ goto out;
+ }
+ }
+out:
+ return ms;
+}
+
+struct ip_masq * ip_masq_mod_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ struct ip_masq_mod *mmod;
+ struct ip_masq *ms = NULL;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_out_create) continue;
+ if ((ms=mmod->mmod_out_create(skb, iph, maddr))) {
+ goto out;
+ }
+ }
+out:
+ return ms;
+}
+
+int ip_masq_mod_in_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
+{
+ struct ip_masq_mod *mmod;
+ int ret = IP_MASQ_MOD_NOP;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_in_update) continue;
+ switch (ret=mmod->mmod_in_update(skb, iph, ms)) {
+ case IP_MASQ_MOD_NOP:
+ continue;
+ case IP_MASQ_MOD_ACCEPT:
+ case IP_MASQ_MOD_REJECT:
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int ip_masq_mod_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
+{
+ struct ip_masq_mod *mmod;
+ int ret = IP_MASQ_MOD_NOP;
+
+ for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
+ if (!mmod->mmod_out_update) continue;
+ switch (ret=mmod->mmod_out_update(skb, iph, ms)) {
+ case IP_MASQ_MOD_NOP:
+ continue;
+ case IP_MASQ_MOD_ACCEPT:
+ case IP_MASQ_MOD_REJECT:
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name)
+{
+ struct ip_masq_mod * mmod;
+
+ IP_MASQ_DEBUG(1, "searching mmod_name \"%s\"\n", mmod_name);
+
+ for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next_reg) {
+ if (mmod->mmod_ctl && *(mmod_name)
+ && (strcmp(mmod_name, mmod->mmod_name)==0)) {
+ /* HIT */
+ return mmod;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Module control entry
+ */
+int ip_masq_mod_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
+{
+ struct ip_masq_mod * mmod;
+#ifdef CONFIG_KMOD
+ char kmod_name[IP_MASQ_TNAME_MAX+8];
+#endif
+ /* tappo */
+ mctl->m_tname[IP_MASQ_TNAME_MAX-1] = 0;
+
+ mmod = ip_masq_mod_getbyname(mctl->m_tname);
+ if (mmod)
+ return mmod->mmod_ctl(optname, mctl, optlen);
+#ifdef CONFIG_KMOD
+ sprintf(kmod_name,"ip_masq_%s", mctl->m_tname);
+
+ IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name);
+
+ /*
+ * Let sleep for a while ...
+ */
+ request_module(kmod_name);
+ mmod = ip_masq_mod_getbyname(mctl->m_tname);
+ if (mmod)
+ return mmod->mmod_ctl(optname, mctl, optlen);
+#endif
+ return ESRCH;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_portfw.c b/pfinet/linux-src/net/ipv4/ip_masq_portfw.c
new file mode 100644
index 00000000..6c697a10
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_portfw.c
@@ -0,0 +1,508 @@
+/*
+ * IP_MASQ_PORTFW masquerading module
+ *
+ *
+ * $Id: ip_masq_portfw.c,v 1.3.2.1 1999/07/02 10:10:02 davem Exp $
+ *
+ * Author: Steven Clarke <steven.clarke@monmouth.demon.co.uk>
+ *
+ * Fixes:
+ * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c
+ * Juan Jose Ciarlante : modularized
+ * Juan Jose Ciarlante : use GFP_KERNEL
+ * Juan Jose Ciarlante : locking
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <linux/ip_fw.h>
+#include <linux/ip_masq.h>
+#include <net/ip_masq.h>
+#include <net/ip_masq_mod.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#define IP_PORTFW_PORT_MIN 1
+#define IP_PORTFW_PORT_MAX 60999
+
+struct ip_portfw {
+ struct list_head list;
+ __u32 laddr, raddr;
+ __u16 lport, rport;
+ atomic_t pref_cnt; /* pref "counter" down to 0 */
+ int pref; /* user set pref */
+};
+
+static struct ip_masq_mod *mmod_self = NULL;
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+/*
+ * Lock
+ */
+static rwlock_t portfw_lock = RW_LOCK_UNLOCKED;
+
+static struct list_head portfw_list[2];
+static __inline__ int portfw_idx(int protocol)
+{
+ return (protocol==IPPROTO_TCP);
+}
+
+/*
+ *
+ * Delete forwarding entry(s):
+ * called from _DEL, u-space.
+ * . "relaxed" match, except for lport
+ *
+ */
+
+static __inline__ int ip_portfw_del(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr)
+{
+ int prot = portfw_idx(protocol);
+ struct ip_portfw *n;
+ struct list_head *entry;
+ struct list_head *list = &portfw_list[prot];
+ int nent;
+
+ nent = atomic_read(&mmod_self->mmod_nent);
+
+ write_lock_bh(&portfw_lock);
+
+ for (entry=list->next;entry != list;entry = entry->next) {
+ n = list_entry(entry, struct ip_portfw, list);
+ if (n->lport == lport &&
+ (!laddr || n->laddr == laddr) &&
+ (!raddr || n->raddr == raddr) &&
+ (!rport || n->rport == rport)) {
+ list_del(entry);
+ ip_masq_mod_dec_nent(mmod_self);
+ kfree_s(n, sizeof(struct ip_portfw));
+ MOD_DEC_USE_COUNT;
+ }
+ }
+ write_unlock_bh(&portfw_lock);
+
+ return nent==atomic_read(&mmod_self->mmod_nent)? ESRCH : 0;
+}
+
+/*
+ * Flush tables
+ * called from _FLUSH, u-space.
+ */
+static __inline__ void ip_portfw_flush(void)
+{
+ int prot;
+ struct list_head *l;
+ struct list_head *e;
+ struct ip_portfw *n;
+
+ write_lock_bh(&portfw_lock);
+
+ for (prot = 0; prot < 2;prot++) {
+ l = &portfw_list[prot];
+ while((e=l->next) != l) {
+ ip_masq_mod_dec_nent(mmod_self);
+ n = list_entry (e, struct ip_portfw, list);
+ list_del(e);
+ kfree_s(n, sizeof (*n));
+ MOD_DEC_USE_COUNT;
+ }
+ }
+
+ write_unlock_bh(&portfw_lock);
+}
+
+/*
+ * Lookup routine for lport,laddr match
+ * must be called with locked tables
+ */
+static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport, __u32 laddr, __u32 *daddr_p, __u16 *dport_p)
+{
+ int prot = portfw_idx(protocol);
+
+ struct ip_portfw *n = NULL;
+ struct list_head *l, *e;
+
+ l = &portfw_list[prot];
+
+ for (e=l->next;e!=l;e=e->next) {
+ n = list_entry(e, struct ip_portfw, list);
+ if (lport == n->lport && laddr == n->laddr) {
+ /* Please be nice, don't pass only a NULL dport */
+ if (daddr_p) {
+ *daddr_p = n->raddr;
+ *dport_p = n->rport;
+ }
+
+ goto out;
+ }
+ }
+ n = NULL;
+out:
+ return n;
+}
+
+/*
+ * Edit routine for lport,[laddr], [raddr], [rport] match
+ * By now, only called from u-space
+ */
+static __inline__ int ip_portfw_edit(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref)
+{
+ int prot = portfw_idx(protocol);
+
+ struct ip_portfw *n = NULL;
+ struct list_head *l, *e;
+ int count = 0;
+
+
+ read_lock_bh(&portfw_lock);
+
+ l = &portfw_list[prot];
+
+ for (e=l->next;e!=l;e=e->next) {
+ n = list_entry(e, struct ip_portfw, list);
+ if (lport == n->lport &&
+ (!laddr || laddr == n->laddr) &&
+ (!rport || rport == n->rport) &&
+ (!raddr || raddr == n->raddr)) {
+ n->pref = pref;
+ atomic_set(&n->pref_cnt, pref);
+ count++;
+ }
+ }
+
+ read_unlock_bh(&portfw_lock);
+
+ return count;
+}
+
+/*
+ * Add/edit en entry
+ * called from _ADD, u-space.
+ * must return 0 or +errno
+ */
+static __inline__ int ip_portfw_add(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref)
+{
+ struct ip_portfw *npf;
+ int prot = portfw_idx(protocol);
+
+ if (pref <= 0)
+ return EINVAL;
+
+ if (ip_portfw_edit(protocol, lport, laddr, rport, raddr, pref)) {
+ /*
+ * Edit ok ...
+ */
+ return 0;
+ }
+
+ /* may block ... */
+ npf = (struct ip_portfw*) kmalloc(sizeof(struct ip_portfw), GFP_KERNEL);
+
+ if (!npf)
+ return ENOMEM;
+
+ MOD_INC_USE_COUNT;
+ memset(npf, 0, sizeof(*npf));
+
+ npf->laddr = laddr;
+ npf->lport = lport;
+ npf->rport = rport;
+ npf->raddr = raddr;
+ npf->pref = pref;
+
+ atomic_set(&npf->pref_cnt, npf->pref);
+ INIT_LIST_HEAD(&npf->list);
+
+ write_lock_bh(&portfw_lock);
+
+ /*
+ * Add at head
+ */
+ list_add(&npf->list, &portfw_list[prot]);
+
+ write_unlock_bh(&portfw_lock);
+
+ ip_masq_mod_inc_nent(mmod_self);
+ return 0;
+}
+
+
+
+static __inline__ int portfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
+{
+ struct ip_portfw_user *mm = &mctl->u.portfw_user;
+ int ret = EINVAL;
+ int arglen = optlen - IP_MASQ_CTL_BSIZE;
+ int cmd;
+
+
+ IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
+ arglen,
+ sizeof (*mm),
+ optlen,
+ sizeof (*mctl));
+
+ /*
+ * Yes, I'm a bad guy ...
+ */
+ if (arglen != sizeof(*mm) && optlen != sizeof(*mctl))
+ return EINVAL;
+
+ /*
+ * Don't trust the lusers - plenty of error checking!
+ */
+ cmd = mctl->m_cmd;
+ IP_MASQ_DEBUG(1-debug, "ip_masq_portfw_ctl(cmd=%d)\n", cmd);
+
+
+ switch (cmd) {
+ case IP_MASQ_CMD_NONE:
+ return 0;
+ case IP_MASQ_CMD_FLUSH:
+ break;
+ default:
+ if (htons(mm->lport) < IP_PORTFW_PORT_MIN || htons(mm->lport) > IP_PORTFW_PORT_MAX)
+ return EINVAL;
+
+ if (mm->protocol!=IPPROTO_TCP && mm->protocol!=IPPROTO_UDP)
+ return EINVAL;
+ }
+
+ switch(cmd) {
+ case IP_MASQ_CMD_ADD:
+ ret = ip_portfw_add(mm->protocol,
+ mm->lport, mm->laddr,
+ mm->rport, mm->raddr,
+ mm->pref);
+ break;
+
+ case IP_MASQ_CMD_DEL:
+ ret = ip_portfw_del(mm->protocol,
+ mm->lport, mm->laddr,
+ mm->rport, mm->raddr);
+ break;
+ case IP_MASQ_CMD_FLUSH:
+ ip_portfw_flush();
+ ret = 0;
+ break;
+ }
+
+
+ return ret;
+}
+
+
+
+
+#ifdef CONFIG_PROC_FS
+
+static int portfw_procinfo(char *buffer, char **start, off_t offset,
+ int length, int unused)
+{
+ off_t pos=0, begin;
+ struct ip_portfw *pf;
+ struct list_head *l, *e;
+ char temp[65];
+ int ind;
+ int len=0;
+
+
+ if (offset < 64)
+ {
+ sprintf(temp, "Prot LAddr LPort > RAddr RPort PrCnt Pref");
+ len = sprintf(buffer, "%-63s\n", temp);
+ }
+ pos = 64;
+
+ read_lock_bh(&portfw_lock);
+
+ for(ind = 0; ind < 2; ind++)
+ {
+ l = &portfw_list[ind];
+ for (e=l->next; e!=l; e=e->next)
+ {
+ pf = list_entry(e, struct ip_portfw, list);
+ pos += 64;
+ if (pos <= offset) {
+ len = 0;
+ continue;
+ }
+
+ sprintf(temp,"%s %08lX %5u > %08lX %5u %5d %5d",
+ ind ? "TCP" : "UDP",
+ ntohl(pf->laddr), ntohs(pf->lport),
+ ntohl(pf->raddr), ntohs(pf->rport),
+ atomic_read(&pf->pref_cnt), pf->pref);
+ len += sprintf(buffer+len, "%-63s\n", temp);
+
+ if (len >= length)
+ goto done;
+ }
+ }
+done:
+ read_unlock_bh(&portfw_lock);
+
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if(len>length)
+ len = length;
+ return len;
+}
+
+static struct proc_dir_entry portfw_proc_entry = {
+/* 0, 0, NULL", */
+ 0, 6, "portfw", /* Just for compatibility, for now ... */
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ portfw_procinfo
+};
+
+#define proc_ent &portfw_proc_entry
+#else /* !CONFIG_PROC_FS */
+
+#define proc_ent NULL
+#endif
+
+static int portfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
+{
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+#ifdef CONFIG_IP_MASQ_DEBUG
+ struct rtable *rt = (struct rtable *)skb->dst;
+#endif
+ struct ip_portfw *pfw;
+
+ IP_MASQ_DEBUG(2, "portfw_in_rule(): skb:= dev=%s (index=%d), rt_iif=%d, rt_flags=0x%x rt_dev___=%s daddr=%d.%d.%d.%d dport=%d\n",
+ skb->dev->name, skb->dev->ifindex, rt->rt_iif, rt->rt_flags,
+ rt->u.dst.dev->name,
+ NIPQUAD(iph->daddr), ntohs(portp[1]));
+
+ read_lock(&portfw_lock);
+ pfw = ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, NULL, NULL);
+ read_unlock(&portfw_lock);
+ return (pfw!=0);
+}
+
+static struct ip_masq * portfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
+{
+ /*
+ * If no entry exists in the masquerading table
+ * and the port is involved
+ * in port forwarding, create a new masq entry
+ */
+
+ __u32 raddr;
+ __u16 rport;
+ const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ struct ip_masq *ms = NULL;
+ struct ip_portfw *pf;
+
+ /*
+ * Lock for writing.
+ */
+ write_lock(&portfw_lock);
+
+ if ((pf=ip_portfw_lookup(iph->protocol,
+ portp[1], iph->daddr,
+ &raddr, &rport))) {
+ ms = ip_masq_new(iph->protocol,
+ iph->daddr, portp[1],
+ raddr, rport,
+ iph->saddr, portp[0],
+ 0);
+ ip_masq_listen(ms);
+
+ if (!ms || atomic_read(&mmod_self->mmod_nent) <= 1
+ /* || ip_masq_nlocks(&portfw_lock) != 1 */ )
+ /*
+ * Maybe later...
+ */
+ goto out;
+
+ /*
+ * Entry created, lock==1.
+ * if pref_cnt == 0, move
+ * entry at _tail_.
+ * This is a simple load balance scheduling
+ */
+
+ if (atomic_dec_and_test(&pf->pref_cnt)) {
+
+ atomic_set(&pf->pref_cnt, pf->pref);
+ list_del(&pf->list);
+ list_add(&pf->list,
+ portfw_list[portfw_idx(iph->protocol)].prev);
+
+ }
+ }
+out:
+ write_unlock(&portfw_lock);
+ return ms;
+}
+
+#define portfw_in_update NULL
+#define portfw_out_rule NULL
+#define portfw_out_create NULL
+#define portfw_out_update NULL
+
+static struct ip_masq_mod portfw_mod = {
+ NULL, /* next */
+ NULL, /* next_reg */
+ "portfw", /* name */
+ ATOMIC_INIT(0), /* nent */
+ ATOMIC_INIT(0), /* refcnt */
+ proc_ent,
+ portfw_ctl,
+ NULL, /* masq_mod_init */
+ NULL, /* masq_mod_done */
+ portfw_in_rule,
+ portfw_in_update,
+ portfw_in_create,
+ portfw_out_rule,
+ portfw_out_update,
+ portfw_out_create,
+};
+
+
+
+__initfunc(int ip_portfw_init(void))
+{
+ INIT_LIST_HEAD(&portfw_list[0]);
+ INIT_LIST_HEAD(&portfw_list[1]);
+ return register_ip_masq_mod ((mmod_self=&portfw_mod));
+}
+
+int ip_portfw_done(void)
+{
+ return unregister_ip_masq_mod(&portfw_mod);
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_portfw_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_portfw_done() != 0)
+ printk(KERN_INFO "ip_portfw_done(): can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_quake.c b/pfinet/linux-src/net/ipv4/ip_masq_quake.c
new file mode 100644
index 00000000..995c3a0a
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_quake.c
@@ -0,0 +1,322 @@
+/*
+ * IP_MASQ_QUAKE quake masquerading module
+ *
+ *
+ * Version: @(#)ip_masq_quake.c 0.02 22/02/97
+ *
+ * Author: Harald Hoyer mailto:HarryH@Royal.Net
+ *
+ *
+ * Fixes:
+ * Harald Hoyer : Unofficial Quake Specs found at
+ * http://www.gamers.org/dEngine/quake/spec/
+ * Harald Hoyer : Check for QUAKE-STRING
+ * Juan Jose Ciarlante : litl bits for 2.1
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ */
+
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/ip_masq.h>
+
+#define DEBUG_CONFIG_IP_MASQ_QUAKE 0
+
+typedef struct
+{
+ __u16 type; // (Little Endian) Type of message.
+ __u16 length; // (Little Endian) Length of message, header included.
+ char message[0]; // The contents of the message.
+} QUAKEHEADER;
+
+struct quake_priv_data {
+ /* Have we seen a client connect message */
+ signed char cl_connect;
+};
+
+static int
+masq_quake_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ if ((ms->app_data = kmalloc(sizeof(struct quake_priv_data),
+ GFP_ATOMIC)) == NULL)
+ printk(KERN_INFO "Quake: No memory for application data\n");
+ else
+ {
+ struct quake_priv_data *priv =
+ (struct quake_priv_data *)ms->app_data;
+ priv->cl_connect = 0;
+ }
+ return 0;
+}
+
+static int
+masq_quake_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ if (ms->app_data)
+ kfree_s(ms->app_data, sizeof(struct quake_priv_data));
+ return 0;
+}
+
+int
+masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ QUAKEHEADER *qh;
+ __u16 udp_port;
+ char *data;
+ unsigned char code;
+ struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data;
+
+ if(priv->cl_connect == -1)
+ return 0;
+
+ skb = *skb_p;
+
+ iph = skb->nh.iph;
+ uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Check for lenght */
+ if(ntohs(uh->len) < 5)
+ return 0;
+
+ qh = (QUAKEHEADER *)&uh[1];
+
+ if(qh->type != 0x0080)
+ return 0;
+
+
+ code = qh->message[0];
+
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_in: code = %d \n", (int)code);
+#endif
+
+ switch(code) {
+ case 0x01:
+ /* Connection Request */
+
+ if(ntohs(qh->length) < 0x0c) {
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_in: length < 0xc \n");
+#endif
+ return 0;
+ }
+
+ data = &qh->message[1];
+
+ /* Check for stomping string */
+ if(memcmp(data,"QUAKE\0\3",7)) {
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: memcmp failed \n");
+#endif
+ return 0;
+ }
+ else {
+ priv->cl_connect = 1;
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: memcmp ok \n");
+#endif
+ }
+ break;
+
+ case 0x81:
+ /* Accept Connection */
+ if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0))
+ return 0;
+ data = &qh->message[1];
+
+ memcpy(&udp_port, data, 2);
+
+ ms->dport = htons(udp_port);
+
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_in: in_rewrote UDP port %d \n", udp_port);
+#endif
+ priv->cl_connect = -1;
+
+ break;
+ }
+
+ return 0;
+}
+
+int
+masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ QUAKEHEADER *qh;
+ __u16 udp_port;
+ char *data;
+ unsigned char code;
+ struct ip_masq *n_ms;
+ struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data;
+
+ if(priv->cl_connect == -1)
+ return 0;
+
+ skb = *skb_p;
+
+ iph = skb->nh.iph;
+ uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Check for lenght */
+ if(ntohs(uh->len) < 5)
+ return 0;
+
+ qh = (QUAKEHEADER *)&uh[1];
+
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: qh->type = %d \n", (int)qh->type);
+#endif
+
+ if(qh->type != 0x0080)
+ return 0;
+
+ code = qh->message[0];
+
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: code = %d \n", (int)code);
+#endif
+
+ switch(code) {
+ case 0x01:
+ /* Connection Request */
+
+ if(ntohs(qh->length) < 0x0c) {
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: length < 0xc \n");
+#endif
+ return 0;
+ }
+
+ data = &qh->message[1];
+
+ /* Check for stomping string */
+ if(memcmp(data,"QUAKE\0\3",7)) {
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: memcmp failed \n");
+#endif
+ return 0;
+ }
+ else {
+ priv->cl_connect = 1;
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: memcmp ok \n");
+#endif
+ }
+ break;
+
+ case 0x81:
+ /* Accept Connection */
+ if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0))
+ return 0;
+
+ data = &qh->message[1];
+
+ memcpy(&udp_port, data, 2);
+
+ n_ms = ip_masq_new(IPPROTO_UDP,
+ maddr, 0,
+ ms->saddr, htons(udp_port),
+ ms->daddr, ms->dport,
+ 0);
+
+ if (n_ms==NULL)
+ return 0;
+
+#if DEBUG_CONFIG_IP_MASQ_QUAKE
+ printk("Quake_out: out_rewrote UDP port %d -> %d\n",
+ udp_port, ntohs(n_ms->mport));
+#endif
+ udp_port = ntohs(n_ms->mport);
+ memcpy(data, &udp_port, 2);
+
+ ip_masq_listen(n_ms);
+ ip_masq_control_add(n_ms, ms);
+ ip_masq_put(n_ms);
+
+ break;
+ }
+
+ return 0;
+}
+
+struct ip_masq_app ip_masq_quake = {
+ NULL, /* next */
+ "Quake_26", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_quake_init_1, /* ip_masq_init_1 */
+ masq_quake_done_1, /* ip_masq_done_1 */
+ masq_quake_out, /* pkt_out */
+ masq_quake_in /* pkt_in */
+};
+struct ip_masq_app ip_masq_quakenew = {
+ NULL, /* next */
+ "Quake_27", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_quake_init_1, /* ip_masq_init_1 */
+ masq_quake_done_1, /* ip_masq_done_1 */
+ masq_quake_out, /* pkt_out */
+ masq_quake_in /* pkt_in */
+};
+
+/*
+ * ip_masq_quake initialization
+ */
+
+__initfunc(int ip_masq_quake_init(void))
+{
+ return (register_ip_masq_app(&ip_masq_quake, IPPROTO_UDP, 26000) +
+ register_ip_masq_app(&ip_masq_quakenew, IPPROTO_UDP, 27000));
+}
+
+/*
+ * ip_masq_quake fin.
+ */
+
+int ip_masq_quake_done(void)
+{
+ return (unregister_ip_masq_app(&ip_masq_quake) +
+ unregister_ip_masq_app(&ip_masq_quakenew));
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_quake_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_quake_done() != 0)
+ printk("ip_masq_quake: can't remove module");
+}
+
+#endif /* MODULE */
+
+
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_raudio.c b/pfinet/linux-src/net/ipv4/ip_masq_raudio.c
new file mode 100644
index 00000000..ee3e276b
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_raudio.c
@@ -0,0 +1,578 @@
+/*
+ * IP_MASQ_RAUDIO - Real Audio masquerading module
+ *
+ *
+ * Version: @(#)$Id: ip_masq_raudio.c,v 1.11 1998/10/06 04:49:04 davem Exp $
+ *
+ * Author: Nigel Metheringham
+ * Real Time Streaming code by Progressive Networks
+ * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne]
+ * [Real Audio information taken from Progressive Networks firewall docs]
+ * [Kudos to Progressive Networks for making the protocol specs available]
+ *
+ *
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * Limitations
+ * The IP Masquerading proxies at present do not have access to a processed
+ * data stream. Hence for a protocol like the Real Audio control protocol,
+ * which depends on knowing where you are in the data stream, you either
+ * to keep a *lot* of state in your proxy, or you cheat and simplify the
+ * problem [needless to say I did the latter].
+ *
+ * This proxy only handles data in the first packet. Everything else is
+ * passed transparently. This means it should work under all normal
+ * circumstances, but it could be fooled by new data formats or a
+ * malicious application!
+ *
+ * At present the "first packet" is defined as a packet starting with
+ * the protocol ID string - "PNA".
+ * When the link is up there appears to be enough control data
+ * crossing the control link to keep it open even if a long audio
+ * piece is playing.
+ *
+ * The Robust UDP support added in RealAudio 3.0 is supported, but due
+ * to servers/clients not making great use of this has not been greatly
+ * tested. RealVideo (as used in the Real client version 4.0beta1) is
+ * supported but again is not greatly tested (bandwidth requirements
+ * appear to exceed that available at the sites supporting the protocol).
+ *
+ * Multiple Port Support
+ * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
+ * with the port numbers being defined at module load time. The module
+ * uses the symbol "ports" to define a list of monitored ports, which can
+ * be specified on the insmod command line as
+ * ports=x1,x2,x3...
+ * where x[n] are integer port numbers. This option can be put into
+ * /etc/conf.modules (or /etc/modules.conf depending on your config)
+ * where modload will pick it up should you use modload to load your
+ * modules.
+ *
+ * Fixes:
+ * Juan Jose Ciarlante : Use control_add() for control chan
+ * 10/15/97 - Modifications to allow masquerading of RTSP connections as
+ * well as PNA, which can potentially exist on the same port.
+ * Joe Rumsey <ogre@real.com>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip_masq.h>
+
+/*
+#ifndef DEBUG_CONFIG_IP_MASQ_RAUDIO
+#define DEBUG_CONFIG_IP_MASQ_RAUDIO 0
+#endif
+*/
+
+#define TOLOWER(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
+#define ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
+
+struct raudio_priv_data {
+ /* Associated data connection - setup but not used at present */
+ struct ip_masq *data_conn;
+ /* UDP Error correction connection - setup but not used at present */
+ struct ip_masq *error_conn;
+ /* Have we seen and performed setup */
+ short seen_start;
+ short is_rtsp;
+};
+
+int
+masq_rtsp_out (struct ip_masq_app *mapp,
+ struct ip_masq *ms,
+ struct sk_buff **skb_p,
+ __u32 maddr);
+
+/*
+ * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+int ports[MAX_MASQ_APP_PORTS] = {554, 7070, 0}; /* I rely on the trailing items being set to zero */
+struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+
+
+static int
+masq_raudio_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ if ((ms->app_data = kmalloc(sizeof(struct raudio_priv_data),
+ GFP_ATOMIC)) == NULL)
+ printk(KERN_INFO "RealAudio: No memory for application data\n");
+ else
+ {
+ struct raudio_priv_data *priv =
+ (struct raudio_priv_data *)ms->app_data;
+ priv->seen_start = 0;
+ priv->data_conn = NULL;
+ priv->error_conn = NULL;
+ priv->is_rtsp = 0;
+ }
+ return 0;
+}
+
+static int
+masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ if (ms->app_data)
+ kfree_s(ms->app_data, sizeof(struct raudio_priv_data));
+ return 0;
+}
+
+int
+masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *p, *data, *data_limit;
+ struct ip_masq *n_ms;
+ unsigned short version, msg_id, msg_len, udp_port;
+ struct raudio_priv_data *priv =
+ (struct raudio_priv_data *)ms->app_data;
+
+ /* Everything running correctly already */
+ if (priv && priv->seen_start)
+ return 0;
+
+ if(priv && priv->is_rtsp)
+ return masq_rtsp_out(mapp, ms, skb_p, maddr);
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+
+ data_limit = skb->h.raw + skb->len;
+
+ if(memcmp(data, "OPTIONS", 7) == 0 ||
+ memcmp(data, "DESCRIBE", 8) == 0)
+ {
+ IP_MASQ_DEBUG(1-debug, "RealAudio: Detected RTSP connection\n");
+ /* This is an RTSP client */
+ if(priv)
+ priv->is_rtsp = 1;
+ return masq_rtsp_out(mapp, ms, skb_p, maddr);
+ }
+
+ /* Check to see if this is the first packet with protocol ID */
+ if (memcmp(data, "PNA", 3)) {
+ IP_MASQ_DEBUG(1-debug, "RealAudio: not initial protocol packet - ignored\n");
+ return(0);
+ }
+ data += 3;
+ memcpy(&version, data, 2);
+
+ IP_MASQ_DEBUG(1-debug, "RealAudio: initial seen - protocol version %d\n",
+ ntohs(version));
+ if (priv)
+ priv->seen_start = 1;
+
+ if (ntohs(version) >= 256)
+ {
+ printk(KERN_INFO "RealAudio: version (%d) not supported\n",
+ ntohs(version));
+ return 0;
+ }
+
+ data += 2;
+ while (data+4 < data_limit) {
+ memcpy(&msg_id, data, 2);
+ data += 2;
+ memcpy(&msg_len, data, 2);
+ data += 2;
+ if (ntohs(msg_id) == 0) {
+ /* The zero tag indicates the end of options */
+ IP_MASQ_DEBUG(1-debug, "RealAudio: packet end tag seen\n");
+ return 0;
+ }
+ IP_MASQ_DEBUG(1-debug, "RealAudio: msg %d - %d byte\n",
+ ntohs(msg_id), ntohs(msg_len));
+ if (ntohs(msg_id) == 0) {
+ /* The zero tag indicates the end of options */
+ return 0;
+ }
+ p = data;
+ data += ntohs(msg_len);
+ if (data > data_limit)
+ {
+ printk(KERN_INFO "RealAudio: Packet too short for data\n");
+ return 0;
+ }
+ if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) {
+ /*
+ * MsgId == 1
+ * Audio UDP data port on client
+ *
+ * MsgId == 7
+ * Robust UDP error correction port number on client
+ *
+ * Since these messages are treated just the same, they
+ * are bundled together here....
+ */
+ memcpy(&udp_port, p, 2);
+
+ /*
+ * Sometimes a server sends a message 7 with a zero UDP port
+ * Rather than do anything with this, just ignore it!
+ */
+ if (udp_port == 0)
+ continue;
+
+
+ n_ms = ip_masq_new(IPPROTO_UDP,
+ maddr, 0,
+ ms->saddr, udp_port,
+ ms->daddr, 0,
+ IP_MASQ_F_NO_DPORT);
+
+ if (n_ms==NULL)
+ return 0;
+
+ ip_masq_listen(n_ms);
+ ip_masq_control_add(n_ms, ms);
+
+ memcpy(p, &(n_ms->mport), 2);
+ IP_MASQ_DEBUG(1-debug, "RealAudio: rewrote UDP port %d -> %d in msg %d\n",
+ ntohs(udp_port), ntohs(n_ms->mport), ntohs(msg_id));
+
+ /* Make ref in application data to data connection */
+ if (priv) {
+ if (ntohs(msg_id) == 1)
+ priv->data_conn = n_ms;
+ else
+ priv->error_conn = n_ms;
+ }
+
+ ip_masq_put(n_ms);
+ }
+ }
+ return 0;
+}
+
+/*
+ * masq_rtsp_out
+ *
+ *
+ */
+int
+masq_rtsp_out (struct ip_masq_app *mapp,
+ struct ip_masq *ms,
+ struct sk_buff **skb_p,
+ __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ struct ip_masq *n_ms, *n_ms2;
+ unsigned short udp_port;
+ struct raudio_priv_data *priv =
+ (struct raudio_priv_data *)ms->app_data;
+ const char* srch = "transport:";
+ const char* srchpos = srch;
+ const char* srchend = srch + strlen(srch);
+ int state = 0;
+ char firstport[6];
+ int firstportpos = 0;
+ char secondport[6];
+ int secondportpos = 0;
+ char *portstart = NULL, *portend = NULL;
+ int diff;
+
+ /* Everything running correctly already */
+ if (priv && priv->seen_start)
+ return 0;
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+
+ data_limit = skb->h.raw + skb->len;
+
+ firstport[0] = 0;
+ secondport[0] = 0;
+
+ while(data < data_limit && state >= 0)
+ {
+ switch(state)
+ {
+ case 0:
+ case 1:
+ if(TOLOWER(*data) == *srchpos)
+ {
+ srchpos++;
+ if(srchpos == srchend)
+ {
+ IP_MASQ_DEBUG(1-debug, "Found string %s in message\n",
+ srch);
+ state++;
+ if(state == 1)
+ {
+ srch = "client_port";
+ srchpos = srch;
+ srchend = srch + strlen(srch);
+ }
+ }
+ }
+ else
+ {
+ srchpos = srch;
+ }
+ break;
+ case 2:
+ if(*data == '=')
+ state = 3;
+ break;
+ case 3:
+ if(ISDIGIT(*data))
+ {
+ portstart = data;
+ firstportpos = 0;
+ firstport[firstportpos++] = *data;
+ state = 4;
+ }
+ break;
+ case 4:
+ if(*data == '-')
+ {
+ state = 5;
+ }
+ else if(*data == ';')
+ {
+ portend = data - 1;
+ firstport[firstportpos] = 0;
+ state = -1;
+ }
+ else if(ISDIGIT(*data))
+ {
+ firstport[firstportpos++] = *data;
+ }
+ else if(*data != ' ' && *data != '\t')
+ {
+ /* This is a badly formed RTSP message, let's bail out */
+ IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n");
+ return 0;
+ }
+ break;
+ case 5:
+ if(ISDIGIT(*data))
+ {
+ secondportpos = 0;
+ secondport[secondportpos++] = *data;
+ state = 6;
+ }
+ else if(*data == ';')
+ {
+ portend = data - 1;
+ secondport[secondportpos] = 0;
+ state = -1;
+ }
+ break;
+ case 6:
+ if(*data == ';')
+ {
+ portend = data - 1;
+ secondport[secondportpos] = 0;
+ state = -1;
+ }
+ else if(ISDIGIT(*data))
+ {
+ secondport[secondportpos++] = *data;
+ }
+ else if(*data != ' ' && *data != '\t')
+ {
+ /* This is a badly formed RTSP message, let's bail out */
+ IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n");
+ return 0;
+ }
+ break;
+ }
+ data++;
+ }
+
+ if(state >= 0)
+ return 0;
+
+ if(firstportpos > 0)
+ {
+ char newbuf[12]; /* xxxxx-xxxxx\0 */
+ char* tmpptr;
+
+ udp_port = htons(simple_strtoul(firstport, &tmpptr, 10));
+ n_ms = ip_masq_new(IPPROTO_UDP,
+ maddr, 0,
+ ms->saddr, udp_port,
+ ms->daddr, 0,
+ IP_MASQ_F_NO_DPORT);
+ if (n_ms==NULL)
+ return 0;
+
+ ip_masq_listen(n_ms);
+ ip_masq_control_add(n_ms, ms);
+
+ if(secondportpos > 0)
+ {
+ udp_port = htons(simple_strtoul(secondport, &tmpptr, 10));
+ n_ms2 = ip_masq_new(IPPROTO_UDP,
+ maddr, 0,
+ ms->saddr, udp_port,
+ ms->daddr, 0,
+ IP_MASQ_F_NO_DPORT);
+ if (n_ms2==NULL) {
+ ip_masq_put(n_ms);
+ return 0;
+ }
+
+ ip_masq_listen(n_ms2);
+ ip_masq_control_add(n_ms2, ms);
+ sprintf(newbuf, "%d-%d", ntohs(n_ms->mport),
+ ntohs(n_ms2->mport));
+ }
+ else
+ {
+ sprintf(newbuf, "%d", ntohs(n_ms->mport));
+ n_ms2 = NULL;
+ }
+ *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC,
+ portstart, portend - portstart + 1,
+ newbuf, strlen(newbuf));
+ IP_MASQ_DEBUG(1-debug, "RTSP: rewrote client_port to %s\n", newbuf);
+ diff = strlen(newbuf) - (portend - portstart);
+ }
+ else
+ {
+ return 0;
+ }
+
+ if(priv)
+ {
+ priv->seen_start = 1;
+ if(n_ms)
+ priv->data_conn = n_ms;
+ if(n_ms2)
+ priv->error_conn = n_ms2;
+ }
+ /*
+ * Release tunnels
+ */
+
+ if (n_ms)
+ ip_masq_put(n_ms);
+
+ if (n_ms2)
+ ip_masq_put(n_ms2);
+
+ return diff;
+}
+
+struct ip_masq_app ip_masq_raudio = {
+ NULL, /* next */
+ "RealAudio", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_raudio_init_1, /* ip_masq_init_1 */
+ masq_raudio_done_1, /* ip_masq_done_1 */
+ masq_raudio_out, /* pkt_out */
+ NULL /* pkt_in */
+};
+
+/*
+ * ip_masq_raudio initialization
+ */
+
+__initfunc(int ip_masq_raudio_init(void))
+{
+ int i, j;
+
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (ports[i]) {
+ if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
+ GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ memcpy(masq_incarnations[i], &ip_masq_raudio, sizeof(struct ip_masq_app));
+ if ((j = register_ip_masq_app(masq_incarnations[i],
+ IPPROTO_TCP,
+ ports[i]))) {
+ return j;
+ }
+ IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n",
+ i, ports[i]);
+ } else {
+ /* To be safe, force the incarnation table entry to NULL */
+ masq_incarnations[i] = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * ip_masq_raudio fin.
+ */
+
+int ip_masq_raudio_done(void)
+{
+ int i, j, k;
+
+ k=0;
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (masq_incarnations[i]) {
+ if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
+ k = j;
+ } else {
+ kfree(masq_incarnations[i]);
+ masq_incarnations[i] = NULL;
+ IP_MASQ_DEBUG(1-debug, "RealAudio: unloaded support on port[%d] = %d\n",
+ i, ports[i]);
+ }
+ }
+ }
+ return k;
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_raudio_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_raudio_done() != 0)
+ printk(KERN_INFO "ip_masq_raudio: can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_user.c b/pfinet/linux-src/net/ipv4/ip_masq_user.c
new file mode 100644
index 00000000..51297441
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_user.c
@@ -0,0 +1,473 @@
+/*
+ * IP_MASQ_USER user space control module
+ *
+ *
+ * $Id: ip_masq_user.c,v 1.1.2.1 1999/08/07 10:56:33 davem Exp $
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/inet.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/checksum.h>
+#include <net/ip_masq.h>
+#include <net/ip_masq_mod.h>
+#include <linux/sysctl.h>
+#include <linux/ip_fw.h>
+
+#include <linux/ip_masq.h>
+
+/*
+ * Debug level
+ */
+static int debug=0;
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+MODULE_PARM(debug, "i");
+
+/*
+static int check_5uple (struct ip_masq_user *ums) {
+ return 0;
+}
+*/
+static void masq_user_k2u(const struct ip_masq *ms, struct ip_masq_user *ums)
+{
+ ums->protocol = ms->protocol;
+ ums->daddr = ms->daddr;
+ ums->dport = ms->dport;
+ ums->maddr = ms->maddr;
+ ums->mport = ms->mport;
+ ums->saddr = ms->saddr;
+ ums->sport = ms->sport;
+ ums->timeout = ms->timeout;
+}
+
+
+static int ip_masq_user_maddr(struct ip_masq_user *ums)
+{
+ struct device *dev;
+ struct rtable *rt;
+ int ret = -EINVAL;
+ u32 rt_daddr, rt_saddr;
+ u32 tos;
+
+ /*
+ * Did specify masq address.
+ */
+ if (ums->maddr)
+ return 0;
+
+ /*
+ * Select address to use for routing query
+ */
+
+ rt_daddr = ums->rt_daddr? ums->rt_daddr : ums->daddr;
+ rt_saddr = ums->rt_saddr? ums->rt_saddr : ums->saddr;
+
+
+ /*
+ * No address for routing, cannot continue
+ */
+ if (rt_daddr == 0) {
+ IP_MASQ_DEBUG(1-debug, "cannot setup maddr with daddr=%lX, rt_addr=%lX\n",
+ ntohl(ums->daddr), ntohl(ums->rt_daddr));
+ return -EINVAL;
+ }
+
+ /*
+ * Find out rt device
+ */
+
+ rt_saddr = 0;
+ tos = RT_TOS(ums->ip_tos) | RTO_CONN;
+
+ if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) {
+ IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n",
+ ntohl(rt_daddr), ntohl(rt_saddr));
+ return ret;
+ }
+ dev = rt->u.dst.dev;
+ ums->maddr = ip_masq_select_addr(dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+
+ IP_MASQ_DEBUG(1-debug, "did setup maddr=%lX\n", ntohl(ums->maddr));
+ ip_rt_put(rt);
+ return 0;
+}
+
+/*
+ * Create new entry (from uspace)
+ */
+static int ip_masq_user_new(struct ip_masq_user *ums)
+{
+ struct ip_masq *ms = NULL;
+ unsigned mflags = 0;
+ int ret;
+
+ if (masq_proto_num (ums->protocol) == -1) {
+ return EPROTONOSUPPORT;
+ }
+
+ if (ums->dport == 0) {
+ ums->flags |= IP_MASQ_USER_F_LISTEN;
+ }
+
+ if (ums->flags | IP_MASQ_USER_F_LISTEN) {
+ if ((ums->saddr == 0) || (ums->sport == 0)) {
+ return EINVAL;
+ }
+ mflags |= (IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR);
+
+ }
+
+ if ((ret = ip_masq_user_maddr(ums)) < 0) {
+ return -ret;
+ }
+
+ mflags |= IP_MASQ_F_USER;
+ ms = ip_masq_new(ums->protocol,
+ ums->maddr, ums->mport,
+ ums->saddr, ums->sport,
+ ums->daddr, ums->dport,
+ mflags);
+
+ if (ms == NULL) {
+ /*
+ * FIXME: ip_masq_new() should return errno
+ */
+ return EBUSY;
+ }
+
+ /*
+ * Setup timeouts for this new entry
+ */
+
+ if (ums->timeout) {
+ ms->timeout = ums->timeout;
+ } else if (ums->flags | IP_MASQ_USER_F_LISTEN) {
+ ip_masq_listen(ms);
+ }
+
+ masq_user_k2u(ms, ums);
+ ip_masq_put(ms);
+ return 0;
+}
+
+/*
+ * Delete existing entry
+ */
+static int ip_masq_user_del(struct ip_masq_user *ums)
+{
+ struct ip_masq *ms=NULL;
+
+ if (masq_proto_num (ums->protocol) == -1) {
+ return EPROTONOSUPPORT;
+ }
+ start_bh_atomic();
+ if (ums->mport && ums->maddr) {
+ ms = ip_masq_in_get(ums->protocol,
+ ums->daddr, ums->dport,
+ ums->maddr, ums->mport);
+ end_bh_atomic();
+ } else if (ums->sport && ums->saddr) {
+ ms = ip_masq_out_get(ums->protocol,
+ ums->saddr, ums->sport,
+ ums->daddr, ums->dport);
+ end_bh_atomic();
+ } else
+ return EINVAL;
+
+ if (ms == NULL) {
+ return ESRCH;
+ }
+
+ /*
+ * got (locked) entry, setup almost tiny timeout :) and
+ * give away
+ *
+ * FIXME: should use something better than S_CLOSE
+ */
+ ms->timeout = IP_MASQ_S_CLOSE;
+
+ masq_user_k2u(ms, ums);
+ ip_masq_put(ms);
+ return 0;
+}
+
+static struct ip_masq * ip_masq_user_locked_get (struct ip_masq_user *ums, int *err)
+{
+ struct ip_masq *ms=NULL;
+ if (masq_proto_num (ums->protocol) == -1) {
+ *err = EPROTONOSUPPORT;
+ }
+
+ start_bh_atomic();
+ if (ums->mport && ums->maddr) {
+ ms = ip_masq_in_get(ums->protocol,
+ ums->daddr, ums->dport,
+ ums->maddr, ums->mport);
+ end_bh_atomic();
+ } else if (ums->sport && ums->saddr) {
+ ms = ip_masq_out_get(ums->protocol,
+ ums->saddr, ums->sport,
+ ums->daddr, ums->dport);
+ end_bh_atomic();
+ } else
+ *err = EINVAL;
+
+ if (ms == NULL) *err = ESRCH;
+ return ms;
+}
+
+/*
+ * Get existing entry (complete full tunnel info)
+ */
+static int ip_masq_user_get(struct ip_masq_user *ums)
+{
+ struct ip_masq *ms=NULL;
+ int err;
+
+ ms = ip_masq_user_locked_get(ums, &err);
+ if (ms == NULL)
+ return err;
+
+ masq_user_k2u(ms, ums);
+
+ ip_masq_put(ms);
+ return 0;
+}
+
+/*
+ * Set (some, valid) entry parameters
+ */
+static int ip_masq_user_set(struct ip_masq_user *ums)
+{
+ struct ip_masq *ms = NULL;
+ int err;
+
+ ms = ip_masq_user_locked_get(ums, &err);
+ if (ms == NULL)
+ return err;
+
+ /*
+ * FIXME: must allow selecting what you want to set
+ */
+ ms->timeout = ums->timeout;
+
+ masq_user_k2u(ms, ums);
+
+ ip_masq_put(ms);
+ return 0;
+}
+
+
+/*
+ * Entry point
+ * ret value:
+ * <0 err
+ * ==0 ok
+ * >0 ok, copy to user
+ */
+static int ip_masq_user_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
+{
+ struct ip_masq_user *ums = &mctl->u.user;
+ int ret = EINVAL;
+ int arglen = optlen - IP_MASQ_CTL_BSIZE;
+ int cmd;
+
+ IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
+ arglen,
+ sizeof (*ums),
+ optlen,
+ sizeof (*mctl));
+
+ /*
+ * Yes, I'm a bad guy ...
+ */
+ if (arglen != sizeof(*ums) && optlen != sizeof(*mctl))
+ return EINVAL;
+
+ MOD_INC_USE_COUNT;
+
+ /*
+ * Don't trust the lusers - plenty of error checking!
+ */
+ cmd = mctl->m_cmd;
+ IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(cmd=%d)\n", cmd);
+
+ switch (mctl->m_cmd) {
+ case IP_MASQ_CMD_ADD:
+ case IP_MASQ_CMD_INSERT:
+ ret = ip_masq_user_new(ums);
+ break;
+ case IP_MASQ_CMD_DEL:
+ ret = ip_masq_user_del(ums);
+ break;
+ case IP_MASQ_CMD_SET:
+ ret = ip_masq_user_set(ums);
+ break;
+ case IP_MASQ_CMD_GET:
+ ret = ip_masq_user_get(ums);
+ break;
+ }
+
+ /*
+ * For all of the above, return masq tunnel info
+ */
+
+ ret = -ret;
+
+ if (ret == 0) {
+ ret = sizeof (*ums) + IP_MASQ_CTL_BSIZE;
+ IP_MASQ_DEBUG(1-debug, "will return %d bytes to user\n", ret);
+ }
+
+ MOD_DEC_USE_COUNT;
+ return ret;
+}
+
+
+#ifdef CONFIG_PROC_FS
+static int ip_masq_user_info(char *buffer, char **start, off_t offset,
+ int length, int proto)
+{
+ off_t pos=0, begin;
+ struct ip_masq *ms;
+ char temp[129];
+ int idx = 0;
+ int col;
+ int len=0;
+ int magic_control;
+ struct list_head *l,*e;
+
+ MOD_INC_USE_COUNT;
+
+ IP_MASQ_DEBUG(1-debug, "Entered user_info with proto=%d\n", proto);
+
+ if (offset < 128)
+ {
+ sprintf(temp,
+ "Prot SrcIP SPrt DstIP DPrt MAddr MPrt State Flgs Ref Ctl Expires HRow HCol (free=%d,%d,%d)",
+ atomic_read(ip_masq_free_ports),
+ atomic_read(ip_masq_free_ports+1),
+ atomic_read(ip_masq_free_ports+2));
+ len = sprintf(buffer, "%-127s\n", temp);
+ }
+ pos = 128;
+
+ for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++)
+ {
+ /*
+ * Lock is actually only need in next loop
+ * we are called from uspace: must stop bh.
+ */
+ col=0;
+ read_lock_bh(&__ip_masq_lock);
+ l = &ip_masq_m_table[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ col++;
+ ms = list_entry(e, struct ip_masq, m_list);
+ if (ms->protocol != proto) {
+ continue;
+ }
+
+ pos += 128;
+ if (pos <= offset) {
+ len = 0;
+ continue;
+ }
+
+ /*
+ * We have locked the tables, no need to del/add timers
+ * nor cli() 8)
+ */
+
+
+ magic_control = atomic_read(&ms->n_control);
+ if (!magic_control && ms->control) magic_control = -1;
+ sprintf(temp,"%-4s %08lX:%04X %08lX:%04X %08lX:%04X %-12s %3X %4d %3d %7lu %4d %4d",
+ masq_proto_name(ms->protocol),
+ ntohl(ms->saddr), ntohs(ms->sport),
+ ntohl(ms->daddr), ntohs(ms->dport),
+ ntohl(ms->maddr), ntohs(ms->mport),
+ ip_masq_state_name(ms->state),
+ ms->flags,
+ atomic_read(&ms->refcnt),
+ magic_control,
+ (ms->timer.expires-jiffies)/HZ,
+ idx, col);
+ len += sprintf(buffer+len, "%-127s\n", temp);
+
+ if(len >= length) {
+ read_unlock_bh(&__ip_masq_lock);
+ goto done;
+ }
+ }
+ read_unlock_bh(&__ip_masq_lock);
+ }
+
+done:
+
+ if (len) {
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ }
+ if(len>length)
+ len = length;
+ MOD_DEC_USE_COUNT;
+ return len;
+}
+#else
+#define ip_masq_user_info NULL
+#endif
+
+static struct ip_masq_hook ip_masq_user = {
+ ip_masq_user_ctl,
+ ip_masq_user_info
+};
+
+int ip_masq_user_init(void)
+{
+ if (ip_masq_user_hook != NULL)
+ return -EEXIST;
+ ip_masq_user_hook = &ip_masq_user;
+ return 0;
+}
+
+int ip_masq_user_done(void)
+{
+ if (ip_masq_user_hook == NULL)
+ return ENOENT;
+ ip_masq_user_hook = NULL;
+ return 0;
+}
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+int init_module(void)
+{
+ if (ip_masq_user_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_user_done() != 0)
+ printk(KERN_INFO "ip_masq_user_done(): can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c b/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c
new file mode 100644
index 00000000..4724e3b9
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_masq_vdolive.c
@@ -0,0 +1,294 @@
+/*
+ * IP_MASQ_VDOLIVE - VDO Live masquerading module
+ *
+ *
+ * Version: @(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $
+ *
+ * Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net>
+ * PLAnet Online Ltd
+ *
+ * Fixes: Minor changes for 2.1 by
+ * Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Thanks:
+ * Thank you to VDOnet Corporation for allowing me access to
+ * a protocol description without an NDA. This means that
+ * this module can be distributed as source - a great help!
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip_masq.h>
+
+struct vdolive_priv_data {
+ /* Ports used */
+ unsigned short origport;
+ unsigned short masqport;
+ /* State of decode */
+ unsigned short state;
+};
+
+/*
+ * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[MAX_MASQ_APP_PORTS] = {7000}; /* I rely on the trailing items being set to zero */
+struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_MASQ_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
+
+static int
+masq_vdolive_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_INC_USE_COUNT;
+ if ((ms->app_data = kmalloc(sizeof(struct vdolive_priv_data),
+ GFP_ATOMIC)) == NULL)
+ IP_MASQ_DEBUG(1-debug, "VDOlive: No memory for application data\n");
+ else
+ {
+ struct vdolive_priv_data *priv =
+ (struct vdolive_priv_data *)ms->app_data;
+ priv->origport = 0;
+ priv->masqport = 0;
+ priv->state = 0;
+ }
+ return 0;
+}
+
+static int
+masq_vdolive_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
+{
+ MOD_DEC_USE_COUNT;
+ if (ms->app_data)
+ kfree_s(ms->app_data, sizeof(struct vdolive_priv_data));
+ return 0;
+}
+
+int
+masq_vdolive_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ unsigned int tagval; /* This should be a 32 bit quantity */
+ struct ip_masq *n_ms;
+ struct vdolive_priv_data *priv =
+ (struct vdolive_priv_data *)ms->app_data;
+
+ /* This doesn't work at all if no priv data was allocated on startup */
+ if (!priv)
+ return 0;
+
+ /* Everything running correctly already */
+ if (priv->state == 3)
+ return 0;
+
+ skb = *skb_p;
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)&th[1];
+
+ data_limit = skb->h.raw + skb->len;
+
+ if (data+8 > data_limit) {
+ IP_MASQ_DEBUG(1-debug, "VDOlive: packet too short for ID %p %p\n", data, data_limit);
+ return 0;
+ }
+ memcpy(&tagval, data+4, 4);
+ IP_MASQ_DEBUG(1-debug, "VDOlive: packet seen, tag %ld, in initial state %d\n", ntohl(tagval), priv->state);
+
+ /* Check for leading packet ID */
+ if ((ntohl(tagval) != 6) && (ntohl(tagval) != 1)) {
+ IP_MASQ_DEBUG(1-debug, "VDOlive: unrecognised tag %ld, in initial state %d\n", ntohl(tagval), priv->state);
+ return 0;
+ }
+
+
+ /* Check packet is long enough for data - ignore if not */
+ if ((ntohl(tagval) == 6) && (data+36 > data_limit)) {
+ IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet too short %p %p\n", data, data_limit);
+ return 0;
+ } else if ((ntohl(tagval) == 1) && (data+20 > data_limit)) {
+ IP_MASQ_DEBUG(1-debug,"VDOlive: secondary packet too short %p %p\n", data, data_limit);
+ return 0;
+ }
+
+ /* Adjust data pointers */
+ /*
+ * I could check the complete protocol version tag
+ * in here however I am just going to look for the
+ * "VDO Live" tag in the hope that this part will
+ * remain constant even if the version changes
+ */
+ if (ntohl(tagval) == 6) {
+ data += 24;
+ IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet found\n");
+ } else {
+ data += 8;
+ IP_MASQ_DEBUG(1-debug, "VDOlive: secondary packet found\n");
+ }
+
+ if (memcmp(data, "VDO Live", 8) != 0) {
+ IP_MASQ_DEBUG(1-debug,"VDOlive: did not find tag\n");
+ return 0;
+ }
+ /*
+ * The port number is the next word after the tag.
+ * VDOlive encodes all of these values
+ * in 32 bit words, so in this case I am
+ * skipping the first 2 bytes of the next
+ * word to get to the relevant 16 bits
+ */
+ data += 10;
+
+ /*
+ * If we have not seen the port already,
+ * set the masquerading tunnel up
+ */
+ if (!priv->origport) {
+ memcpy(&priv->origport, data, 2);
+ IP_MASQ_DEBUG(1-debug, "VDOlive: found port %d\n", ntohs(priv->origport));
+
+ /* Open up a tunnel */
+ n_ms = ip_masq_new(IPPROTO_UDP,
+ maddr, 0,
+ ms->saddr, priv->origport,
+ ms->daddr, 0,
+ IP_MASQ_F_NO_DPORT);
+
+ if (n_ms==NULL) {
+ ip_masq_put(n_ms);
+ IP_MASQ_DEBUG(1-debug, "VDOlive: unable to build UDP tunnel for %x:%x\n", ms->saddr, priv->origport);
+ /* Leave state as unset */
+ priv->origport = 0;
+ return 0;
+ }
+ ip_masq_listen(n_ms);
+
+ ip_masq_put(ms);
+ priv->masqport = n_ms->mport;
+ } else if (memcmp(data, &(priv->origport), 2)) {
+ IP_MASQ_DEBUG(1-debug, "VDOlive: ports do not match\n");
+ /* Write the port in anyhow!!! */
+ }
+
+ /*
+ * Write masq port into packet
+ */
+ memcpy(data, &(priv->masqport), 2);
+ IP_MASQ_DEBUG(1-debug, "VDOlive: rewrote port %d to %d, server %08X\n", ntohs(priv->origport), ntohs(priv->masqport), ms->saddr);
+
+ /*
+ * Set state bit to make which bit has been done
+ */
+
+ priv->state |= (ntohl(tagval) == 6) ? 1 : 2;
+
+ return 0;
+}
+
+
+struct ip_masq_app ip_masq_vdolive = {
+ NULL, /* next */
+ "VDOlive", /* name */
+ 0, /* type */
+ 0, /* n_attach */
+ masq_vdolive_init_1, /* ip_masq_init_1 */
+ masq_vdolive_done_1, /* ip_masq_done_1 */
+ masq_vdolive_out, /* pkt_out */
+ NULL /* pkt_in */
+};
+
+/*
+ * ip_masq_vdolive initialization
+ */
+
+__initfunc(int ip_masq_vdolive_init(void))
+{
+ int i, j;
+
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (ports[i]) {
+ if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
+ GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ memcpy(masq_incarnations[i], &ip_masq_vdolive, sizeof(struct ip_masq_app));
+ if ((j = register_ip_masq_app(masq_incarnations[i],
+ IPPROTO_TCP,
+ ports[i]))) {
+ return j;
+ }
+ IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n", i, ports[i]);
+ } else {
+ /* To be safe, force the incarnation table entry to NULL */
+ masq_incarnations[i] = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * ip_masq_vdolive fin.
+ */
+
+int ip_masq_vdolive_done(void)
+{
+ int i, j, k;
+
+ k=0;
+ for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
+ if (masq_incarnations[i]) {
+ if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
+ k = j;
+ } else {
+ kfree(masq_incarnations[i]);
+ masq_incarnations[i] = NULL;
+ IP_MASQ_DEBUG(1-debug,"VDOlive: unloaded support on port[%d] = %d\n", i, ports[i]);
+ }
+ }
+ }
+ return k;
+}
+
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+ if (ip_masq_vdolive_init() != 0)
+ return -EIO;
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ if (ip_masq_vdolive_done() != 0)
+ IP_MASQ_DEBUG(1-debug, "ip_masq_vdolive: can't remove module");
+}
+
+#endif /* MODULE */
diff --git a/pfinet/linux-src/net/ipv4/ip_nat_dumb.c b/pfinet/linux-src/net/ipv4/ip_nat_dumb.c
new file mode 100644
index 00000000..5a1c6d75
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_nat_dumb.c
@@ -0,0 +1,158 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Dumb Network Address Translation.
+ *
+ * Version: $Id: ip_nat_dumb.c,v 1.8 1999/03/21 05:22:40 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : A zero checksum is a special case
+ * only in UDP
+ * Rani Assaf : Added ICMP messages rewriting
+ * Rani Assaf : Repaired wrong changes, made by ANK.
+ *
+ *
+ * NOTE: It is just working model of real NAT.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/firewall.h>
+#include <linux/ip_fw.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+
+int
+ip_do_nat(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct iphdr *iph = skb->nh.iph;
+ u32 odaddr = iph->daddr;
+ u32 osaddr = iph->saddr;
+ u16 check;
+
+ IPCB(skb)->flags |= IPSKB_TRANSLATED;
+
+ /* Rewrite IP header */
+ iph->daddr = rt->rt_dst_map;
+ iph->saddr = rt->rt_src_map;
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ /* If it is the first fragment, rewrite protocol headers */
+
+ if (!(iph->frag_off & htons(IP_OFFSET))) {
+ u16 *cksum;
+
+ switch(iph->protocol) {
+ case IPPROTO_TCP:
+ cksum = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+ if ((u8*)(cksum+1) > skb->tail)
+ goto truncated;
+ check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum));
+ *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+ break;
+ case IPPROTO_UDP:
+ cksum = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+ if ((u8*)(cksum+1) > skb->tail)
+ goto truncated;
+ if ((check = *cksum) != 0) {
+ check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
+ check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+ *cksum = check ? : 0xFFFF;
+ }
+ break;
+ case IPPROTO_ICMP:
+ {
+ struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2));
+ struct iphdr *ciph;
+ u32 idaddr, isaddr;
+ int updated;
+
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED) &&
+ (icmph->type != ICMP_PARAMETERPROB))
+ break;
+
+ ciph = (struct iphdr *) (icmph + 1);
+
+ if ((u8*)(ciph+1) > skb->tail)
+ goto truncated;
+
+ isaddr = ciph->saddr;
+ idaddr = ciph->daddr;
+ updated = 0;
+
+ if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) {
+ ciph->saddr = iph->daddr;
+ updated = 1;
+ }
+ if (rt->rt_flags&RTCF_SNAT) {
+ if (ciph->daddr != osaddr) {
+ struct fib_result res;
+ struct rt_key key;
+ unsigned flags = 0;
+
+ key.src = ciph->daddr;
+ key.dst = ciph->saddr;
+ key.iif = skb->dev->ifindex;
+ key.oif = 0;
+#ifdef CONFIG_IP_ROUTE_TOS
+ key.tos = RT_TOS(ciph->tos);
+#endif
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = 0;
+#endif
+ /* Use fib_lookup() until we get our own
+ * hash table of NATed hosts -- Rani
+ */
+ if (fib_lookup(&key, &res) == 0 && res.r) {
+ ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
+ if (ciph->daddr != idaddr)
+ updated = 1;
+ }
+ } else {
+ ciph->daddr = iph->saddr;
+ updated = 1;
+ }
+ }
+ if (updated) {
+ cksum = &icmph->checksum;
+ /* Using tcpudp primitive. Why not? */
+ check = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum));
+ *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ return 0;
+
+truncated:
+ return -EINVAL;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_options.c b/pfinet/linux-src/net/ipv4/ip_options.c
new file mode 100644
index 00000000..a3d1f0aa
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_options.c
@@ -0,0 +1,617 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The options processing module for ip.c
+ *
+ * Version: $Id: ip_options.c,v 1.16.2.1 1999/06/02 04:06:19 davem Exp $
+ *
+ * Authors: A.N.Kuznetsov
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+ u32 daddr, struct rtable *rt, int is_frag)
+{
+ unsigned char * iph = skb->nh.raw;
+
+ memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+ memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ opt = &(IPCB(skb)->opt);
+ opt->is_data = 0;
+
+ if (opt->srr)
+ memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+ if (!is_frag) {
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+ if (opt->ts_needtime) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ }
+ return;
+ }
+ if (opt->rr) {
+ memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ }
+ if (opt->ts) {
+ memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ opt->ts = 0;
+ opt->ts_needaddr = opt->ts_needtime = 0;
+ }
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+{
+ struct ip_options *sopt;
+ unsigned char *sptr, *dptr;
+ int soffset, doffset;
+ int optlen;
+ u32 daddr;
+
+ memset(dopt, 0, sizeof(struct ip_options));
+
+ dopt->is_data = 1;
+
+ sopt = &(IPCB(skb)->opt);
+
+ if (sopt->optlen == 0) {
+ dopt->optlen = 0;
+ return 0;
+ }
+
+ sptr = skb->nh.raw;
+ dptr = dopt->__data;
+
+ if (skb->dst)
+ daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
+ else
+ daddr = skb->nh.iph->daddr;
+
+ if (sopt->rr) {
+ optlen = sptr[sopt->rr+1];
+ soffset = sptr[sopt->rr+2];
+ dopt->rr = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->rr, optlen);
+ if (sopt->rr_needaddr && soffset <= optlen) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dptr[2] = soffset + 4;
+ dopt->rr_needaddr = 1;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->ts) {
+ optlen = sptr[sopt->ts+1];
+ soffset = sptr[sopt->ts+2];
+ dopt->ts = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->ts, optlen);
+ if (soffset <= optlen) {
+ if (sopt->ts_needaddr) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dopt->ts_needaddr = 1;
+ soffset += 4;
+ }
+ if (sopt->ts_needtime) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
+ soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
+ if (soffset + 8 <= optlen) {
+ __u32 addr;
+
+ memcpy(&addr, sptr+soffset-1, 4);
+ if (inet_addr_type(addr) != RTN_LOCAL) {
+ dopt->ts_needtime = 1;
+ soffset += 8;
+ }
+ }
+ }
+ }
+ dptr[2] = soffset;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->srr) {
+ unsigned char * start = sptr+sopt->srr;
+ u32 faddr;
+
+ optlen = start[1];
+ soffset = start[2];
+ doffset = 0;
+ if (soffset > optlen)
+ soffset = optlen + 1;
+ soffset -= 4;
+ if (soffset > 3) {
+ memcpy(&faddr, &start[soffset-1], 4);
+ for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+ /*
+ * RFC1812 requires to fix illegal source routes.
+ */
+ if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+ doffset -= 4;
+ }
+ if (doffset > 3) {
+ memcpy(&start[doffset-1], &daddr, 4);
+ dopt->faddr = faddr;
+ dptr[0] = start[0];
+ dptr[1] = doffset+3;
+ dptr[2] = 4;
+ dptr += doffset+3;
+ dopt->srr = dopt->optlen + sizeof(struct iphdr);
+ dopt->optlen += doffset+3;
+ dopt->is_strictroute = sopt->is_strictroute;
+ }
+ }
+ while (dopt->optlen & 3) {
+ *dptr++ = IPOPT_END;
+ dopt->optlen++;
+ }
+ return 0;
+}
+
+/*
+ * Options "fragmenting", just fill options not
+ * allowed in fragments with NOOPs.
+ * Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff * skb)
+{
+ unsigned char * optptr = skb->nh.raw;
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ int l = opt->optlen;
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return;
+ if (!IPOPT_COPIED(*optptr))
+ memset(optptr, IPOPT_NOOP, optlen);
+ l -= optlen;
+ optptr += optlen;
+ }
+ opt->ts = 0;
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ opt->ts_needaddr = 0;
+ opt->ts_needtime = 0;
+ return;
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+{
+ int l;
+ unsigned char * iph;
+ unsigned char * optptr;
+ int optlen;
+ unsigned char * pp_ptr = NULL;
+ struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
+
+ if (!opt) {
+ opt = &(IPCB(skb)->opt);
+ memset(opt, 0, sizeof(struct ip_options));
+ iph = skb->nh.raw;
+ opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
+ optptr = iph + sizeof(struct iphdr);
+ opt->is_data = 0;
+ } else {
+ optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
+ iph = optptr - sizeof(struct iphdr);
+ }
+
+ for (l = opt->optlen; l > 0; ) {
+ switch (*optptr) {
+ case IPOPT_END:
+ for (optptr++, l--; l>0; l--) {
+ if (*optptr != IPOPT_END) {
+ *optptr = IPOPT_END;
+ opt->is_changed = 1;
+ }
+ }
+ goto eol;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ switch (*optptr) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ /* NB: cf RFC-1812 5.2.4.1 */
+ if (opt->srr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (!skb) {
+ if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ memcpy(&opt->faddr, &optptr[3], 4);
+ if (optlen > 7)
+ memmove(&optptr[3], &optptr[7], optlen-7);
+ }
+ opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+ opt->srr = optptr - iph;
+ break;
+ case IPOPT_RR:
+ if (opt->rr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ opt->is_changed = 1;
+ }
+ optptr[2] += 4;
+ opt->rr_needaddr = 1;
+ }
+ opt->rr = optptr - iph;
+ break;
+ case IPOPT_TIMESTAMP:
+ if (opt->ts) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 5) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ __u32 * timeptr = NULL;
+ if (optptr[2]+3 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ switch (optptr[3]&0xF) {
+ case IPOPT_TS_TSONLY:
+ opt->ts = optptr - iph;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]-1];
+ opt->ts_needtime = 1;
+ optptr[2] += 4;
+ break;
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needaddr = 1;
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optptr[1]) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ {
+ u32 addr;
+ memcpy(&addr, &optptr[optptr[2]-1], 4);
+ if (inet_addr_type(addr) == RTN_UNICAST)
+ break;
+ if (skb)
+ timeptr = (__u32*)&optptr[optptr[2]+3];
+ }
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ break;
+ }
+ if (timeptr) {
+ struct timeval tv;
+ __u32 midtime;
+ do_gettimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+ memcpy(timeptr, &midtime, sizeof(__u32));
+ opt->is_changed = 1;
+ }
+ } else {
+ unsigned overflow = optptr[3]>>4;
+ if (overflow == 15) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ opt->ts = optptr - iph;
+ if (skb) {
+ optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+ opt->is_changed = 1;
+ }
+ }
+ break;
+ case IPOPT_RA:
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] == 0 && optptr[3] == 0)
+ opt->router_alert = optptr - iph;
+ break;
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !capable(CAP_NET_RAW)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+
+eol:
+ if (!pp_ptr)
+ return 0;
+
+error:
+ if (skb) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+ }
+ return -EINVAL;
+}
+
+
+/*
+ * Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options * opt)
+{
+ if (opt->srr) {
+ unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ memmove(optptr+7, optptr+3, optptr[1]-7);
+ memcpy(optptr+3, &opt->faddr, 4);
+ }
+ if (opt->rr_needaddr) {
+ unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ if (opt->ts) {
+ unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ if (opt->ts_needtime) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ }
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+{
+ struct ip_options *opt;
+
+ opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+ if (!opt)
+ return -ENOMEM;
+ memset(opt, 0, sizeof(struct ip_options));
+ if (optlen) {
+ if (user) {
+ if (copy_from_user(opt->__data, data, optlen))
+ return -EFAULT;
+ } else
+ memcpy(opt->__data, data, optlen);
+ }
+ while (optlen & 3)
+ opt->__data[optlen++] = IPOPT_END;
+ opt->optlen = optlen;
+ opt->is_data = 1;
+ opt->is_setbyuser = 1;
+ if (optlen && ip_options_compile(opt, NULL)) {
+ kfree_s(opt, sizeof(struct ip_options) + optlen);
+ return -EINVAL;
+ }
+ *optp = opt;
+ return 0;
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+ struct ip_options * opt = &(IPCB(skb)->opt);
+ unsigned char * optptr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ unsigned char *raw = skb->nh.raw;
+
+ if (opt->rr_needaddr) {
+ optptr = (unsigned char *)raw + opt->rr;
+ ip_rt_get_source(&optptr[optptr[2]-5], rt);
+ opt->is_changed = 1;
+ }
+ if (opt->srr_is_hit) {
+ int srrptr, srrspace;
+
+ optptr = raw + opt->srr;
+
+ for ( srrptr=optptr[2], srrspace = optptr[1];
+ srrptr <= srrspace;
+ srrptr += 4
+ ) {
+ if (srrptr + 3 > srrspace)
+ break;
+ if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+ break;
+ }
+ if (srrptr + 3 <= srrspace) {
+ opt->is_changed = 1;
+ ip_rt_get_source(&optptr[srrptr-1], rt);
+ skb->nh.iph->daddr = rt->rt_dst;
+ optptr[2] = srrptr+4;
+ } else
+ printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ if (opt->ts_needaddr) {
+ optptr = raw + opt->ts;
+ ip_rt_get_source(&optptr[optptr[2]-9], rt);
+ opt->is_changed = 1;
+ }
+ }
+ if (opt->is_changed) {
+ opt->is_changed = 0;
+ ip_send_check(skb->nh.iph);
+ }
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int srrspace, srrptr;
+ u32 nexthop;
+ struct iphdr *iph = skb->nh.iph;
+ unsigned char * optptr = skb->nh.raw + opt->srr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtable *rt2;
+ int err;
+
+ if (!opt->srr)
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return -EINVAL;
+ if (rt->rt_type == RTN_UNICAST) {
+ if (!opt->is_strictroute)
+ return 0;
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+ return -EINVAL;
+ }
+ if (rt->rt_type != RTN_LOCAL)
+ return -EINVAL;
+
+ for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ if (srrptr + 3 > srrspace) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+ return -EINVAL;
+ }
+ memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+ rt = (struct rtable*)skb->dst;
+ skb->dst = NULL;
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ rt2 = (struct rtable*)skb->dst;
+ if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+ ip_rt_put(rt2);
+ skb->dst = &rt->u.dst;
+ return -EINVAL;
+ }
+ ip_rt_put(rt);
+ if (rt2->rt_type != RTN_LOCAL)
+ break;
+ /* Superfast 8) loopback forward */
+ memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+ opt->is_changed = 1;
+ }
+ if (srrptr <= srrspace) {
+ opt->srr_is_hit = 1;
+ opt->is_changed = 1;
+ }
+ return 0;
+}
diff --git a/pfinet/linux-src/net/ipv4/ip_output.c b/pfinet/linux-src/net/ipv4/ip_output.c
new file mode 100644
index 00000000..44d63557
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_output.c
@@ -0,0 +1,992 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) output module.
+ *
+ * Version: $Id: ip_output.c,v 1.67 1999/03/25 00:43:00 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ * See ip_input.c for original log
+ *
+ * Fixes:
+ * Alan Cox : Missing nonblock feature in ip_build_xmit.
+ * Mike Kilburn : htons() missing in ip_build_xmit.
+ * Bradford Johnson: Fix faulty handling of some frames when
+ * no route is found.
+ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
+ * (in case if packet not accepted by
+ * output firewall rules)
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov: use new route cache
+ * Andi Kleen: Fix broken PMTU recovery and remove
+ * some redundant tests.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Replace ip_reply with ip_send_reply.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readibility.
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/igmp.h>
+#include <linux/ip_fw.h>
+#include <linux/firewall.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr = 0;
+
+
+int ip_id_count = 0;
+
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+
+/*
+ * Add an ip header to a skbuff and send it out.
+ */
+void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ u32 saddr, u32 daddr, struct ip_options *opt)
+{
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct iphdr *iph;
+ struct device *dev;
+
+ /* Build the IP header. */
+ if (opt)
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
+ else
+ iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = sk->ip_tos;
+ iph->frag_off = 0;
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ iph->frag_off |= htons(IP_DF);
+ iph->ttl = sk->ip_ttl;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+ iph->tot_len = htons(skb->len);
+ iph->id = htons(ip_id_count++);
+ skb->nh.iph = iph;
+
+ if (opt && opt->optlen) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, daddr, rt, 0);
+ }
+
+ dev = rt->u.dst.dev;
+
+#ifdef CONFIG_FIREWALL
+ /* Now we have no better mechanism to notify about error. */
+ switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+ case FW_REJECT:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_QUEUE:
+ kfree_skb(skb);
+ return;
+ }
+#endif
+
+ ip_send_check(iph);
+
+ /* Send it out. */
+ skb->dst->output(skb);
+ return;
+}
+
+int __ip_finish_output(struct sk_buff *skb)
+{
+ return ip_finish_output(skb);
+}
+
+int ip_mc_output(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct device *dev = rt->u.dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+
+ ip_statistics.IpOutRequests++;
+#ifdef CONFIG_IP_ROUTE_NAT
+ if (rt->rt_flags & RTCF_NAT)
+ ip_do_nat(skb);
+#endif
+
+ skb->dev = dev;
+ skb->protocol = __constant_htons(ETH_P_IP);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
+#ifdef CONFIG_IP_MROUTE
+ /* Small optimization: do not loopback not local frames,
+ which returned after forwarding; they will be dropped
+ by ip_mr_input in any case.
+ Note, that local frames are looped back to be delivered
+ to local recipients.
+
+ This check is duplicated in ip_mr_input at the moment.
+ */
+ if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+#endif
+ dev_loopback_xmit(skb);
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (skb->nh.iph->ttl == 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (rt->rt_flags&RTCF_BROADCAST)
+ dev_loopback_xmit(skb);
+
+ return ip_finish_output(skb);
+}
+
+int ip_output(struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_NAT
+ struct rtable *rt = (struct rtable*)skb->dst;
+#endif
+
+ ip_statistics.IpOutRequests++;
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ if (rt->rt_flags&RTCF_NAT)
+ ip_do_nat(skb);
+#endif
+
+ return ip_finish_output(skb);
+}
+
+/* Queues a packet to be sent, and starts the transmitter if necessary.
+ * This routine also needs to put in the total length and compute the
+ * checksum. We use to do this in two stages, ip_build_header() then
+ * this, but that scheme created a mess when routes disappeared etc.
+ * So we do it all here, and the TCP send engine has been changed to
+ * match. (No more unroutable FIN disasters, etc. wheee...) This will
+ * most likely make other reliable transport layers above IP easier
+ * to implement under Linux.
+ */
+void ip_queue_xmit(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct ip_options *opt = sk->opt;
+ struct rtable *rt;
+ struct device *dev;
+ struct iphdr *iph;
+ unsigned int tot_len;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *) sk->dst_cache;
+ if(rt == NULL || rt->u.dst.obsolete) {
+ u32 daddr;
+
+ sk->dst_cache = NULL;
+ ip_rt_put(rt);
+
+ /* Use correct destination address if we have options. */
+ daddr = sk->daddr;
+ if(opt && opt->srr)
+ daddr = opt->faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times itself
+ * out.
+ */
+ if(ip_route_output(&rt, daddr, sk->saddr,
+ RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
+ sk->bound_dev_if))
+ goto drop;
+ sk->dst_cache = &rt->u.dst;
+ }
+ if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto no_route;
+
+ /* We have a route, so grab a reference. */
+ skb->dst = dst_clone(sk->dst_cache);
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = sk->ip_tos;
+ iph->frag_off = 0;
+ iph->ttl = sk->ip_ttl;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+ skb->nh.iph = iph;
+ /* Transport layer set skb->h.foo itself. */
+
+ if(opt && opt->optlen) {
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, sk->daddr, rt, 0);
+ }
+
+ tot_len = skb->len;
+ iph->tot_len = htons(tot_len);
+ iph->id = htons(ip_id_count++);
+
+ dev = rt->u.dst.dev;
+
+#ifdef CONFIG_FIREWALL
+ /* Now we have no better mechanism to notify about error. */
+ switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
+ case FW_REJECT:
+ start_bh_atomic();
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ end_bh_atomic();
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_QUEUE:
+ goto drop;
+ }
+#endif
+
+ /* This can happen when the transport layer has segments queued
+ * with a cached route, and by the time we get here things are
+ * re-routed to a device with a different MTU than the original
+ * device. Sick, but we must cover it.
+ */
+ if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
+ kfree_skb(skb);
+ if (skb2 == NULL)
+ return;
+ if (sk)
+ skb_set_owner_w(skb, sk);
+ skb = skb2;
+ iph = skb->nh.iph;
+ }
+
+ /* Do we need to fragment. Again this is inefficient. We
+ * need to somehow lock the original buffer and use bits of it.
+ */
+ if (tot_len > rt->u.dst.pmtu)
+ goto fragment;
+
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ iph->frag_off |= __constant_htons(IP_DF);
+
+ /* Add an IP checksum. */
+ ip_send_check(iph);
+
+ skb->priority = sk->priority;
+ skb->dst->output(skb);
+ return;
+
+fragment:
+ if (ip_dont_fragment(sk, &rt->u.dst) &&
+ tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
+ /* Reject packet ONLY if TCP might fragment
+ it itself, if were careful enough.
+ Test is not precise (f.e. it does not take sacks
+ into account). Actually, tcp should make it. --ANK (980801)
+ */
+ iph->frag_off |= __constant_htons(IP_DF);
+ NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
+
+ /* icmp_send is not reenterable, so that bh_atomic... --ANK */
+ start_bh_atomic();
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(rt->u.dst.pmtu));
+ end_bh_atomic();
+ goto drop;
+ }
+ ip_fragment(skb, skb->dst->output);
+ return;
+
+no_route:
+ sk->dst_cache = NULL;
+ ip_rt_put(rt);
+ ip_statistics.IpOutNoRoutes++;
+ /* Fall through... */
+drop:
+ kfree_skb(skb);
+}
+
+/*
+ * Build and send a packet, with as little as one copy
+ *
+ * Doesn't care much about ip options... option length can be
+ * different for fragment at 0 and other fragments.
+ *
+ * Note that the fragment at the highest offset is sent first,
+ * so the getfrag routine can fill in the TCP/UDP checksum header
+ * field in the last fragment it sends... actually it also helps
+ * the reassemblers, they can put most packets in at the head of
+ * the fragment queue, and they know the total size in advance. This
+ * last feature will measurably improve the Linux fragment handler one
+ * day.
+ *
+ * The callback has five args, an arbitrary pointer (copy of frag),
+ * the source IP address (may depend on the routing table), the
+ * destination address (char *), the offset to copy from, and the
+ * length to be copied.
+ */
+
+int ip_build_xmit_slow(struct sock *sk,
+ int getfrag (const void *,
+ char *,
+ unsigned int,
+ unsigned int),
+ const void *frag,
+ unsigned length,
+ struct ipcm_cookie *ipc,
+ struct rtable *rt,
+ int flags)
+{
+ unsigned int fraglen, maxfraglen, fragheaderlen;
+ int err;
+ int offset, mf;
+ int mtu;
+ unsigned short id;
+
+ int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
+ int nfrags=0;
+ struct ip_options *opt = ipc->opt;
+ int df = 0;
+
+ mtu = rt->u.dst.pmtu;
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ df = htons(IP_DF);
+
+ length -= sizeof(struct iphdr);
+
+ if (opt) {
+ fragheaderlen = sizeof(struct iphdr) + opt->optlen;
+ maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
+ } else {
+ fragheaderlen = sizeof(struct iphdr);
+
+ /*
+ * Fragheaderlen is the size of 'overhead' on each buffer. Now work
+ * out the size of the frames to send.
+ */
+
+ maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
+ }
+
+ if (length + fragheaderlen > 0xFFFF) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * Start at the end of the frame by handling the remainder.
+ */
+
+ offset = length - (length % (maxfraglen - fragheaderlen));
+
+ /*
+ * Amount of memory to allocate for final fragment.
+ */
+
+ fraglen = length - offset + fragheaderlen;
+
+ if (length-offset==0) {
+ fraglen = maxfraglen;
+ offset -= maxfraglen-fragheaderlen;
+ }
+
+
+ /*
+ * The last fragment will not have MF (more fragments) set.
+ */
+
+ mf = 0;
+
+ /*
+ * Don't fragment packets for path mtu discovery.
+ */
+
+ if (offset > 0 && df) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
+ return(-EMSGSIZE);
+ }
+
+ /*
+ * Lock the device lists.
+ */
+
+ dev_lock_list();
+
+ /*
+ * Get an identifier
+ */
+
+ id = htons(ip_id_count++);
+
+ /*
+ * Begin outputting the bytes.
+ */
+
+ do {
+ char *data;
+ struct sk_buff * skb;
+
+ /*
+ * Get the memory we require with some space left for alignment.
+ */
+
+ skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto error;
+
+ /*
+ * Fill in the control structures
+ */
+
+ skb->priority = sk->priority;
+ skb->dst = dst_clone(&rt->u.dst);
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+
+ data = skb_put(skb, fraglen);
+ skb->nh.iph = (struct iphdr *)data;
+
+ /*
+ * Only write IP header onto non-raw packets
+ */
+
+ {
+ struct iphdr *iph = (struct iphdr *)data;
+
+ iph->version = 4;
+ iph->ihl = 5;
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt,
+ ipc->addr, rt, offset);
+ }
+ iph->tos = sk->ip_tos;
+ iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
+ iph->id = id;
+ iph->frag_off = htons(offset>>3);
+ iph->frag_off |= mf|df;
+ if (rt->rt_type == RTN_MULTICAST)
+ iph->ttl = sk->ip_mc_ttl;
+ else
+ iph->ttl = sk->ip_ttl;
+ iph->protocol = sk->protocol;
+ iph->check = 0;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ data += iph->ihl*4;
+
+ /*
+ * Any further fragments will have MF set.
+ */
+
+ mf = htons(IP_MF);
+ }
+
+ /*
+ * User data callback
+ */
+
+ if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset -= (maxfraglen-fragheaderlen);
+ fraglen = maxfraglen;
+
+ nfrags++;
+
+#ifdef CONFIG_FIREWALL
+ switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
+ case FW_QUEUE:
+ kfree_skb(skb);
+ continue;
+ case FW_BLOCK:
+ case FW_REJECT:
+ kfree_skb(skb);
+ err = -EPERM;
+ goto error;
+ }
+#endif
+
+ err = -ENETDOWN;
+ if (rt->u.dst.output(skb))
+ goto error;
+ } while (offset >= 0);
+
+ if (nfrags>1)
+ ip_statistics.IpFragCreates += nfrags;
+ dev_unlock_list();
+ return 0;
+
+error:
+ ip_statistics.IpOutDiscards++;
+ if (nfrags>1)
+ ip_statistics.IpFragCreates += nfrags;
+ dev_unlock_list();
+ return err;
+}
+
+
+/*
+ * Fast path for unfragmented packets.
+ */
+int ip_build_xmit(struct sock *sk,
+ int getfrag (const void *,
+ char *,
+ unsigned int,
+ unsigned int),
+ const void *frag,
+ unsigned length,
+ struct ipcm_cookie *ipc,
+ struct rtable *rt,
+ int flags)
+{
+ int err;
+ struct sk_buff *skb;
+ int df;
+ struct iphdr *iph;
+
+ /*
+ * Try the simple case first. This leaves fragmented frames, and by
+ * choice RAW frames within 20 bytes of maximum size(rare) to the long path
+ */
+
+ if (!sk->ip_hdrincl) {
+ length += sizeof(struct iphdr);
+
+ /*
+ * Check for slow path.
+ */
+ if (length > rt->u.dst.pmtu || ipc->opt != NULL)
+ return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
+ } else {
+ if (length > rt->u.dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ }
+
+ /*
+ * Do path mtu discovery if needed.
+ */
+ df = 0;
+ if (ip_dont_fragment(sk, &rt->u.dst))
+ df = htons(IP_DF);
+
+ /*
+ * Fast path for unfragmented frames without options.
+ */
+ {
+ int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
+
+ skb = sock_alloc_send_skb(sk, length+hh_len+15,
+ 0, flags&MSG_DONTWAIT, &err);
+ if(skb==NULL)
+ goto error;
+ skb_reserve(skb, hh_len);
+ }
+
+ skb->priority = sk->priority;
+ skb->dst = dst_clone(&rt->u.dst);
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+
+ dev_lock_list();
+
+ if(!sk->ip_hdrincl) {
+ iph->version=4;
+ iph->ihl=5;
+ iph->tos=sk->ip_tos;
+ iph->tot_len = htons(length);
+ iph->id=htons(ip_id_count++);
+ iph->frag_off = df;
+ iph->ttl=sk->ip_mc_ttl;
+ if (rt->rt_type != RTN_MULTICAST)
+ iph->ttl=sk->ip_ttl;
+ iph->protocol=sk->protocol;
+ iph->saddr=rt->rt_src;
+ iph->daddr=rt->rt_dst;
+ iph->check=0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
+ }
+ else
+ err = getfrag(frag, (void *)iph, 0, length);
+
+ dev_unlock_list();
+
+ if (err)
+ goto error_fault;
+
+#ifdef CONFIG_FIREWALL
+ switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
+ case FW_QUEUE:
+ kfree_skb(skb);
+ return 0;
+ case FW_BLOCK:
+ case FW_REJECT:
+ kfree_skb(skb);
+ err = -EPERM;
+ goto error;
+ }
+#endif
+
+ return rt->u.dst.output(skb);
+
+error_fault:
+ err = -EFAULT;
+ kfree_skb(skb);
+error:
+ ip_statistics.IpOutDiscards++;
+ return err;
+}
+
+
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ *
+ * Yes this is inefficient, feel free to submit a quicker one.
+ */
+
+void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+{
+ struct iphdr *iph;
+ unsigned char *raw;
+ unsigned char *ptr;
+ struct device *dev;
+ struct sk_buff *skb2;
+ unsigned int mtu, hlen, left, len;
+ int offset;
+ int not_last_frag;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ dev = rt->u.dst.dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ raw = skb->nh.raw;
+ iph = (struct iphdr*)raw;
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ left = ntohs(iph->tot_len) - hlen; /* Space per frame */
+ mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
+ ptr = raw + hlen; /* Where to start from */
+
+ /*
+ * The protocol doesn't seem to say what to do in the case that the
+ * frame + options doesn't fit the mtu. As it used to fall down dead
+ * in this case we were fortunate it didn't happen
+ *
+ * It is impossible, because mtu>=68. --ANK (980801)
+ */
+
+#ifdef CONFIG_NET_PARANOIA
+ if (mtu<8)
+ goto fail;
+#endif
+
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while(left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+ /*
+ * Allocate buffer.
+ */
+
+ if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
+ NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ skb2->pkt_type = skb->pkt_type;
+ skb2->priority = skb->priority;
+ skb_reserve(skb2, (dev->hard_header_len+15)&~15);
+ skb_put(skb2, len + hlen);
+ skb2->nh.raw = skb2->data;
+ skb2->h.raw = skb2->data + hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ skb2->dst = dst_clone(skb->dst);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ memcpy(skb2->nh.raw, raw, hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ memcpy(skb2->h.raw, ptr, len);
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = skb2->nh.iph;
+ iph->frag_off = htons((offset >> 3));
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+
+ ip_statistics.IpFragCreates++;
+
+ iph->tot_len = htons(len + hlen);
+
+ ip_send_check(iph);
+
+ output(skb2);
+ }
+ kfree_skb(skb);
+ ip_statistics.IpFragOKs++;
+ return;
+
+fail:
+ kfree_skb(skb);
+ ip_statistics.IpFragFails++;
+}
+
+/*
+ * Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
+ unsigned int fraglen)
+{
+ struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
+ u16 *pktp = (u16 *)to;
+ struct iovec *iov;
+ int len;
+ int hdrflag = 1;
+
+ iov = &dp->iov[0];
+ if (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ hdrflag = 0;
+ }
+ len = iov->iov_len - offset;
+ if (fraglen > len) { /* overlapping. */
+ dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
+ dp->csum);
+ offset = 0;
+ fraglen -= len;
+ to += len;
+ iov++;
+ }
+
+ dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
+ dp->csum);
+
+ if (hdrflag && dp->csumoffset)
+ *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
+ return 0;
+}
+
+/*
+ * Generic function to send a packet as reply to another packet.
+ * Used to send TCP resets so far. ICMP should use this function too.
+ *
+ * Should run single threaded per socket because it uses the sock
+ * structure to pass arguments.
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+ unsigned int len)
+{
+ struct {
+ struct ip_options opt;
+ char data[40];
+ } replyopts;
+ struct ipcm_cookie ipc;
+ u32 daddr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ if (ip_options_echo(&replyopts.opt, skb))
+ return;
+
+ sk->ip_tos = skb->nh.iph->tos;
+ sk->priority = skb->priority;
+ sk->protocol = skb->nh.iph->protocol;
+
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = &replyopts.opt;
+
+ if (ipc.opt->srr)
+ daddr = replyopts.opt.faddr;
+ if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
+ return;
+
+ /* And let IP do all the hard work. */
+ ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
+ ip_rt_put(rt);
+}
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type =
+{
+ __constant_htons(ETH_P_IP),
+ NULL, /* All devices */
+ ip_rcv,
+ NULL,
+ NULL,
+};
+
+
+
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IP_MULTICAST
+static struct proc_dir_entry proc_net_igmp = {
+ PROC_NET_IGMP, 4, "igmp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ ip_mc_procinfo
+};
+#endif
+#endif
+
+/*
+ * IP registers the packet type and then calls the subprotocol initialisers
+ */
+
+__initfunc(void ip_init(void))
+{
+ dev_add_pack(&ip_packet_type);
+
+ ip_rt_init();
+
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_IP_MULTICAST
+ proc_net_register(&proc_net_igmp);
+#endif
+#endif
+}
+
diff --git a/pfinet/linux-src/net/ipv4/ip_sockglue.c b/pfinet/linux-src/net/ipv4/ip_sockglue.c
new file mode 100644
index 00000000..369a6770
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ip_sockglue.c
@@ -0,0 +1,739 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP to API glue.
+ *
+ * Version: $Id: ip_sockglue.c,v 1.42 1999/04/22 10:07:34 davem Exp $
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip.c for history.
+ * Martin Mares : TOS setting fixed.
+ * Alan Cox : Fixed a couple of oopses in Martin's
+ * TOS tweaks.
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/firewall.h>
+#include <linux/ip_fw.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+
+#ifdef CONFIG_IP_MASQUERADE
+#include <linux/ip_masq.h>
+#endif
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+#define IP_CMSG_PKTINFO 1
+#define IP_CMSG_TTL 2
+#define IP_CMSG_TOS 4
+#define IP_CMSG_RECVOPTS 8
+#define IP_CMSG_RETOPTS 16
+
+/*
+ * SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct in_pktinfo info;
+ struct rtable *rt = (struct rtable *)skb->dst;
+
+ info.ipi_addr.s_addr = skb->nh.iph->daddr;
+ if (rt) {
+ info.ipi_ifindex = rt->rt_iif;
+ info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+ } else {
+ info.ipi_ifindex = 0;
+ info.ipi_spec_dst.s_addr = 0;
+ }
+
+ put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+ int ttl = skb->nh.iph->ttl;
+ put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+}
+
+
+void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ if (ip_options_echo(opt, skb)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+ ip_options_undo(opt);
+
+ put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned flags = skb->sk->ip_cmsg_flags;
+
+ /* Ordered by supposed usage frequency */
+ if (flags & 1)
+ ip_cmsg_recv_pktinfo(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_ttl(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_tos(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_opts(msg, skb);
+ if ((flags>>=1) == 0)
+ return;
+
+ if (flags & 1)
+ ip_cmsg_recv_retopts(msg, skb);
+}
+
+int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+ int err;
+ struct cmsghdr *cmsg;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (cmsg->cmsg_len < sizeof(struct cmsghdr) ||
+ (unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+ + cmsg->cmsg_len) > msg->msg_controllen) {
+ return -EINVAL;
+ }
+ if (cmsg->cmsg_level != SOL_IP)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case IP_RETOPTS:
+ err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ if (err)
+ return err;
+ break;
+ case IP_PKTINFO:
+ {
+ struct in_pktinfo *info;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+ return -EINVAL;
+ info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ ipc->oif = info->ipi_ifindex;
+ ipc->addr = info->ipi_spec_dst.s_addr;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+
+/* Special input handler for packets catched by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain *ip_ra_chain;
+
+int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+{
+ struct ip_ra_chain *ra, *new_ra, **rap;
+
+ if (sk->type != SOCK_RAW || sk->num == IPPROTO_RAW)
+ return -EINVAL;
+
+ new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (on) {
+ if (new_ra)
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+ *rap = ra->next;
+ synchronize_bh();
+
+ if (ra->destructor)
+ ra->destructor(sk);
+ kfree(ra);
+ return 0;
+ }
+ }
+ if (new_ra == NULL)
+ return -ENOBUFS;
+ new_ra->sk = sk;
+ new_ra->destructor = destructor;
+
+ new_ra->next = ra;
+ wmb();
+ *rap = new_ra;
+
+ return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ u16 port, u32 info, u8 *payload)
+{
+ struct sock_exterr_skb *serr;
+
+ if (!sk->ip_recverr)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+ serr->ee.ee_type = skb->h.icmph->type;
+ serr->ee.ee_code = skb->h.icmph->code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = payload;
+ skb_pull(skb, payload - skb->data);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
+{
+ struct sock_exterr_skb *serr;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+
+ if (!sk->ip_recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
+ skb->nh.iph = iph;
+ iph->daddr = daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+ serr->port = port;
+
+ skb->h.raw = skb->tail;
+ skb_pull(skb, skb->tail - skb->data);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb, *skb2;
+ struct sockaddr_in *sin;
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = skb_dequeue(&sk->error_queue);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
+ if (err)
+ goto out_free_skb;
+
+ serr = SKB_EXT_ERR(skb);
+
+ sin = (struct sockaddr_in *)msg->msg_name;
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
+ sin->sin_port = serr->port;
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ sin->sin_family = AF_UNSPEC;
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ if (sk->ip_cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+
+ put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+ /* Reset and regenerate socket error */
+ sk->err = 0;
+ if ((skb2 = skb_peek(&sk->error_queue)) != NULL) {
+ sk->err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+ sk->error_report(sk);
+ }
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+/*
+ * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
+ * an IP socket.
+ *
+ * We implement IP_TOS (type of service), IP_TTL (time to live).
+ */
+
+int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
+{
+ int val=0,err;
+#if defined(CONFIG_IP_FIREWALL)
+ char tmp_fw[MAX(sizeof(struct ip_fwtest),sizeof(struct ip_fwnew))];
+#endif
+ if(optlen>=sizeof(int)) {
+ if(get_user(val, (int *) optval))
+ return -EFAULT;
+ } else if(optlen>=sizeof(char)) {
+ unsigned char ucval;
+ if(get_user(ucval, (unsigned char *) optval))
+ return -EFAULT;
+ val = (int)ucval;
+ }
+ /* If optlen==0, it is equivalent to val == 0 */
+
+ if(level!=SOL_IP)
+ return -ENOPROTOOPT;
+#ifdef CONFIG_IP_MROUTE
+ if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+ {
+ return ip_mroute_setsockopt(sk,optname,optval,optlen);
+ }
+#endif
+
+ switch(optname)
+ {
+ case IP_OPTIONS:
+ {
+ struct ip_options * opt = NULL;
+ if (optlen > 40 || optlen < 0)
+ return -EINVAL;
+ err = ip_options_get(&opt, optval, optlen, 1);
+ if (err)
+ return err;
+ lock_sock(sk);
+ if (sk->type == SOCK_STREAM) {
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (sk->family == PF_INET ||
+ ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+ && sk->daddr != LOOPBACK4_IPV6)) {
+#endif
+ if (opt)
+ tp->ext_header_len = opt->optlen;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ }
+#endif
+ }
+ opt = xchg(&sk->opt, opt);
+ release_sock(sk);
+ if (opt)
+ kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
+ return 0;
+ }
+ case IP_PKTINFO:
+ if (val)
+ sk->ip_cmsg_flags |= IP_CMSG_PKTINFO;
+ else
+ sk->ip_cmsg_flags &= ~IP_CMSG_PKTINFO;
+ return 0;
+ case IP_RECVTTL:
+ if (val)
+ sk->ip_cmsg_flags |= IP_CMSG_TTL;
+ else
+ sk->ip_cmsg_flags &= ~IP_CMSG_TTL;
+ return 0;
+ case IP_RECVTOS:
+ if (val)
+ sk->ip_cmsg_flags |= IP_CMSG_TOS;
+ else
+ sk->ip_cmsg_flags &= ~IP_CMSG_TOS;
+ return 0;
+ case IP_RECVOPTS:
+ if (val)
+ sk->ip_cmsg_flags |= IP_CMSG_RECVOPTS;
+ else
+ sk->ip_cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ return 0;
+ case IP_RETOPTS:
+ if (val)
+ sk->ip_cmsg_flags |= IP_CMSG_RETOPTS;
+ else
+ sk->ip_cmsg_flags &= ~IP_CMSG_RETOPTS;
+ return 0;
+ case IP_TOS: /* This sets both TOS and Precedence */
+ /* Reject setting of unused bits */
+ if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK))
+ return -EINVAL;
+ if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sk->ip_tos != val) {
+ lock_sock(sk);
+ sk->ip_tos=val;
+ sk->priority = rt_tos2priority(val);
+ dst_release(xchg(&sk->dst_cache, NULL));
+ release_sock(sk);
+ }
+ return 0;
+ case IP_TTL:
+ if (optlen<1)
+ return -EINVAL;
+ if(val==-1)
+ val = ip_statistics.IpDefaultTTL;
+ if(val<1||val>255)
+ return -EINVAL;
+ sk->ip_ttl=val;
+ return 0;
+ case IP_HDRINCL:
+ if(sk->type!=SOCK_RAW)
+ return -ENOPROTOOPT;
+ sk->ip_hdrincl=val?1:0;
+ return 0;
+ case IP_MTU_DISCOVER:
+ if (val<0 || val>2)
+ return -EINVAL;
+ sk->ip_pmtudisc = val;
+ return 0;
+ case IP_RECVERR:
+ sk->ip_recverr = !!val;
+ if (!val)
+ skb_queue_purge(&sk->error_queue);
+ return 0;
+ case IP_MULTICAST_TTL:
+ if (optlen<1)
+ return -EINVAL;
+ if (val==-1)
+ val = 1;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ sk->ip_mc_ttl=val;
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (optlen<1)
+ return -EINVAL;
+ sk->ip_mc_loop = val ? 1 : 0;
+ return 0;
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct device *dev = NULL;
+
+ /*
+ * Check the arguments are allowable
+ */
+
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq,optval,sizeof(mreq)))
+ return -EFAULT;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct in_addr) &&
+ copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+ return -EFAULT;
+ }
+
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == INADDR_ANY) {
+ sk->ip_mc_index = 0;
+ sk->ip_mc_addr = 0;
+ return 0;
+ }
+ dev = ip_dev_find(mreq.imr_address.s_addr);
+ } else
+ dev = dev_get_by_index(mreq.imr_ifindex);
+
+ if (!dev)
+ return -EADDRNOTAVAIL;
+
+ if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if)
+ return -EINVAL;
+
+ sk->ip_mc_index = mreq.imr_ifindex;
+ sk->ip_mc_addr = mreq.imr_address.s_addr;
+ return 0;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct ip_mreq))
+ return -EINVAL;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if(copy_from_user(&mreq,optval,sizeof(mreq)))
+ return -EFAULT;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
+ return -EFAULT;
+ }
+
+ if (optname == IP_ADD_MEMBERSHIP)
+ return ip_mc_join_group(sk,&mreq);
+ else
+ return ip_mc_leave_group(sk,&mreq);
+ }
+ case IP_ROUTER_ALERT:
+ return ip_ra_control(sk, val ? 1 : 0, NULL);
+
+#ifdef CONFIG_IP_FIREWALL
+ case IP_FW_MASQ_TIMEOUTS:
+ case IP_FW_APPEND:
+ case IP_FW_REPLACE:
+ case IP_FW_DELETE:
+ case IP_FW_DELETE_NUM:
+ case IP_FW_INSERT:
+ case IP_FW_FLUSH:
+ case IP_FW_ZERO:
+ case IP_FW_CHECK:
+ case IP_FW_CREATECHAIN:
+ case IP_FW_DELETECHAIN:
+ case IP_FW_POLICY:
+ if(!capable(CAP_NET_ADMIN))
+ return -EACCES;
+ if(optlen>sizeof(tmp_fw) || optlen<1)
+ return -EINVAL;
+ if(copy_from_user(&tmp_fw,optval,optlen))
+ return -EFAULT;
+ err=ip_fw_ctl(optname, &tmp_fw,optlen);
+ return -err; /* -0 is 0 after all */
+#endif /* CONFIG_IP_FIREWALL */
+#ifdef CONFIG_IP_MASQUERADE
+ case IP_FW_MASQ_CTL:
+ if(!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if(optlen<1)
+ return -EINVAL;
+ err=ip_masq_uctl(optname, optval ,optlen);
+ return err;
+
+#endif
+ default:
+ return(-ENOPROTOOPT);
+ }
+}
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets the
+ * _received_ ones. The set sets the _sent_ ones.
+ */
+
+int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
+{
+ int val;
+ int len;
+
+ if(level!=SOL_IP)
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_IP_MROUTE
+ if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+ {
+ return ip_mroute_getsockopt(sk,optname,optval,optlen);
+ }
+#endif
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+
+ switch(optname)
+ {
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options * opt = (struct ip_options*)optbuf;
+ lock_sock(sk);
+ opt->optlen = 0;
+ if (sk->opt)
+ memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
+ release_sock(sk);
+ if (opt->optlen == 0)
+ return put_user(0, optlen);
+
+ ip_options_undo(opt);
+
+ len=min(len, opt->optlen);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_PKTINFO:
+ val = (sk->ip_cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ break;
+ case IP_RECVTTL:
+ val = (sk->ip_cmsg_flags & IP_CMSG_TTL) != 0;
+ break;
+ case IP_RECVTOS:
+ val = (sk->ip_cmsg_flags & IP_CMSG_TOS) != 0;
+ break;
+ case IP_RECVOPTS:
+ val = (sk->ip_cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ break;
+ case IP_RETOPTS:
+ val = (sk->ip_cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ break;
+ case IP_TOS:
+ val=sk->ip_tos;
+ break;
+ case IP_TTL:
+ val=sk->ip_ttl;
+ break;
+ case IP_HDRINCL:
+ val=sk->ip_hdrincl;
+ break;
+ case IP_MTU_DISCOVER:
+ val=sk->ip_pmtudisc;
+ break;
+ case IP_MTU:
+ val = 0;
+ lock_sock(sk);
+ if (sk->dst_cache)
+ val = sk->dst_cache->pmtu;
+ release_sock(sk);
+ if (!val)
+ return -ENOTCONN;
+ break;
+ case IP_RECVERR:
+ val=sk->ip_recverr;
+ break;
+ case IP_MULTICAST_TTL:
+ val=sk->ip_mc_ttl;
+ break;
+ case IP_MULTICAST_LOOP:
+ val=sk->ip_mc_loop;
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ len = min(len,sizeof(struct ip_mreqn));
+ if(put_user(len, optlen))
+ return -EFAULT;
+ mreq.imr_ifindex = sk->ip_mc_index;
+ mreq.imr_address.s_addr = sk->ip_mc_addr;
+ mreq.imr_multiaddr.s_addr = 0;
+ if(copy_to_user((void *)optval, &mreq, len))
+ return -EFAULT;
+ return 0;
+ }
+ default:
+ return(-ENOPROTOOPT);
+ }
+
+ if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+ unsigned char ucval = (unsigned char)val;
+ len = 1;
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&ucval,1))
+ return -EFAULT;
+ } else {
+ len=min(sizeof(int),len);
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval,&val,len))
+ return -EFAULT;
+ }
+ return 0;
+}
diff --git a/pfinet/linux-src/net/ipv4/ipconfig.c b/pfinet/linux-src/net/ipv4/ipconfig.c
new file mode 100644
index 00000000..0770bad1
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ipconfig.c
@@ -0,0 +1,970 @@
+/*
+ * $Id: ipconfig.c,v 1.20.2.1 1999/06/28 11:33:27 davem Exp $
+ *
+ * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
+ * information to configure own IP address and routes.
+ *
+ * Copyright (C) 1996--1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Derived from network configuration code in fs/nfs/nfsroot.c,
+ * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ * BOOTP rewritten to construct and analyse packets itself instead
+ * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ * -- MJ, December 1998
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+
+#include <asm/segment.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+/* Define the timeout for waiting for a RARP/BOOTP reply */
+#define CONF_BASE_TIMEOUT (HZ*5) /* Initial timeout: 5 seconds */
+#define CONF_RETRIES 10 /* 10 retries */
+#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT *5/4 /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
+
+/* IP configuration */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };/* Name of user-selected boot device */
+u32 ic_myaddr __initdata = INADDR_NONE; /* My IP address */
+u32 ic_servaddr __initdata = INADDR_NONE; /* Server IP address */
+u32 ic_gateway __initdata = INADDR_NONE; /* Gateway IP address */
+u32 ic_netmask __initdata = INADDR_NONE; /* Netmask for local subnet */
+int ic_enable __initdata = 1; /* Automatic IP configuration enabled */
+int ic_host_name_set __initdata = 0; /* Host name configured manually */
+int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
+
+u32 root_server_addr __initdata = INADDR_NONE; /* Address of boot server */
+u8 root_server_path[256] __initdata = { 0, }; /* Path to mount as root */
+
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_RARP)
+
+#define CONFIG_IP_PNP_DYNAMIC
+
+static int ic_proto_enabled __initdata = 0 /* Protocols enabled */
+#ifdef CONFIG_IP_PNP_BOOTP
+ | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ | IC_RARP
+#endif
+ ;
+static int ic_got_reply __initdata = 0; /* Protocol(s) we got reply from */
+
+#else
+
+static int ic_proto_enabled __initdata = 0;
+
+#endif
+
+static int ic_proto_have_if __initdata = 0;
+
+/*
+ * Network devices
+ */
+
+struct ic_device {
+ struct ic_device *next;
+ struct device *dev;
+ unsigned short flags;
+ int able;
+};
+
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct device *ic_dev __initdata = NULL; /* Selected device */
+
+static int __init ic_open_devs(void)
+{
+ struct ic_device *d, **last;
+ struct device *dev;
+ unsigned short oflags;
+
+ last = &ic_first_dev;
+ for (dev = dev_base; dev; dev = dev->next)
+ if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5))) {
+ int able = 0;
+ if (dev->mtu >= 364)
+ able |= IC_BOOTP;
+ else
+ printk(KERN_WARNING "BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ if (!(dev->flags & IFF_NOARP))
+ able |= IC_RARP;
+ able &= ic_proto_enabled;
+ if (ic_proto_enabled && !able)
+ continue;
+ oflags = dev->flags;
+ if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ continue;
+ }
+ if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL)))
+ return -1;
+ d->dev = dev;
+ *last = d;
+ last = &d->next;
+ d->flags = oflags;
+ d->able = able;
+ ic_proto_have_if |= able;
+ DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able));
+ }
+ *last = NULL;
+
+ if (!ic_first_dev) {
+ if (user_dev_name[0])
+ printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ else
+ printk(KERN_ERR "IP-Config: No network devices available.\n");
+ return -1;
+ }
+ return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+ struct ic_device *d, *next;
+ struct device *dev;
+
+ next = ic_first_dev;
+ while ((d = next)) {
+ next = d->next;
+ dev = d->dev;
+ if (dev != ic_dev) {
+ DBG(("IP-Config: Downing %s\n", dev->name));
+ dev_change_flags(dev, d->flags);
+ }
+ kfree_s(d, sizeof(struct ic_device));
+ }
+}
+
+/*
+ * Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = addr;
+ sin->sin_port = port;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = devinet_ioctl(cmd, arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = ip_rt_ioctl(cmd, arg);
+ set_fs(oldfs);
+ return res;
+}
+
+/*
+ * Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+ struct ifreq ir;
+ struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+ int err;
+
+ memset(&ir, 0, sizeof(ir));
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ set_sockaddr(sin, ic_myaddr, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+ if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ return -1;
+ }
+ return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+ /* No need to setup device routes, only the default route... */
+
+ if (ic_gateway != INADDR_NONE) {
+ struct rtentry rm;
+ int err;
+
+ memset(&rm, 0, sizeof(rm));
+ if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+ printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ return -1;
+ }
+ set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+ rm.rt_flags = RTF_UP | RTF_GATEWAY;
+ if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+ printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+ /*
+ * At this point we have no userspace running so need not
+ * claim locks on system_utsname
+ */
+
+ if (!ic_host_name_set)
+ strcpy(system_utsname.nodename, in_ntoa(ic_myaddr));
+
+ if (root_server_addr == INADDR_NONE)
+ root_server_addr = ic_servaddr;
+
+ if (ic_netmask == INADDR_NONE) {
+ if (IN_CLASSA(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSA_NET);
+ else if (IN_CLASSB(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSB_NET);
+ else if (IN_CLASSC(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSC_NET);
+ else {
+ printk(KERN_ERR "IP-Config: Unable to guess netmask for address %08x\n", ic_myaddr);
+ return -1;
+ }
+ printk("IP-Config: Guessing netmask %s\n", in_ntoa(ic_netmask));
+ }
+
+ return 0;
+}
+
+/*
+ * RARP support.
+ */
+
+#ifdef CONFIG_IP_PNP_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt);
+
+static struct packet_type rarp_packet_type __initdata = {
+ __constant_htons(ETH_P_RARP),
+ NULL, /* Listen to all devices */
+ ic_rarp_recv,
+ NULL,
+ NULL
+};
+
+static inline void ic_rarp_init(void)
+{
+ dev_add_pack(&rarp_packet_type);
+}
+
+static inline void ic_rarp_cleanup(void)
+{
+ dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ * Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+{
+ struct arphdr *rarp = (struct arphdr *)skb->h.raw;
+ unsigned char *rarp_ptr = (unsigned char *) (rarp + 1);
+ unsigned long sip, tip;
+ unsigned char *sha, *tha; /* s for "source", t for "target" */
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop;
+
+ /* If this test doesn't pass, it's not IP, or we should ignore it anyway */
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+ goto drop;
+
+ /* If it's not a RARP reply, delete it. */
+ if (rarp->ar_op != htons(ARPOP_RREPLY))
+ goto drop;
+
+ /* If it's not Ethernet, delete it. */
+ if (rarp->ar_pro != htons(ETH_P_IP))
+ goto drop;
+
+ /* Extract variable-width fields */
+ sha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&sip, rarp_ptr, 4);
+ rarp_ptr += 4;
+ tha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&tip, rarp_ptr, 4);
+
+ /* Discard packets which are not meant for us. */
+ if (memcmp(tha, dev->dev_addr, dev->addr_len))
+ goto drop;
+
+ /* Discard packets which are not from specified server. */
+ if (ic_servaddr != INADDR_NONE && ic_servaddr != sip)
+ goto drop;
+
+ /* Victory! The packet is what we were looking for! */
+ if (!ic_got_reply) {
+ ic_got_reply = IC_RARP;
+ ic_dev = dev;
+ if (ic_myaddr == INADDR_NONE)
+ ic_myaddr = tip;
+ ic_servaddr = sip;
+ }
+
+ /* And throw the packet out... */
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Send RARP request packet over all devices which allow RARP.
+ */
+static void __init ic_rarp_send(void)
+{
+ struct ic_device *d;
+
+ for (d=ic_first_dev; d; d=d->next)
+ if (d->able & IC_RARP) {
+ struct device *dev = d->dev;
+ arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+ dev->dev_addr, dev->dev_addr);
+ }
+}
+
+#endif
+
+/*
+ * BOOTP support.
+ */
+
+#ifdef CONFIG_IP_PNP_BOOTP
+
+struct bootp_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 op; /* 1=request, 2=reply */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ u8 hops; /* Used only by gateways */
+ u32 xid; /* Transaction ID */
+ u16 secs; /* Seconds since we started */
+ u16 flags; /* Just what it says */
+ u32 client_ip; /* Client's IP address if known */
+ u32 your_ip; /* Assigned IP address */
+ u32 server_ip; /* Server's IP address */
+ u32 relay_ip; /* IP address of BOOTP relay */
+ u8 hw_addr[16]; /* Client's HW address */
+ u8 serv_name[64]; /* Server host name */
+ u8 boot_file[128]; /* Name of boot file */
+ u8 vendor_area[128]; /* Area for extensions */
+};
+
+#define BOOTP_REQUEST 1
+#define BOOTP_REPLY 2
+
+static u32 ic_bootp_xid;
+
+static int ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt);
+
+static struct packet_type bootp_packet_type __initdata = {
+ __constant_htons(ETH_P_IP),
+ NULL, /* Listen to all devices */
+ ic_bootp_recv,
+ NULL,
+ NULL
+};
+
+
+/*
+ * Initialize BOOTP extension fields in the request.
+ */
+static void __init ic_bootp_init_ext(u8 *e)
+{
+ *e++ = 99; /* RFC1048 Magic Cookie */
+ *e++ = 130;
+ *e++ = 83;
+ *e++ = 99;
+ *e++ = 1; /* Subnet mask request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 3; /* Default gateway request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 12; /* Host name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 40; /* NIS Domain name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 17; /* Boot path */
+ *e++ = 32;
+ e += 32;
+ *e = 255; /* End of the list */
+}
+
+
+/*
+ * Initialize the BOOTP mechanism.
+ */
+static inline void ic_bootp_init(void)
+{
+ get_random_bytes(&ic_bootp_xid, sizeof(u32));
+ DBG(("BOOTP: XID=%08x\n", ic_bootp_xid));
+ dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ * BOOTP cleanup.
+ */
+static inline void ic_bootp_cleanup(void)
+{
+ dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ * Send BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies)
+{
+ struct device *dev = d->dev;
+ struct sk_buff *skb;
+ struct bootp_pkt *b;
+ int hh_len = (dev->hard_header_len + 15) & ~15;
+ struct iphdr *h;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hh_len);
+ b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+ memset(b, 0, sizeof(struct bootp_pkt));
+
+ /* Construct IP header */
+ skb->nh.iph = h = &b->iph;
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct bootp_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = INADDR_BROADCAST;
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(68);
+ b->udph.dest = htons(67);
+ b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct BOOTP header */
+ b->op = BOOTP_REQUEST;
+ b->htype = dev->type;
+ b->hlen = dev->addr_len;
+ memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+ b->secs = htons(jiffies / HZ);
+ b->xid = ic_bootp_xid;
+ ic_bootp_init_ext(b->vendor_area);
+
+ /* Chain packet down the line... */
+ skb->dev = dev;
+ skb->protocol = __constant_htons(ETH_P_IP);
+ if ((dev->hard_header &&
+ dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
+ dev_queue_xmit(skb) < 0)
+ printk("E");
+}
+
+
+/*
+ * Send BOOTP requests to all interfaces.
+ */
+static void __init ic_bootp_send(u32 jiffies)
+{
+ struct ic_device *d;
+
+ for(d=ic_first_dev; d; d=d->next)
+ if (d->able & IC_BOOTP)
+ ic_bootp_send_if(d, jiffies);
+}
+
+
+/*
+ * Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+ if (!len)
+ return 0;
+ if (len > max-1)
+ len = max-1;
+ strncpy(dest, src, len);
+ dest[len] = '\0';
+ return 1;
+}
+
+
+/*
+ * Process BOOTP extension.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+#ifdef IPCONFIG_DEBUG
+ u8 *c;
+
+ printk("BOOTP: Got extension %02x",*ext);
+ for(c=ext+2; c<ext+2+ext[1]; c++)
+ printk(" %02x", *c);
+ printk("\n");
+#endif
+
+ switch (*ext++) {
+ case 1: /* Subnet mask */
+ if (ic_netmask == INADDR_NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == INADDR_NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 40: /* NIS Domain name */
+ ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
+ break;
+ }
+}
+
+
+/*
+ * Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+{
+ struct bootp_pkt *b = (struct bootp_pkt *) skb->nh.iph;
+ struct iphdr *h = &b->iph;
+ int len;
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop;
+
+ /* Check whether it's a BOOTP packet */
+ if (skb->pkt_type == PACKET_OTHERHOST ||
+ skb->len < sizeof(struct udphdr) + sizeof(struct iphdr) ||
+ h->ihl != 5 ||
+ h->version != 4 ||
+ ip_fast_csum((char *) h, h->ihl) != 0 ||
+ skb->len < ntohs(h->tot_len) ||
+ h->protocol != IPPROTO_UDP ||
+ b->udph.source != htons(67) ||
+ b->udph.dest != htons(68) ||
+ ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ /* Fragments are not supported */
+ if (h->frag_off & htons(IP_OFFSET|IP_MF)) {
+ printk(KERN_ERR "BOOTP: Ignoring fragmented reply.\n");
+ goto drop;
+ }
+
+ /* Is it a reply to our BOOTP request? */
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+ if (len < 300 || /* See RFC 951:2.1 */
+ b->op != BOOTP_REPLY ||
+ b->xid != ic_bootp_xid) {
+ printk("?");
+ goto drop;
+ }
+
+ /* Extract basic fields */
+ ic_myaddr = b->your_ip;
+ ic_servaddr = b->server_ip;
+ ic_got_reply = IC_BOOTP;
+ ic_dev = dev;
+
+ /* Parse extensions */
+ if (b->vendor_area[0] == 99 && /* Check magic cookie */
+ b->vendor_area[1] == 130 &&
+ b->vendor_area[2] == 83 &&
+ b->vendor_area[3] == 99) {
+ u8 *ext = &b->vendor_area[4];
+ u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+ while (ext < end && *ext != 0xff) {
+ if (*ext == 0) /* Padding */
+ ext++;
+ else {
+ u8 *opt = ext;
+ ext += ext[1] + 2;
+ if (ext <= end)
+ ic_do_bootp_ext(opt);
+ }
+ }
+ }
+
+ if (ic_gateway == INADDR_NONE && b->relay_ip)
+ ic_gateway = b->relay_ip;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+
+#endif
+
+
+/*
+ * Dynamic IP configuration -- BOOTP and RARP.
+ */
+
+#ifdef CONFIG_IP_PNP_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+ int retries;
+ unsigned long timeout, jiff;
+ unsigned long start_jiffies;
+ int do_rarp = ic_proto_have_if & IC_RARP;
+ int do_bootp = ic_proto_have_if & IC_BOOTP;
+
+ /*
+ * If neither BOOTP nor RARP was selected, return with an error. This
+ * routine gets only called when some pieces of information are mis-
+ * sing, and without BOOTP and RARP we are not able to get that in-
+ * formation.
+ */
+ if (!ic_proto_enabled) {
+ printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ return -1;
+ }
+
+#ifdef CONFIG_IP_PNP_BOOTP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+ printk(KERN_ERR "BOOTP: No suitable device found.\n");
+#endif
+
+#ifdef CONFIG_IP_PNP_RARP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+ printk(KERN_ERR "RARP: No suitable device found.\n");
+#endif
+
+ if (!ic_proto_have_if)
+ /* Error message already printed */
+ return -1;
+
+ /*
+ * Setup RARP and BOOTP protocols
+ */
+#ifdef CONFIG_IP_PNP_RARP
+ if (do_rarp)
+ ic_rarp_init();
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+ if (do_bootp)
+ ic_bootp_init();
+#endif
+
+ /*
+ * Send requests and wait, until we get an answer. This loop
+ * seems to be a terrible waste of CPU time, but actually there is
+ * only one process running at all, so we don't need to use any
+ * scheduler functions.
+ * [Actually we could now, but the nothing else running note still
+ * applies.. - AC]
+ */
+ printk(KERN_NOTICE "Sending %s%s%s requests...",
+ do_bootp ? "BOOTP" : "",
+ do_bootp && do_rarp ? " and " : "",
+ do_rarp ? "RARP" : "");
+ start_jiffies = jiffies;
+ retries = CONF_RETRIES;
+ get_random_bytes(&timeout, sizeof(timeout));
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+ for(;;) {
+#ifdef CONFIG_IP_PNP_BOOTP
+ if (do_bootp)
+ ic_bootp_send(jiffies - start_jiffies);
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ if (do_rarp)
+ ic_rarp_send();
+#endif
+ printk(".");
+ jiff = jiffies + timeout;
+ while (jiffies < jiff && !ic_got_reply)
+ ;
+ if (ic_got_reply) {
+ printk(" OK\n");
+ break;
+ }
+ if (! --retries) {
+ printk(" timed out!\n");
+ break;
+ }
+ timeout = timeout CONF_TIMEOUT_MULT;
+ if (timeout > CONF_TIMEOUT_MAX)
+ timeout = CONF_TIMEOUT_MAX;
+ }
+
+#ifdef CONFIG_IP_PNP_RARP
+ if (do_rarp)
+ ic_rarp_cleanup();
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+ if (do_bootp)
+ ic_bootp_cleanup();
+#endif
+
+ if (!ic_got_reply)
+ return -1;
+
+ printk("IP-Config: Got %s answer from %s, ",
+ (ic_got_reply & IC_BOOTP) ? "BOOTP" : "RARP",
+ in_ntoa(ic_servaddr));
+ printk("my address is %s\n", in_ntoa(ic_myaddr));
+
+ return 0;
+}
+
+#endif
+
+/*
+ * IP Autoconfig dispatcher.
+ */
+
+int __init ip_auto_config(void)
+{
+ if (!ic_enable)
+ return 0;
+
+ DBG(("IP-Config: Entered.\n"));
+
+ /* Setup all network devices */
+ if (ic_open_devs() < 0)
+ return -1;
+
+ /*
+ * If the config information is insufficient (e.g., our IP address or
+ * IP address of the boot server is missing or we have multiple network
+ * interfaces and no default was set), use BOOTP or RARP to get the
+ * missing values.
+ */
+ if (ic_myaddr == INADDR_NONE ||
+#ifdef CONFIG_ROOT_NFS
+ (root_server_addr == INADDR_NONE && ic_servaddr == INADDR_NONE) ||
+#endif
+ ic_first_dev->next) {
+#ifdef CONFIG_IP_PNP_DYNAMIC
+ if (ic_dynamic() < 0) {
+ printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+ ic_close_devs();
+ return -1;
+ }
+#else
+ printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ ic_close_devs();
+ return -1;
+#endif
+ } else {
+ ic_dev = ic_first_dev->dev; /* Device selected manually or only one device -> use it */
+ }
+
+ /*
+ * Use defaults whereever applicable.
+ */
+ if (ic_defaults() < 0)
+ return -1;
+
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ ic_close_devs();
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ return -1;
+
+ DBG(("IP-Config: device=%s, local=%08x, server=%08x, boot=%08x, gw=%08x, mask=%08x\n",
+ ic_dev->name, ic_myaddr, ic_servaddr, root_server_addr, ic_gateway, ic_netmask));
+ DBG(("IP-Config: host=%s, domain=%s, path=`%s'\n", system_utsname.nodename,
+ system_utsname.domainname, root_server_path));
+ return 0;
+}
+
+/*
+ * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ * command line parameter. It consists of option fields separated by colons in
+ * the following order:
+ *
+ * <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<bootp|rarp>
+ *
+ * Any of the fields can be empty which means to use a default value:
+ * <client-ip> - address given by BOOTP or RARP
+ * <server-ip> - address of host returning BOOTP or RARP packet
+ * <gw-ip> - none, or the address returned by BOOTP
+ * <netmask> - automatically determined from <client-ip>, or the
+ * one returned by BOOTP
+ * <host name> - <client-ip> in ASCII notation, or the name returned
+ * by BOOTP
+ * <device> - use all available devices
+ * <bootp|rarp|both|off> - use both protocols to determine my own address
+ */
+static int __init ic_proto_name(char *name)
+{
+ if (!strcmp(name, "off")) {
+ ic_proto_enabled = 0;
+ return 1;
+ }
+#ifdef CONFIG_IP_PNP_BOOTP
+ else if (!strcmp(name, "bootp")) {
+ ic_proto_enabled &= ~IC_RARP;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ else if (!strcmp(name, "rarp")) {
+ ic_proto_enabled &= ~IC_BOOTP;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_DYNAMIC
+ else if (!strcmp(name, "both")) {
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+void __init ip_auto_config_setup(char *addrs, int *ints)
+{
+ char *cp, *ip, *dp;
+ int num = 0;
+
+ ic_set_manually = 1;
+ if (!strcmp(addrs, "off")) {
+ ic_enable = 0;
+ return;
+ }
+ if (ic_proto_name(addrs))
+ return;
+
+ /* Parse the whole string */
+ ip = addrs;
+ while (ip && *ip) {
+ if ((cp = strchr(ip, ':')))
+ *cp++ = '\0';
+ if (strlen(ip) > 0) {
+ DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ switch (num) {
+ case 0:
+ if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+ ic_myaddr = INADDR_NONE;
+ break;
+ case 1:
+ if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+ ic_servaddr = INADDR_NONE;
+ break;
+ case 2:
+ if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+ ic_gateway = INADDR_NONE;
+ break;
+ case 3:
+ if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+ ic_netmask = INADDR_NONE;
+ break;
+ case 4:
+ if ((dp = strchr(ip, '.'))) {
+ *dp++ = '\0';
+ strncpy(system_utsname.domainname, dp, __NEW_UTS_LEN);
+ system_utsname.domainname[__NEW_UTS_LEN] = '\0';
+ }
+ strncpy(system_utsname.nodename, ip, __NEW_UTS_LEN);
+ system_utsname.nodename[__NEW_UTS_LEN] = '\0';
+ ic_host_name_set = 1;
+ break;
+ case 5:
+ strncpy(user_dev_name, ip, IFNAMSIZ);
+ user_dev_name[IFNAMSIZ-1] = '\0';
+ break;
+ case 6:
+ ic_proto_name(ip);
+ break;
+ }
+ }
+ ip = cp;
+ num++;
+ }
+}
diff --git a/pfinet/linux-src/net/ipv4/ipip.c b/pfinet/linux-src/net/ipv4/ipip.c
new file mode 100644
index 00000000..0aeef4a3
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ipip.c
@@ -0,0 +1,870 @@
+/*
+ * Linux NET3: IP/IP protocol decoder.
+ *
+ * Version: $Id: ipip.c,v 1.26 1999/03/25 10:04:32 davem Exp $
+ *
+ * Authors:
+ * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+ *
+ * Fixes:
+ * Alan Cox : Merged and made usable non modular (its so tiny its silly as
+ * a module taking up 2 pages).
+ * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ * to keep ip_forward happy.
+ * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
+ * David Woodhouse : Perform some basic ICMP handling.
+ * IPIP Routing without decapsulation.
+ * Carlos Picoto : GRE over IP support
+ * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ * I do not want to merge them together.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+ The purpose of this driver is to provide an IP tunnel through
+ which you can tunnel network traffic transparently across subnets.
+
+ This was written by looking at Nick Holloway's dummy driver
+ Thanks for the great code!
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+
+ Minor tweaks:
+ Cleaned up the code a little and added some pre-1.3.0 tweaks.
+ dev->hard_header/hard_header_len changed to use no headers.
+ Comments/bracketing tweaked.
+ Made the tunnels use dev->name not tunnel: when error reporting.
+ Added tx_dropped stat
+
+ -Alan Cox (Alan.Cox@linux.org) 21 March 95
+
+ Reworked:
+ Changed to tunnel to destination gateway in addition to the
+ tunnel's pointopoint address
+ Almost completely rewritten
+ Note: There is currently no firewall or ICMP handling done.
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+ When the tunnel_xmit() function is called, the skb contains the
+ packet to be sent (plus a great deal of extra info), and dev
+ contains the tunnel device that _we_ are.
+
+ When we are passed a packet, we are expected to fill in the
+ source address with our source IP address.
+
+ What is the proper way to allocate, copy and free a buffer?
+ After you allocate it, it is a "0 length" chunk of memory
+ starting at zero. If you want to add headers to the buffer
+ later, you'll have to call "skb_reserve(skb, amount)" with
+ the amount of memory you want reserved. Then, you call
+ "skb_put(skb, amount)" with the amount of space you want in
+ the buffer. skb_put() returns a pointer to the top (#0) of
+ that buffer. skb->len is set to the amount of space you have
+ "allocated" with skb_put(). You can then write up to skb->len
+ bytes to that buffer. If you need more, you can call skb_put()
+ again with the additional amount of space you need. You can
+ find out how much more space you can allocate by calling
+ "skb_tailroom(skb)".
+ Now, to add header space, call "skb_push(skb, header_len)".
+ This creates space at the beginning of the buffer and returns
+ a pointer to this new space. If later you need to strip a
+ header from a buffer, call "skb_pull(skb, header_len)".
+ skb_headroom() will return how much space is left at the top
+ of the buffer (before the main data). Remember, this headroom
+ space must be reserved before the skb_put() function is called.
+ */
+
+/*
+ This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+
+#define HASH_SIZE 16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static int ipip_fb_tunnel_init(struct device *dev);
+static int ipip_tunnel_init(struct device *dev);
+
+static struct device ipip_fb_tunnel_dev = {
+ NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init,
+};
+
+static struct ip_tunnel ipip_fb_tunnel = {
+ NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", }
+};
+
+static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_r[HASH_SIZE];
+static struct ip_tunnel *tunnels_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_wc[1];
+static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+
+static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
+{
+ unsigned h0 = HASH(remote);
+ unsigned h1 = HASH(local);
+ struct ip_tunnel *t;
+
+ for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ for (t = tunnels_r[h0]; t; t = t->next) {
+ if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ for (t = tunnels_l[h1]; t; t = t->next) {
+ if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+ return t;
+ }
+ if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+ return t;
+ return NULL;
+}
+
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+ u32 remote = t->parms.iph.daddr;
+ u32 local = t->parms.iph.saddr;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &tunnels[prio][h];
+}
+
+
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp;
+
+ for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+ if (t == *tp) {
+ *tp = t->next;
+ synchronize_bh();
+ break;
+ }
+ }
+}
+
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+ struct ip_tunnel **tp = ipip_bucket(t);
+
+ t->next = *tp;
+ wmb();
+ *tp = t;
+}
+
+struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+ struct ip_tunnel *t, **tp, *nt;
+ struct device *dev;
+ unsigned h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+ if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+ return t;
+ }
+ if (!create)
+ return NULL;
+
+ MOD_INC_USE_COUNT;
+ dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL);
+ if (dev == NULL) {
+ MOD_DEC_USE_COUNT;
+ return NULL;
+ }
+ memset(dev, 0, sizeof(*dev) + sizeof(*t));
+ dev->priv = (void*)(dev+1);
+ nt = (struct ip_tunnel*)dev->priv;
+ nt->dev = dev;
+ dev->name = nt->parms.name;
+ dev->init = ipip_tunnel_init;
+ memcpy(&nt->parms, parms, sizeof(*parms));
+ if (dev->name[0] == 0) {
+ int i;
+ for (i=1; i<100; i++) {
+ sprintf(dev->name, "tunl%d", i);
+ if (dev_get(dev->name) == NULL)
+ break;
+ }
+ if (i==100)
+ goto failed;
+ memcpy(parms->name, dev->name, IFNAMSIZ);
+ }
+ if (register_netdevice(dev) < 0)
+ goto failed;
+
+ ipip_tunnel_link(nt);
+ /* Do not decrement MOD_USE_COUNT here. */
+ return nt;
+
+failed:
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
+ return NULL;
+}
+
+
+static void ipip_tunnel_destroy(struct device *dev)
+{
+ if (dev == &ipip_fb_tunnel_dev) {
+ tunnels_wc[0] = NULL;
+ synchronize_bh();
+ } else {
+ ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ kfree(dev);
+ MOD_DEC_USE_COUNT;
+ }
+}
+
+void ipip_err(struct sk_buff *skb, unsigned char *dp, int len)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+ */
+ struct iphdr *iph = (struct iphdr*)dp;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct ip_tunnel *t;
+
+ if (len < sizeof(struct iphdr))
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* Soft state for pmtu is maintained by IP core. */
+ return;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
+ if (t == NULL || t->parms.iph.daddr == 0)
+ return;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ return;
+
+ if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+ return;
+#else
+ struct iphdr *iph = (struct iphdr*)dp;
+ int hlen = iph->ihl<<2;
+ struct iphdr *eiph;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ int rel_type = 0;
+ int rel_code = 0;
+ int rel_info = 0;
+ struct sk_buff *skb2;
+ struct rtable *rt;
+
+ if (len < hlen + sizeof(struct iphdr))
+ return;
+ eiph = (struct iphdr*)(dp + hlen);
+
+ switch (type) {
+ default:
+ return;
+ case ICMP_PARAMETERPROB:
+ if (skb->h.icmph->un.gateway < hlen)
+ return;
+
+ /* So... This guy found something strange INSIDE encapsulated
+ packet. Well, he is fool, but what can we do ?
+ */
+ rel_type = ICMP_PARAMETERPROB;
+ rel_info = skb->h.icmph->un.gateway - hlen;
+ break;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return;
+ case ICMP_FRAG_NEEDED:
+ /* And it is the only really necesary thing :-) */
+ rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+ if (rel_info < hlen+68)
+ return;
+ rel_info -= hlen;
+ /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+ if (rel_info > ntohs(eiph->tot_len))
+ return;
+ break;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe, it is just ether pollution. --ANK
+ */
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return;
+ break;
+ }
+
+ /* Prepare fake skb to feed it to icmp_send */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ dst_release(skb2->dst);
+ skb2->dst = NULL;
+ skb_pull(skb2, skb->data - (u8*)eiph);
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+ if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dev = rt->u.dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
+ rt->u.dst.dev->type != ARPHRD_IPGRE) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+ }
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+ skb2->dst->dev->type != ARPHRD_IPGRE) {
+ kfree_skb(skb2);
+ return;
+ }
+ }
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ if (rel_info > skb2->dst->pmtu) {
+ kfree_skb(skb2);
+ return;
+ }
+ skb2->dst->pmtu = rel_info;
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+ if (t->parms.iph.ttl) {
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ }
+ }
+
+ icmp_send(skb2, rel_type, rel_code, rel_info);
+ kfree_skb(skb2);
+ return;
+#endif
+}
+
+int ipip_rcv(struct sk_buff *skb, unsigned short len)
+{
+ struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ iph = skb->nh.iph;
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data);
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = __constant_htons(ETH_P_IP);
+ skb->ip_summed = 0;
+ skb->pkt_type = PACKET_HOST;
+
+ if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ netif_rx(skb);
+ return 0;
+ }
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+
+static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev)
+{
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+ struct net_device_stats *stats = &tunnel->stat;
+ struct iphdr *tiph = &tunnel->parms.iph;
+ u8 tos = tunnel->parms.iph.tos;
+ u16 df = tiph->frag_off;
+ struct rtable *rt; /* Route to the other host */
+ struct device *tdev; /* Device to other host */
+ struct iphdr *old_iph = skb->nh.iph;
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ u32 dst = tiph->daddr;
+ int mtu;
+
+ if (tunnel->recursion++) {
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ if (skb->protocol != __constant_htons(ETH_P_IP))
+ goto tx_error;
+
+ if (tos&1)
+ tos = old_iph->tos;
+
+ if (!dst) {
+ /* NBMA tunnel */
+ if ((rt = (struct rtable*)skb->dst) == NULL) {
+ tunnel->stat.tx_fifo_errors++;
+ goto tx_error;
+ }
+ if ((dst = rt->rt_gateway) == 0)
+ goto tx_error_icmp;
+ }
+
+ if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
+ tunnel->stat.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ tdev = rt->u.dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ tunnel->stat.collisions++;
+ goto tx_error;
+ }
+
+ mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
+ if (mtu < 68) {
+ tunnel->stat.collisions++;
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ if (skb->dst && mtu < skb->dst->pmtu)
+ skb->dst->pmtu = mtu;
+
+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+ if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb->h.raw = skb->nh.raw;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ stats->tx_dropped++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ }
+
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_IPIP;
+ iph->tos = tos;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+
+ if ((iph->ttl = tiph->ttl) == 0)
+ iph->ttl = old_iph->ttl;
+
+ iph->tot_len = htons(skb->len);
+ iph->id = htons(ip_id_count++);
+ ip_send_check(iph);
+
+ stats->tx_bytes += skb->len;
+ stats->tx_packets++;
+ ip_send(skb);
+ tunnel->recursion--;
+ return 0;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ stats->tx_errors++;
+ dev_kfree_skb(skb);
+ tunnel->recursion--;
+ return 0;
+}
+
+static int
+ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel *t;
+
+ MOD_INC_USE_COUNT;
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ t = NULL;
+ if (dev == &ipip_fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipip_tunnel_locate(&p, 0);
+ }
+ if (t == NULL)
+ t = (struct ip_tunnel*)dev->priv;
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= __constant_htons(IP_DF);
+
+ t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+ if (dev != &ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL &&
+ t != &ipip_fb_tunnel) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = (struct ip_tunnel*)dev->priv;
+ start_bh_atomic();
+ ipip_tunnel_unlink(t);
+ t->parms.iph.saddr = p.iph.saddr;
+ t->parms.iph.daddr = p.iph.daddr;
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ ipip_tunnel_link(t);
+ end_bh_atomic();
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ if (cmd == SIOCCHGTUNNEL) {
+ t->parms.iph.ttl = p.iph.ttl;
+ t->parms.iph.tos = p.iph.tos;
+ t->parms.iph.frag_off = p.iph.frag_off;
+ }
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == &ipip_fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == &ipip_fb_tunnel)
+ goto done;
+ }
+ err = unregister_netdevice(dev);
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ MOD_DEC_USE_COUNT;
+ return err;
+}
+
+static struct net_device_stats *ipip_tunnel_get_stats(struct device *dev)
+{
+ return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu)
+{
+ if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static void ipip_tunnel_init_gen(struct device *dev)
+{
+ struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+ dev->destructor = ipip_tunnel_destroy;
+ dev->hard_start_xmit = ipip_tunnel_xmit;
+ dev->get_stats = ipip_tunnel_get_stats;
+ dev->do_ioctl = ipip_tunnel_ioctl;
+ dev->change_mtu = ipip_tunnel_change_mtu;
+
+ dev_init_buffers(dev);
+
+ dev->type = ARPHRD_TUNNEL;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = 1500 - sizeof(struct iphdr);
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+ memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
+}
+
+static int ipip_tunnel_init(struct device *dev)
+{
+ struct device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ struct iphdr *iph;
+
+ tunnel = (struct ip_tunnel*)dev->priv;
+ iph = &tunnel->parms.iph;
+
+ ipip_tunnel_init_gen(dev);
+
+ if (iph->daddr) {
+ struct rtable *rt;
+ if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = dev_get_by_index(tunnel->parms.link);
+
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+ dev->mtu = tdev->mtu - sizeof(struct iphdr);
+ }
+ dev->iflink = tunnel->parms.link;
+
+ return 0;
+}
+
+#ifdef MODULE
+static int ipip_fb_tunnel_open(struct device *dev)
+{
+ MOD_INC_USE_COUNT;
+ return 0;
+}
+
+static int ipip_fb_tunnel_close(struct device *dev)
+{
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+#endif
+
+__initfunc(int ipip_fb_tunnel_init(struct device *dev))
+{
+ struct iphdr *iph;
+
+ ipip_tunnel_init_gen(dev);
+#ifdef MODULE
+ dev->open = ipip_fb_tunnel_open;
+ dev->stop = ipip_fb_tunnel_close;
+#endif
+
+ iph = &ipip_fb_tunnel.parms.iph;
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+
+ tunnels_wc[0] = &ipip_fb_tunnel;
+ return 0;
+}
+
+static struct inet_protocol ipip_protocol = {
+ ipip_rcv, /* IPIP handler */
+ ipip_err, /* TUNNEL error control */
+ 0, /* next */
+ IPPROTO_IPIP, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "IPIP" /* name */
+};
+
+#ifdef MODULE
+int init_module(void)
+#else
+__initfunc(int ipip_init(void))
+#endif
+{
+ printk(KERN_INFO "IPv4 over IPv4 tunneling driver\n");
+
+ ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel;
+ ipip_fb_tunnel_dev.name = ipip_fb_tunnel.parms.name;
+#ifdef MODULE
+ register_netdev(&ipip_fb_tunnel_dev);
+#else
+ register_netdevice(&ipip_fb_tunnel_dev);
+#endif
+
+ inet_add_protocol(&ipip_protocol);
+ return 0;
+}
+
+#ifdef MODULE
+
+void cleanup_module(void)
+{
+ if ( inet_del_protocol(&ipip_protocol) < 0 )
+ printk(KERN_INFO "ipip close: can't remove protocol\n");
+
+ unregister_netdevice(&ipip_fb_tunnel_dev);
+}
+
+#endif
diff --git a/pfinet/linux-src/net/ipv4/ipmr.c b/pfinet/linux-src/net/ipv4/ipmr.c
new file mode 100644
index 00000000..cd51cd9a
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/ipmr.c
@@ -0,0 +1,1609 @@
+/*
+ * IP multicast routing support for mrouted 3.6/3.8
+ *
+ * (c) 1995 Alan Cox, <alan@redhat.com>
+ * Linux Consultancy and Custom Driver Development
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Version: $Id: ipmr.c,v 1.40.2.2 1999/06/20 21:27:44 davem Exp $
+ *
+ * Fixes:
+ * Michael Chastain : Incorrect size of copying.
+ * Alan Cox : Added the cache manager code
+ * Alan Cox : Fixed the clone/copy bug and device race.
+ * Mike McLagan : Routing by source
+ * Malcolm Beattie : Buffer handling fixes.
+ * Alexey Kuznetsov : Double buffer free and other fixes.
+ * SVR Anand : Fixed several multicast bugs and problems.
+ * Alexey Kuznetsov : Status, optimisations and more.
+ * Brad Parker : Better behaviour on mrouted upcall
+ * overflow.
+ * Carlos Picoto : PIMv1 Support
+ * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
+ * Relax this requrement to work with older peers.
+ *
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/ip_fw.h>
+#include <linux/firewall.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM 1
+#endif
+
+/*
+ * Multicast router control variables
+ */
+
+static struct vif_device vif_table[MAXVIFS]; /* Devices */
+static unsigned long vifc_map; /* Active device map */
+static int maxvif;
+int mroute_do_assert = 0; /* Set in PIM assert */
+int mroute_do_pim = 0;
+static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
+int cache_resolve_queue_len = 0; /* Size of unresolved */
+
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+
+extern struct inet_protocol pim_protocol;
+
+static
+struct device *ipmr_new_tunnel(struct vifctl *v)
+{
+ struct device *dev = NULL;
+
+ rtnl_lock();
+ dev = dev_get("tunl0");
+
+ if (dev) {
+ int err;
+ struct ifreq ifr;
+ mm_segment_t oldfs;
+ struct ip_tunnel_parm p;
+ struct in_device *in_dev;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (void*)&p;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+ set_fs(oldfs);
+
+ if (err == 0 && (dev = dev_get(p.name)) != NULL) {
+ dev->flags |= IFF_MULTICAST;
+
+ in_dev = dev->ip_ptr;
+ if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
+ goto failure;
+ in_dev->cnf.rp_filter = 0;
+
+ if (dev_open(dev))
+ goto failure;
+ }
+ }
+ rtnl_unlock();
+ return dev;
+
+failure:
+ unregister_netdevice(dev);
+ rtnl_unlock();
+ return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static int reg_vif_num = -1;
+static struct device * reg_dev;
+
+static int reg_vif_xmit(struct sk_buff *skb, struct device *dev)
+{
+ ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+ ((struct net_device_stats*)dev->priv)->tx_packets++;
+ ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct net_device_stats *reg_vif_get_stats(struct device *dev)
+{
+ return (struct net_device_stats*)dev->priv;
+}
+
+static
+struct device *ipmr_reg_vif(struct vifctl *v)
+{
+ struct device *dev;
+ struct in_device *in_dev;
+ int size;
+
+ size = sizeof(*dev) + IFNAMSIZ + sizeof(struct net_device_stats);
+ dev = kmalloc(size, GFP_KERNEL);
+ if (!dev)
+ return NULL;
+
+ memset(dev, 0, size);
+
+ dev->priv = dev + 1;
+ dev->name = dev->priv + sizeof(struct net_device_stats);
+
+ strcpy(dev->name, "pimreg");
+
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = 1500 - sizeof(struct iphdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->hard_start_xmit = reg_vif_xmit;
+ dev->get_stats = reg_vif_get_stats;
+
+ rtnl_lock();
+
+ if (register_netdevice(dev)) {
+ rtnl_unlock();
+ kfree(dev);
+ return NULL;
+ }
+ dev->iflink = 0;
+
+ if ((in_dev = inetdev_init(dev)) == NULL)
+ goto failure;
+
+ in_dev->cnf.rp_filter = 0;
+
+ if (dev_open(dev))
+ goto failure;
+
+ rtnl_unlock();
+ reg_dev = dev;
+ return dev;
+
+failure:
+ unregister_netdevice(dev);
+ rtnl_unlock();
+ kfree(dev);
+ return NULL;
+}
+#endif
+
+/*
+ * Delete a VIF entry
+ */
+
+static int vif_delete(int vifi)
+{
+ struct vif_device *v;
+ struct device *dev;
+ struct in_device *in_dev;
+
+ if (vifi < 0 || vifi >= maxvif || !(vifc_map&(1<<vifi)))
+ return -EADDRNOTAVAIL;
+
+ v = &vif_table[vifi];
+
+ dev = v->dev;
+ v->dev = NULL;
+ vifc_map &= ~(1<<vifi);
+
+ if ((in_dev = dev->ip_ptr) != NULL)
+ in_dev->cnf.mc_forwarding = 0;
+
+ dev_set_allmulti(dev, -1);
+ ip_rt_multicast_event(in_dev);
+
+ if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) {
+#ifdef CONFIG_IP_PIMSM
+ if (vifi == reg_vif_num) {
+ reg_vif_num = -1;
+ reg_dev = NULL;
+ }
+#endif
+ unregister_netdevice(dev);
+ if (v->flags&VIFF_REGISTER)
+ kfree(dev);
+ }
+
+ if (vifi+1 == maxvif) {
+ int tmp;
+ for (tmp=vifi-1; tmp>=0; tmp--) {
+ if (vifc_map&(1<<tmp))
+ break;
+ }
+ maxvif = tmp+1;
+ }
+ return 0;
+}
+
+static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+{
+ int vifi;
+
+ start_bh_atomic();
+
+ cache->mfc_minvif = MAXVIFS;
+ cache->mfc_maxvif = 0;
+ memset(cache->mfc_ttls, 255, MAXVIFS);
+
+ for (vifi=0; vifi<maxvif; vifi++) {
+ if (vifc_map&(1<<vifi) && ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_ttls[vifi] = ttls[vifi];
+ if (cache->mfc_minvif > vifi)
+ cache->mfc_minvif = vifi;
+ if (cache->mfc_maxvif <= vifi)
+ cache->mfc_maxvif = vifi + 1;
+ }
+ }
+ end_bh_atomic();
+}
+
+/*
+ * Delete a multicast route cache entry
+ */
+
+static void ipmr_cache_delete(struct mfc_cache *cache)
+{
+ struct sk_buff *skb;
+ int line;
+ struct mfc_cache **cp;
+
+ /*
+ * Find the right cache line
+ */
+
+ line=MFC_HASH(cache->mfc_mcastgrp,cache->mfc_origin);
+ cp=&(mfc_cache_array[line]);
+
+ if(cache->mfc_flags&MFC_QUEUED)
+ del_timer(&cache->mfc_timer);
+
+ /*
+ * Unlink the buffer
+ */
+
+ while(*cp!=NULL)
+ {
+ if(*cp==cache)
+ {
+ *cp=cache->next;
+ break;
+ }
+ cp=&((*cp)->next);
+ }
+
+ /*
+ * Free the buffer. If it is a pending resolution
+ * clean up the other resources.
+ */
+
+ if(cache->mfc_flags&MFC_QUEUED)
+ {
+ cache_resolve_queue_len--;
+ while((skb=skb_dequeue(&cache->mfc_unresolved))) {
+#ifdef CONFIG_RTNETLINK
+ if (skb->nh.iph->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+ netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+ } else
+#endif
+ kfree_skb(skb);
+ }
+ }
+ kfree_s(cache,sizeof(cache));
+}
+
+/*
+ * Cache expiry timer
+ */
+
+static void ipmr_cache_timer(unsigned long data)
+{
+ struct mfc_cache *cache=(struct mfc_cache *)data;
+ ipmr_cache_delete(cache);
+}
+
+/*
+ * Insert a multicast cache entry
+ */
+
+static void ipmr_cache_insert(struct mfc_cache *c)
+{
+ int line=MFC_HASH(c->mfc_mcastgrp,c->mfc_origin);
+ c->next=mfc_cache_array[line];
+ mfc_cache_array[line]=c;
+}
+
+/*
+ * Find a multicast cache entry
+ */
+
+struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
+{
+ int line=MFC_HASH(mcastgrp,origin);
+ struct mfc_cache *cache;
+
+ cache=mfc_cache_array[line];
+ while(cache!=NULL)
+ {
+ if(cache->mfc_origin==origin && cache->mfc_mcastgrp==mcastgrp)
+ return cache;
+ cache=cache->next;
+ }
+ return NULL;
+}
+
+/*
+ * Allocate a multicast cache entry
+ */
+
+static struct mfc_cache *ipmr_cache_alloc(int priority)
+{
+ struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority);
+ if(c==NULL)
+ return NULL;
+ memset(c, 0, sizeof(*c));
+ skb_queue_head_init(&c->mfc_unresolved);
+ init_timer(&c->mfc_timer);
+ c->mfc_timer.data=(long)c;
+ c->mfc_timer.function=ipmr_cache_timer;
+ c->mfc_minvif = MAXVIFS;
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ipmr_cache_resolve(struct mfc_cache *cache)
+{
+ struct sk_buff *skb;
+
+ start_bh_atomic();
+
+ /*
+ * Kill the queue entry timer.
+ */
+
+ del_timer(&cache->mfc_timer);
+
+ if (cache->mfc_flags&MFC_QUEUED) {
+ cache->mfc_flags&=~MFC_QUEUED;
+ cache_resolve_queue_len--;
+ }
+
+ end_bh_atomic();
+
+ /*
+ * Play the pending entries through our router
+ */
+ while((skb=skb_dequeue(&cache->mfc_unresolved))) {
+#ifdef CONFIG_RTNETLINK
+ if (skb->nh.iph->version == 0) {
+ int err;
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+ if (ipmr_fill_mroute(skb, cache, NLMSG_DATA(nlh)) > 0) {
+ nlh->nlmsg_len = skb->tail - (u8*)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+ }
+ err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+ } else
+#endif
+ ip_mr_forward(skb, cache, 0);
+ }
+}
+
+/*
+ * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme..
+ */
+
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+ struct sk_buff *skb;
+ int ihl = pkt->nh.iph->ihl<<2;
+ struct igmphdr *igmp;
+ struct igmpmsg *msg;
+ int ret;
+
+ if (mroute_socket==NULL)
+ return -EINVAL;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT)
+ skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+ else
+#endif
+ skb = alloc_skb(128, GFP_ATOMIC);
+
+ if(!skb)
+ return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT) {
+ /* Ugly, but we have no choice with this interface.
+ Duplicate old header, fix ihl, length etc.
+ And all this only to mangle msg->im_msgtype and
+ to set msg->im_mbz to "mbz" :-)
+ */
+ msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
+ skb->nh.raw = skb->h.raw = (u8*)msg;
+ memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
+ msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_mbz = 0;
+ msg->im_vif = reg_vif_num;
+ skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
+ skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
+ } else
+#endif
+ {
+
+ /*
+ * Copy the IP header
+ */
+
+ skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
+ memcpy(skb->data,pkt->data,ihl);
+ skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */
+ msg = (struct igmpmsg*)skb->nh.iph;
+ msg->im_vif = vifi;
+ skb->dst = dst_clone(pkt->dst);
+
+ /*
+ * Add our header
+ */
+
+ igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
+ igmp->type =
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
+ skb->h.raw = skb->nh.raw;
+ }
+
+ /*
+ * Deliver to mrouted
+ */
+ if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/*
+ * Queue a packet for resolution
+ */
+
+static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb)
+{
+ if(cache==NULL)
+ {
+ /*
+ * Create a new entry if allowable
+ */
+ if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL)
+ {
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ /*
+ * Fill in the new cache entry
+ */
+ cache->mfc_parent=ALL_VIFS;
+ cache->mfc_origin=skb->nh.iph->saddr;
+ cache->mfc_mcastgrp=skb->nh.iph->daddr;
+ cache->mfc_flags=MFC_QUEUED;
+ /*
+ * Link to the unresolved list
+ */
+ ipmr_cache_insert(cache);
+ cache_resolve_queue_len++;
+ /*
+ * Fire off the expiry timer
+ */
+ cache->mfc_timer.expires=jiffies+10*HZ;
+ add_timer(&cache->mfc_timer);
+ /*
+ * Reflect first query at mrouted.
+ */
+ if(mroute_socket)
+ {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+
+ OK, OK, Brad. Only do not forget to free skb
+ and return :-) --ANK
+ */
+ if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) {
+ ipmr_cache_delete(cache);
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ }
+ }
+ /*
+ * See if we can append the packet
+ */
+ if(cache->mfc_queuelen>3)
+ {
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ cache->mfc_queuelen++;
+ skb_queue_tail(&cache->mfc_unresolved,skb);
+ return 0;
+}
+
+/*
+ * MFC cache manipulation by user space mroute daemon
+ */
+
+int ipmr_mfc_modify(int action, struct mfcctl *mfc)
+{
+ struct mfc_cache *cache;
+
+ if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+ return -EINVAL;
+ /*
+ * Find the cache line
+ */
+
+ start_bh_atomic();
+
+ cache=ipmr_cache_find(mfc->mfcc_origin.s_addr,mfc->mfcc_mcastgrp.s_addr);
+
+ /*
+ * Delete an entry
+ */
+ if(action==MRT_DEL_MFC)
+ {
+ if(cache)
+ {
+ ipmr_cache_delete(cache);
+ end_bh_atomic();
+ return 0;
+ }
+ end_bh_atomic();
+ return -ENOENT;
+ }
+ if(cache)
+ {
+
+ /*
+ * Update the cache, see if it frees a pending queue
+ */
+
+ cache->mfc_flags|=MFC_RESOLVED;
+ cache->mfc_parent=mfc->mfcc_parent;
+ ipmr_update_threshoulds(cache, mfc->mfcc_ttls);
+
+ /*
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+
+ if(cache->mfc_flags&MFC_QUEUED)
+ ipmr_cache_resolve(cache); /* Unhook & send the frames */
+ end_bh_atomic();
+ return 0;
+ }
+
+ /*
+ * Unsolicited update - that's ok, add anyway.
+ */
+
+
+ cache=ipmr_cache_alloc(GFP_ATOMIC);
+ if(cache==NULL)
+ {
+ end_bh_atomic();
+ return -ENOMEM;
+ }
+ cache->mfc_flags=MFC_RESOLVED;
+ cache->mfc_origin=mfc->mfcc_origin.s_addr;
+ cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
+ cache->mfc_parent=mfc->mfcc_parent;
+ ipmr_update_threshoulds(cache, mfc->mfcc_ttls);
+ ipmr_cache_insert(cache);
+ end_bh_atomic();
+ return 0;
+}
+
+static void mrtsock_destruct(struct sock *sk)
+{
+ if (sk == mroute_socket) {
+ ipv4_devconf.mc_forwarding = 0;
+
+ mroute_socket=NULL;
+ synchronize_bh();
+
+ mroute_close(sk);
+ }
+}
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
+{
+ struct vifctl vif;
+ struct mfcctl mfc;
+
+ if(optname!=MRT_INIT)
+ {
+ if(sk!=mroute_socket)
+ return -EACCES;
+ }
+
+ switch(optname)
+ {
+ case MRT_INIT:
+ if(sk->type!=SOCK_RAW || sk->num!=IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+ if(optlen!=sizeof(int))
+ return -ENOPROTOOPT;
+ {
+ int opt;
+ if (get_user(opt,(int *)optval))
+ return -EFAULT;
+ if (opt != 1)
+ return -ENOPROTOOPT;
+ }
+ if(mroute_socket)
+ return -EADDRINUSE;
+ mroute_socket=sk;
+ ipv4_devconf.mc_forwarding = 1;
+ if (ip_ra_control(sk, 1, mrtsock_destruct) == 0)
+ return 0;
+ mrtsock_destruct(sk);
+ return -EADDRINUSE;
+ case MRT_DONE:
+ return ip_ra_control(sk, 0, NULL);
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ if(optlen!=sizeof(vif))
+ return -EINVAL;
+ if (copy_from_user(&vif,optval,sizeof(vif)))
+ return -EFAULT;
+ if(vif.vifc_vifi >= MAXVIFS)
+ return -ENFILE;
+ if(optname==MRT_ADD_VIF)
+ {
+ struct vif_device *v=&vif_table[vif.vifc_vifi];
+ struct device *dev;
+ struct in_device *in_dev;
+
+ /* Is vif busy ? */
+ if (vifc_map&(1<<vif.vifc_vifi))
+ return -EADDRINUSE;
+
+ switch (vif.vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+ case VIFF_REGISTER:
+
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (reg_vif_num >= 0)
+ return -EADDRINUSE;
+ reg_vif_num = vif.vifc_vifi;
+ dev = ipmr_reg_vif(&vif);
+ if (!dev) {
+ reg_vif_num = -1;
+ return -ENOBUFS;
+ }
+ break;
+#endif
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(&vif);
+ if (!dev)
+ return -ENOBUFS;
+ break;
+ case 0:
+ dev=ip_dev_find(vif.vifc_lcl_addr.s_addr);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ break;
+ default:
+#if 0
+ printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags);
+#endif
+ return -EINVAL;
+ }
+
+ if ((in_dev = dev->ip_ptr) == NULL)
+ return -EADDRNOTAVAIL;
+ if (in_dev->cnf.mc_forwarding)
+ return -EADDRINUSE;
+ in_dev->cnf.mc_forwarding = 1;
+ dev_set_allmulti(dev, +1);
+ ip_rt_multicast_event(in_dev);
+
+ /*
+ * Fill in the VIF structures
+ */
+ start_bh_atomic();
+ v->rate_limit=vif.vifc_rate_limit;
+ v->local=vif.vifc_lcl_addr.s_addr;
+ v->remote=vif.vifc_rmt_addr.s_addr;
+ v->flags=vif.vifc_flags;
+ v->threshold=vif.vifc_threshold;
+ v->dev=dev;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (vif.vifc_flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ v->link = dev->iflink;
+ vifc_map|=(1<<vif.vifc_vifi);
+ if (vif.vifc_vifi+1 > maxvif)
+ maxvif = vif.vifc_vifi+1;
+ end_bh_atomic();
+ return 0;
+ } else {
+ int ret;
+ rtnl_lock();
+ ret = vif_delete(vif.vifc_vifi);
+ rtnl_unlock();
+ return ret;
+ }
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ if(optlen!=sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_user(&mfc,optval, sizeof(mfc)))
+ return -EFAULT;
+ return ipmr_mfc_modify(optname, &mfc);
+ /*
+ * Control PIM assert.
+ */
+ case MRT_ASSERT:
+ {
+ int v;
+ if(get_user(v,(int *)optval))
+ return -EFAULT;
+ mroute_do_assert=(v)?1:0;
+ return 0;
+ }
+#ifdef CONFIG_IP_PIMSM
+ case MRT_PIM:
+ {
+ int v;
+ if(get_user(v,(int *)optval))
+ return -EFAULT;
+ v = (v)?1:0;
+ if (v != mroute_do_pim) {
+ mroute_do_pim = v;
+ mroute_do_assert = v;
+#ifdef CONFIG_IP_PIMSM_V2
+ if (mroute_do_pim)
+ inet_add_protocol(&pim_protocol);
+ else
+ inet_del_protocol(&pim_protocol);
+#endif
+ }
+ return 0;
+ }
+#endif
+ /*
+ * Spurious command, or MRT_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen)
+{
+ int olr;
+ int val;
+
+ if(sk!=mroute_socket)
+ return -EACCES;
+ if(optname!=MRT_VERSION &&
+#ifdef CONFIG_IP_PIMSM
+ optname!=MRT_PIM &&
+#endif
+ optname!=MRT_ASSERT)
+ return -ENOPROTOOPT;
+
+ if(get_user(olr, optlen))
+ return -EFAULT;
+
+ olr=min(olr,sizeof(int));
+ if(put_user(olr,optlen))
+ return -EFAULT;
+ if(optname==MRT_VERSION)
+ val=0x0305;
+#ifdef CONFIG_IP_PIMSM
+ else if(optname==MRT_PIM)
+ val=mroute_do_pim;
+#endif
+ else
+ val=mroute_do_assert;
+ if(copy_to_user(optval,&val,olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+
+int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct sioc_sg_req sr;
+ struct sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+
+ switch(cmd)
+ {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr,(void *)arg,sizeof(vr)))
+ return -EFAULT;
+ if(vr.vifi>=maxvif)
+ return -EINVAL;
+ vif=&vif_table[vr.vifi];
+ if(vifc_map&(1<<vr.vifi))
+ {
+ vr.icount=vif->pkt_in;
+ vr.ocount=vif->pkt_out;
+ vr.ibytes=vif->bytes_in;
+ vr.obytes=vif->bytes_out;
+ if (copy_to_user((void *)arg,&vr,sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr,(void *)arg,sizeof(sr)))
+ return -EFAULT;
+ for (c = mfc_cache_array[MFC_HASH(sr.grp.s_addr, sr.src.s_addr)];
+ c; c = c->next) {
+ if (sr.grp.s_addr == c->mfc_mcastgrp &&
+ sr.src.s_addr == c->mfc_origin) {
+ sr.pktcnt = c->mfc_pkt;
+ sr.bytecnt = c->mfc_bytes;
+ sr.wrong_if = c->mfc_wrong_if;
+ if (copy_to_user((void *)arg,&sr,sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ }
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+void mroute_close(struct sock *sk)
+{
+ int i;
+
+ /*
+ * Shut down all active vif entries
+ */
+ rtnl_lock();
+ for(i=0; i<maxvif; i++)
+ vif_delete(i);
+ rtnl_unlock();
+
+ /*
+ * Wipe the cache
+ */
+ for(i=0;i<MFC_LINES;i++)
+ {
+ start_bh_atomic();
+ while(mfc_cache_array[i]!=NULL)
+ ipmr_cache_delete(mfc_cache_array[i]);
+ end_bh_atomic();
+ }
+}
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct vif_device *v;
+ int ct;
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+ v=&vif_table[0];
+ for(ct=0;ct<maxvif;ct++) {
+ if (vifc_map&(1<<ct) && v->dev==ptr)
+ vif_delete(ct);
+ v++;
+ }
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier={
+ ipmr_device_event,
+ NULL,
+ 0
+};
+
+/*
+ * Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
+ */
+
+static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+ struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+ iph->version = 4;
+ iph->tos = skb->nh.iph->tos;
+ iph->ttl = skb->nh.iph->ttl;
+ iph->frag_off = 0;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+ iph->tot_len = htons(skb->len);
+ iph->id = htons(ip_id_count++);
+ ip_send_check(iph);
+
+ skb->h.ipiph = skb->nh.iph;
+ skb->nh.iph = iph;
+}
+
+/*
+ * Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
+ int vifi, int last)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct vif_device *vif = &vif_table[vifi];
+ struct device *dev;
+ struct rtable *rt;
+ int encap = 0;
+ struct sk_buff *skb2;
+
+#ifdef CONFIG_IP_PIMSM
+ if (vif->flags & VIFF_REGISTER) {
+ vif->pkt_out++;
+ vif->bytes_out+=skb->len;
+ ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
+ ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
+ ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
+ return;
+ }
+#endif
+
+ if (vif->flags&VIFF_TUNNEL) {
+ if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link))
+ return;
+ encap = sizeof(struct iphdr);
+ } else {
+ if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link))
+ return;
+ }
+
+ dev = rt->u.dst.dev;
+
+ if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) {
+ /* Do not fragment multicasts. Alas, IPv4 does not
+ allow to send ICMP, so that packets will disappear
+ to blackhole.
+ */
+
+ ip_statistics.IpFragFails++;
+ ip_rt_put(rt);
+ return;
+ }
+
+ encap += dev->hard_header_len;
+
+ if (skb_headroom(skb) < encap || skb_cloned(skb) || !last)
+ skb2 = skb_realloc_headroom(skb, (encap + 15)&~15);
+ else if (atomic_read(&skb->users) != 1)
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ else {
+ atomic_inc(&skb->users);
+ skb2 = skb;
+ }
+
+ if (skb2 == NULL) {
+ ip_rt_put(rt);
+ return;
+ }
+
+ vif->pkt_out++;
+ vif->bytes_out+=skb->len;
+
+ dst_release(skb2->dst);
+ skb2->dst = &rt->u.dst;
+ iph = skb2->nh.iph;
+ ip_decrease_ttl(iph);
+
+#ifdef CONFIG_FIREWALL
+ if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+ kfree_skb(skb2);
+ return;
+ }
+ if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+ kfree_skb(skb2);
+ return;
+ }
+#endif
+ if (vif->flags & VIFF_TUNNEL) {
+ ip_encap(skb2, vif->local, vif->remote);
+#ifdef CONFIG_FIREWALL
+ /* Double output firewalling on tunnels: one is on tunnel
+ another one is on real device.
+ */
+ if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+ kfree_skb(skb2);
+ return;
+ }
+#endif
+ ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
+ ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len;
+ }
+
+ IPCB(skb2)->flags |= IPSKB_FORWARDED;
+
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ if (skb2->len <= rt->u.dst.pmtu)
+ skb2->dst->output(skb2);
+ else
+ ip_fragment(skb2, skb2->dst->output);
+}
+
+int ipmr_find_vif(struct device *dev)
+{
+ int ct;
+ for (ct=0; ct<maxvif; ct++) {
+ if (vifc_map&(1<<ct) && vif_table[ct].dev == dev)
+ return ct;
+ }
+ return ALL_VIFS;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+{
+ int psend = -1;
+ int vif, ct;
+
+ vif = cache->mfc_parent;
+ cache->mfc_pkt++;
+ cache->mfc_bytes += skb->len;
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (vif_table[vif].dev != skb->dev) {
+ int true_vifi;
+
+ if (((struct rtable*)skb->dst)->key.iif == 0) {
+ /* It is our own packet, looped back.
+ Very complicated situation...
+
+ The best workaround until routing daemons will be
+ fixed is not to redistribute packet, if it was
+ send through wrong interface. It means, that
+ multicast applications WILL NOT work for
+ (S,G), which have default multicast route pointing
+ to wrong oif. In any case, it is not a good
+ idea to use multicasting applications on router.
+ */
+ goto dont_forward;
+ }
+
+ cache->mfc_wrong_if++;
+ true_vifi = ipmr_find_vif(skb->dev);
+
+ if (true_vifi < MAXVIFS && mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ so that we cannot check that packet arrived on an oif.
+ It is bad, but otherwise we would need to move pretty
+ large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mroute_do_pim || cache->mfc_ttls[true_vifi] < 255) &&
+ jiffies - cache->mfc_last_assert > MFC_ASSERT_THRESH) {
+ cache->mfc_last_assert = jiffies;
+ ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+ }
+ goto dont_forward;
+ }
+
+ vif_table[vif].pkt_in++;
+ vif_table[vif].bytes_in+=skb->len;
+
+ /*
+ * Forward the frame
+ */
+ for (ct = cache->mfc_maxvif-1; ct >= cache->mfc_minvif; ct--) {
+ if (skb->nh.iph->ttl > cache->mfc_ttls[ct]) {
+ if (psend != -1)
+ ipmr_queue_xmit(skb, cache, psend, 0);
+ psend=ct;
+ }
+ }
+ if (psend != -1)
+ ipmr_queue_xmit(skb, cache, psend, !local);
+
+dont_forward:
+ if (!local)
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Multicast packets for forwarding arrive here
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+ struct mfc_cache *cache;
+ int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
+
+ /* Packet is looped back after forward, it should not be
+ forwarded second time, but still can be delivered locally.
+ */
+ if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ goto dont_forward;
+
+ if (!local) {
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (skb->nh.iph->protocol == IPPROTO_IGMP && mroute_socket) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ Cisco IOS <= 11.2(8)) do not put router alert
+ option to IGMP packets destined to routable
+ groups. It is very bad, because it means
+ that we can forward NO IGMP messages.
+ */
+ raw_rcv(mroute_socket, skb);
+ return 0;
+ }
+ }
+
+ cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
+
+ /*
+ * No usable cache entry
+ */
+
+ if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
+ int vif;
+
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ ip_local_deliver(skb);
+ if (skb2 == NULL)
+ return -ENOBUFS;
+ skb = skb2;
+ }
+
+ vif = ipmr_find_vif(skb->dev);
+ if (vif != ALL_VIFS) {
+ ipmr_cache_unresolved(cache, vif, skb);
+ return -EAGAIN;
+ }
+ kfree_skb(skb);
+ return 0;
+ }
+
+ ip_mr_forward(skb, cache, local);
+
+ if (local)
+ return ip_local_deliver(skb);
+ return 0;
+
+dont_forward:
+ if (local)
+ return ip_local_deliver(skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff * skb, unsigned short len)
+{
+ struct igmphdr *pim = (struct igmphdr*)skb->h.raw;
+ struct iphdr *encap;
+
+ if (!mroute_do_pim ||
+ len < sizeof(*pim) + sizeof(*encap) ||
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER ||
+ reg_dev == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
+ /*
+ Check that:
+ a. packet is really destinted to a multicast group
+ b. packet is not a NULL-REGISTER
+ c. packet is not truncated
+ */
+ if (!MULTICAST(encap->daddr) ||
+ ntohs(encap->tot_len) == 0 ||
+ ntohs(encap->tot_len) + sizeof(*pim) > len) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+ skb->nh.iph = (struct iphdr *)skb->data;
+ skb->dev = reg_dev;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = __constant_htons(ETH_P_IP);
+ skb->ip_summed = 0;
+ skb->pkt_type = PACKET_HOST;
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+ ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+ netif_rx(skb);
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+int pim_rcv(struct sk_buff * skb, unsigned short len)
+{
+ struct pimreghdr *pim = (struct pimreghdr*)skb->h.raw;
+ struct iphdr *encap;
+
+ if (len < sizeof(*pim) + sizeof(*encap) ||
+ pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+ (pim->flags&PIM_NULL_REGISTER) ||
+ reg_dev == NULL ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) &&
+ ip_compute_csum((void *)pim, len))) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* check if the inner packet is destined to mcast group */
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
+ if (!MULTICAST(encap->daddr) ||
+ ntohs(encap->tot_len) == 0 ||
+ ntohs(encap->tot_len) + sizeof(*pim) > len) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+ skb->nh.iph = (struct iphdr *)skb->data;
+ skb->dev = reg_dev;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ skb->protocol = __constant_htons(ETH_P_IP);
+ skb->ip_summed = 0;
+ skb->pkt_type = PACKET_HOST;
+ dst_release(skb->dst);
+ ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+ ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+ skb->dst = NULL;
+ netif_rx(skb);
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_RTNETLINK
+
+static int
+ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+{
+ int ct;
+ struct rtnexthop *nhp;
+ struct device *dev = vif_table[c->mfc_parent].dev;
+ u8 *b = skb->tail;
+ struct rtattr *mp_head;
+
+ if (dev)
+ RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+
+ mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
+
+ for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) {
+ if (c->mfc_ttls[ct] < 255) {
+ if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+ goto rtattr_failure;
+ nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_ttls[ct];
+ nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+ mp_head->rta_type = RTA_MULTIPATH;
+ mp_head->rta_len = skb->tail - (u8*)mp_head;
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -EMSGSIZE;
+}
+
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+ struct mfc_cache *cache;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ start_bh_atomic();
+ cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+ if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
+ struct device *dev;
+ int vif;
+ int err;
+
+ if (nowait) {
+ end_bh_atomic();
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
+ if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) {
+ end_bh_atomic();
+ return -ENODEV;
+ }
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
+ skb->nh.iph->saddr = rt->rt_src;
+ skb->nh.iph->daddr = rt->rt_dst;
+ skb->nh.iph->version = 0;
+ err = ipmr_cache_unresolved(cache, vif, skb);
+ end_bh_atomic();
+ return err;
+ }
+ /* Resolved cache entry is not changed by net bh,
+ so that we are allowed to enable it.
+ */
+ end_bh_atomic();
+
+ if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ cache->mfc_flags |= MFC_NOTIFY;
+ return ipmr_fill_mroute(skb, cache, rtm);
+}
+#endif
+
+/*
+ * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ */
+
+int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ struct vif_device *vif;
+ int len=0;
+ off_t pos=0;
+ off_t begin=0;
+ int size;
+ int ct;
+
+ len += sprintf(buffer,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
+ pos=len;
+
+ for (ct=0;ct<maxvif;ct++)
+ {
+ char *name = "none";
+ vif=&vif_table[ct];
+ if(!(vifc_map&(1<<ct)))
+ continue;
+ if (vif->dev)
+ name = vif->dev->name;
+ size = sprintf(buffer+len, "%2d %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ ct, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out,
+ vif->flags, vif->local, vif->remote);
+ len+=size;
+ pos+=size;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ break;
+ }
+
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ return len;
+}
+
+int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ struct mfc_cache *mfc;
+ int len=0;
+ off_t pos=0;
+ off_t begin=0;
+ int size;
+ int ct;
+
+ len += sprintf(buffer,
+ "Group Origin Iif Pkts Bytes Wrong Oifs\n");
+ pos=len;
+
+ for (ct=0;ct<MFC_LINES;ct++)
+ {
+ start_bh_atomic();
+ mfc=mfc_cache_array[ct];
+ while(mfc!=NULL)
+ {
+ int n;
+
+ /*
+ * Interface forwarding map
+ */
+ size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld",
+ (unsigned long)mfc->mfc_mcastgrp,
+ (unsigned long)mfc->mfc_origin,
+ mfc->mfc_parent == ALL_VIFS ? -1 : mfc->mfc_parent,
+ (mfc->mfc_flags & MFC_QUEUED) ? mfc->mfc_unresolved.qlen : mfc->mfc_pkt,
+ mfc->mfc_bytes,
+ mfc->mfc_wrong_if);
+ for(n=mfc->mfc_minvif;n<mfc->mfc_maxvif;n++)
+ {
+ if(vifc_map&(1<<n) && mfc->mfc_ttls[n] < 255)
+ size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_ttls[n]);
+ }
+ size += sprintf(buffer+len+size, "\n");
+ len+=size;
+ pos+=size;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ {
+ end_bh_atomic();
+ goto done;
+ }
+ mfc=mfc->next;
+ }
+ end_bh_atomic();
+ }
+done:
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ if (len < 0) {
+ len = 0;
+ }
+ return len;
+}
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry proc_net_ipmr_vif = {
+ PROC_NET_IPMR_VIF, 9 ,"ip_mr_vif",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ ipmr_vif_info
+};
+static struct proc_dir_entry proc_net_ipmr_mfc = {
+ PROC_NET_IPMR_MFC, 11 ,"ip_mr_cache",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ ipmr_mfc_info
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+struct inet_protocol pim_protocol =
+{
+ pim_rcv, /* PIM handler */
+ NULL, /* PIM error control */
+ NULL, /* next */
+ IPPROTO_PIM, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "PIM" /* name */
+};
+#endif
+
+
+/*
+ * Setup for IP multicast routing
+ */
+
+__initfunc(void ip_mr_init(void))
+{
+ printk(KERN_INFO "Linux IP multicast router 0.06 plus PIM-SM\n");
+ register_netdevice_notifier(&ip_mr_notifier);
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_ipmr_vif);
+ proc_net_register(&proc_net_ipmr_mfc);
+#endif
+}
diff --git a/pfinet/linux-src/net/ipv4/proc.c b/pfinet/linux-src/net/ipv4/proc.c
new file mode 100644
index 00000000..1640a056
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/proc.c
@@ -0,0 +1,387 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * This file implements the various access functions for the
+ * PROC file system. It is mainly used for debugging and
+ * statistics.
+ *
+ * Version: $Id: proc.c,v 1.34 1999/02/08 11:20:34 davem Exp $
+ *
+ * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ * Alan Cox : UDP sockets show the rxqueue/txqueue
+ * using hint flag for the netinfo.
+ * Pauline Middelink : identd support
+ * Alan Cox : Make /proc safer.
+ * Erik Schoenfelder : /proc/net/snmp
+ * Alan Cox : Handle dead sockets properly.
+ * Gerhard Koerting : Show both timers
+ * Alan Cox : Allow inode to be NULL (kernel socket)
+ * Andi Kleen : Add support for open_requests and
+ * split functions for more readibility.
+ * Andi Kleen : Add support for /proc/net/netstat
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/system.h>
+#include <linux/sched.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/un.h>
+#include <linux/in.h>
+#include <linux/param.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+/* Format a single open_request into tmpbuf. */
+static inline void get__openreq(struct sock *sk, struct open_request *req,
+ char *tmpbuf,
+ int i)
+{
+ sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u",
+ i,
+ (long unsigned int)req->af.v4_req.loc_addr,
+ ntohs(sk->sport),
+ (long unsigned int)req->af.v4_req.rmt_addr,
+ ntohs(req->rmt_port),
+ TCP_SYN_RECV,
+ 0,0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ (unsigned long)(req->expires - jiffies),
+ req->retrans,
+ sk->socket ? sk->socket->inode->i_uid : 0,
+ 0, /* non standard timer */
+ 0 /* open_requests have no inode */
+ );
+}
+
+/* Format a single socket into tmpbuf. */
+static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
+{
+ unsigned long dest, src;
+ unsigned short destp, srcp;
+ int timer_active, timer_active1, timer_active2;
+ int tw_bucket = 0;
+ unsigned long timer_expires;
+ struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
+
+ dest = sp->daddr;
+ src = sp->rcv_saddr;
+ destp = sp->dport;
+ srcp = sp->sport;
+
+ /* FIXME: The fact that retransmit_timer occurs as a field
+ * in two different parts of the socket structure is,
+ * to say the least, confusing. This code now uses the
+ * right retransmit_timer variable, but I'm not sure
+ * the rest of the timer stuff is still correct.
+ * In particular I'm not sure what the timeout value
+ * is suppose to reflect (as opposed to tm->when). -- erics
+ */
+
+ destp = ntohs(destp);
+ srcp = ntohs(srcp);
+ if((format == 0) && (sp->state == TCP_TIME_WAIT)) {
+ extern int tcp_tw_death_row_slot;
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp;
+ int slot_dist;
+
+ tw_bucket = 1;
+ timer_active1 = timer_active2 = 0;
+ timer_active = 3;
+ slot_dist = tw->death_slot;
+ if(slot_dist > tcp_tw_death_row_slot)
+ slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
+ else
+ slot_dist = tcp_tw_death_row_slot - slot_dist;
+ timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD);
+ } else {
+ timer_active1 = del_timer(&tp->retransmit_timer);
+ timer_active2 = del_timer(&sp->timer);
+ if (!timer_active1) tp->retransmit_timer.expires=0;
+ if (!timer_active2) sp->timer.expires=0;
+ timer_active = 0;
+ timer_expires = (unsigned) -1;
+ }
+ if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
+ timer_active = 1;
+ timer_expires = tp->retransmit_timer.expires;
+ }
+ if (timer_active2 && sp->timer.expires < timer_expires) {
+ timer_active = 2;
+ timer_expires = sp->timer.expires;
+ }
+ if(timer_active == 0)
+ timer_expires = jiffies;
+ sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
+ i, src, srcp, dest, destp, sp->state,
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->write_seq-tp->snd_una : atomic_read(&sp->wmem_alloc)),
+ (tw_bucket ?
+ 0 :
+ (format == 0) ?
+ tp->rcv_nxt-tp->copied_seq: atomic_read(&sp->rmem_alloc)),
+ timer_active, timer_expires-jiffies,
+ (tw_bucket ? 0 : tp->retransmits),
+ (!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0,
+ (!tw_bucket && timer_active) ? sp->timeout : 0,
+ (!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0);
+
+ if (timer_active1) add_timer(&tp->retransmit_timer);
+ if (timer_active2) add_timer(&sp->timer);
+}
+
+/*
+ * Get__netinfo returns the length of that string.
+ *
+ * KNOWN BUGS
+ * As in get_unix_netinfo, the buffer might be too small. If this
+ * happens, get__netinfo returns only part of the available infos.
+ *
+ * Assumes that buffer length is a multiply of 128 - if not it will
+ * write past the end.
+ */
+static int
+get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t offset, int length)
+{
+ struct sock *sp, *next;
+ int len=0, i = 0;
+ off_t pos=0;
+ off_t begin;
+ char tmpbuf[129];
+
+ if (offset < 128)
+ len += sprintf(buffer, "%-127s\n",
+ " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout inode");
+ pos = 128;
+ SOCKHASH_LOCK();
+ sp = pro->sklist_next;
+ while(sp != (struct sock *)pro) {
+ if (format == 0 && sp->state == TCP_LISTEN) {
+ struct open_request *req;
+
+ for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req;
+ i++, req = req->dl_next) {
+ if (req->sk)
+ continue;
+ pos += 128;
+ if (pos < offset)
+ continue;
+ get__openreq(sp, req, tmpbuf, i);
+ len += sprintf(buffer+len, "%-127s\n", tmpbuf);
+ if(len >= length)
+ goto out;
+ }
+ }
+
+ pos += 128;
+ if (pos < offset)
+ goto next;
+
+ get__sock(sp, tmpbuf, i, format);
+
+ len += sprintf(buffer+len, "%-127s\n", tmpbuf);
+ if(len >= length)
+ break;
+ next:
+ next = sp->sklist_next;
+ sp = next;
+ i++;
+ }
+out:
+ SOCKHASH_UNLOCK();
+
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if(len>length)
+ len = length;
+ if (len<0)
+ len = 0;
+ return len;
+}
+
+int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ return get__netinfo(&tcp_prot, buffer,0, start, offset, length);
+}
+
+int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ return get__netinfo(&udp_prot, buffer,1, start, offset, length);
+}
+
+int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ return get__netinfo(&raw_prot, buffer,1, start, offset, length);
+}
+
+/*
+ * Report socket allocation statistics [mea@utu.fi]
+ */
+int afinet_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ /* From net/socket.c */
+ extern int socket_get_info(char *, char **, off_t, int);
+
+ int len = socket_get_info(buffer,start,offset,length);
+
+ len += sprintf(buffer+len,"TCP: inuse %d highest %d\n",
+ tcp_prot.inuse, tcp_prot.highestinuse);
+ len += sprintf(buffer+len,"UDP: inuse %d highest %d\n",
+ udp_prot.inuse, udp_prot.highestinuse);
+ len += sprintf(buffer+len,"RAW: inuse %d highest %d\n",
+ raw_prot.inuse, raw_prot.highestinuse);
+ if (offset >= len)
+ {
+ *start = buffer;
+ return 0;
+ }
+ *start = buffer + offset;
+ len -= offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+
+/*
+ * Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+
+int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ extern struct tcp_mib tcp_statistics;
+ extern struct udp_mib udp_statistics;
+ int len;
+/*
+ extern unsigned long tcp_rx_miss, tcp_rx_hit1,tcp_rx_hit2;
+*/
+
+ len = sprintf (buffer,
+ "Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates\n"
+ "Ip: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ ip_statistics.IpForwarding, ip_statistics.IpDefaultTTL,
+ ip_statistics.IpInReceives, ip_statistics.IpInHdrErrors,
+ ip_statistics.IpInAddrErrors, ip_statistics.IpForwDatagrams,
+ ip_statistics.IpInUnknownProtos, ip_statistics.IpInDiscards,
+ ip_statistics.IpInDelivers, ip_statistics.IpOutRequests,
+ ip_statistics.IpOutDiscards, ip_statistics.IpOutNoRoutes,
+ ip_statistics.IpReasmTimeout, ip_statistics.IpReasmReqds,
+ ip_statistics.IpReasmOKs, ip_statistics.IpReasmFails,
+ ip_statistics.IpFragOKs, ip_statistics.IpFragFails,
+ ip_statistics.IpFragCreates);
+
+ len += sprintf (buffer + len,
+ "Icmp: InMsgs InErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps\n"
+ "Icmp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ icmp_statistics.IcmpInMsgs, icmp_statistics.IcmpInErrors,
+ icmp_statistics.IcmpInDestUnreachs, icmp_statistics.IcmpInTimeExcds,
+ icmp_statistics.IcmpInParmProbs, icmp_statistics.IcmpInSrcQuenchs,
+ icmp_statistics.IcmpInRedirects, icmp_statistics.IcmpInEchos,
+ icmp_statistics.IcmpInEchoReps, icmp_statistics.IcmpInTimestamps,
+ icmp_statistics.IcmpInTimestampReps, icmp_statistics.IcmpInAddrMasks,
+ icmp_statistics.IcmpInAddrMaskReps, icmp_statistics.IcmpOutMsgs,
+ icmp_statistics.IcmpOutErrors, icmp_statistics.IcmpOutDestUnreachs,
+ icmp_statistics.IcmpOutTimeExcds, icmp_statistics.IcmpOutParmProbs,
+ icmp_statistics.IcmpOutSrcQuenchs, icmp_statistics.IcmpOutRedirects,
+ icmp_statistics.IcmpOutEchos, icmp_statistics.IcmpOutEchoReps,
+ icmp_statistics.IcmpOutTimestamps, icmp_statistics.IcmpOutTimestampReps,
+ icmp_statistics.IcmpOutAddrMasks, icmp_statistics.IcmpOutAddrMaskReps);
+
+ len += sprintf (buffer + len,
+ "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts\n"
+ "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ tcp_statistics.TcpRtoAlgorithm, tcp_statistics.TcpRtoMin,
+ tcp_statistics.TcpRtoMax, tcp_statistics.TcpMaxConn,
+ tcp_statistics.TcpActiveOpens, tcp_statistics.TcpPassiveOpens,
+ tcp_statistics.TcpAttemptFails, tcp_statistics.TcpEstabResets,
+ tcp_statistics.TcpCurrEstab, tcp_statistics.TcpInSegs,
+ tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs,
+ tcp_statistics.TcpInErrs, tcp_statistics.TcpOutRsts);
+
+ len += sprintf (buffer + len,
+ "Udp: InDatagrams NoPorts InErrors OutDatagrams\nUdp: %lu %lu %lu %lu\n",
+ udp_statistics.UdpInDatagrams, udp_statistics.UdpNoPorts,
+ udp_statistics.UdpInErrors, udp_statistics.UdpOutDatagrams);
+/*
+ len += sprintf( buffer + len,
+ "TCP fast path RX: H2: %ul H1: %ul L: %ul\n",
+ tcp_rx_hit2,tcp_rx_hit1,tcp_rx_miss);
+*/
+
+ if (offset >= len)
+ {
+ *start = buffer;
+ return 0;
+ }
+ *start = buffer + offset;
+ len -= offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+/*
+ * Output /proc/net/netstat
+ */
+
+int netstat_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ extern struct linux_mib net_statistics;
+ int len;
+
+ len = sprintf(buffer,
+ "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed"
+ " EmbryonicRsts PruneCalled RcvPruned OfoPruned"
+ " OutOfWindowIcmps LockDroppedIcmps\n"
+ "TcpExt: %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ net_statistics.SyncookiesSent,
+ net_statistics.SyncookiesRecv,
+ net_statistics.SyncookiesFailed,
+ net_statistics.EmbryonicRsts,
+ net_statistics.PruneCalled,
+ net_statistics.RcvPruned,
+ net_statistics.OfoPruned,
+ net_statistics.OutOfWindowIcmps,
+ net_statistics.LockDroppedIcmps);
+
+ if (offset >= len)
+ {
+ *start = buffer;
+ return 0;
+ }
+ *start = buffer + offset;
+ len -= offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
diff --git a/pfinet/linux-src/net/ipv4/protocol.c b/pfinet/linux-src/net/ipv4/protocol.c
new file mode 100644
index 00000000..b47480be
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/protocol.c
@@ -0,0 +1,211 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * INET protocol dispatch tables.
+ *
+ * Version: $Id: protocol.c,v 1.9 1997/10/29 20:27:34 kuznet Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : Ahah! udp icmp errors don't work because
+ * udp_err is never called!
+ * Alan Cox : Added new fields for init and ready for
+ * proper fragmentation (_NO_ 4K limits!)
+ * Richard Colella : Hang on hash collision
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/ipip.h>
+#include <linux/igmp.h>
+
+#define IPPROTO_PREVIOUS NULL
+
+#ifdef CONFIG_IP_MULTICAST
+
+static struct inet_protocol igmp_protocol =
+{
+ igmp_rcv, /* IGMP handler */
+ NULL, /* IGMP error control */
+ IPPROTO_PREVIOUS, /* next */
+ IPPROTO_IGMP, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "IGMP" /* name */
+};
+
+#undef IPPROTO_PREVIOUS
+#define IPPROTO_PREVIOUS &igmp_protocol
+
+#endif
+
+static struct inet_protocol tcp_protocol =
+{
+ tcp_v4_rcv, /* TCP handler */
+ tcp_v4_err, /* TCP error control */
+ IPPROTO_PREVIOUS,
+ IPPROTO_TCP, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "TCP" /* name */
+};
+
+#undef IPPROTO_PREVIOUS
+#define IPPROTO_PREVIOUS &tcp_protocol
+
+static struct inet_protocol udp_protocol =
+{
+ udp_rcv, /* UDP handler */
+ udp_err, /* UDP error control */
+ IPPROTO_PREVIOUS, /* next */
+ IPPROTO_UDP, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "UDP" /* name */
+};
+
+#undef IPPROTO_PREVIOUS
+#define IPPROTO_PREVIOUS &udp_protocol
+
+
+static struct inet_protocol icmp_protocol =
+{
+ icmp_rcv, /* ICMP handler */
+ NULL, /* ICMP error control */
+ IPPROTO_PREVIOUS, /* next */
+ IPPROTO_ICMP, /* protocol ID */
+ 0, /* copy */
+ NULL, /* data */
+ "ICMP" /* name */
+};
+
+#undef IPPROTO_PREVIOUS
+#define IPPROTO_PREVIOUS &icmp_protocol
+
+
+struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS;
+
+struct inet_protocol *inet_protos[MAX_INET_PROTOS] =
+{
+ NULL
+};
+
+
+/*
+ * Find a protocol in the protocol tables given its
+ * IP type.
+ */
+
+struct inet_protocol *inet_get_protocol(unsigned char prot)
+{
+ unsigned char hash;
+ struct inet_protocol *p;
+
+ hash = prot & (MAX_INET_PROTOS - 1);
+ for (p = inet_protos[hash] ; p != NULL; p=p->next)
+ {
+ if (p->protocol == prot)
+ return((struct inet_protocol *) p);
+ }
+ return(NULL);
+}
+
+/*
+ * Add a protocol handler to the hash tables
+ */
+
+void inet_add_protocol(struct inet_protocol *prot)
+{
+ unsigned char hash;
+ struct inet_protocol *p2;
+
+ hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ prot ->next = inet_protos[hash];
+ inet_protos[hash] = prot;
+ prot->copy = 0;
+
+ /*
+ * Set the copy bit if we need to.
+ */
+
+ p2 = (struct inet_protocol *) prot->next;
+ while(p2 != NULL)
+ {
+ if (p2->protocol == prot->protocol)
+ {
+ prot->copy = 1;
+ break;
+ }
+ p2 = (struct inet_protocol *) p2->next;
+ }
+}
+
+/*
+ * Remove a protocol from the hash tables.
+ */
+
+int inet_del_protocol(struct inet_protocol *prot)
+{
+ struct inet_protocol *p;
+ struct inet_protocol *lp = NULL;
+ unsigned char hash;
+
+ hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ if (prot == inet_protos[hash])
+ {
+ inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next;
+ return(0);
+ }
+
+ p = (struct inet_protocol *) inet_protos[hash];
+ while(p != NULL)
+ {
+ /*
+ * We have to worry if the protocol being deleted is
+ * the last one on the list, then we may need to reset
+ * someone's copied bit.
+ */
+ if (p->next != NULL && p->next == prot)
+ {
+ /*
+ * if we are the last one with this protocol and
+ * there is a previous one, reset its copy bit.
+ */
+ if (p->copy == 0 && lp != NULL)
+ lp->copy = 0;
+ p->next = prot->next;
+ return(0);
+ }
+ if (p->next != NULL && p->next->protocol == prot->protocol)
+ lp = p;
+
+ p = (struct inet_protocol *) p->next;
+ }
+ return(-1);
+}
diff --git a/pfinet/linux-src/net/ipv4/rarp.c b/pfinet/linux-src/net/ipv4/rarp.c
new file mode 100644
index 00000000..7f7c7e3f
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/rarp.c
@@ -0,0 +1,606 @@
+/* linux/net/inet/rarp.c
+ *
+ * Copyright (C) 1994 by Ross Martin
+ * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche
+ *
+ * $Id: rarp.c,v 1.25 1998/06/19 13:22:34 davem Exp $
+ *
+ * This module implements the Reverse Address Resolution Protocol
+ * (RARP, RFC 903), which is used to convert low level addresses such
+ * as Ethernet addresses into high level addresses such as IP addresses.
+ * The most common use of RARP is as a means for a diskless workstation
+ * to discover its IP address during a network boot.
+ *
+ **
+ *** WARNING:::::::::::::::::::::::::::::::::WARNING
+ ****
+ ***** SUN machines seem determined to boot solely from the person who
+ **** answered their RARP query. NEVER add a SUN to your RARP table
+ *** unless you have all the rest to boot the box from it.
+ **
+ *
+ * Currently, only Ethernet address -> IP address is likely to work.
+ * (Is RARP ever used for anything else?)
+ *
+ * This code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes
+ * Alan Cox : Rarp delete on device down needed as
+ * reported by Walter Wolfgang.
+ * Mike McLagan : Routing by source
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/in.h>
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <stdarg.h>
+#include <linux/inet.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/rarp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#endif
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+
+extern int (*rarp_ioctl_hook)(unsigned int,void*);
+
+/*
+ * This structure defines the RARP mapping cache. As long as we make
+ * changes in this structure, we keep interrupts off.
+ */
+
+struct rarp_table
+{
+ struct rarp_table *next; /* Linked entry list */
+ unsigned long ip; /* ip address of entry */
+ unsigned char ha[MAX_ADDR_LEN]; /* Hardware address */
+ unsigned char hlen; /* Length of hardware address */
+ unsigned char htype; /* Type of hardware in use */
+ struct device *dev; /* Device the entry is tied to */
+};
+
+struct rarp_table *rarp_tables = NULL;
+
+static int rarp_rcv(struct sk_buff *, struct device *, struct packet_type *);
+
+static struct packet_type rarp_packet_type =
+{
+ 0, /* Should be: __constant_htons(ETH_P_RARP) - but this _doesn't_ come out constant! */
+ 0, /* copy */
+ rarp_rcv,
+ NULL,
+ NULL
+};
+
+static int initflag = 1;
+
+
+/*
+ * Release the memory for this entry.
+ */
+
+static inline void rarp_release_entry(struct rarp_table *entry)
+{
+ kfree_s(entry, sizeof(struct rarp_table));
+ MOD_DEC_USE_COUNT;
+ return;
+}
+
+/*
+ * Delete a RARP mapping entry in the cache.
+ */
+
+static void rarp_destroy(unsigned long ip_addr)
+{
+ struct rarp_table *entry;
+ struct rarp_table **pentry;
+
+ start_bh_atomic();
+ pentry = &rarp_tables;
+ while ((entry = *pentry) != NULL)
+ {
+ if (entry->ip == ip_addr)
+ {
+ *pentry = entry->next;
+ end_bh_atomic();
+ rarp_release_entry(entry);
+ return;
+ }
+ pentry = &entry->next;
+ }
+ end_bh_atomic();
+}
+
+/*
+ * Flush a device.
+ */
+
+static void rarp_destroy_dev(struct device *dev)
+{
+ struct rarp_table *entry;
+ struct rarp_table **pentry;
+
+ start_bh_atomic();
+ pentry = &rarp_tables;
+ while ((entry = *pentry) != NULL)
+ {
+ if (entry->dev == dev)
+ {
+ *pentry = entry->next;
+ rarp_release_entry(entry);
+ }
+ else
+ pentry = &entry->next;
+ }
+ end_bh_atomic();
+}
+
+static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ if(event!=NETDEV_DOWN)
+ return NOTIFY_DONE;
+ rarp_destroy_dev((struct device *)ptr);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Called once when data first added to rarp cache with ioctl.
+ */
+
+static struct notifier_block rarp_dev_notifier={
+ rarp_device_event,
+ NULL,
+ 0
+};
+
+static int rarp_pkt_inited=0;
+
+static void rarp_init_pkt (void)
+{
+ /* Register the packet type */
+ rarp_packet_type.type=htons(ETH_P_RARP);
+ dev_add_pack(&rarp_packet_type);
+ register_netdevice_notifier(&rarp_dev_notifier);
+ rarp_pkt_inited=1;
+}
+
+#ifdef MODULE
+
+static void rarp_end_pkt(void)
+{
+ if(!rarp_pkt_inited)
+ return;
+ dev_remove_pack(&rarp_packet_type);
+ unregister_netdevice_notifier(&rarp_dev_notifier);
+ rarp_pkt_inited=0;
+}
+
+#endif
+
+/*
+ * Receive an arp request by the device layer. Maybe it should be
+ * rewritten to use the incoming packet for the reply. The current
+ * "overhead" time isn't that high...
+ */
+
+static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+{
+/*
+ * We shouldn't use this type conversion. Check later.
+ */
+ struct arphdr *rarp = (struct arphdr *) skb->data;
+ unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr));
+ struct rarp_table *entry;
+ struct in_device *in_dev = dev->ip_ptr;
+ long sip,tip;
+ unsigned char *sha,*tha; /* s for "source", t for "target" */
+
+/*
+ * If this test doesn't pass, it's not IP, or we should ignore it anyway
+ */
+
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)
+ || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list)
+ {
+ kfree_skb(skb);
+ return 0;
+ }
+
+/*
+ * If it's not a RARP request, delete it.
+ */
+ if (rarp->ar_op != htons(ARPOP_RREQUEST))
+ {
+ kfree_skb(skb);
+ return 0;
+ }
+
+/*
+ * For now we will only deal with IP addresses.
+ */
+
+ if (
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ (rarp->ar_pro != htons(AX25_P_IP) && dev->type == ARPHRD_AX25) ||
+#endif
+ (rarp->ar_pro != htons(ETH_P_IP) && dev->type != ARPHRD_AX25)
+ || rarp->ar_pln != 4)
+ {
+ /*
+ * This packet is not for us. Remove it.
+ */
+ kfree_skb(skb);
+ return 0;
+ }
+
+/*
+ * Extract variable width fields
+ */
+
+ sha=rarp_ptr;
+ rarp_ptr+=dev->addr_len;
+ memcpy(&sip,rarp_ptr,4);
+ rarp_ptr+=4;
+ tha=rarp_ptr;
+ rarp_ptr+=dev->addr_len;
+ memcpy(&tip,rarp_ptr,4);
+
+/*
+ * Process entry. Use tha for table lookup according to RFC903.
+ */
+
+ for (entry = rarp_tables; entry != NULL; entry = entry->next)
+ if (!memcmp(entry->ha, tha, rarp->ar_hln))
+ break;
+
+ if (entry != NULL)
+ {
+ sip=entry->ip;
+
+ arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, in_dev->ifa_list->ifa_address, sha,
+ dev->dev_addr, sha);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Set (create) a RARP cache entry.
+ */
+
+static int rarp_req_set(struct arpreq *req)
+{
+ struct arpreq r;
+ struct rarp_table *entry;
+ struct sockaddr_in *si;
+ int htype, hlen;
+ unsigned long ip;
+ struct rtable *rt;
+ struct device * dev;
+ int err;
+
+ err = copy_from_user(&r, req, sizeof(r));
+ if (err)
+ return -EFAULT;
+
+ /*
+ * We only understand about IP addresses...
+ */
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ switch (r.arp_ha.sa_family)
+ {
+ case ARPHRD_ETHER:
+ htype = ARPHRD_ETHER;
+ hlen = ETH_ALEN;
+ break;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ case ARPHRD_AX25:
+ htype = ARPHRD_AX25;
+ hlen = 7;
+ break;
+#endif
+ default:
+ return -EPFNOSUPPORT;
+ }
+
+ si = (struct sockaddr_in *) &r.arp_pa;
+ ip = si->sin_addr.s_addr;
+ if (ip == 0)
+ {
+ printk(KERN_DEBUG "RARP: SETRARP: requested PA is 0.0.0.0 !\n");
+ return -EINVAL;
+ }
+
+/*
+ * Is it reachable directly ?
+ */
+
+ err = ip_route_output(&rt, ip, 0, 1, 0);
+ if (err)
+ return err;
+ if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) {
+ ip_rt_put(rt);
+ return -EINVAL;
+ }
+ dev = rt->u.dst.dev;
+
+/*
+ * Is there an existing entry for this address? Find out...
+ */
+
+ for (entry = rarp_tables; entry != NULL; entry = entry->next)
+ if (entry->ip == ip)
+ break;
+
+/*
+ * If no entry was found, create a new one.
+ */
+
+ if (entry == NULL)
+ {
+ entry = (struct rarp_table *) kmalloc(sizeof(struct rarp_table),
+ GFP_ATOMIC);
+ if (entry == NULL)
+ {
+ return -ENOMEM;
+ }
+ if (initflag)
+ {
+ rarp_init_pkt();
+ initflag=0;
+ }
+
+ /* Block interrupts until table modification is finished */
+
+ cli();
+ entry->next = rarp_tables;
+ rarp_tables = entry;
+ }
+ cli();
+ entry->ip = ip;
+ entry->hlen = hlen;
+ entry->htype = htype;
+ memcpy(&entry->ha, &r.arp_ha.sa_data, hlen);
+ entry->dev = dev;
+ sti();
+
+ /* Don't unlink if we have entries to serve. */
+ MOD_INC_USE_COUNT;
+
+ return 0;
+}
+
+
+/*
+ * Get a RARP cache entry.
+ */
+
+static int rarp_req_get(struct arpreq *req)
+{
+ struct arpreq r;
+ struct rarp_table *entry;
+ struct sockaddr_in *si;
+ unsigned long ip;
+ int err;
+
+/*
+ * We only understand about IP addresses...
+ */
+
+ err = copy_from_user(&r, req, sizeof(r));
+ if (err)
+ return -EFAULT;
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+/*
+ * Is there an existing entry for this address?
+ */
+
+ si = (struct sockaddr_in *) &r.arp_pa;
+ ip = si->sin_addr.s_addr;
+
+ for (entry = rarp_tables; entry != NULL; entry = entry->next)
+ if (entry->ip == ip)
+ break;
+
+ if (entry == NULL)
+ {
+ return -ENXIO;
+ }
+
+/*
+ * We found it; copy into structure.
+ */
+
+ memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen);
+ r.arp_ha.sa_family = entry->htype;
+
+/*
+ * Copy the information back
+ */
+
+ return copy_to_user(req, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+
+/*
+ * Handle a RARP layer I/O control request.
+ */
+
+int rarp_ioctl(unsigned int cmd, void *arg)
+{
+ struct arpreq r;
+ struct sockaddr_in *si;
+ int err;
+
+ switch(cmd)
+ {
+ case SIOCDRARP:
+ if (!suser())
+ return -EPERM;
+ err = copy_from_user(&r, arg, sizeof(r));
+ if (err)
+ return -EFAULT;
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+ si = (struct sockaddr_in *) &r.arp_pa;
+ rarp_destroy(si->sin_addr.s_addr);
+ return 0;
+
+ case SIOCGRARP:
+
+ return rarp_req_get((struct arpreq *)arg);
+ case SIOCSRARP:
+ if (!suser())
+ return -EPERM;
+ return rarp_req_set((struct arpreq *)arg);
+ default:
+ return -EINVAL;
+ }
+
+ /*NOTREACHED*/
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ int len=0;
+ off_t begin=0;
+ off_t pos=0;
+ int size;
+ struct rarp_table *entry;
+ char ipbuffer[20];
+ unsigned long netip;
+ if (initflag)
+ {
+ size = sprintf(buffer,"RARP disabled until entries added to cache.\n");
+ pos+=size;
+ len+=size;
+ }
+ else
+ {
+ size = sprintf(buffer,
+ "IP address HW type HW address\n");
+ pos+=size;
+ len+=size;
+
+ for(entry=rarp_tables; entry!=NULL; entry=entry->next)
+ {
+ netip=htonl(entry->ip); /* switch to network order */
+ sprintf(ipbuffer,"%d.%d.%d.%d",
+ (unsigned int)(netip>>24)&255,
+ (unsigned int)(netip>>16)&255,
+ (unsigned int)(netip>>8)&255,
+ (unsigned int)(netip)&255);
+
+ size = sprintf(buffer+len,
+ "%-17s%-20s%02x:%02x:%02x:%02x:%02x:%02x\n",
+ ipbuffer,
+ "10Mbps Ethernet",
+ (unsigned int)entry->ha[0],
+ (unsigned int)entry->ha[1],
+ (unsigned int)entry->ha[2],
+ (unsigned int)entry->ha[3],
+ (unsigned int)entry->ha[4],
+ (unsigned int)entry->ha[5]);
+
+ len+=size;
+ pos=begin+len;
+
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ break;
+ }
+ }
+
+ *start = buffer+(offset-begin); /* Start of wanted data */
+ len -= (offset-begin); /* Start slop */
+ if (len>length)
+ len = length; /* Ending slop */
+ return len;
+}
+
+struct proc_dir_entry proc_net_rarp = {
+ PROC_NET_RARP, 4, "rarp",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ rarp_get_info
+};
+#endif
+
+__initfunc(void
+rarp_init(void))
+{
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&proc_net_rarp);
+#endif
+ rarp_ioctl_hook = rarp_ioctl;
+}
+
+#ifdef MODULE
+
+int init_module(void)
+{
+ rarp_init();
+ return 0;
+}
+
+void cleanup_module(void)
+{
+ struct rarp_table *rt, *rt_next;
+#ifdef CONFIG_PROC_FS
+ proc_net_unregister(PROC_NET_RARP);
+#endif
+ rarp_ioctl_hook = NULL;
+ cli();
+ /* Destroy the RARP-table */
+ rt = rarp_tables;
+ rarp_tables = NULL;
+ sti();
+ /* ... and free it. */
+ for ( ; rt != NULL; rt = rt_next) {
+ rt_next = rt->next;
+ rarp_release_entry(rt);
+ }
+ rarp_end_pkt();
+}
+#endif
diff --git a/pfinet/linux-src/net/ipv4/raw.c b/pfinet/linux-src/net/ipv4/raw.c
new file mode 100644
index 00000000..5e7910dd
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/raw.c
@@ -0,0 +1,573 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * RAW - implementation of IP "raw" sockets.
+ *
+ * Version: $Id: raw.c,v 1.39.2.1 1999/06/20 20:14:50 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() fixed up
+ * Alan Cox : ICMP error handling
+ * Alan Cox : EMSGSIZE if you send too big a packet
+ * Alan Cox : Now uses generic datagrams and shared skbuff
+ * library. No more peek crashes, no more backlogs
+ * Alan Cox : Checks sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
+ * Alan Cox : Raw passes ip options too
+ * Alan Cox : Setsocketopt added
+ * Alan Cox : Fixed error return for broadcasts
+ * Alan Cox : Removed wake_up calls
+ * Alan Cox : Use ttl/tos
+ * Alan Cox : Cleaned up old debugging
+ * Alan Cox : Use new kernel side addresses
+ * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
+ * Alan Cox : BSD style RAW socket demultiplexing.
+ * Alan Cox : Beginnings of mrouted support.
+ * Alan Cox : Added IP_HDRINCL option.
+ * Alan Cox : Skip broadcast check if BSDism set.
+ * David S. Miller : New socket lookup architecture.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/mroute.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+
+#ifdef CONFIG_IP_MROUTE
+struct sock *mroute_socket=NULL;
+#endif
+
+struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
+
+static void raw_v4_hash(struct sock *sk)
+{
+ struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)];
+
+ SOCKHASH_LOCK();
+ if ((sk->next = *skp) != NULL)
+ (*skp)->pprev = &sk->next;
+ *skp = sk;
+ sk->pprev = skp;
+ SOCKHASH_UNLOCK();
+}
+
+static void raw_v4_unhash(struct sock *sk)
+{
+ SOCKHASH_LOCK();
+ if (sk->pprev) {
+ if (sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ }
+ SOCKHASH_UNLOCK();
+}
+
+/* Grumble... icmp and ip_input want to get at this... */
+struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
+ unsigned long raddr, unsigned long laddr, int dif)
+{
+ struct sock *s = sk;
+
+ SOCKHASH_LOCK();
+ for(s = sk; s; s = s->next) {
+ if((s->num == num) &&
+ !(s->dead && (s->state == TCP_CLOSE)) &&
+ !(s->daddr && s->daddr != raddr) &&
+ !(s->rcv_saddr && s->rcv_saddr != laddr) &&
+ !(s->bound_dev_if && s->bound_dev_if != dif))
+ break; /* gotcha */
+ }
+ SOCKHASH_UNLOCK();
+ return s;
+}
+
+void raw_err (struct sock *sk, struct sk_buff *skb)
+{
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ u32 info = 0;
+ int err = 0;
+ int harderr = 0;
+
+ /* Report error on raw socket, if:
+ 1. User requested ip_recverr.
+ 2. Socket is connected (otherwise the error indication
+ is useless without ip_recverr and error is hard.
+ */
+ if (!sk->ip_recverr && sk->state != TCP_ESTABLISHED)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ info = ntohl(skb->h.icmph->un.gateway)>>24;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ err = EHOSTUNREACH;
+ if (code > NR_ICMP_UNREACH)
+ break;
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
+ if (code == ICMP_FRAG_NEEDED) {
+ harderr = (sk->ip_pmtudisc != IP_PMTUDISC_DONT);
+ err = EMSGSIZE;
+ info = ntohs(skb->h.icmph->un.frag.mtu);
+ }
+ }
+
+ if (sk->ip_recverr)
+ ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1));
+
+ if (sk->ip_recverr || harderr) {
+ sk->err = err;
+ sk->error_report(sk);
+ }
+}
+
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+ /* Charge it to the socket. */
+
+ if (sock_queue_rcv_skb(sk,skb)<0)
+ {
+ ip_statistics.IpInDiscards++;
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ip_statistics.IpInDelivers++;
+ return 0;
+}
+
+/*
+ * This should be the easiest of all, all we do is
+ * copy it into a buffer. All demultiplexing is done
+ * in ip.c
+ */
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ /* Now we need to copy this into memory. */
+ skb_trim(skb, ntohs(skb->nh.iph->tot_len));
+
+ skb->h.raw = skb->nh.raw;
+
+ raw_rcv_skb(sk, skb);
+ return 0;
+}
+
+struct rawfakehdr
+{
+ struct iovec *iov;
+ u32 saddr;
+};
+
+/*
+ * Send a RAW IP packet.
+ */
+
+/*
+ * Callback support is trivial for SOCK_RAW
+ */
+
+static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
+{
+ struct rawfakehdr *rfh = (struct rawfakehdr *) p;
+ return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
+}
+
+/*
+ * IPPROTO_RAW needs extra work.
+ */
+
+static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
+{
+ struct rawfakehdr *rfh = (struct rawfakehdr *) p;
+
+ if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
+ return -EFAULT;
+
+ if (offset==0) {
+ struct iphdr *iph = (struct iphdr *)to;
+ if (!iph->saddr)
+ iph->saddr = rfh->saddr;
+ iph->check=0;
+ iph->tot_len=htons(fraglen); /* This is right as you can't frag
+ RAW packets */
+ /*
+ * Deliberate breach of modularity to keep
+ * ip_build_xmit clean (well less messy).
+ */
+ if (!iph->id)
+ iph->id = htons(ip_id_count++);
+ iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl);
+ }
+ return 0;
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct ipcm_cookie ipc;
+ struct rawfakehdr rfh;
+ struct rtable *rt = NULL;
+ int free = 0;
+ u32 daddr;
+ u8 tos;
+ int err;
+
+ /* This check is ONLY to check for arithmetic overflow
+ on integer(!) len. Not more! Real check will be made
+ in ip_build_xmit --ANK
+
+ BTW socket.c -> af_*.c -> ... make multiple
+ invalid conversions size_t -> int. We MUST repair it f.e.
+ by replacing all of them with size_t and revise all
+ the places sort of len += sizeof(struct iphdr)
+ If len was ULONG_MAX-10 it would be cathastrophe --ANK
+ */
+
+ if (len < 0 || len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
+ if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
+ return(-EINVAL);
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_namelen) {
+ struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*usin))
+ return(-EINVAL);
+ if (usin->sin_family != AF_INET) {
+ static int complained;
+ if (!complained++)
+ printk(KERN_INFO "%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current->comm);
+ if (usin->sin_family)
+ return -EINVAL;
+ }
+ daddr = usin->sin_addr.s_addr;
+ /* ANK: I did not forget to get protocol from port field.
+ * I just do not know, who uses this weirdness.
+ * IP_HDRINCL is much more convenient.
+ */
+ } else {
+ if (sk->state != TCP_ESTABLISHED)
+ return(-EINVAL);
+ daddr = sk->daddr;
+ }
+
+ ipc.addr = sk->saddr;
+ ipc.opt = NULL;
+ ipc.oif = sk->bound_dev_if;
+
+ if (msg->msg_controllen) {
+ int tmp = ip_cmsg_send(msg, &ipc);
+ if (tmp)
+ return tmp;
+ if (ipc.opt)
+ free=1;
+ }
+
+ rfh.saddr = ipc.addr;
+ ipc.addr = daddr;
+
+ if (!ipc.opt)
+ ipc.opt = sk->opt;
+
+ if (ipc.opt) {
+ err = -EINVAL;
+ /* Linux does not mangle headers on raw sockets,
+ * so that IP options + IP_HDRINCL is non-sense.
+ */
+ if (sk->ip_hdrincl)
+ goto done;
+ if (ipc.opt->srr) {
+ if (!daddr)
+ goto done;
+ daddr = ipc.opt->faddr;
+ }
+ }
+ tos = RT_TOS(sk->ip_tos) | sk->localroute;
+ if (msg->msg_flags&MSG_DONTROUTE)
+ tos |= RTO_ONLINK;
+
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = sk->ip_mc_index;
+ if (!rfh.saddr)
+ rfh.saddr = sk->ip_mc_addr;
+ }
+
+ err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
+
+ if (err)
+ goto done;
+
+ err = -EACCES;
+ if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
+ goto done;
+
+ rfh.iov = msg->msg_iov;
+ rfh.saddr = rt->rt_src;
+ if (!ipc.addr)
+ ipc.addr = rt->rt_dst;
+ err=ip_build_xmit(sk, sk->ip_hdrincl ? raw_getrawfrag : raw_getfrag,
+ &rfh, len, &ipc, rt, msg->msg_flags);
+
+done:
+ if (free)
+ kfree(ipc.opt);
+ ip_rt_put(rt);
+
+ return err<0 ? err : len;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ /* Observation: when raw_close is called, processes have
+ no access to socket anymore. But net still has.
+ Step one, detach it from networking:
+
+ A. Remove from hash tables.
+ */
+ sk->state = TCP_CLOSE;
+ raw_v4_unhash(sk);
+ /*
+ B. Raw sockets may have direct kernel refereneces. Kill them.
+ */
+ ip_ra_control(sk, 0, NULL);
+
+ /* In this point socket cannot receive new packets anymore */
+
+
+ /* But we still have packets pending on receive
+ queue and probably, our own packets waiting in device queues.
+ sock_destroy will drain receive queue, but transmitted
+ packets will delay socket destruction.
+ Set sk->dead=1 in order to prevent wakeups, when these
+ packet will be freed.
+ */
+ sk->dead=1;
+ destroy_sock(sk);
+
+ /* That's all. No races here. */
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int chk_addr_ret;
+
+ if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in)))
+ return -EINVAL;
+ chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ /* Superuser may bind to any address to allow transparent proxying. */
+ if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN))
+#endif
+ return -EADDRNOTAVAIL;
+ }
+ sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
+ if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ sk->saddr = 0; /* Use device */
+ dst_release(xchg(&sk->dst_cache, NULL));
+ return 0;
+}
+
+/*
+ * This should be easy, if there is something there
+ * we return it, otherwise we block.
+ */
+
+int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
+ int noblock, int flags,int *addr_len)
+{
+ int copied=0;
+ struct sk_buff *skb;
+ int err;
+ struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (addr_len)
+ *addr_len=sizeof(*sin);
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len);
+
+ skb=skb_recv_datagram(sk,flags,noblock,&err);
+ if(skb==NULL)
+ return err;
+
+ copied = skb->len;
+ if (len < copied)
+ {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto done;
+
+ sk->stamp=skb->stamp;
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+ }
+ if (sk->ip_cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+done:
+ skb_free_datagram(sk, skb);
+ return (err ? : copied);
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
+ if (sk->num == IPPROTO_ICMP)
+ memset(&tp->filter, 0, sizeof(tp->filter));
+ return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen)
+{
+ if (optlen > sizeof(struct icmp_filter))
+ optlen = sizeof(struct icmp_filter);
+ if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen)
+{
+ int len;
+
+ if (get_user(len,optlen))
+ return -EFAULT;
+ if (len > sizeof(struct icmp_filter))
+ len = sizeof(struct icmp_filter);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (level != SOL_RAW)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+
+ switch (optname) {
+ case ICMP_FILTER:
+ if (sk->num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ return raw_seticmpfilter(sk, optval, optlen);
+ };
+
+ return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int *optlen)
+{
+ if (level != SOL_RAW)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+
+ switch (optname) {
+ case ICMP_FILTER:
+ if (sk->num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ return raw_geticmpfilter(sk, optval, optlen);
+ };
+
+ return -ENOPROTOOPT;
+}
+
+struct proto raw_prot = {
+ (struct sock *)&raw_prot, /* sklist_next */
+ (struct sock *)&raw_prot, /* sklist_prev */
+ raw_close, /* close */
+ udp_connect, /* connect */
+ NULL, /* accept */
+ NULL, /* retransmit */
+ NULL, /* write_wakeup */
+ NULL, /* read_wakeup */
+ datagram_poll, /* poll */
+#ifdef CONFIG_IP_MROUTE
+ ipmr_ioctl, /* ioctl */
+#else
+ NULL, /* ioctl */
+#endif
+ raw_init, /* init */
+ NULL, /* destroy */
+ NULL, /* shutdown */
+ raw_setsockopt, /* setsockopt */
+ raw_getsockopt, /* getsockopt */
+ raw_sendmsg, /* sendmsg */
+ raw_recvmsg, /* recvmsg */
+ raw_bind, /* bind */
+ raw_rcv_skb, /* backlog_rcv */
+ raw_v4_hash, /* hash */
+ raw_v4_unhash, /* unhash */
+ NULL, /* get_port */
+ 128, /* max_header */
+ 0, /* retransmits */
+ "RAW", /* name */
+ 0, /* inuse */
+ 0 /* highestinuse */
+};
diff --git a/pfinet/linux-src/net/ipv4/route.c b/pfinet/linux-src/net/ipv4/route.c
new file mode 100644
index 00000000..06eb5fe5
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/route.c
@@ -0,0 +1,2048 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * ROUTE - implementation of the IP router.
+ *
+ * Version: $Id: route.c,v 1.67.2.3 1999/08/08 08:43:12 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ * Alan Cox : Verify area fixes.
+ * Alan Cox : cli() protects routing changes
+ * Rui Oliveira : ICMP routing table updates
+ * (rco@di.uminho.pt) Routing table insertion and update
+ * Linus Torvalds : Rewrote bits to be sensible
+ * Alan Cox : Added BSD route gw semantics
+ * Alan Cox : Super /proc >4K
+ * Alan Cox : MTU in route table
+ * Alan Cox : MSS actually. Also added the window
+ * clamper.
+ * Sam Lantinga : Fixed route matching in rt_del()
+ * Alan Cox : Routing cache support.
+ * Alan Cox : Removed compatibility cruft.
+ * Alan Cox : RTF_REJECT support.
+ * Alan Cox : TCP irtt support.
+ * Jonathan Naylor : Added Metric support.
+ * Miquel van Smoorenburg : BSD API fixes.
+ * Miquel van Smoorenburg : Metrics.
+ * Alan Cox : Use __u32 properly
+ * Alan Cox : Aligned routing errors more closely with BSD
+ * our system is still very different.
+ * Alan Cox : Faster /proc handling
+ * Alexey Kuznetsov : Massive rework to support tree based routing,
+ * routing caches and better behaviour.
+ *
+ * Olaf Erb : irtt wasn't being copied right.
+ * Bjorn Ekwall : Kerneld route support.
+ * Alan Cox : Multicast fixed (I hope)
+ * Pavel Krauz : Limited broadcast fixed
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov : End of old history. Splitted to fib.c and
+ * route.c and rewritten from scratch.
+ * Andi Kleen : Load-limit warning messages.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define IP_MAX_MTU 0xFFF0
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+int ip_rt_min_delay = 2*HZ;
+int ip_rt_max_delay = 10*HZ;
+int ip_rt_gc_thresh = RT_HASH_DIVISOR;
+int ip_rt_max_size = RT_HASH_DIVISOR*16;
+int ip_rt_gc_timeout = RT_GC_TIMEOUT;
+int ip_rt_gc_interval = 60*HZ;
+int ip_rt_gc_min_interval = 5*HZ;
+int ip_rt_redirect_number = 9;
+int ip_rt_redirect_load = HZ/50;
+int ip_rt_redirect_silence = ((HZ/50) << (9+1));
+int ip_rt_error_cost = HZ;
+int ip_rt_error_burst = 5*HZ;
+int ip_rt_gc_elasticity = 8;
+int ip_rt_mtu_expires = 10*60*HZ;
+
+static unsigned long rt_deadline = 0;
+
+#define RTprint(a...) printk(KERN_DEBUG a)
+
+static void rt_run_flush(unsigned long dummy);
+
+static struct timer_list rt_flush_timer =
+ { NULL, NULL, 0, 0L, rt_run_flush };
+static struct timer_list rt_periodic_timer =
+ { NULL, NULL, 0, 0L, NULL };
+
+/*
+ * Interface to generic destination cache.
+ */
+
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
+ struct sk_buff *);
+static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
+static void ipv4_link_failure(struct sk_buff *skb);
+static int rt_garbage_collect(void);
+
+
+struct dst_ops ipv4_dst_ops =
+{
+ AF_INET,
+ __constant_htons(ETH_P_IP),
+ RT_HASH_DIVISOR,
+
+ rt_garbage_collect,
+ ipv4_dst_check,
+ ipv4_dst_reroute,
+ NULL,
+ ipv4_negative_advice,
+ ipv4_link_failure,
+};
+
+__u8 ip_tos2prio[16] = {
+ TC_PRIO_BESTEFFORT,
+ TC_PRIO_FILLER,
+ TC_PRIO_BESTEFFORT,
+ TC_PRIO_FILLER,
+ TC_PRIO_BULK,
+ TC_PRIO_FILLER,
+ TC_PRIO_BULK,
+ TC_PRIO_FILLER,
+ TC_PRIO_INTERACTIVE,
+ TC_PRIO_FILLER,
+ TC_PRIO_INTERACTIVE,
+ TC_PRIO_FILLER,
+ TC_PRIO_INTERACTIVE_BULK,
+ TC_PRIO_FILLER,
+ TC_PRIO_INTERACTIVE_BULK,
+ TC_PRIO_FILLER
+};
+
+
+/*
+ * Route cache.
+ */
+
+struct rtable *rt_hash_table[RT_HASH_DIVISOR];
+
+static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
+
+static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+{
+ unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
+ hash = hash^saddr^tos;
+ hash = hash^(hash>>16);
+ return (hash^(hash>>8)) & 0xFF;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+ int len=0;
+ off_t pos=0;
+ char temp[129];
+ struct rtable *r;
+ int i;
+
+ pos = 128;
+
+ if (offset<128) {
+ sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
+ len = 128;
+ }
+
+
+ start_bh_atomic();
+
+ for (i = 0; i<RT_HASH_DIVISOR; i++) {
+ for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
+ /*
+ * Spin through entries until we are ready
+ */
+ pos += 128;
+
+ if (pos <= offset) {
+ len = 0;
+ continue;
+ }
+ sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
+ r->u.dst.dev ? r->u.dst.dev->name : "*",
+ (unsigned long)r->rt_dst,
+ (unsigned long)r->rt_gateway,
+ r->rt_flags,
+ atomic_read(&r->u.dst.use),
+ atomic_read(&r->u.dst.refcnt),
+ 0,
+ (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
+ r->u.dst.window,
+ (int)r->u.dst.rtt, r->key.tos,
+ r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
+ r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
+ r->rt_spec_dst);
+ sprintf(buffer+len,"%-127s\n",temp);
+ len += 128;
+ if (pos >= offset+length)
+ goto done;
+ }
+ }
+
+done:
+ end_bh_atomic();
+
+ *start = buffer+len-(pos-offset);
+ len = pos-offset;
+ if (len>length)
+ len = length;
+ return len;
+}
+#endif
+
+static __inline__ void rt_free(struct rtable *rt)
+{
+ dst_free(&rt->u.dst);
+}
+
+static __inline__ void rt_drop(struct rtable *rt)
+{
+ ip_rt_put(rt);
+ dst_free(&rt->u.dst);
+}
+
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+ /* Kill broadcast/multicast entries very aggresively, if they
+ collide in hash table with more useful entries */
+ return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
+ && rth->key.iif && rth->u.rt_next);
+}
+
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+ return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
+ || rth->u.dst.expires);
+}
+
+static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
+{
+ int age;
+
+ if (atomic_read(&rth->u.dst.use))
+ return 0;
+
+ if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
+ return 1;
+
+ age = jiffies - rth->u.dst.lastuse;
+ if (age <= tmo1 && !rt_fast_clean(rth))
+ return 0;
+ if (age <= tmo2 && rt_valuable(rth))
+ return 0;
+ return 1;
+}
+
+static void rt_check_expire(unsigned long dummy)
+{
+ int i;
+ static int rover;
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+
+ for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+ unsigned tmo = ip_rt_gc_timeout;
+
+ rover = (rover + 1) & (RT_HASH_DIVISOR-1);
+ rthp = &rt_hash_table[rover];
+
+ while ((rth = *rthp) != NULL) {
+ if (rth->u.dst.expires) {
+ /* Entrie is expired even if it is in use */
+ if ((long)(now - rth->u.dst.expires) <= 0) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+
+ /*
+ * Cleanup aged off entries.
+ */
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ }
+
+ /* Fallback loop breaker. */
+ if ((jiffies - now) > 0)
+ break;
+ }
+ rt_periodic_timer.expires = now + ip_rt_gc_interval;
+ add_timer(&rt_periodic_timer);
+}
+
+static void rt_run_flush(unsigned long dummy)
+{
+ int i;
+ struct rtable * rth, * next;
+
+ rt_deadline = 0;
+
+ start_bh_atomic();
+ for (i=0; i<RT_HASH_DIVISOR; i++) {
+ if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
+ continue;
+ end_bh_atomic();
+
+ for (; rth; rth=next) {
+ next = rth->u.rt_next;
+ rth->u.rt_next = NULL;
+ rt_free(rth);
+ }
+
+ start_bh_atomic();
+ }
+ end_bh_atomic();
+}
+
+void rt_cache_flush(int delay)
+{
+ unsigned long now = jiffies;
+ int user_mode = !in_interrupt();
+
+ if (delay < 0)
+ delay = ip_rt_min_delay;
+
+ start_bh_atomic();
+
+ if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
+ long tmo = (long)(rt_deadline - now);
+
+ /* If flush timer is already running
+ and flush request is not immediate (delay > 0):
+
+ if deadline is not achieved, prolongate timer to "delay",
+ otherwise fire it at deadline time.
+ */
+
+ if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
+ tmo = 0;
+
+ if (delay > tmo)
+ delay = tmo;
+ }
+
+ if (delay <= 0) {
+ end_bh_atomic();
+ rt_run_flush(0);
+ return;
+ }
+
+ if (rt_deadline == 0)
+ rt_deadline = now + ip_rt_max_delay;
+
+ rt_flush_timer.expires = now + delay;
+ add_timer(&rt_flush_timer);
+ end_bh_atomic();
+}
+
+/*
+ Short description of GC goals.
+
+ We want to build algorithm, which will keep routing cache
+ at some equilibrium point, when number of aged off entries
+ is kept approximately equal to newly generated ones.
+
+ Current expiration strength is variable "expire".
+ We try to adjust it dynamically, so that if networking
+ is idle expires is large enough to keep enough of warm entries,
+ and when load increases it reduces to limit cache size.
+ */
+
+static int rt_garbage_collect(void)
+{
+ static unsigned expire = RT_GC_TIMEOUT;
+ static unsigned long last_gc;
+ static int rover;
+ static int equilibrium;
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+ int goal;
+
+ /*
+ * Garbage collection is pretty expensive,
+ * do not make it too frequently.
+ */
+ if (now - last_gc < ip_rt_gc_min_interval &&
+ atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+
+ /* Calculate number of entries, which we want to expire now. */
+ goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
+ if (goal <= 0) {
+ if (equilibrium < ipv4_dst_ops.gc_thresh)
+ equilibrium = ipv4_dst_ops.gc_thresh;
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ if (goal > 0) {
+ equilibrium += min(goal/2, RT_HASH_DIVISOR);
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ }
+ } else {
+ /* We are in dangerous area. Try to reduce cache really
+ * aggressively.
+ */
+ goal = max(goal/2, RT_HASH_DIVISOR);
+ equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ }
+
+ if (now - last_gc >= ip_rt_gc_min_interval)
+ last_gc = now;
+
+ if (goal <= 0) {
+ equilibrium += goal;
+ goto work_done;
+ }
+
+ do {
+ int i, k;
+
+ start_bh_atomic();
+ for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
+ unsigned tmo = expire;
+
+ k = (k + 1) & (RT_HASH_DIVISOR-1);
+ rthp = &rt_hash_table[k];
+ while ((rth = *rthp) != NULL) {
+ if (!rt_may_expire(rth, tmo, expire)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = NULL;
+ rt_free(rth);
+ goal--;
+ }
+ if (goal <= 0)
+ break;
+ }
+ rover = k;
+ end_bh_atomic();
+
+ if (goal <= 0)
+ goto work_done;
+
+ /* Goal is not achieved. We stop process if:
+
+ - if expire reduced to zero. Otherwise, expire is halfed.
+ - if table is not full.
+ - if we are called from interrupt.
+ - jiffies check is just fallback/debug loop breaker.
+ We will not spin here for long time in any case.
+ */
+
+ if (expire == 0)
+ break;
+
+ expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ } while (!in_interrupt() && jiffies - now < 1);
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ if (net_ratelimit())
+ printk("dst cache overflow\n");
+ return 1;
+
+work_done:
+ expire += ip_rt_gc_min_interval;
+ if (expire > ip_rt_gc_timeout ||
+ atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+ return 0;
+}
+
+static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
+{
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+ int attempts = !in_interrupt();
+
+restart:
+ start_bh_atomic();
+
+ rthp = &rt_hash_table[hash];
+
+ while ((rth = *rthp) != NULL) {
+ if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
+ /* Put it first */
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = rt_hash_table[hash];
+ rt_hash_table[hash] = rth;
+
+ atomic_inc(&rth->u.dst.refcnt);
+ atomic_inc(&rth->u.dst.use);
+ rth->u.dst.lastuse = now;
+ end_bh_atomic();
+
+ rt_drop(rt);
+ *rp = rth;
+ return 0;
+ }
+
+ rthp = &rth->u.rt_next;
+ }
+
+ /* Try to bind route to arp only if it is output
+ route or unicast forwarding path.
+ */
+ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
+ if (!arp_bind_neighbour(&rt->u.dst)) {
+ end_bh_atomic();
+
+ /* Neighbour tables are full and nothing
+ can be released. Try to shrink route cache,
+ it is most likely it holds some neighbour records.
+ */
+ if (attempts-- > 0) {
+ int saved_elasticity = ip_rt_gc_elasticity;
+ int saved_int = ip_rt_gc_min_interval;
+ ip_rt_gc_elasticity = 1;
+ ip_rt_gc_min_interval = 0;
+ rt_garbage_collect();
+ ip_rt_gc_min_interval = saved_int;
+ ip_rt_gc_elasticity = saved_elasticity;
+ goto restart;
+ }
+
+ rt_drop(rt);
+ if (net_ratelimit())
+ printk("neighbour table overflow\n");
+ return -ENOBUFS;
+ }
+ }
+
+ rt->u.rt_next = rt_hash_table[hash];
+#if RT_CACHE_DEBUG >= 2
+ if (rt->u.rt_next) {
+ struct rtable * trt;
+ printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
+ for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
+ printk(" . %08x", trt->rt_dst);
+ printk("\n");
+ }
+#endif
+ rt_hash_table[hash] = rt;
+ end_bh_atomic();
+ *rp = rt;
+ return 0;
+}
+
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+ struct rtable **rthp;
+
+ start_bh_atomic();
+ ip_rt_put(rt);
+ for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
+ if (*rthp == rt) {
+ *rthp = rt->u.rt_next;
+ rt_free(rt);
+ break;
+ }
+ }
+ end_bh_atomic();
+}
+
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+ u32 saddr, u8 tos, struct device *dev)
+{
+ int i, k;
+ struct in_device *in_dev = dev->ip_ptr;
+ struct rtable *rth, **rthp;
+ u32 skeys[2] = { saddr, 0 };
+ int ikeys[2] = { dev->ifindex, 0 };
+
+ tos &= IPTOS_TOS_MASK;
+
+ if (!in_dev)
+ return;
+
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
+ || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+ goto reject_redirect;
+
+ if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+ if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+ goto reject_redirect;
+ if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+ goto reject_redirect;
+ } else {
+ if (inet_addr_type(new_gw) != RTN_UNICAST)
+ goto reject_redirect;
+ }
+
+ for (i=0; i<2; i++) {
+ for (k=0; k<2; k++) {
+ unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
+
+ rthp=&rt_hash_table[hash];
+
+ while ( (rth = *rthp) != NULL) {
+ struct rtable *rt;
+
+ if (rth->key.dst != daddr ||
+ rth->key.src != skeys[i] ||
+ rth->key.tos != tos ||
+ rth->key.oif != ikeys[k] ||
+ rth->key.iif != 0) {
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+
+ if (rth->rt_dst != daddr ||
+ rth->rt_src != saddr ||
+ rth->u.dst.error ||
+ rth->rt_gateway != old_gw ||
+ rth->u.dst.dev != dev)
+ break;
+
+ dst_clone(&rth->u.dst);
+
+ rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (rt == NULL) {
+ ip_rt_put(rth);
+ return;
+ }
+
+ /*
+ * Copy all the information.
+ */
+ *rt = *rth;
+ atomic_set(&rt->u.dst.refcnt, 1);
+ atomic_set(&rt->u.dst.use, 1);
+ rt->u.dst.lastuse = jiffies;
+ rt->u.dst.neighbour = NULL;
+ rt->u.dst.hh = NULL;
+ rt->u.dst.obsolete = 0;
+
+ rt->rt_flags |= RTCF_REDIRECTED;
+
+ /* Gateway is different ... */
+ rt->rt_gateway = new_gw;
+
+ /* Redirect received -> path was valid */
+ dst_confirm(&rth->u.dst);
+
+ if (!arp_bind_neighbour(&rt->u.dst) ||
+ !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
+ if (rt->u.dst.neighbour)
+ neigh_event_send(rt->u.dst.neighbour, NULL);
+ ip_rt_put(rth);
+ rt_drop(rt);
+ break;
+ }
+
+ rt_del(hash, rth);
+
+ if (!rt_intern_hash(hash, rt, &rt))
+ ip_rt_put(rt);
+ break;
+ }
+ }
+ }
+ return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
+ "Path = %lX -> %lX, tos %02x\n",
+ ntohl(old_gw), dev->name, ntohl(new_gw),
+ ntohl(saddr), ntohl(daddr), tos);
+#endif
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable*)dst;
+
+ if (rt != NULL) {
+ if (dst->obsolete) {
+ ip_rt_put(rt);
+ return NULL;
+ }
+ if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
+ unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
+#if RT_CACHE_DEBUG >= 1
+ printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
+#endif
+ rt_del(hash, rt);
+ return NULL;
+ }
+ }
+ return dst;
+}
+
+/*
+ * Algorithm:
+ * 1. The first ip_rt_redirect_number redirects are sent
+ * with exponential backoff, then we stop sending them at all,
+ * assuming that the host ignores our redirects.
+ * 2. If we did not see packets requiring redirects
+ * during ip_rt_redirect_silence, we assume that the host
+ * forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
+
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
+ return;
+
+ /* No redirected packets during ip_rt_redirect_silence;
+ * reset the algorithm.
+ */
+ if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
+ rt->u.dst.rate_tokens = 0;
+
+ /* Too many ignored redirects; do not send anything
+ * set u.dst.rate_last to the last seen redirected packet.
+ */
+ if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
+ rt->u.dst.rate_last = jiffies;
+ return;
+ }
+
+ /* Check for load limit; set rate_last to the latest sent
+ * redirect.
+ */
+ if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ rt->u.dst.rate_last = jiffies;
+ ++rt->u.dst.rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
+ rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
+ printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
+ rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
+#endif
+ }
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ unsigned long now;
+ int code;
+
+ switch (rt->u.dst.error) {
+ case EINVAL:
+ default:
+ kfree_skb(skb);
+ return 0;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+
+ now = jiffies;
+ if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
+ rt->u.dst.rate_tokens = ip_rt_error_burst;
+ rt->u.dst.rate_last = now;
+ if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
+ rt->u.dst.rate_tokens -= ip_rt_error_cost;
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * The last two values are not from the RFC but
+ * are needed for AMPRnet AX.25 paths.
+ */
+
+static unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+{
+ int i;
+
+ for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
+ if (old_mtu > mtu_plateau[i])
+ return mtu_plateau[i];
+ return 68;
+}
+
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+ int i;
+ unsigned short old_mtu = ntohs(iph->tot_len);
+ struct rtable *rth;
+ u32 skeys[2] = { iph->saddr, 0, };
+ u32 daddr = iph->daddr;
+ u8 tos = iph->tos & IPTOS_TOS_MASK;
+ unsigned short est_mtu = 0;
+
+ if (ipv4_config.no_pmtu_disc)
+ return 0;
+
+ for (i=0; i<2; i++) {
+ unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+ for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == skeys[i] &&
+ rth->rt_dst == daddr &&
+ rth->rt_src == iph->saddr &&
+ rth->key.tos == tos &&
+ rth->key.iif == 0 &&
+ !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
+ unsigned short mtu = new_mtu;
+
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+ /* BSD 4.2 compatibility hack :-( */
+ if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
+ old_mtu >= 68 + (iph->ihl<<2))
+ old_mtu -= iph->ihl<<2;
+
+ mtu = guess_mtu(old_mtu);
+ }
+ if (mtu <= rth->u.dst.pmtu) {
+ if (mtu < rth->u.dst.pmtu) {
+ dst_confirm(&rth->u.dst);
+ rth->u.dst.pmtu = mtu;
+ dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
+ }
+ est_mtu = mtu;
+ }
+ }
+ }
+ }
+ return est_mtu ? : new_mtu;
+}
+
+void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
+{
+ if (dst->pmtu > mtu && mtu >= 68 &&
+ !(dst->mxlock&(1<<RTAX_MTU))) {
+ dst->pmtu = mtu;
+ dst_set_expires(dst, ip_rt_mtu_expires);
+ }
+}
+
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
+{
+ dst_release(dst);
+ return NULL;
+}
+
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
+ struct sk_buff *skb)
+{
+ return NULL;
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+ struct rtable *rt;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = (struct rtable *) skb->dst;
+ if (rt)
+ dst_set_expires(&rt->u.dst, 0);
+}
+
+static int ip_rt_bug(struct sk_buff *skb)
+{
+ printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
+ skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ We do not cache source address of outgoing interface,
+ because it is used only by IP RR, TS and SRR options,
+ so that it out of fast path.
+
+ BTW remember: "addr" is allowed to be not aligned
+ in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct rtable *rt)
+{
+ u32 src;
+ struct fib_result res;
+
+ if (rt->key.iif == 0)
+ src = rt->rt_src;
+ else if (fib_lookup(&rt->key, &res) == 0 && res.type != RTN_NAT)
+ src = FIB_RES_PREFSRC(res);
+ else
+ src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->u.dst.tclassid&0xFFFF))
+ rt->u.dst.tclassid |= tag&0xFFFF;
+ if (!(rt->u.dst.tclassid&0xFFFF0000))
+ rt->u.dst.tclassid |= tag&0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+{
+ struct fib_info *fi = res->fi;
+
+ if (fi) {
+ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = FIB_RES_GW(*res);
+ rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
+ rt->u.dst.pmtu = fi->fib_mtu;
+ if (fi->fib_mtu == 0) {
+ rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+ if (rt->u.dst.pmtu > IP_MAX_MTU)
+ rt->u.dst.pmtu = IP_MAX_MTU;
+ if (rt->u.dst.pmtu < 68)
+ rt->u.dst.pmtu = 68;
+ if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
+ rt->rt_gateway != rt->rt_dst &&
+ rt->u.dst.pmtu > 576)
+ rt->u.dst.pmtu = 576;
+ }
+ rt->u.dst.window= fi->fib_window ? : 0;
+ rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
+#ifdef CONFIG_NET_CLS_ROUTE
+ rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+#endif
+ } else {
+ rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+ if (rt->u.dst.pmtu > IP_MAX_MTU)
+ rt->u.dst.pmtu = IP_MAX_MTU;
+ if (rt->u.dst.pmtu < 68)
+ rt->u.dst.pmtu = 68;
+ rt->u.dst.window= 0;
+ rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+ }
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, fib_rules_tclass(res));
+#endif
+ set_class_tag(rt, itag);
+#endif
+ rt->rt_type = res->type;
+}
+
+static int
+ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct device *dev, int our)
+{
+ unsigned hash;
+ struct rtable *rth;
+ u32 spec_dst;
+ struct in_device *in_dev = dev->ip_ptr;
+ u32 itag = 0;
+
+ /* Primary sanity checks. */
+
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+ in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (ZERONET(saddr)) {
+ if (!LOCAL_MCAST(daddr))
+ return -EINVAL;
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
+ return -EINVAL;
+
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
+
+ rth->u.dst.output= ip_rt_bug;
+
+ atomic_set(&rth->u.dst.use, 1);
+ rth->key.dst = daddr;
+ rth->rt_dst = daddr;
+ rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
+ rth->key.src = saddr;
+ rth->rt_src = saddr;
+#ifdef CONFIG_IP_ROUTE_NAT
+ rth->rt_dst_map = daddr;
+ rth->rt_src_map = saddr;
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
+ rth->rt_iif =
+ rth->key.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ rth->key.oif = 0;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->rt_type = RTN_MULTICAST;
+ rth->rt_flags = RTCF_MULTICAST;
+ if (our) {
+ rth->u.dst.input= ip_local_deliver;
+ rth->rt_flags |= RTCF_LOCAL;
+ }
+
+#ifdef CONFIG_IP_MROUTE
+ if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->u.dst.input = ip_mr_input;
+#endif
+
+ hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+}
+
+/*
+ * NOTE. We drop all the packets that has local source
+ * addresses, because every properly looped back packet
+ * must have correct destination already attached by output routine.
+ *
+ * Such approach solves two big problems:
+ * 1. Not simplex devices are handled properly.
+ * 2. IP spoofing attempts are filtered with 100% of guarantee.
+ */
+
+int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct device *dev)
+{
+ struct rt_key key;
+ struct fib_result res;
+ struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *out_dev;
+ unsigned flags = 0;
+ u32 itag = 0;
+ struct rtable * rth;
+ unsigned hash;
+ u32 spec_dst;
+ int err = -EINVAL;
+
+ /*
+ * IP on this device is disabled.
+ */
+
+ if (!in_dev)
+ return -EINVAL;
+
+ key.dst = daddr;
+ key.src = saddr;
+ key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = skb->fwmark;
+#endif
+ key.iif = dev->ifindex;
+ key.oif = 0;
+ key.scope = RT_SCOPE_UNIVERSE;
+
+ hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
+
+ /* Check for the most weird martians, which can be not detected
+ by fib_lookup.
+ */
+
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+ goto martian_source;
+
+ if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
+ goto brd_input;
+
+ /* Accept zero addresses only to limited broadcast;
+ * I even do not know to fix it or not. Waiting for complains :-)
+ */
+ if (ZERONET(saddr))
+ goto martian_source;
+
+ if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+ goto martian_destination;
+
+ /*
+ * Now we are ready to route packet.
+ */
+ if ((err = fib_lookup(&key, &res))) {
+ if (!IN_DEV_FORWARD(in_dev))
+ return -EINVAL;
+ goto no_route;
+ }
+
+#ifdef CONFIG_IP_ROUTE_NAT
+ /* Policy is applied before mapping destination,
+ but rerouting after map should be made with old source.
+ */
+
+ if (1) {
+ u32 src_map = saddr;
+ if (res.r)
+ src_map = fib_rules_policy(saddr, &res, &flags);
+
+ if (res.type == RTN_NAT) {
+ key.dst = fib_rules_map_destination(daddr, &res);
+ if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
+ return -EINVAL;
+ flags |= RTCF_DNAT;
+ }
+ key.src = src_map;
+ }
+#endif
+
+ if (res.type == RTN_BROADCAST)
+ goto brd_input;
+
+ if (res.type == RTN_LOCAL) {
+ int result;
+ result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
+ dev, &spec_dst, &itag);
+ if (result < 0)
+ goto martian_source;
+ if (result)
+ flags |= RTCF_DIRECTSRC;
+ spec_dst = daddr;
+ goto local_input;
+ }
+
+ if (!IN_DEV_FORWARD(in_dev))
+ return -EINVAL;
+ if (res.type != RTN_UNICAST)
+ goto martian_destination;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && key.oif == 0)
+ fib_select_multipath(&key, &res);
+#endif
+ out_dev = FIB_RES_DEV(res)->ip_ptr;
+ if (out_dev == NULL) {
+ if (net_ratelimit())
+ printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
+ return -EINVAL;
+ }
+
+ err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
+ if (err < 0)
+ goto martian_source;
+
+ if (err)
+ flags |= RTCF_DIRECTSRC;
+
+ if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
+ (IN_DEV_SHARED_MEDIA(out_dev)
+ || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
+ flags |= RTCF_DOREDIRECT;
+
+ if (skb->protocol != __constant_htons(ETH_P_IP)) {
+ /* Not IP (i.e. ARP). Do not create route, if it is
+ * invalid for proxy arp. DNAT routes are always valid.
+ */
+ if (out_dev == in_dev && !(flags&RTCF_DNAT))
+ return -EINVAL;
+ }
+
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
+
+ atomic_set(&rth->u.dst.use, 1);
+ rth->key.dst = daddr;
+ rth->rt_dst = daddr;
+ rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
+ rth->key.src = saddr;
+ rth->rt_src = saddr;
+ rth->rt_gateway = daddr;
+#ifdef CONFIG_IP_ROUTE_NAT
+ rth->rt_src_map = key.src;
+ rth->rt_dst_map = key.dst;
+ if (flags&RTCF_DNAT)
+ rth->rt_gateway = key.dst;
+#endif
+ rth->rt_iif =
+ rth->key.iif = dev->ifindex;
+ rth->u.dst.dev = out_dev->dev;
+ rth->key.oif = 0;
+ rth->rt_spec_dst= spec_dst;
+
+ rth->u.dst.input = ip_forward;
+ rth->u.dst.output = ip_output;
+
+ rt_set_nexthop(rth, &res, itag);
+
+ rth->rt_flags = flags;
+
+#ifdef CONFIG_NET_FASTROUTE
+ if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
+ struct device *odev = rth->u.dst.dev;
+ if (odev != dev &&
+ dev->accept_fastpath &&
+ odev->mtu >= dev->mtu &&
+ dev->accept_fastpath(dev, &rth->u.dst) == 0)
+ rth->rt_flags |= RTCF_FAST;
+ }
+#endif
+
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+
+brd_input:
+ if (skb->protocol != __constant_htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (ZERONET(saddr)) {
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ } else {
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
+ if (err < 0)
+ goto martian_source;
+ if (err)
+ flags |= RTCF_DIRECTSRC;
+ }
+ flags |= RTCF_BROADCAST;
+ res.type = RTN_BROADCAST;
+
+local_input:
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
+
+ rth->u.dst.output= ip_rt_bug;
+
+ atomic_set(&rth->u.dst.use, 1);
+ rth->key.dst = daddr;
+ rth->rt_dst = daddr;
+ rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
+ rth->key.src = saddr;
+ rth->rt_src = saddr;
+#ifdef CONFIG_IP_ROUTE_NAT
+ rth->rt_dst_map = key.dst;
+ rth->rt_src_map = key.src;
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
+ rth->rt_iif =
+ rth->key.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ rth->key.oif = 0;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->u.dst.input= ip_local_deliver;
+ rth->rt_flags = flags|RTCF_LOCAL;
+ if (res.type == RTN_UNREACHABLE) {
+ rth->u.dst.input= ip_error;
+ rth->u.dst.error= -err;
+ rth->rt_flags &= ~RTCF_LOCAL;
+ }
+ rth->rt_type = res.type;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+
+no_route:
+ spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ res.type = RTN_UNREACHABLE;
+ goto local_input;
+
+ /*
+ * Do not cache martian addresses: they should be logged (RFC1812)
+ */
+martian_destination:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
+#endif
+ return -EINVAL;
+
+martian_source:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+ /*
+ * RFC1812 recommenadtion, if source is martian,
+ * the only hint is MAC header.
+ */
+ printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
+ if (dev->hard_header_len) {
+ int i;
+ unsigned char *p = skb->mac.raw;
+ printk(KERN_WARNING "ll header:");
+ for (i=0; i<dev->hard_header_len; i++, p++)
+ printk(" %02x", *p);
+ printk("\n");
+ }
+ }
+#endif
+ return -EINVAL;
+}
+
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct device *dev)
+{
+ struct rtable * rth;
+ unsigned hash;
+ int iif = dev->ifindex;
+
+ tos &= IPTOS_TOS_MASK;
+ hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
+
+ for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == saddr &&
+ rth->key.iif == iif &&
+ rth->key.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark == skb->fwmark &&
+#endif
+ rth->key.tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ atomic_inc(&rth->u.dst.use);
+ atomic_inc(&rth->u.dst.refcnt);
+ skb->dst = (struct dst_entry*)rth;
+ return 0;
+ }
+ }
+
+ /* Multicast recognition logic is moved from route cache to here.
+ The problem was that too many Ethernet cards have broken/missing
+ hardware multicast filters :-( As result the host on multicasting
+ network acquires a lot of useless route cache entries, sort of
+ SDR messages from all the world. Now we try to get rid of them.
+ Really, provided software IP multicast filter is organized
+ reasonably (at least, hashed), it does not result in a slowdown
+ comparing with route cache reject entries.
+ Note, that multicast routers are not affected, because
+ route cache entry is created eventually.
+ */
+ if (MULTICAST(daddr)) {
+ int our = ip_check_mc(dev, daddr);
+ if (!our
+#ifdef CONFIG_IP_MROUTE
+ && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
+ !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
+#endif
+ ) return -EINVAL;
+ return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
+ }
+ return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
+{
+ struct rt_key key;
+ struct fib_result res;
+ unsigned flags = 0;
+ struct rtable *rth;
+ struct device *dev_out = NULL;
+ unsigned hash;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ u32 nochecksrc = (tos & RTO_TPROXY);
+#endif
+
+ tos &= IPTOS_TOS_MASK|RTO_ONLINK;
+ key.dst = daddr;
+ key.src = saddr;
+ key.tos = tos&IPTOS_TOS_MASK;
+ key.iif = loopback_dev.ifindex;
+ key.oif = oif;
+ key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+ res.fi = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ if (saddr) {
+ if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
+ return -EINVAL;
+
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = ip_dev_find(saddr);
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ /* If address is not local, test for transparent proxy flag;
+ if address is local --- clear the flag.
+ */
+ if (dev_out == NULL) {
+ if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
+ return -EINVAL;
+ flags |= RTCF_TPROXY;
+ }
+#else
+ if (dev_out == NULL)
+ return -EINVAL;
+#endif
+
+ /* I removed check for oif == dev_out->oif here.
+ It was wrong by three reasons:
+ 1. ip_dev_find(saddr) can return wrong iface, if saddr is
+ assigned to multiple interfaces.
+ 2. Moreover, we are allowed to send packets with saddr
+ of another iface. --ANK
+ */
+
+ if (oif == 0 &&
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ dev_out &&
+#endif
+ (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
+ /* Special hack: user can direct multicasts
+ and limited broadcast via necessary interface
+ without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ This hack is not just for fun, it allows
+ vic,vat and friends to work.
+ They bind socket to loopback, set ttl to zero
+ and expect that it will work.
+ From the viewpoint of routing cache they are broken,
+ because we are not allowed to build multicast path
+ with loopback source addr (look, routing cache
+ cannot know, that ttl is zero, so that packet
+ will not leave this host and route is valid).
+ Luckily, this hack is good workaround.
+ */
+
+ key.oif = dev_out->ifindex;
+ goto make_route;
+ }
+ dev_out = NULL;
+ }
+ if (oif) {
+ dev_out = dev_get_by_index(oif);
+ if (dev_out == NULL)
+ return -ENODEV;
+ if (dev_out->ip_ptr == NULL)
+ return -ENODEV; /* Wrong error code */
+
+ if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
+ if (!key.src)
+ key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
+ goto make_route;
+ }
+ if (!key.src) {
+ if (MULTICAST(daddr))
+ key.src = inet_select_addr(dev_out, 0, key.scope);
+ else if (!daddr)
+ key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
+ }
+ }
+
+ if (!key.dst) {
+ key.dst = key.src;
+ if (!key.dst)
+ key.dst = key.src = htonl(INADDR_LOOPBACK);
+ dev_out = &loopback_dev;
+ key.oif = loopback_dev.ifindex;
+ res.type = RTN_LOCAL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+ if (fib_lookup(&key, &res)) {
+ res.fi = NULL;
+ if (oif) {
+ /* Apparently, routing tables are wrong. Assume,
+ that the destination is on link.
+
+ WHY? DW.
+ Because we are allowed to send to iface
+ even if it has NO routes and NO assigned
+ addresses. When oif is specified, routing
+ tables are looked up with only one purpose:
+ to catch if destination is gatewayed, rather than
+ direct. Moreover, if MSG_DONTROUTE is set,
+ we send packet, ignoring both routing tables
+ and ifaddr state. --ANK
+
+
+ We could make it even if oif is unknown,
+ likely IPv6, but we do not.
+ */
+
+ if (key.src == 0)
+ key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
+ res.type = RTN_UNICAST;
+ goto make_route;
+ }
+ return -ENETUNREACH;
+ }
+
+ if (res.type == RTN_NAT)
+ return -EINVAL;
+
+ if (res.type == RTN_LOCAL) {
+ if (!key.src)
+ key.src = key.dst;
+ dev_out = &loopback_dev;
+ key.oif = dev_out->ifindex;
+ res.fi = NULL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && key.oif == 0)
+ fib_select_multipath(&key, &res);
+ else
+#endif
+ if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
+ fib_select_default(&key, &res);
+
+ if (!key.src)
+ key.src = FIB_RES_PREFSRC(res);
+
+ dev_out = FIB_RES_DEV(res);
+ key.oif = dev_out->ifindex;
+
+make_route:
+ if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
+ return -EINVAL;
+
+ if (key.dst == 0xFFFFFFFF)
+ res.type = RTN_BROADCAST;
+ else if (MULTICAST(key.dst))
+ res.type = RTN_MULTICAST;
+ else if (BADCLASS(key.dst) || ZERONET(key.dst))
+ return -EINVAL;
+
+ if (dev_out->flags&IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
+ if (res.type == RTN_BROADCAST) {
+ flags |= RTCF_BROADCAST|RTCF_LOCAL;
+ res.fi = NULL;
+ } else if (res.type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST|RTCF_LOCAL;
+ if (!ip_check_mc(dev_out, daddr))
+ flags &= ~RTCF_LOCAL;
+ /* If multicast route do not exist use
+ default one, but do not gateway in this case.
+ Yes, it is hack.
+ */
+ if (res.fi && res.prefixlen < 4)
+ res.fi = NULL;
+ }
+
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
+
+ atomic_set(&rth->u.dst.use, 1);
+ rth->key.dst = daddr;
+ rth->key.tos = tos;
+ rth->key.src = saddr;
+ rth->key.iif = 0;
+ rth->key.oif = oif;
+ rth->rt_dst = key.dst;
+ rth->rt_src = key.src;
+#ifdef CONFIG_IP_ROUTE_NAT
+ rth->rt_dst_map = key.dst;
+ rth->rt_src_map = key.src;
+#endif
+ rth->rt_iif = oif ? : dev_out->ifindex;
+ rth->u.dst.dev = dev_out;
+ rth->rt_gateway = key.dst;
+ rth->rt_spec_dst= key.src;
+
+ rth->u.dst.output=ip_output;
+
+ if (flags&RTCF_LOCAL) {
+ rth->u.dst.input = ip_local_deliver;
+ rth->rt_spec_dst = key.dst;
+ }
+ if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
+ rth->rt_spec_dst = key.src;
+ if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
+ rth->u.dst.output = ip_mc_output;
+#ifdef CONFIG_IP_MROUTE
+ if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
+ struct in_device *in_dev = dev_out->ip_ptr;
+ if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
+ rth->u.dst.input = ip_mr_input;
+ rth->u.dst.output = ip_mc_output;
+ }
+ }
+#endif
+ }
+
+ rt_set_nexthop(rth, &res, 0);
+
+ rth->rt_flags = flags;
+
+ hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+ return rt_intern_hash(hash, rth, rp);
+}
+
+int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
+{
+ unsigned hash;
+ struct rtable *rth;
+
+ hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+
+ start_bh_atomic();
+ for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == saddr &&
+ rth->key.iif == 0 &&
+ rth->key.oif == oif &&
+#ifndef CONFIG_IP_TRANSPARENT_PROXY
+ rth->key.tos == tos
+#else
+ !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
+ ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
+#endif
+ ) {
+ rth->u.dst.lastuse = jiffies;
+ atomic_inc(&rth->u.dst.use);
+ atomic_inc(&rth->u.dst.refcnt);
+ end_bh_atomic();
+ *rp = rth;
+ return 0;
+ }
+ }
+ end_bh_atomic();
+
+ return ip_route_output_slow(rp, daddr, saddr, tos, oif);
+}
+
+#ifdef CONFIG_RTNETLINK
+
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb->tail;
+ struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+ struct rtattr *eptr;
+#endif
+ struct rtattr *mx;
+
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+ r = NLMSG_DATA(nlh);
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 0;
+ r->rtm_tos = rt->key.tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+ r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+ RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+ if (rt->key.src) {
+ r->rtm_src_len = 32;
+ RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
+ }
+ if (rt->u.dst.dev)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rt->u.dst.tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
+ if (rt->key.iif)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+ else if (rt->rt_src != rt->key.src)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
+ if (rt->rt_dst != rt->rt_gateway)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+ mx = (struct rtattr*)skb->tail;
+ RTA_PUT(skb, RTA_METRICS, 0, NULL);
+ if (rt->u.dst.mxlock)
+ RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
+ if (rt->u.dst.pmtu)
+ RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+ if (rt->u.dst.window)
+ RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+ if (rt->u.dst.rtt)
+ RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+ mx->rta_len = skb->tail - (u8*)mx;
+ if (mx->rta_len == RTA_LENGTH(0))
+ skb_trim(skb, (u8*)mx - skb->data);
+ ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
+ ci.rta_used = atomic_read(&rt->u.dst.refcnt);
+ ci.rta_clntref = atomic_read(&rt->u.dst.use);
+ if (rt->u.dst.expires)
+ ci.rta_expires = rt->u.dst.expires - jiffies;
+ else
+ ci.rta_expires = 0;
+ ci.rta_error = rt->u.dst.error;
+#ifdef CONFIG_IP_MROUTE
+ eptr = (struct rtattr*)skb->tail;
+#endif
+ RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+ if (rt->key.iif) {
+#ifdef CONFIG_IP_MROUTE
+ u32 dst = rt->rt_dst;
+
+ if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
+ int err = ipmr_get_route(skb, r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nlmsg_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nlmsg_failure;
+ ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+ }
+ }
+ } else
+#endif
+ {
+ RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
+ }
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+ struct rtattr **rta = arg;
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ struct rtable *rt = NULL;
+ u32 dst = 0;
+ u32 src = 0;
+ int iif = 0;
+ int err;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOBUFS;
+
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+ skb->mac.raw = skb->data;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+ if (rta[RTA_SRC-1])
+ memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
+ if (rta[RTA_DST-1])
+ memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
+ if (rta[RTA_IIF-1])
+ memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
+
+ if (iif) {
+ struct device *dev;
+ dev = dev_get_by_index(iif);
+ if (!dev)
+ return -ENODEV;
+ skb->protocol = __constant_htons(ETH_P_IP);
+ skb->dev = dev;
+ start_bh_atomic();
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ end_bh_atomic();
+ rt = (struct rtable*)skb->dst;
+ if (!err && rt->u.dst.error)
+ err = -rt->u.dst.error;
+ } else {
+ int oif = 0;
+ if (rta[RTA_OIF-1])
+ memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
+ err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
+ }
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb->dst = &rt->u.dst;
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
+ rt->rt_flags |= RTCF_NOTIFY;
+
+ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+
+ err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
+ if (err == 0)
+ return 0;
+ if (err < 0)
+ return -EMSGSIZE;
+
+ err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+
+int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtable *rt;
+ int h, s_h;
+ int idx, s_idx;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ for (h=0; h < RT_HASH_DIVISOR; h++) {
+ if (h < s_h) continue;
+ if (h > s_h)
+ s_idx = 0;
+ start_bh_atomic();
+ for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
+ if (idx < s_idx)
+ continue;
+ skb->dst = dst_clone(&rt->u.dst);
+ if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
+ dst_release(xchg(&skb->dst, NULL));
+ end_bh_atomic();
+ goto done;
+ }
+ dst_release(xchg(&skb->dst, NULL));
+ }
+ end_bh_atomic();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ return skb->len;
+}
+
+#endif /* CONFIG_RTNETLINK */
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+ rt_cache_flush(0);
+}
+
+
+
+#ifdef CONFIG_SYSCTL
+
+static int flush_delay;
+
+static
+int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
+ void *buffer, size_t *lenp)
+{
+ if (write) {
+ proc_dointvec(ctl, write, filp, buffer, lenp);
+ rt_cache_flush(flush_delay);
+ return 0;
+ } else
+ return -EINVAL;
+}
+
+static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen,
+ void **context)
+{
+ int delay;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(delay,(int *)newval))
+ return -EFAULT;
+ rt_cache_flush(delay);
+ return 0;
+}
+
+ctl_table ipv4_route_table[] = {
+ {NET_IPV4_ROUTE_FLUSH, "flush",
+ &flush_delay, sizeof(int), 0644, NULL,
+ &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
+ {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
+ &ip_rt_min_delay, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
+ &ip_rt_max_delay, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
+ &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
+ &ip_rt_max_size, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
+ &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
+ &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
+ &ip_rt_gc_interval, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
+ &ip_rt_redirect_load, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
+ &ip_rt_redirect_number, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
+ &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
+ &ip_rt_error_cost, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
+ &ip_rt_error_burst, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
+ &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
+ &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {0}
+};
+#endif
+
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct ip_rt_acct[256];
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ *start=buffer;
+
+ if (offset + length > sizeof(ip_rt_acct)) {
+ length = sizeof(ip_rt_acct) - offset;
+ *eof = 1;
+ }
+ if (length > 0) {
+ start_bh_atomic();
+ memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
+ end_bh_atomic();
+ return length;
+ }
+ return 0;
+}
+#endif
+#endif
+
+
+__initfunc(void ip_rt_init(void))
+{
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NET_CLS_ROUTE
+ struct proc_dir_entry *ent;
+#endif
+#endif
+ devinet_init();
+ ip_fib_init();
+ rt_periodic_timer.function = rt_check_expire;
+ /* All the timers, started at system startup tend
+ to synchronize. Perturb it a bit.
+ */
+ rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
+ + ip_rt_gc_interval;
+ add_timer(&rt_periodic_timer);
+
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&(struct proc_dir_entry) {
+ PROC_NET_RTCACHE, 8, "rt_cache",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ rt_cache_get_info
+ });
+#ifdef CONFIG_NET_CLS_ROUTE
+ ent = create_proc_entry("net/rt_acct", 0, 0);
+ ent->read_proc = ip_rt_acct_read;
+#endif
+#endif
+}
diff --git a/pfinet/linux-src/net/ipv4/syncookies.c b/pfinet/linux-src/net/ipv4/syncookies.c
new file mode 100644
index 00000000..fb4e8f80
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/syncookies.c
@@ -0,0 +1,201 @@
+/*
+ * Syncookies implementation for the Linux kernel
+ *
+ * Copyright (C) 1997 Andi Kleen
+ * Based on ideas by D.J.Bernstein and Eric Schenk.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * $Id: syncookies.c,v 1.7.2.1 1999/08/08 08:43:13 davem Exp $
+ *
+ * Missing: IPv6 support.
+ */
+
+#include <linux/config.h>
+#if defined(CONFIG_SYN_COOKIES)
+#include <linux/tcp.h>
+#include <linux/malloc.h>
+#include <linux/random.h>
+#include <net/tcp.h>
+
+extern int sysctl_tcp_syncookies;
+
+static unsigned long tcp_lastsynq_overflow;
+
+/*
+ * This table has to be sorted and terminated with (__u16)-1.
+ * XXX generate a better table.
+ * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ */
+static __u16 const msstab[] = {
+ 64-1,
+ 256-1,
+ 512-1,
+ 536-1,
+ 1024-1,
+ 1440-1,
+ 1460-1,
+ 4312-1,
+ (__u16)-1
+};
+/* The number doesn't include the -1 terminator */
+#define NUM_MSS (sizeof(msstab)/sizeof(msstab[0]) - 1)
+
+/*
+ * Generate a syncookie. mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
+ __u16 *mssp)
+{
+ int mssind;
+ const __u16 mss = *mssp;
+
+ tcp_lastsynq_overflow = jiffies;
+ /* XXX sort msstab[] by probability? Binary search? */
+ for (mssind = 0; mss > msstab[mssind+1]; mssind++)
+ ;
+ *mssp = msstab[mssind]+1;
+
+ net_statistics.SyncookiesSent++;
+
+ return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->h.th->source, skb->h.th->dest,
+ ntohl(skb->h.th->seq),
+ jiffies / (HZ*60), mssind);
+}
+
+/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+ __u32 seq;
+ __u32 mssind;
+
+ if ((jiffies - tcp_lastsynq_overflow) > TCP_TIMEOUT_INIT)
+ return 0;
+
+ seq = ntohl(skb->h.th->seq)-1;
+ mssind = check_tcp_syn_cookie(cookie,
+ skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->h.th->source, skb->h.th->dest,
+ seq, jiffies/(HZ*60), COUNTER_TRIES);
+
+ return mssind < NUM_MSS ? msstab[mssind]+1 : 0;
+}
+
+extern struct or_calltable or_ipv4;
+
+static inline struct sock *
+get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req,
+ struct dst_entry *dst)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+ req->sk = sk;
+
+ /* Queue up for accept() */
+ tcp_synq_queue(tp, req);
+
+ return sk;
+}
+
+struct sock *
+cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
+{
+ __u32 cookie = ntohl(skb->h.th->ack_seq)-1;
+ struct open_request *req;
+ int mss;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+
+ if (!sysctl_tcp_syncookies)
+ return sk;
+ if (!skb->h.th->ack)
+ return sk;
+
+ mss = cookie_check(skb, cookie);
+ if (mss == 0) {
+ net_statistics.SyncookiesFailed++;
+ return sk;
+ }
+
+ net_statistics.SyncookiesRecv++;
+
+ req = tcp_openreq_alloc();
+ if (req == NULL)
+ return NULL;
+
+ req->rcv_isn = htonl(skb->h.th->seq)-1;
+ req->snt_isn = cookie;
+ req->mss = mss;
+ req->rmt_port = skb->h.th->source;
+ req->af.v4_req.loc_addr = skb->nh.iph->daddr;
+ req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
+ req->class = &or_ipv4; /* for safety */
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ req->lcl_port = skb->h.th->dest;
+#endif
+
+ req->af.v4_req.opt = NULL;
+
+ /* We throwed the options of the initial SYN away, so we hope
+ * the ACK carries the same options again (see RFC1122 4.2.3.8)
+ */
+ if (opt && opt->optlen) {
+ int opt_size = sizeof(struct ip_options) + opt->optlen;
+
+ req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
+ if (req->af.v4_req.opt) {
+ if (ip_options_echo(req->af.v4_req.opt, skb)) {
+ kfree_s(req->af.v4_req.opt, opt_size);
+ req->af.v4_req.opt = NULL;
+ }
+ }
+ }
+
+ req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
+ req->wscale_ok = 0;
+ req->expires = 0UL;
+ req->retrans = 0;
+
+ /*
+ * We need to lookup the route here to get at the correct
+ * window size. We should better make sure that the window size
+ * hasn't changed since we received the original syn, but I see
+ * no easy way to do this.
+ */
+ if (ip_route_output(&rt,
+ opt &&
+ opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
+ req->af.v4_req.loc_addr,
+ sk->ip_tos | RTO_CONN,
+ 0)) {
+ tcp_openreq_free(req);
+ return NULL;
+ }
+
+ /* Try to redo what tcp_v4_send_synack did. */
+ req->window_clamp = rt->u.dst.window;
+ tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ 0, &rcv_wscale);
+ req->rcv_wscale = rcv_wscale;
+
+ return get_cookie_sock(sk, skb, req, &rt->u.dst);
+}
+
+#endif
diff --git a/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c b/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 00000000..e578e4e7
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,205 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * $Id: sysctl_net_ipv4.c,v 1.38.2.1 1999/08/08 08:43:14 davem Exp $
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/config.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+/*
+ * TCP configuration parameters
+ */
+
+#define TCP_PMTU_DISC 0x00000001 /* perform PMTU discovery */
+#define TCP_CONG_AVOID 0x00000002 /* congestion avoidance algorithm */
+#define TCP_DELAY_ACKS 0x00000003 /* delayed ack stategy */
+
+#if 0
+static int boolean_min = 0;
+static int boolean_max = 1;
+#endif
+
+/* From icmp.c */
+extern int sysctl_icmp_echo_ignore_all;
+extern int sysctl_icmp_echo_ignore_broadcasts;
+extern int sysctl_icmp_ignore_bogus_error_responses;
+
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_high_thresh;
+extern int sysctl_ipfrag_time;
+
+/* From ip_output.c */
+extern int sysctl_ip_dynaddr;
+
+/* From ip_masq.c */
+extern int sysctl_ip_masq_debug;
+
+extern int sysctl_tcp_timestamps;
+extern int sysctl_tcp_window_scaling;
+extern int sysctl_tcp_sack;
+extern int sysctl_tcp_retrans_collapse;
+extern int sysctl_tcp_keepalive_time;
+extern int sysctl_tcp_keepalive_probes;
+extern int sysctl_tcp_max_ka_probes;
+extern int sysctl_tcp_retries1;
+extern int sysctl_tcp_retries2;
+extern int sysctl_tcp_fin_timeout;
+extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_syn_retries;
+extern int sysctl_tcp_stdurg;
+extern int sysctl_tcp_rfc1337;
+extern int sysctl_tcp_syn_taildrop;
+extern int sysctl_max_syn_backlog;
+
+/* From icmp.c */
+extern int sysctl_icmp_destunreach_time;
+extern int sysctl_icmp_timeexceed_time;
+extern int sysctl_icmp_paramprob_time;
+extern int sysctl_icmp_echoreply_time;
+
+/* From igmp.c */
+extern int sysctl_igmp_max_memberships;
+
+int tcp_retr1_max = 255;
+
+struct ipv4_config ipv4_config;
+
+extern ctl_table ipv4_route_table[];
+
+#ifdef CONFIG_SYSCTL
+
+static
+int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+ void *buffer, size_t *lenp)
+{
+ int val = ipv4_devconf.forwarding;
+ int ret;
+
+ ret = proc_dointvec(ctl, write, filp, buffer, lenp);
+
+ if (write && ipv4_devconf.forwarding != val)
+ inet_forward_change();
+
+ return ret;
+}
+
+static int ipv4_sysctl_forward_strategy(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen,
+ void **context)
+{
+ int new;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(new,(int *)newval))
+ return -EFAULT;
+ if (new != ipv4_devconf.forwarding)
+ inet_forward_change();
+ return 0; /* caller does change again and handles handles oldval */
+}
+
+ctl_table ipv4_table[] = {
+ {NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps",
+ &sysctl_tcp_timestamps, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling",
+ &sysctl_tcp_window_scaling, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_TCP_SACK, "tcp_sack",
+ &sysctl_tcp_sack, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse",
+ &sysctl_tcp_retrans_collapse, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_FORWARD, "ip_forward",
+ &ipv4_devconf.forwarding, sizeof(int), 0644, NULL,
+ &ipv4_sysctl_forward,&ipv4_sysctl_forward_strategy},
+ {NET_IPV4_DEFAULT_TTL, "ip_default_ttl",
+ &ip_statistics.IpDefaultTTL, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_AUTOCONFIG, "ip_autoconfig",
+ &ipv4_config.autoconfig, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc",
+ &ipv4_config.no_pmtu_disc, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries",
+ &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh",
+ &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh",
+ &sysctl_ipfrag_low_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_DYNADDR, "ip_dynaddr",
+ &sysctl_ip_dynaddr, sizeof(int), 0644, NULL, &proc_dointvec},
+#ifdef CONFIG_IP_MASQUERADE
+ {NET_IPV4_IP_MASQ_DEBUG, "ip_masq_debug",
+ &sysctl_ip_masq_debug, sizeof(int), 0644, NULL, &proc_dointvec},
+#endif
+ {NET_IPV4_IPFRAG_TIME, "ipfrag_time",
+ &sysctl_ipfrag_time, sizeof(int), 0644, NULL, &proc_dointvec_jiffies,
+ &sysctl_jiffies},
+ {NET_IPV4_TCP_MAX_KA_PROBES, "tcp_max_ka_probes",
+ &sysctl_tcp_max_ka_probes, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time",
+ &sysctl_tcp_keepalive_time, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes",
+ &sysctl_tcp_keepalive_probes, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_TCP_RETRIES1, "tcp_retries1",
+ &sysctl_tcp_retries1, sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+ &sysctl_intvec, NULL, NULL, &tcp_retr1_max},
+ {NET_IPV4_TCP_RETRIES2, "tcp_retries2",
+ &sysctl_tcp_retries2, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout",
+ &sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+#ifdef CONFIG_SYN_COOKIES
+ {NET_TCP_SYNCOOKIES, "tcp_syncookies",
+ &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec},
+#endif
+ {NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range",
+ &sysctl_local_port_range, sizeof(sysctl_local_port_range), 0644,
+ NULL, &proc_dointvec},
+ {NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all",
+ &sysctl_icmp_echo_ignore_all, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts",
+ &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses",
+ &sysctl_icmp_ignore_bogus_error_responses, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate",
+ &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate",
+ &sysctl_icmp_timeexceed_time, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_ICMP_PARAMPROB_RATE, "icmp_paramprob_rate",
+ &sysctl_icmp_paramprob_time, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate",
+ &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec},
+ {NET_IPV4_ROUTE, "route", NULL, 0, 0555, ipv4_route_table},
+#ifdef CONFIG_IP_MULTICAST
+ {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships",
+ &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec},
+#endif
+ {0}
+};
+
+#endif /* CONFIG_SYSCTL */
diff --git a/pfinet/linux-src/net/ipv4/tcp.c b/pfinet/linux-src/net/ipv4/tcp.c
new file mode 100644
index 00000000..65763215
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp.c
@@ -0,0 +1,1826 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp.c,v 1.140.2.4 1999/08/09 03:13:12 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() calls
+ * Alan Cox : Set the ACK bit on a reset
+ * Alan Cox : Stopped it crashing if it closed while
+ * sk->inuse=1 and was trying to connect
+ * (tcp_err()).
+ * Alan Cox : All icmp error handling was broken
+ * pointers passed where wrong and the
+ * socket was looked up backwards. Nobody
+ * tested any icmp error code obviously.
+ * Alan Cox : tcp_err() now handled properly. It
+ * wakes people on errors. poll
+ * behaves and the icmp error race
+ * has gone by moving it into sock.c
+ * Alan Cox : tcp_send_reset() fixed to work for
+ * everything not just packets for
+ * unknown sockets.
+ * Alan Cox : tcp option processing.
+ * Alan Cox : Reset tweaked (still not 100%) [Had
+ * syn rule wrong]
+ * Herp Rosmanith : More reset fixes
+ * Alan Cox : No longer acks invalid rst frames.
+ * Acking any kind of RST is right out.
+ * Alan Cox : Sets an ignore me flag on an rst
+ * receive otherwise odd bits of prattle
+ * escape still
+ * Alan Cox : Fixed another acking RST frame bug.
+ * Should stop LAN workplace lockups.
+ * Alan Cox : Some tidyups using the new skb list
+ * facilities
+ * Alan Cox : sk->keepopen now seems to work
+ * Alan Cox : Pulls options out correctly on accepts
+ * Alan Cox : Fixed assorted sk->rqueue->next errors
+ * Alan Cox : PSH doesn't end a TCP read. Switched a
+ * bit to skb ops.
+ * Alan Cox : Tidied tcp_data to avoid a potential
+ * nasty.
+ * Alan Cox : Added some better commenting, as the
+ * tcp is hard to follow
+ * Alan Cox : Removed incorrect check for 20 * psh
+ * Michael O'Reilly : ack < copied bug fix.
+ * Johannes Stille : Misc tcp fixes (not all in yet).
+ * Alan Cox : FIN with no memory -> CRASH
+ * Alan Cox : Added socket option proto entries.
+ * Also added awareness of them to accept.
+ * Alan Cox : Added TCP options (SOL_TCP)
+ * Alan Cox : Switched wakeup calls to callbacks,
+ * so the kernel can layer network
+ * sockets.
+ * Alan Cox : Use ip_tos/ip_ttl settings.
+ * Alan Cox : Handle FIN (more) properly (we hope).
+ * Alan Cox : RST frames sent on unsynchronised
+ * state ack error.
+ * Alan Cox : Put in missing check for SYN bit.
+ * Alan Cox : Added tcp_select_window() aka NET2E
+ * window non shrink trick.
+ * Alan Cox : Added a couple of small NET2E timer
+ * fixes
+ * Charles Hedrick : TCP fixes
+ * Toomas Tamm : TCP window fixes
+ * Alan Cox : Small URG fix to rlogin ^C ack fight
+ * Charles Hedrick : Rewrote most of it to actually work
+ * Linus : Rewrote tcp_read() and URG handling
+ * completely
+ * Gerhard Koerting: Fixed some missing timer handling
+ * Matthew Dillon : Reworked TCP machine states as per RFC
+ * Gerhard Koerting: PC/TCP workarounds
+ * Adam Caldwell : Assorted timer/timing errors
+ * Matthew Dillon : Fixed another RST bug
+ * Alan Cox : Move to kernel side addressing changes.
+ * Alan Cox : Beginning work on TCP fastpathing
+ * (not yet usable)
+ * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
+ * Alan Cox : TCP fast path debugging
+ * Alan Cox : Window clamping
+ * Michael Riepe : Bug in tcp_check()
+ * Matt Dillon : More TCP improvements and RST bug fixes
+ * Matt Dillon : Yet more small nasties remove from the
+ * TCP code (Be very nice to this man if
+ * tcp finally works 100%) 8)
+ * Alan Cox : BSD accept semantics.
+ * Alan Cox : Reset on closedown bug.
+ * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
+ * Michael Pall : Handle poll() after URG properly in
+ * all cases.
+ * Michael Pall : Undo the last fix in tcp_read_urg()
+ * (multi URG PUSH broke rlogin).
+ * Michael Pall : Fix the multi URG PUSH problem in
+ * tcp_readable(), poll() after URG
+ * works now.
+ * Michael Pall : recv(...,MSG_OOB) never blocks in the
+ * BSD api.
+ * Alan Cox : Changed the semantics of sk->socket to
+ * fix a race and a signal problem with
+ * accept() and async I/O.
+ * Alan Cox : Relaxed the rules on tcp_sendto().
+ * Yury Shevchuk : Really fixed accept() blocking problem.
+ * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
+ * clients/servers which listen in on
+ * fixed ports.
+ * Alan Cox : Cleaned the above up and shrank it to
+ * a sensible code size.
+ * Alan Cox : Self connect lockup fix.
+ * Alan Cox : No connect to multicast.
+ * Ross Biro : Close unaccepted children on master
+ * socket close.
+ * Alan Cox : Reset tracing code.
+ * Alan Cox : Spurious resets on shutdown.
+ * Alan Cox : Giant 15 minute/60 second timer error
+ * Alan Cox : Small whoops in polling before an
+ * accept.
+ * Alan Cox : Kept the state trace facility since
+ * it's handy for debugging.
+ * Alan Cox : More reset handler fixes.
+ * Alan Cox : Started rewriting the code based on
+ * the RFC's for other useful protocol
+ * references see: Comer, KA9Q NOS, and
+ * for a reference on the difference
+ * between specifications and how BSD
+ * works see the 4.4lite source.
+ * A.N.Kuznetsov : Don't time wait on completion of tidy
+ * close.
+ * Linus Torvalds : Fin/Shutdown & copied_seq changes.
+ * Linus Torvalds : Fixed BSD port reuse to work first syn
+ * Alan Cox : Reimplemented timers as per the RFC
+ * and using multiple timers for sanity.
+ * Alan Cox : Small bug fixes, and a lot of new
+ * comments.
+ * Alan Cox : Fixed dual reader crash by locking
+ * the buffers (much like datagram.c)
+ * Alan Cox : Fixed stuck sockets in probe. A probe
+ * now gets fed up of retrying without
+ * (even a no space) answer.
+ * Alan Cox : Extracted closing code better
+ * Alan Cox : Fixed the closing state machine to
+ * resemble the RFC.
+ * Alan Cox : More 'per spec' fixes.
+ * Jorge Cwik : Even faster checksumming.
+ * Alan Cox : tcp_data() doesn't ack illegal PSH
+ * only frames. At least one pc tcp stack
+ * generates them.
+ * Alan Cox : Cache last socket.
+ * Alan Cox : Per route irtt.
+ * Matt Day : poll()->select() match BSD precisely on error
+ * Alan Cox : New buffers
+ * Marc Tamsky : Various sk->prot->retransmits and
+ * sk->retransmits misupdating fixed.
+ * Fixed tcp_write_timeout: stuck close,
+ * and TCP syn retries gets used now.
+ * Mark Yarvis : In tcp_read_wakeup(), don't send an
+ * ack if state is TCP_CLOSED.
+ * Alan Cox : Look up device on a retransmit - routes may
+ * change. Doesn't yet cope with MSS shrink right
+ * but its a start!
+ * Marc Tamsky : Closing in closing fixes.
+ * Mike Shaver : RFC1122 verifications.
+ * Alan Cox : rcv_saddr errors.
+ * Alan Cox : Block double connect().
+ * Alan Cox : Small hooks for enSKIP.
+ * Alexey Kuznetsov: Path MTU discovery.
+ * Alan Cox : Support soft errors.
+ * Alan Cox : Fix MTU discovery pathological case
+ * when the remote claims no mtu!
+ * Marc Tamsky : TCP_CLOSE fix.
+ * Colin (G3TNE) : Send a reset on syn ack replies in
+ * window but wrong (fixes NT lpd problems)
+ * Pedro Roque : Better TCP window handling, delayed ack.
+ * Joerg Reuter : No modification of locked buffers in
+ * tcp_do_retransmit()
+ * Eric Schenk : Changed receiver side silly window
+ * avoidance algorithm to BSD style
+ * algorithm. This doubles throughput
+ * against machines running Solaris,
+ * and seems to result in general
+ * improvement.
+ * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * Keith Owens : Do proper merging with partial SKB's in
+ * tcp_do_sendmsg to avoid burstiness.
+ * Eric Schenk : Fix fast close down bug with
+ * shutdown() followed by close().
+ * Andi Kleen : Make poll agree with SIGIO
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ * TCP_SYN_SENT sent a connection request, waiting for ack
+ *
+ * TCP_SYN_RECV received a connection request, sent ack,
+ * waiting for final ack in three-way handshake.
+ *
+ * TCP_ESTABLISHED connection established
+ *
+ * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
+ * transmission of remaining buffered data
+ *
+ * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
+ * to shutdown
+ *
+ * TCP_CLOSING both sides have shutdown but we still have
+ * data we have to finish sending
+ *
+ * TCP_TIME_WAIT timeout to catch resent junk before entering
+ * closed, can only be entered from FIN_WAIT2
+ * or CLOSING. Required because the other end
+ * may not have gotten our last ACK causing it
+ * to retransmit the data packet (which we ignore)
+ *
+ * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
+ * us to finish writing our data and to shutdown
+ * (we have to close() to move on to LAST_ACK)
+ *
+ * TCP_LAST_ACK out side has shutdown after remote has
+ * shutdown. There may still be data in our
+ * buffer that we have to finish sending
+ *
+ * TCP_CLOSE socket is finished
+ */
+
+/*
+ * RFC1122 status:
+ * NOTE: I'm not going to be doing comments in the code for this one except
+ * for violations and the like. tcp.c is just too big... If I say something
+ * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
+ * with Alan. -- MS 950903
+ * [Note: Most of the TCP code has been rewriten/redesigned since this
+ * RFC1122 check. It is probably not correct anymore. It should be redone
+ * before 2.2. -AK]
+ *
+ * Use of PSH (4.2.2.2)
+ * MAY aggregate data sent without the PSH flag. (does)
+ * MAY queue data received without the PSH flag. (does)
+ * SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
+ * MAY implement PSH on send calls. (doesn't, thus:)
+ * MUST NOT buffer data indefinitely (doesn't [1 second])
+ * MUST set PSH on last segment (does)
+ * MAY pass received PSH to application layer (doesn't)
+ * SHOULD send maximum-sized segment whenever possible. (almost always does)
+ *
+ * Window Size (4.2.2.3, 4.2.2.16)
+ * MUST treat window size as an unsigned number (does)
+ * SHOULD treat window size as a 32-bit number (does not)
+ * MUST NOT shrink window once it is offered (does not normally)
+ *
+ * Urgent Pointer (4.2.2.4)
+ * **MUST point urgent pointer to last byte of urgent data (not right
+ * after). (doesn't, to be like BSD. That's configurable, but defaults
+ * to off)
+ * MUST inform application layer asynchronously of incoming urgent
+ * data. (does)
+ * MUST provide application with means of determining the amount of
+ * urgent data pending. (does)
+ * **MUST support urgent data sequence of arbitrary length. (doesn't, but
+ * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
+ * [Follows BSD 1 byte of urgent data]
+ *
+ * TCP Options (4.2.2.5)
+ * MUST be able to receive TCP options in any segment. (does)
+ * MUST ignore unsupported options (does)
+ *
+ * Maximum Segment Size Option (4.2.2.6)
+ * MUST implement both sending and receiving MSS. (does, but currently
+ * only uses the smaller of both of them)
+ * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
+ * it always). (does, even when MSS == 536, which is legal)
+ * MUST assume MSS == 536 if no MSS received at connection setup (does)
+ * MUST calculate "effective send MSS" correctly:
+ * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
+ * (does - but allows operator override)
+ *
+ * TCP Checksum (4.2.2.7)
+ * MUST generate and check TCP checksum. (does)
+ *
+ * Initial Sequence Number Selection (4.2.2.8)
+ * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's
+ * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
+ * necessary for 10Mbps networks - and harder than BSD to spoof!
+ * With syncookies we don't)
+ *
+ * Simultaneous Open Attempts (4.2.2.10)
+ * MUST support simultaneous open attempts (does)
+ *
+ * Recovery from Old Duplicate SYN (4.2.2.11)
+ * MUST keep track of active vs. passive open (does)
+ *
+ * RST segment (4.2.2.12)
+ * SHOULD allow an RST segment to contain data (does, but doesn't do
+ * anything with it, which is standard)
+ *
+ * Closing a Connection (4.2.2.13)
+ * MUST inform application of whether connection was closed by RST or
+ * normal close. (does)
+ * MAY allow "half-duplex" close (treat connection as closed for the
+ * local app, even before handshake is done). (does)
+ * MUST linger in TIME_WAIT for 2 * MSL (does)
+ *
+ * Retransmission Timeout (4.2.2.15)
+ * MUST implement Jacobson's slow start and congestion avoidance
+ * stuff. (does)
+ *
+ * Probing Zero Windows (4.2.2.17)
+ * MUST support probing of zero windows. (does)
+ * MAY keep offered window closed indefinitely. (does)
+ * MUST allow remote window to stay closed indefinitely. (does)
+ *
+ * Passive Open Calls (4.2.2.18)
+ * MUST NOT let new passive open affect other connections. (doesn't)
+ * MUST support passive opens (LISTENs) concurrently. (does)
+ *
+ * Time to Live (4.2.2.19)
+ * MUST make TCP TTL configurable. (does - IP_TTL option)
+ *
+ * Event Processing (4.2.2.20)
+ * SHOULD queue out-of-order segments. (does)
+ * MUST aggregate ACK segments whenever possible. (does but badly)
+ *
+ * Retransmission Timeout Calculation (4.2.3.1)
+ * MUST implement Karn's algorithm and Jacobson's algorithm for RTO
+ * calculation. (does, or at least explains them in the comments 8*b)
+ * SHOULD initialize RTO to 0 and RTT to 3. (does)
+ *
+ * When to Send an ACK Segment (4.2.3.2)
+ * SHOULD implement delayed ACK. (does)
+ * MUST keep ACK delay < 0.5 sec. (does)
+ *
+ * When to Send a Window Update (4.2.3.3)
+ * MUST implement receiver-side SWS. (does)
+ *
+ * When to Send Data (4.2.3.4)
+ * MUST implement sender-side SWS. (does)
+ * SHOULD implement Nagle algorithm. (does)
+ *
+ * TCP Connection Failures (4.2.3.5)
+ * MUST handle excessive retransmissions "properly" (see the RFC). (does)
+ * SHOULD inform application layer of soft errors. (does)
+ *
+ * TCP Keep-Alives (4.2.3.6)
+ * MAY provide keep-alives. (does)
+ * MUST make keep-alives configurable on a per-connection basis. (does)
+ * MUST default to no keep-alives. (does)
+ * MUST make keep-alive interval configurable. (does)
+ * MUST make default keep-alive interval > 2 hours. (does)
+ * MUST NOT interpret failure to ACK keep-alive packet as dead
+ * connection. (doesn't)
+ * SHOULD send keep-alive with no data. (does)
+ *
+ * TCP Multihoming (4.2.3.7)
+ * MUST get source address from IP layer before sending first
+ * SYN. (does)
+ * MUST use same local address for all segments of a connection. (does)
+ *
+ * IP Options (4.2.3.8)
+ * MUST ignore unsupported IP options. (does)
+ * MAY support Time Stamp and Record Route. (does)
+ * MUST allow application to specify a source route. (does)
+ * MUST allow received Source Route option to set route for all future
+ * segments on this connection. (does not (security issues))
+ *
+ * ICMP messages (4.2.3.9)
+ * MUST act on ICMP errors. (does)
+ * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
+ * because that is deprecated now by the IETF, can be turned on)
+ * MUST NOT abort connection upon receipt of soft Destination
+ * Unreachables (0, 1, 5), Time Exceededs and Parameter
+ * Problems. (doesn't)
+ * SHOULD report soft Destination Unreachables etc. to the
+ * application. (does, except during SYN_RECV and may drop messages
+ * in some rare cases before accept() - ICMP is unreliable)
+ * SHOULD abort connection upon receipt of hard Destination Unreachable
+ * messages (2, 3, 4). (does, but see above)
+ *
+ * Remote Address Validation (4.2.3.10)
+ * MUST reject as an error OPEN for invalid remote IP address. (does)
+ * MUST ignore SYN with invalid source address. (does)
+ * MUST silently discard incoming SYN for broadcast/multicast
+ * address. (does)
+ *
+ * Asynchronous Reports (4.2.4.1)
+ * MUST provide mechanism for reporting soft errors to application
+ * layer. (does)
+ *
+ * Type of Service (4.2.4.2)
+ * MUST allow application layer to set Type of Service. (does IP_TOS)
+ *
+ * (Whew. -- MS 950903)
+ * (Updated by AK, but not complete yet.)
+ **/
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+
+struct tcp_mib tcp_statistics;
+
+kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
+
+/*
+ * Find someone to 'accept'. Must be called with
+ * the socket locked or with interrupts disabled
+ */
+
+static struct open_request *tcp_find_established(struct tcp_opt *tp,
+ struct open_request **prevp)
+{
+ struct open_request *req = tp->syn_wait_queue;
+ struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
+ while(req) {
+ if (req->sk &&
+ ((1 << req->sk->state) &
+ ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
+ break;
+ prev = req;
+ req = req->dl_next;
+ }
+ *prevp = prev;
+ return req;
+}
+
+/*
+ * Walk down the receive queue counting readable data.
+ *
+ * Must be called with the socket lock held.
+ */
+
+static int tcp_readable(struct sock *sk)
+{
+ unsigned long counted;
+ unsigned long amount;
+ struct sk_buff *skb;
+ int sum;
+
+ SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
+
+ skb = skb_peek(&sk->receive_queue);
+ if (skb == NULL) {
+ SOCK_DEBUG(sk, "empty\n");
+ return(0);
+ }
+
+ counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */
+ amount = 0;
+
+ /* Do until a push or until we are out of data. */
+ do {
+ /* Found a hole so stops here. */
+ if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */
+ break;
+
+ /* Length - header but start from where we are up to
+ * avoid overlaps.
+ */
+ sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
+ if (sum >= 0) {
+ /* Add it up, move on. */
+ amount += sum;
+ counted += sum;
+ if (skb->h.th->syn)
+ counted++;
+ }
+
+ /* Don't count urg data ... but do it in the right place!
+ * Consider: "old_data (ptr is here) URG PUSH data"
+ * The old code would stop at the first push because
+ * it counted the urg (amount==1) and then does amount--
+ * *after* the loop. This means tcp_readable() always
+ * returned zero if any URG PUSH was in the queue, even
+ * though there was normal data available. If we subtract
+ * the urg data right here, we even get it to work for more
+ * than one URG PUSH skb without normal data.
+ * This means that poll() finally works now with urg data
+ * in the queue. Note that rlogin was never affected
+ * because it doesn't use poll(); it uses two processes
+ * and a blocking read(). And the queue scan in tcp_read()
+ * was correct. Mike <pall@rz.uni-karlsruhe.de>
+ */
+
+ /* Don't count urg data. */
+ if (skb->h.th->urg)
+ amount--;
+#if 0
+ if (amount && skb->h.th->psh) break;
+#endif
+ skb = skb->next;
+ } while(skb != (struct sk_buff *)&sk->receive_queue);
+
+ SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
+ return(amount);
+}
+
+/*
+ * LISTEN is a special case for poll..
+ */
+static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
+{
+ struct open_request *req, *dummy;
+
+ lock_sock(sk);
+ req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
+ release_sock(sk);
+ if (req)
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+/*
+ * Compute minimal free write space needed to queue new packets.
+ */
+#define tcp_min_write_space(__sk) \
+ (atomic_read(&(__sk)->wmem_alloc) / 2)
+
+/*
+ * Wait for a TCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ poll_wait(file, sk->sleep, wait);
+ if (sk->state == TCP_LISTEN)
+ return tcp_listen_poll(sk, wait);
+
+ mask = 0;
+ if (sk->err)
+ mask = POLLERR;
+
+ /*
+ * POLLHUP is certainly not done right. But poll() doesn't
+ * have a notion of HUP in just one direction, and for a
+ * socket the read side is more interesting.
+ *
+ * Some poll() documentation says that POLLHUP is incompatible
+ * with the POLLOUT/POLLWR flags, so somebody should check this
+ * all. But careful, it tends to be safer to return too many
+ * bits than too few, and you can easily break real applications
+ * if you don't tell them that something has hung up!
+ *
+ * Check-me.
+ */
+ if (sk->shutdown & RCV_SHUTDOWN)
+ mask |= POLLHUP;
+
+ /* Connected? */
+ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
+ if ((tp->rcv_nxt != tp->copied_seq) &&
+ (tp->urg_seq != tp->copied_seq ||
+ tp->rcv_nxt != tp->copied_seq+1 ||
+ sk->urginline || !tp->urg_data))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->shutdown & SEND_SHUTDOWN)) {
+ if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ sk->socket->flags |= SO_NOSPACE;
+ }
+ }
+
+ if (tp->urg_data & URG_VALID)
+ mask |= POLLPRI;
+ }
+ return mask;
+}
+
+/*
+ * Socket write_space callback.
+ * This (or rather the sock_wake_async) should agree with poll.
+ */
+void tcp_write_space(struct sock *sk)
+{
+ if (sk->dead)
+ return;
+
+ wake_up_interruptible(sk->sleep);
+ if (sock_wspace(sk) >=
+ tcp_min_write_space(sk))
+ sock_wake_async(sk->socket, 2);
+}
+
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ int answ;
+
+ switch(cmd) {
+ case TIOCINQ:
+#ifdef FIXME /* FIXME: */
+ case FIONREAD:
+#endif
+ if (sk->state == TCP_LISTEN)
+ return(-EINVAL);
+ lock_sock(sk);
+ answ = tcp_readable(sk);
+ release_sock(sk);
+ break;
+ case SIOCATMARK:
+ {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ break;
+ }
+ case TIOCOUTQ:
+ if (sk->state == TCP_LISTEN)
+ return(-EINVAL);
+ answ = sock_wspace(sk);
+ break;
+ default:
+ return(-ENOIOCTLCMD);
+ };
+
+ return put_user(answ, (int *)arg);
+}
+
+/*
+ * Wait for a socket to get into the connected state
+ *
+ * Note: must be called with the socket locked.
+ */
+static int wait_for_tcp_connect(struct sock * sk, int flags)
+{
+ struct task_struct *tsk = current;
+ struct wait_queue wait = { tsk, NULL };
+
+ while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+ if(sk->err)
+ return sock_error(sk);
+ if((1 << sk->state) &
+ ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ if(sk->keepopen && !(flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, tsk, 0);
+ return -EPIPE;
+ }
+ if(flags & MSG_DONTWAIT)
+ return -EAGAIN;
+ if(signal_pending(tsk))
+ return -ERESTARTSYS;
+
+ tsk->state = TASK_INTERRUPTIBLE;
+ add_wait_queue(sk->sleep, &wait);
+ release_sock(sk);
+
+ if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
+ sk->err == 0)
+ schedule();
+
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+ lock_sock(sk);
+ }
+ return 0;
+}
+
+static inline int tcp_memory_free(struct sock *sk)
+{
+ return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
+}
+
+/*
+ * Wait for more memory for a socket
+ */
+static void wait_for_tcp_memory(struct sock * sk)
+{
+ release_sock(sk);
+ if (!tcp_memory_free(sk)) {
+ struct wait_queue wait = { current, NULL };
+
+ sk->socket->flags &= ~SO_NOSPACE;
+ add_wait_queue(sk->sleep, &wait);
+ for (;;) {
+ if (signal_pending(current))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ if (tcp_memory_free(sk))
+ break;
+ if (sk->shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->err)
+ break;
+ schedule();
+ }
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+ }
+ lock_sock(sk);
+}
+
+/*
+ * Wait for a buffer.
+ */
+static int wait_for_buffer(struct sock *sk)
+{
+ struct wait_queue wait = { current, NULL };
+
+ release_sock(sk);
+ add_wait_queue(sk->sleep, &wait);
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+ lock_sock(sk);
+ return 0;
+}
+
+/* When all user supplied data has been queued set the PSH bit */
+#define PSH_NEEDED (seglen == 0 && iovlen == 0)
+
+/*
+ * This routine copies from a user buffer into a socket,
+ * and starts the transmit system.
+ *
+ * Note: must be called with the socket locked.
+ */
+
+int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
+{
+ struct iovec *iov;
+ struct tcp_opt *tp;
+ struct sk_buff *skb;
+ int iovlen, flags;
+ int mss_now;
+ int err, copied;
+
+ lock_sock(sk);
+
+ err = 0;
+ tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Wait for a connection to finish. */
+ flags = msg->msg_flags;
+ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ if((err = wait_for_tcp_connect(sk, flags)) != 0)
+ goto out;
+
+ /* This should be in poll */
+ sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
+
+ mss_now = tcp_current_mss(sk);
+
+ /* Ok commence sending. */
+ iovlen = msg->msg_iovlen;
+ iov = msg->msg_iov;
+ copied = 0;
+
+ while(--iovlen >= 0) {
+ int seglen=iov->iov_len;
+ unsigned char * from=iov->iov_base;
+
+ iov++;
+
+ while(seglen > 0) {
+ int copy, tmp, queue_it, psh;
+
+ if (err)
+ goto do_fault2;
+
+ /* Stop on errors. */
+ if (sk->err)
+ goto do_sock_err;
+
+ /* Make sure that we are established. */
+ if (sk->shutdown & SEND_SHUTDOWN)
+ goto do_shutdown;
+
+ /* Now we need to check if we have a half
+ * built packet we can tack some data onto.
+ */
+ if (tp->send_head && !(flags & MSG_OOB)) {
+ skb = sk->write_queue.prev;
+ copy = skb->len;
+ /* If the remote does SWS avoidance we should
+ * queue the best we can if not we should in
+ * fact send multiple packets...
+ * A method for detecting this would be most
+ * welcome.
+ */
+ if (skb_tailroom(skb) > 0 &&
+ (mss_now - copy) > 0 &&
+ tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
+ int last_byte_was_odd = (copy % 4);
+
+ /*
+ * Check for parallel writers sleeping in user access.
+ */
+ if (tp->partial_writers++ > 0) {
+ wait_for_buffer(sk);
+ tp->partial_writers--;
+ continue;
+ }
+
+ copy = mss_now - copy;
+ if(copy > skb_tailroom(skb))
+ copy = skb_tailroom(skb);
+ if(copy > seglen)
+ copy = seglen;
+
+ if(last_byte_was_odd) {
+ if(copy_from_user(skb_put(skb, copy),
+ from, copy))
+ err = -EFAULT;
+ skb->csum = csum_partial(skb->data,
+ skb->len, 0);
+ } else {
+ skb->csum =
+ csum_and_copy_from_user(
+ from, skb_put(skb, copy),
+ copy, skb->csum, &err);
+ }
+
+ /*
+ * FIXME: the *_user functions should
+ * return how much data was
+ * copied before the fault
+ * occurred and then a partial
+ * packet with this data should
+ * be sent. Unfortunately
+ * csum_and_copy_from_user doesn't
+ * return this information.
+ * ATM it might send partly zeroed
+ * data in this case.
+ */
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ from += copy;
+ copied += copy;
+ seglen -= copy;
+ if (PSH_NEEDED)
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+
+ if (--tp->partial_writers > 0)
+ wake_up_interruptible(sk->sleep);
+
+ continue;
+ }
+ }
+
+ /* We also need to worry about the window. If
+ * window < 1/2 the maximum window we've seen
+ * from this host, don't use it. This is
+ * sender side silly window prevention, as
+ * specified in RFC1122. (Note that this is
+ * different than earlier versions of SWS
+ * prevention, e.g. RFC813.). What we
+ * actually do is use the whole MSS. Since
+ * the results in the right edge of the packet
+ * being outside the window, it will be queued
+ * for later rather than sent.
+ */
+ psh = 0;
+ copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
+ if(copy > (tp->max_window >> 1)) {
+ copy = min(copy, mss_now);
+ psh = 1;
+ } else {
+ copy = mss_now;
+ }
+ if(copy > seglen)
+ copy = seglen;
+
+ /* Determine how large of a buffer to allocate. */
+ tmp = MAX_HEADER + sk->prot->max_header;
+ if (copy < min(mss_now, tp->max_window >> 1) &&
+ !(flags & MSG_OOB)) {
+ tmp += min(mss_now, tp->max_window);
+
+ /* What is happening here is that we want to
+ * tack on later members of the users iovec
+ * if possible into a single frame. When we
+ * leave this loop our caller checks to see if
+ * we can send queued frames onto the wire.
+ * See tcp_v[46]_sendmsg() for this.
+ */
+ queue_it = 1;
+ } else {
+ tmp += copy;
+ queue_it = 0;
+ }
+ skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
+
+ /* If we didn't get any memory, we need to sleep. */
+ if (skb == NULL) {
+ sk->socket->flags |= SO_NOSPACE;
+ if (flags&MSG_DONTWAIT) {
+ err = -EAGAIN;
+ goto do_interrupted;
+ }
+ if (signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto do_interrupted;
+ }
+ tcp_push_pending_frames(sk, tp);
+ wait_for_tcp_memory(sk);
+
+ /* If SACK's were formed or PMTU events happened,
+ * we must find out about it.
+ */
+ mss_now = tcp_current_mss(sk);
+ continue;
+ }
+
+ seglen -= copy;
+
+ /* Prepare control bits for TCP header creation engine. */
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
+ ((PSH_NEEDED || psh) ?
+ TCPCB_FLAG_PSH : 0));
+ TCP_SKB_CB(skb)->sacked = 0;
+ if (flags & MSG_OOB) {
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
+ TCP_SKB_CB(skb)->urg_ptr = copy;
+ } else
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* TCP data bytes are SKB_PUT() on top, later
+ * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
+ * Reserve header space and checksum the data.
+ */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb->csum = csum_and_copy_from_user(from,
+ skb_put(skb, copy), copy, 0, &err);
+
+ if (err)
+ goto do_fault;
+
+ from += copy;
+ copied += copy;
+
+ TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
+
+ /* This advances tp->write_seq for us. */
+ tcp_send_skb(sk, skb, queue_it);
+ }
+ }
+ sk->err = 0;
+ err = copied;
+ goto out;
+
+do_sock_err:
+ if(copied)
+ err = copied;
+ else
+ err = sock_error(sk);
+ goto out;
+do_shutdown:
+ if(copied)
+ err = copied;
+ else {
+ if (!(flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+ }
+ goto out;
+do_interrupted:
+ if(copied)
+ err = copied;
+ goto out;
+do_fault:
+ kfree_skb(skb);
+do_fault2:
+ err = -EFAULT;
+out:
+ tcp_push_pending_frames(sk, tp);
+ release_sock(sk);
+ return err;
+}
+
+#undef PSH_NEEDED
+
+/*
+ * Send an ack if one is backlogged at this point. Ought to merge
+ * this with tcp_send_ack().
+ * This is called for delayed acks also.
+ */
+
+void tcp_read_wakeup(struct sock *sk)
+{
+ /* If we're closed, don't send an ack, or we'll get a RST
+ * from the closed destination.
+ */
+ if (sk->state != TCP_CLOSE)
+ tcp_send_ack(sk);
+}
+
+/*
+ * Handle reading urgent data. BSD has very simple semantics for
+ * this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock * sk, int nonblock,
+ struct msghdr *msg, int len, int flags,
+ int *addr_len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* No URG data to read. */
+ if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
+ return -EINVAL; /* Yes this is right ! */
+
+ if (sk->err)
+ return sock_error(sk);
+
+ if (sk->done)
+ return -ENOTCONN;
+
+ if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
+ sk->done = 1;
+ return 0;
+ }
+
+ lock_sock(sk);
+ if (tp->urg_data & URG_VALID) {
+ int err = 0;
+ char c = tp->urg_data;
+
+ if (!(flags & MSG_PEEK))
+ tp->urg_data = URG_READ;
+
+ if(msg->msg_name)
+ tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
+ msg->msg_name);
+
+ if(addr_len)
+ *addr_len = tp->af_specific->sockaddr_len;
+
+ /* Read urgent data. */
+ msg->msg_flags|=MSG_OOB;
+ release_sock(sk);
+
+ if(len>0)
+ {
+ err = memcpy_toiovec(msg->msg_iov, &c, 1);
+ /* N.B. already set above ... */
+ msg->msg_flags|=MSG_OOB;
+ }
+ else
+ msg->msg_flags|=MSG_TRUNC;
+
+ /* N.B. Is this right?? If len == 0 we didn't read any data */
+ return err ? -EFAULT : 1;
+ }
+ release_sock(sk);
+
+ /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
+ * the available implementations agree in this case:
+ * this call should never block, independent of the
+ * blocking state of the socket.
+ * Mike <pall@rz.uni-karlsruhe.de>
+ */
+ return -EAGAIN;
+}
+
+/*
+ * Release a skb if it is no longer needed. This routine
+ * must be called with interrupts disabled or with the
+ * socket locked so that the sk_buff queue operation is ok.
+ */
+
+static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
+{
+ __skb_unlink(skb, &sk->receive_queue);
+ kfree_skb(skb);
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary. COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct sk_buff *skb;
+
+ /* NOTE! The socket must be locked, so that we don't get
+ * a messed-up receive queue.
+ */
+ while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
+ if (!skb->used || atomic_read(&skb->users) > 1)
+ break;
+ tcp_eat_skb(sk, skb);
+ }
+
+ /* We send an ACK if we can now advertise a non-zero window
+ * which has been raised "significantly".
+ */
+ if(copied > 0) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ __u32 rcv_window_now = tcp_receive_window(tp);
+ __u32 new_window = __tcp_select_window(sk);
+
+ /* We won't be raising the window any further than
+ * the window-clamp allows. Our window selection
+ * also keeps things a nice multiple of MSS. These
+ * checks are necessary to prevent spurious ACKs
+ * which don't advertize a larger window.
+ */
+ if((new_window && (new_window >= rcv_window_now * 2)) &&
+ ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
+ tcp_read_wakeup(sk);
+ }
+}
+
+
+/*
+ * This routine copies from a sock struct into the user buffer.
+ */
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
+ int len, int nonblock, int flags, int *addr_len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct wait_queue wait = { current, NULL };
+ int copied = 0;
+ u32 peek_seq;
+ volatile u32 *seq; /* So gcc doesn't overoptimise */
+ unsigned long used;
+ int err = 0;
+ int target = 1; /* Read at least this many bytes */
+
+ if (sk->err)
+ return sock_error(sk);
+
+ if (sk->state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ /* Urgent data needs to be handled specially. */
+ if (flags & MSG_OOB)
+ return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+
+ /* Copying sequence to update. This is volatile to handle
+ * the multi-reader case neatly (memcpy_to/fromfs might be
+ * inline and thus not flush cached variables otherwise).
+ */
+ peek_seq = tp->copied_seq;
+ seq = &tp->copied_seq;
+ if (flags & MSG_PEEK)
+ seq = &peek_seq;
+
+ /* Handle the POSIX bogosity MSG_WAITALL. */
+ if (flags & MSG_WAITALL)
+ target=len;
+
+ add_wait_queue(sk->sleep, &wait);
+ lock_sock(sk);
+
+ /*
+ * BUG BUG BUG
+ * This violates 1003.1g compliance. We must wait for
+ * data to exist even if we read none!
+ */
+
+ while (len > 0) {
+ struct sk_buff * skb;
+ u32 offset;
+
+ /* Are we at urgent data? Stop if we have read anything. */
+ if (copied && tp->urg_data && tp->urg_seq == *seq)
+ break;
+
+ /* We need to check signals first, to get correct SIGURG
+ * handling. FIXME: Need to check this doesnt impact 1003.1g
+ * and move it down to the bottom of the loop
+ */
+ if (signal_pending(current)) {
+ if (copied)
+ break;
+ copied = -ERESTARTSYS;
+ if (nonblock)
+ copied = -EAGAIN;
+ break;
+ }
+
+ /* Next get a buffer. */
+ current->state = TASK_INTERRUPTIBLE;
+
+ skb = skb_peek(&sk->receive_queue);
+ do {
+ if (!skb)
+ break;
+
+ /* Now that we have two receive queues this
+ * shouldn't happen.
+ */
+ if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+ printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
+ *seq, TCP_SKB_CB(skb)->seq);
+ break;
+ }
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ if (skb->h.th->syn)
+ offset--;
+ if (offset < skb->len)
+ goto found_ok_skb;
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ if (!(flags & MSG_PEEK))
+ skb->used = 1;
+ skb = skb->next;
+ } while (skb != (struct sk_buff *)&sk->receive_queue);
+
+ if (copied >= target)
+ break;
+
+ /*
+ These three lines and clause if (sk->state == TCP_CLOSE)
+ are unlikely to be correct, if target > 1.
+ I DO NOT FIX IT, because I have no idea, what
+ POSIX prescribes to make here. Probably, it really
+ wants to lose data 8), if not all target is received.
+ --ANK
+ */
+ if (sk->err && !(flags&MSG_PEEK)) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->shutdown & RCV_SHUTDOWN) {
+ sk->done = 1;
+ break;
+ }
+
+ if (sk->state == TCP_CLOSE) {
+ if (!sk->done) {
+ sk->done = 1;
+ break;
+ }
+ copied = -ENOTCONN;
+ break;
+ }
+
+ if (nonblock) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ cleanup_rbuf(sk, copied);
+ release_sock(sk);
+ sk->socket->flags |= SO_WAITDATA;
+ schedule();
+ sk->socket->flags &= ~SO_WAITDATA;
+ lock_sock(sk);
+ continue;
+
+ found_ok_skb:
+ /* Lock the buffer. We can be fairly relaxed as
+ * an interrupt will never steal a buffer we are
+ * using unless I've missed something serious in
+ * tcp_data.
+ */
+ atomic_inc(&skb->users);
+
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ /* Do we have urgent data here? */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sk->urginline) {
+ ++*seq;
+ offset++;
+ used--;
+ }
+ } else
+ used = urg_offset;
+ }
+ }
+
+ /* Copy it - We _MUST_ update *seq first so that we
+ * don't ever double read when we have dual readers
+ */
+ *seq += used;
+
+ /* This memcpy_toiovec can sleep. If it sleeps and we
+ * do a second read it relies on the skb->users to avoid
+ * a crash when cleanup_rbuf() gets called.
+ */
+ err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
+ if (err) {
+ /* Exception. Bailout! */
+ atomic_dec(&skb->users);
+ copied = -EFAULT;
+ break;
+ }
+
+ copied += used;
+ len -= used;
+
+ /* We now will not sleep again until we are finished
+ * with skb. Sorry if you are doing the SMP port
+ * but you'll just have to fix it neatly ;)
+ */
+ atomic_dec(&skb->users);
+
+ if (after(tp->copied_seq,tp->urg_seq))
+ tp->urg_data = 0;
+ if (used + offset < skb->len)
+ continue;
+
+ /* Process the FIN. We may also need to handle PSH
+ * here and make it break out of MSG_WAITALL.
+ */
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+ if (flags & MSG_PEEK)
+ continue;
+ skb->used = 1;
+ if (atomic_read(&skb->users) == 1)
+ tcp_eat_skb(sk, skb);
+ continue;
+
+ found_fin_ok:
+ ++*seq;
+ if (flags & MSG_PEEK)
+ break;
+
+ /* All is done. */
+ skb->used = 1;
+ sk->shutdown |= RCV_SHUTDOWN;
+ break;
+ }
+
+ if(copied >= 0 && msg->msg_name) {
+ tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
+ msg->msg_name);
+ if(addr_len)
+ *addr_len = tp->af_specific->sockaddr_len;
+ }
+
+ remove_wait_queue(sk->sleep, &wait);
+ current->state = TASK_RUNNING;
+
+ /* Clean up data we have read: This will do ACK frames. */
+ cleanup_rbuf(sk, copied);
+ release_sock(sk);
+ return copied;
+}
+
+/*
+ * Check whether to renew the timer.
+ */
+static inline void tcp_check_fin_timer(struct sock *sk)
+{
+ if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
+ tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
+}
+
+/*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+ * states. A shutdown() may have already sent the FIN, or we may be
+ * closed.
+ */
+
+static unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ /* (Invalid) */ TCP_CLOSE,
+ /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ /* TCP_SYN_SENT */ TCP_CLOSE,
+ /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
+ /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
+ /* TCP_TIME_WAIT */ TCP_CLOSE,
+ /* TCP_CLOSE */ TCP_CLOSE,
+ /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
+ /* TCP_LAST_ACK */ TCP_LAST_ACK,
+ /* TCP_LISTEN */ TCP_CLOSE,
+ /* TCP_CLOSING */ TCP_CLOSING,
+};
+
+static int tcp_close_state(struct sock *sk, int dead)
+{
+ int next = (int) new_state[sk->state];
+ int ns = (next & TCP_STATE_MASK);
+
+ tcp_set_state(sk, ns);
+
+ /* This is a (useful) BSD violating of the RFC. There is a
+ * problem with TCP as specified in that the other end could
+ * keep a socket open forever with no application left this end.
+ * We use a 3 minute timeout (about the same as BSD) then kill
+ * our end. If they send after that then tough - BUT: long enough
+ * that we won't make the old 4*rto = almost no time - whoops
+ * reset mistake.
+ */
+ if (dead)
+ tcp_check_fin_timer(sk);
+
+ return (next & TCP_ACTION_FIN);
+}
+
+/*
+ * Shutdown the sending side of a connection. Much like close except
+ * that we don't receive shut down or set sk->dead.
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+ /* We need to grab some memory, and put together a FIN,
+ * and then put it into the queue to be sent.
+ * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+ */
+ if (!(how & SEND_SHUTDOWN))
+ return;
+
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if ((1 << sk->state) &
+ (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
+ lock_sock(sk);
+
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk,0))
+ tcp_send_fin(sk);
+
+ release_sock(sk);
+ }
+}
+
+
+/*
+ * Return 1 if we still have things to send in our buffers.
+ */
+
+static inline int closing(struct sock * sk)
+{
+ return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
+}
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted. Currently it is only called by
+ * tcp_close, and timeout mirrors the value there.
+ */
+
+static void tcp_close_pending (struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct open_request *req = tp->syn_wait_queue;
+
+ while(req) {
+ struct open_request *iter;
+
+ if (req->sk)
+ tcp_close(req->sk, 0);
+
+ iter = req;
+ req = req->dl_next;
+
+ (*iter->class->destructor)(iter);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ sk->ack_backlog--;
+ tcp_openreq_free(iter);
+ }
+
+ tcp_synq_init(tp);
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+ int data_was_unread = 0;
+
+ /*
+ * Check whether the socket is locked ... supposedly
+ * it's impossible to tcp_close() a locked socket.
+ */
+ if (atomic_read(&sk->sock_readers))
+ printk("tcp_close: socket already locked!\n");
+
+ /* We need to grab some memory, and put together a FIN,
+ * and then put it into the queue to be sent.
+ */
+ lock_sock(sk);
+ if(sk->state == TCP_LISTEN) {
+ /* Special case. */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_close_pending(sk);
+ release_sock(sk);
+ sk->dead = 1;
+ return;
+ }
+
+ /* It is questionable, what the role of this is now.
+ * In any event either it should be removed, or
+ * increment of SLT_KEEPALIVE be done, this is causing
+ * big problems. For now I comment it out. -DaveM
+ */
+ /* sk->keepopen = 1; */
+ sk->shutdown = SHUTDOWN_MASK;
+
+ if (!sk->dead)
+ sk->state_change(sk);
+
+ /* We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ * reader process may not have drained the data yet!
+ */
+ while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
+ data_was_unread += len;
+ kfree_skb(skb);
+ }
+
+ /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+ * 3.10, we send a RST here because data was lost. To
+ * witness the awful effects of the old behavior of always
+ * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+ * a bulk GET in an FTP client, suspend the process, wait
+ * for the client to advertise a zero window, then kill -9
+ * the FTP client, wheee... Note: timeout is always zero
+ * in such a case.
+ */
+ if(data_was_unread != 0) {
+ /* Unread data was tossed, zap the connection. */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk);
+ } else if (tcp_close_state(sk,1)) {
+ /* We FIN if the application ate all the data before
+ * zapping the connection.
+ */
+ tcp_send_fin(sk);
+ }
+
+ if (timeout) {
+ struct task_struct *tsk = current;
+ struct wait_queue wait = { tsk, NULL };
+
+ add_wait_queue(sk->sleep, &wait);
+ release_sock(sk);
+
+ while (1) {
+ tsk->state = TASK_INTERRUPTIBLE;
+ if (!closing(sk))
+ break;
+ timeout = schedule_timeout(timeout);
+ if (signal_pending(tsk) || !timeout)
+ break;
+ }
+
+ tsk->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+
+ lock_sock(sk);
+ }
+
+ /* Now that the socket is dead, if we are in the FIN_WAIT2 state
+ * we may need to set up a timer.
+ */
+ tcp_check_fin_timer(sk);
+
+ release_sock(sk);
+ sk->dead = 1;
+}
+
+/*
+ * Wait for an incoming connection, avoid race
+ * conditions. This must be called with the socket locked.
+ */
+static struct open_request * wait_for_connect(struct sock * sk,
+ struct open_request **pprev)
+{
+ struct wait_queue wait = { current, NULL };
+ struct open_request *req;
+
+ add_wait_queue(sk->sleep, &wait);
+ for (;;) {
+ current->state = TASK_INTERRUPTIBLE;
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
+ if (req)
+ break;
+ if (signal_pending(current))
+ break;
+ }
+ current->state = TASK_RUNNING;
+ remove_wait_queue(sk->sleep, &wait);
+ return req;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ *
+ * Be careful about race conditions here - this is subtle.
+ */
+
+struct sock *tcp_accept(struct sock *sk, int flags)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct open_request *req, *prev;
+ struct sock *newsk = NULL;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = EINVAL;
+ if (sk->state != TCP_LISTEN)
+ goto out;
+
+ /* Find already established connection */
+ req = tcp_find_established(tp, &prev);
+ if (!req) {
+ /* If this is a non blocking socket don't sleep */
+ error = EAGAIN;
+ if (flags & O_NONBLOCK)
+ goto out;
+
+ error = ERESTARTSYS;
+ req = wait_for_connect(sk, &prev);
+ if (!req)
+ goto out;
+ }
+
+ tcp_synq_unlink(tp, req, prev);
+ newsk = req->sk;
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+ sk->ack_backlog--;
+ if(sk->keepopen)
+ tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
+
+ release_sock(sk);
+ return newsk;
+
+out:
+ /* sk should be in LISTEN state, thus accept can use sk->err for
+ * internal purposes without stomping one anyone's feed.
+ */
+ sk->err = error;
+ release_sock(sk);
+ return newsk;
+}
+
+/*
+ * Socket option code for TCP.
+ */
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
+ int optlen)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int val;
+
+ if (level != SOL_TCP)
+ return tp->af_specific->setsockopt(sk, level, optname,
+ optval, optlen);
+
+ if(optlen<sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int *)optval))
+ return -EFAULT;
+
+ switch(optname) {
+ case TCP_MAXSEG:
+ /* values greater than interface MTU won't take effect. however at
+ * the point when this call is done we typically don't yet know
+ * which interface is going to be used
+ */
+ if(val < 1 || val > MAX_WINDOW)
+ return -EINVAL;
+ tp->user_mss = val;
+ return 0;
+
+ case TCP_NODELAY:
+ /* You cannot try to use this and TCP_CORK in
+ * tandem, so let the user know.
+ */
+ if (sk->nonagle == 2)
+ return -EINVAL;
+ sk->nonagle = (val == 0) ? 0 : 1;
+ return 0;
+
+ case TCP_CORK:
+ /* When set indicates to always queue non-full frames.
+ * Later the user clears this option and we transmit
+ * any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly
+ * filled frames when the user (for example) must write
+ * out headers with a write() call first and then use
+ * sendfile to send out the data parts.
+ *
+ * You cannot try to use TCP_NODELAY and this mechanism
+ * at the same time, so let the user know.
+ */
+ if (sk->nonagle == 1)
+ return -EINVAL;
+ if (val != 0) {
+ sk->nonagle = 2;
+ } else {
+ sk->nonagle = 0;
+
+ lock_sock(sk);
+ tcp_push_pending_frames(sk, tp);
+ release_sock(sk);
+ }
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ };
+}
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
+ int *optlen)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int val, len;
+
+ if(level != SOL_TCP)
+ return tp->af_specific->getsockopt(sk, level, optname,
+ optval, optlen);
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+
+ len = min(len, sizeof(int));
+
+ switch(optname) {
+ case TCP_MAXSEG:
+ val = tp->user_mss;
+ break;
+ case TCP_NODELAY:
+ val = (sk->nonagle == 1);
+ break;
+ case TCP_CORK:
+ val = (sk->nonagle == 2);
+ break;
+ default:
+ return -ENOPROTOOPT;
+ };
+
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, &val,len))
+ return -EFAULT;
+ return 0;
+}
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if (!sk->keepopen && val)
+ tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
+ else if (sk->keepopen && !val)
+ tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
+}
+
+extern void __skb_cb_too_small_for_tcp(int, int);
+
+void __init tcp_init(void)
+{
+ struct sk_buff *skb = NULL;
+
+ if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
+ __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
+ sizeof(skb->cb));
+
+ tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
+ sizeof(struct open_request),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_openreq_cachep)
+ panic("tcp_init: Cannot alloc open_request cache.");
+
+ tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct tcp_bind_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_bucket_cachep)
+ panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+ tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+ sizeof(struct tcp_tw_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_timewait_cachep)
+ panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+}
diff --git a/pfinet/linux-src/net/ipv4/tcp_input.c b/pfinet/linux-src/net/ipv4/tcp_input.c
new file mode 100644
index 00000000..a753b128
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp_input.c
@@ -0,0 +1,2432 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_input.c,v 1.164.2.7 1999/08/13 16:14:27 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
+ * Two receive queues.
+ * Retransmit queue handled by TCP.
+ * Better retransmit timer handling.
+ * New congestion avoidance.
+ * Header prediction.
+ * Variable renaming.
+ *
+ * Eric : Fast Retransmit.
+ * Randy Scott : MSS option defines.
+ * Eric Schenk : Fixes to slow start algorithm.
+ * Eric Schenk : Yet another double ACK bug.
+ * Eric Schenk : Delayed ACK bug fixes.
+ * Eric Schenk : Floyd style fast retrans war avoidance.
+ * David S. Miller : Don't allow zero congestion window.
+ * Eric Schenk : Fix retransmitter so that it sends
+ * next packet on ack of previous packet.
+ * Andi Kleen : Moved open_request checking here
+ * and process RSTs for open_requests.
+ * Andi Kleen : Better prune_queue, and other fixes.
+ * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * timestamps.
+ * Andrey Savochkin: Check sequence numbers correctly when
+ * removing SACKs due to in sequence incoming
+ * data segments.
+ * Andi Kleen: Make sure we never ack data there is not
+ * enough room for. Also make this condition
+ * a fatal error if it might still happen.
+ * Andi Kleen: Add tcp_measure_rcv_mss to make
+ * connections with MSS<min(MTU,ann. MSS)
+ * work without delayed acks.
+ * Andi Kleen: Process packets with PSH set in the
+ * fast path.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <net/tcp.h>
+#include <linux/ipsec.h>
+
+#ifdef CONFIG_SYSCTL
+#define SYNC_INIT 0 /* let the user enable it */
+#else
+#define SYNC_INIT 1
+#endif
+
+extern int sysctl_tcp_fin_timeout;
+
+/* These are on by default so the code paths get tested.
+ * For the final 2.2 this may be undone at our discretion. -DaveM
+ */
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+int sysctl_tcp_sack = 1;
+
+int sysctl_tcp_syncookies = SYNC_INIT;
+int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
+
+static int prune_queue(struct sock *sk);
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
+ */
+static void tcp_delack_estimator(struct tcp_opt *tp)
+{
+ if(tp->ato == 0) {
+ tp->lrcvtime = tcp_time_stamp;
+
+ /* Help sender leave slow start quickly,
+ * and also makes sure we do not take this
+ * branch ever again for this connection.
+ */
+ tp->ato = 1;
+ tcp_enter_quickack_mode(tp);
+ } else {
+ int m = tcp_time_stamp - tp->lrcvtime;
+
+ tp->lrcvtime = tcp_time_stamp;
+ if(m <= 0)
+ m = 1;
+ if(m > tp->rto)
+ tp->ato = tp->rto;
+ else {
+ /* This funny shift makes sure we
+ * clear the "quick ack mode" bit.
+ */
+ tp->ato = ((tp->ato << 1) >> 2) + m;
+ }
+ }
+}
+
+/*
+ * Remember to send an ACK later.
+ */
+static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
+ struct sk_buff *skb)
+{
+ tp->delayed_acks++;
+
+ /* Tiny-grams with PSH set artifically deflate our
+ * ato measurement, but with a lower bound.
+ */
+ if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
+ /* Preserve the quickack state. */
+ if((tp->ato & 0x7fffffff) > HZ/50)
+ tp->ato = ((tp->ato & 0x80000000) |
+ (HZ/50));
+ }
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+
+static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
+{
+ long m = mrtt; /* RTT */
+
+ /* The following amusing code comes from Jacobson's
+ * article in SIGCOMM '88. Note that rtt and mdev
+ * are scaled versions of rtt and mean deviation.
+ * This is designed to be as fast as possible
+ * m stands for "measurement".
+ *
+ * On a 1990 paper the rto value is changed to:
+ * RTO = rtt + 4 * mdev
+ */
+ if(m == 0)
+ m = 1;
+ if (tp->srtt != 0) {
+ m -= (tp->srtt >> 3); /* m is now error in rtt est */
+ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (m < 0)
+ m = -m; /* m is now abs(error) */
+ m -= (tp->mdev >> 2); /* similar update on mdev */
+ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
+ } else {
+ /* no previous measure. */
+ tp->srtt = m<<3; /* take the measured time to be rtt */
+ tp->mdev = m<<2; /* make sure rto = 3*rtt */
+ }
+}
+
+/* Calculate rto without backoff. This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+
+static __inline__ void tcp_set_rto(struct tcp_opt *tp)
+{
+ tp->rto = (tp->srtt >> 3) + tp->mdev;
+ tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
+}
+
+
+/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
+ * on packet lifetime in the internet. We need the HZ/5 lower
+ * bound to behave correctly against BSD stacks with a fixed
+ * delayed ack.
+ * FIXME: It's not entirely clear this lower bound is the best
+ * way to avoid the problem. Is it possible to drop the lower
+ * bound and still avoid trouble with BSD stacks? Perhaps
+ * some modification to the RTO calculation that takes delayed
+ * ack bias into account? This needs serious thought. -- erics
+ */
+static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
+{
+ if (tp->rto > 120*HZ)
+ tp->rto = 120*HZ;
+ if (tp->rto < HZ/5)
+ tp->rto = HZ/5;
+}
+
+/* WARNING: this must not be called if tp->saw_timestamp was false. */
+extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
+ __u32 start_seq, __u32 end_seq)
+{
+ /* It is start_seq <= last_ack_seq combined
+ with in window check. If start_seq<=last_ack_seq<=rcv_nxt,
+ then segment is in window if end_seq>=rcv_nxt.
+ */
+ if (!after(start_seq, tp->last_ack_sent) &&
+ !before(end_seq, tp->rcv_nxt)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Plus: expired timestamps.
+ *
+ * Plus: resets failing PAWS.
+ */
+ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = tcp_time_stamp;
+ }
+ }
+}
+
+#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
+
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
+{
+ return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
+ (s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS &&
+ /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
+ len != (th->doff * 4));
+}
+
+
+static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
+{
+ u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+
+ if (tp->rcv_wnd &&
+ after(end_seq, tp->rcv_nxt) &&
+ before(seq, end_window))
+ return 1;
+ if (seq != end_window)
+ return 0;
+ return (seq == end_seq);
+}
+
+/* This functions checks to see if the tcp header is actually acceptable. */
+extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
+{
+ if (seq == tp->rcv_nxt)
+ return (tp->rcv_wnd || (end_seq == seq));
+
+ return __tcp_sequence(tp, seq, end_seq);
+}
+
+/* When we get a reset we do this. */
+static void tcp_reset(struct sock *sk)
+{
+ sk->zapped = 1;
+
+ /* We want the right error as BSD sees it (and indeed as we do). */
+ switch (sk->state) {
+ case TCP_SYN_SENT:
+ sk->err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->err = EPIPE;
+ break;
+ default:
+ sk->err = ECONNRESET;
+ };
+ tcp_set_state(sk, TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+ if (!sk->dead)
+ sk->state_change(sk);
+}
+
+/* This tags the retransmission queue when SACKs arrive. */
+static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int i = nsacks;
+
+ while(i--) {
+ struct sk_buff *skb = skb_peek(&sk->write_queue);
+ __u32 start_seq = ntohl(sp->start_seq);
+ __u32 end_seq = ntohl(sp->end_seq);
+ int fack_count = 0;
+
+ while((skb != NULL) &&
+ (skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue)) {
+ /* The retransmission queue is always in order, so
+ * we can short-circuit the walk early.
+ */
+ if(after(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ /* We play conservative, we don't allow SACKS to partially
+ * tag a sequence space.
+ */
+ fack_count++;
+ if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* If this was a retransmitted frame, account for it. */
+ if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) &&
+ tp->retrans_out)
+ tp->retrans_out--;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+
+ /* RULE: All new SACKs will either decrease retrans_out
+ * or advance fackets_out.
+ */
+ if(fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+ }
+ skb = skb->next;
+ }
+ sp++; /* Move on to the next SACK block. */
+ }
+}
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
+{
+ unsigned char *ptr;
+ int length=(th->doff*4)-sizeof(struct tcphdr);
+ int saw_mss = 0;
+
+ ptr = (unsigned char *)(th + 1);
+ tp->saw_tstamp = 0;
+
+ while(length>0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+ switch(opcode) {
+ case TCPOPT_MSS:
+ if(opsize==TCPOLEN_MSS && th->syn) {
+ u16 in_mss = ntohs(*(__u16 *)ptr);
+ if (in_mss == 0)
+ in_mss = 536;
+ if (tp->mss_clamp > in_mss)
+ tp->mss_clamp = in_mss;
+ saw_mss = 1;
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if(opsize==TCPOLEN_WINDOW && th->syn)
+ if (!no_fancy && sysctl_tcp_window_scaling) {
+ tp->wscale_ok = 1;
+ tp->snd_wscale = *(__u8 *)ptr;
+ if(tp->snd_wscale > 14) {
+ if(net_ratelimit())
+ printk("tcp_parse_options: Illegal window "
+ "scaling value %d >14 received.",
+ tp->snd_wscale);
+ tp->snd_wscale = 14;
+ }
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if(opsize==TCPOLEN_TIMESTAMP) {
+ if (sysctl_tcp_timestamps && !no_fancy) {
+ tp->tstamp_ok = 1;
+ tp->saw_tstamp = 1;
+ tp->rcv_tsval = ntohl(*(__u32 *)ptr);
+ tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
+ }
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if(opsize==TCPOLEN_SACK_PERM && th->syn) {
+ if (sysctl_tcp_sack && !no_fancy) {
+ tp->sack_ok = 1;
+ tp->num_sacks = 0;
+ }
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ sysctl_tcp_sack && (sk != NULL) && !th->syn) {
+ int sack_bytes = opsize - TCPOLEN_SACK_BASE;
+
+ if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) {
+ int num_sacks = sack_bytes >> 3;
+ struct tcp_sack_block *sackp;
+
+ sackp = (struct tcp_sack_block *)ptr;
+ tcp_sacktag_write_queue(sk, sackp, num_sacks);
+ }
+ }
+ };
+ ptr+=opsize-2;
+ length-=opsize;
+ };
+ }
+ if(th->syn && saw_mss == 0)
+ tp->mss_clamp = 536;
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp)
+{
+ /* If we didn't send out any options ignore them all. */
+ if (tp->tcp_header_len == sizeof(struct tcphdr))
+ return 0;
+ if (th->doff == sizeof(struct tcphdr)>>2) {
+ tp->saw_tstamp = 0;
+ return 0;
+ } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+ __u32 *ptr = (__u32 *)(th + 1);
+ if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->saw_tstamp = 1;
+ tp->rcv_tsval = ntohl(*++ptr);
+ tp->rcv_tsecr = ntohl(*++ptr);
+ return 1;
+ }
+ }
+ tcp_parse_options(sk, th, tp, 0);
+ return 1;
+}
+
+#define FLAG_DATA 0x01 /* Incoming frame contained data. */
+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+
+static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
+{
+ if (tp->dup_acks > 3)
+ tp->snd_cwnd = (tp->snd_ssthresh);
+
+ tp->dup_acks = 0;
+}
+
+/* NOTE: This code assumes that tp->dup_acks gets cleared when a
+ * retransmit timer fires.
+ */
+static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Note: If not_dup is set this implies we got a
+ * data carrying packet or a window update.
+ * This carries no new information about possible
+ * lost packets, so we have to ignore it for the purposes
+ * of counting duplicate acks. Ideally this does not imply we
+ * should stop our fast retransmit phase, more acks may come
+ * later without data to help us. Unfortunately this would make
+ * the code below much more complex. For now if I see such
+ * a packet I clear the fast retransmit phase.
+ */
+ if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
+ /* This is the standard reno style fast retransmit branch. */
+
+ /* 1. When the third duplicate ack is received, set ssthresh
+ * to one half the current congestion window, but no less
+ * than two segments. Retransmit the missing segment.
+ */
+ if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
+ tp->dup_acks++;
+ if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->snd_cwnd = (tp->snd_ssthresh + 3);
+ tp->high_seq = tp->snd_nxt;
+ if(!tp->fackets_out)
+ tcp_retransmit_skb(sk,
+ skb_peek(&sk->write_queue));
+ else
+ tcp_fack_retransmit(sk);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
+ } else if (++tp->dup_acks > 3) {
+ /* 2. Each time another duplicate ACK arrives, increment
+ * cwnd by the segment size. [...] Transmit a packet...
+ *
+ * Packet transmission will be done on normal flow processing
+ * since we're not in "retransmit mode". We do not use
+ * duplicate ACKs to artificially inflate the congestion
+ * window when doing FACK.
+ */
+ if(!tp->fackets_out) {
+ tp->snd_cwnd++;
+ } else {
+ /* Fill any further holes which may have
+ * appeared.
+ *
+ * We may want to change this to run every
+ * further multiple-of-3 dup ack increments,
+ * to be more robust against out-of-order
+ * packet delivery. -DaveM
+ */
+ tcp_fack_retransmit(sk);
+ }
+ }
+ } else if (tp->high_seq != 0) {
+ /* In this branch we deal with clearing the Floyd style
+ * block on duplicate fast retransmits, and if requested
+ * we do Hoe style secondary fast retransmits.
+ */
+ if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) {
+ /* Once we have acked all the packets up to high_seq
+ * we are done this fast retransmit phase.
+ * Alternatively data arrived. In this case we
+ * Have to abort the fast retransmit attempt.
+ * Note that we do want to accept a window
+ * update since this is expected with Hoe's algorithm.
+ */
+ clear_fast_retransmit(tp);
+
+ /* After we have cleared up to high_seq we can
+ * clear the Floyd style block.
+ */
+ if (!before(ack, tp->high_seq)) {
+ tp->high_seq = 0;
+ tp->fackets_out = 0;
+ }
+ } else if (tp->dup_acks >= 3) {
+ if (!tp->fackets_out) {
+ /* Hoe Style. We didn't ack the whole
+ * window. Take this as a cue that
+ * another packet was lost and retransmit it.
+ * Don't muck with the congestion window here.
+ * Note that we have to be careful not to
+ * act if this was a window update and it
+ * didn't ack new data, since this does
+ * not indicate a packet left the system.
+ * We can test this by just checking
+ * if ack changed from snd_una, since
+ * the only way to get here without advancing
+ * from snd_una is if this was a window update.
+ */
+ if (ack != tp->snd_una && before(ack, tp->high_seq)) {
+ tcp_retransmit_skb(sk,
+ skb_peek(&sk->write_queue));
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
+ } else {
+ /* FACK style, fill any remaining holes in
+ * receiver's queue.
+ */
+ tcp_fack_retransmit(sk);
+ }
+ }
+ }
+}
+
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
+{
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
+ tp->snd_cwnd++;
+ } else {
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt=0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+}
+
+/* Remove acknowledged frames from the retransmission queue. */
+static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
+ __u32 *seq, __u32 *seq_rtt)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+ __u32 now = tcp_time_stamp;
+ int acked = 0;
+
+ /* If we are retransmitting, and this ACK clears up to
+ * the retransmit head, or further, then clear our state.
+ */
+ if (tp->retrans_head != NULL &&
+ !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq))
+ tp->retrans_head = NULL;
+
+ while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ __u8 sacked = scb->sacked;
+
+ /* If our packet is before the ack sequence we can
+ * discard it as it's confirmed to have arrived at
+ * the other end.
+ */
+ if (after(scb->end_seq, ack))
+ break;
+
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out)
+ tp->retrans_out--;
+ if(!(scb->flags & TCPCB_FLAG_SYN)) {
+ acked |= FLAG_DATA_ACKED;
+ if(sacked & TCPCB_SACKED_RETRANS)
+ acked |= FLAG_RETRANS_DATA_ACKED;
+ if(tp->fackets_out)
+ tp->fackets_out--;
+ } else {
+ /* This is pure paranoia. */
+ tp->retrans_head = NULL;
+ }
+ tp->packets_out--;
+ *seq = scb->seq;
+ *seq_rtt = now - scb->when;
+ __skb_unlink(skb, skb->list);
+ kfree_skb(skb);
+ }
+ return acked;
+}
+
+static void tcp_ack_probe(struct sock *sk, __u32 ack)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Our probe was answered. */
+ tp->probes_out = 0;
+
+ /* Was it a usable window open? */
+
+ /* should always be non-null */
+ if (tp->send_head != NULL &&
+ !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
+ tp->backoff = 0;
+ tp->pending = 0;
+ tcp_clear_xmit_timer(sk, TIME_PROBE0);
+ } else {
+ tcp_reset_xmit_timer(sk, TIME_PROBE0,
+ min(tp->rto << tp->backoff, 120*HZ));
+ }
+}
+
+/* Should we open up the congestion window? */
+static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
+{
+ /* Data must have been acked. */
+ if ((flag & FLAG_DATA_ACKED) == 0)
+ return 0;
+
+ /* Some of the data acked was retransmitted somehow? */
+ if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) {
+ /* We advance in all cases except during
+ * non-FACK fast retransmit/recovery.
+ */
+ if (tp->fackets_out != 0 ||
+ tp->retransmits != 0)
+ return 1;
+
+ /* Non-FACK fast retransmit does it's own
+ * congestion window management, don't get
+ * in the way.
+ */
+ return 0;
+ }
+
+ /* New non-retransmitted data acked, always advance. */
+ return 1;
+}
+
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
+ u32 seq, u32 ack, int flag)
+{
+ __u32 seq_rtt;
+
+ /* RTTM Rule: A TSecr value received in a segment is used to
+ * update the averaged RTT measurement only if the segment
+ * acknowledges some new data, i.e., only if it advances the
+ * left edge of the send window.
+ *
+ * See draft-ietf-tcplw-high-performance-00, section 3.3.
+ * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+ */
+ if (!(flag & FLAG_DATA_ACKED))
+ return;
+
+ seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
+ tcp_rtt_estimator(tp, seq_rtt);
+ if (tp->retransmits) {
+ if (tp->packets_out == 0) {
+ tp->retransmits = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
+ tp->backoff = 0;
+ tcp_set_rto(tp);
+ } else {
+ /* Still retransmitting, use backoff */
+ tcp_set_rto(tp);
+ tp->rto = tp->rto << tp->backoff;
+ }
+ } else {
+ tcp_set_rto(tp);
+ }
+
+ tcp_bound_rto(tp);
+}
+
+static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
+{
+ struct sk_buff *skb = skb_peek(&sk->write_queue);
+
+ /* Some data was ACK'd, if still retransmitting (due to a
+ * timeout), resend more of the retransmit queue. The
+ * congestion window is handled properly by that code.
+ */
+ if (tp->retransmits) {
+ tcp_xmit_retransmit_queue(sk);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else {
+ __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
+ if ((__s32)when < 0)
+ when = 1;
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+ }
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, struct tcphdr *th,
+ u32 ack_seq, u32 ack, int len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int flag = 0;
+ u32 seq = 0;
+ u32 seq_rtt = 0;
+
+ if(sk->zapped)
+ return(1); /* Dead, can't ack any more so why bother */
+
+ if (tp->pending == TIME_KEEPOPEN)
+ tp->probes_out = 0;
+
+ tp->rcv_tstamp = tcp_time_stamp;
+
+ /* If the ack is newer than sent or older than previous acks
+ * then we can probably ignore it.
+ */
+ if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
+ goto uninteresting_ack;
+
+ /* If there is data set flag 1 */
+ if (len != th->doff*4) {
+ flag |= FLAG_DATA;
+ tcp_delack_estimator(tp);
+ }
+
+ /* Update our send window. */
+
+ /* This is the window update code as per RFC 793
+ * snd_wl{1,2} are used to prevent unordered
+ * segments from shrinking the window
+ */
+ if (before(tp->snd_wl1, ack_seq) ||
+ (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
+ u32 nwin = ntohs(th->window) << tp->snd_wscale;
+
+ if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
+ flag |= FLAG_WIN_UPDATE;
+ tp->snd_wnd = nwin;
+
+ tp->snd_wl1 = ack_seq;
+ tp->snd_wl2 = ack;
+
+ if (nwin > tp->max_window)
+ tp->max_window = nwin;
+ }
+ }
+
+ /* We passed data and got it acked, remove any soft error
+ * log. Something worked...
+ */
+ sk->err_soft = 0;
+
+ /* If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+ if (tp->pending == TIME_PROBE0)
+ tcp_ack_probe(sk, ack);
+
+ /* See if we can take anything off of the retransmit queue. */
+ flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
+
+ /* We must do this here, before code below clears out important
+ * state contained in tp->fackets_out and tp->retransmits. -DaveM
+ */
+ if (should_advance_cwnd(tp, flag))
+ tcp_cong_avoid(tp);
+
+ /* If we have a timestamp, we always do rtt estimates. */
+ if (tp->saw_tstamp) {
+ tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
+ } else {
+ /* If we were retransmiting don't count rtt estimate. */
+ if (tp->retransmits) {
+ if (tp->packets_out == 0) {
+ tp->retransmits = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
+ }
+ } else {
+ /* We don't have a timestamp. Can only use
+ * packets that are not retransmitted to determine
+ * rtt estimates. Also, we must not reset the
+ * backoff for rto until we get a non-retransmitted
+ * packet. This allows us to deal with a situation
+ * where the network delay has increased suddenly.
+ * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+ */
+ if (flag & FLAG_DATA_ACKED) {
+ if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
+ tp->backoff = 0;
+ tcp_rtt_estimator(tp, seq_rtt);
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+ }
+ }
+ }
+ }
+
+ if (tp->packets_out) {
+ if (flag & FLAG_DATA_ACKED)
+ tcp_ack_packets_out(sk, tp);
+ } else {
+ tcp_clear_xmit_timer(sk, TIME_RETRANS);
+ }
+
+ flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
+ if ((ack == tp->snd_una && tp->packets_out && flag == 0) ||
+ (tp->high_seq != 0)) {
+ tcp_fast_retrans(sk, ack, flag);
+ } else {
+ /* Clear any aborted fast retransmit starts. */
+ tp->dup_acks = 0;
+ }
+ /* It is not a brain fart, I thought a bit now. 8)
+ *
+ * Forward progress is indicated, if:
+ * 1. the ack acknowledges new data.
+ * 2. or the ack is duplicate, but it is caused by new segment
+ * arrival. This case is filtered by:
+ * - it contains no data, syn or fin.
+ * - it does not update window.
+ * 3. or new SACK. It is difficult to check, so that we ignore it.
+ *
+ * Forward progress is also indicated by arrival new data,
+ * which was caused by window open from our side. This case is more
+ * difficult and it is made (alas, incorrectly) in tcp_data_queue().
+ * --ANK (990513)
+ */
+ if (ack != tp->snd_una || (flag == 0 && !th->fin))
+ dst_confirm(sk->dst_cache);
+
+ /* Remember the highest ack received. */
+ tp->snd_una = ack;
+ return 1;
+
+uninteresting_ack:
+ SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
+ return 0;
+}
+
+/* New-style handling of TIME_WAIT sockets. */
+extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
+extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
+
+void tcp_timewait_kill(struct tcp_tw_bucket *tw)
+{
+ struct tcp_bind_bucket *tb = tw->tb;
+
+ /* Disassociate with bind bucket. */
+ if(tw->bind_next)
+ tw->bind_next->bind_pprev = tw->bind_pprev;
+ *(tw->bind_pprev) = tw->bind_next;
+ if (tb->owners == NULL) {
+ if (tb->next)
+ tb->next->pprev = tb->pprev;
+ *(tb->pprev) = tb->next;
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+
+ /* Unlink from established hashes. */
+ if(tw->next)
+ tw->next->pprev = tw->pprev;
+ *tw->pprev = tw->next;
+
+ /* We decremented the prot->inuse count when we entered TIME_WAIT
+ * and the sock from which this came was destroyed.
+ */
+ tw->sklist_next->sklist_prev = tw->sklist_prev;
+ tw->sklist_prev->sklist_next = tw->sklist_next;
+
+ /* Ok, now free it up. */
+ kmem_cache_free(tcp_timewait_cachep, tw);
+}
+
+/* We come here as a special case from the AF specific TCP input processing,
+ * and the SKB has no owner. Essentially handling this is very simple,
+ * we just keep silently eating rx'd packets, acking them if necessary,
+ * until none show up for the entire timeout period.
+ *
+ * Return 0, TCP_TW_ACK, TCP_TW_RST
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ /* RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+ if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
+ struct sock *sk;
+ struct tcp_func *af_specific = tw->af_specific;
+ __u32 isn;
+
+ isn = tw->snd_nxt + 128000;
+ if(isn == 0)
+ isn++;
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ sk = af_specific->get_sock(skb, th);
+ if(sk == NULL ||
+ !ipsec_sk_policy(sk,skb) ||
+ atomic_read(&sk->sock_readers) != 0)
+ return 0;
+ skb_set_owner_r(skb, sk);
+ af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ if(af_specific->conn_request(sk, skb, isn) < 0)
+ return TCP_TW_RST; /* Toss a reset back. */
+ return 0; /* Discard the frame. */
+ }
+
+ /* Check RST or SYN */
+ if(th->rst || th->syn) {
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if(sysctl_tcp_rfc1337 == 0) {
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ }
+ if(!th->rst)
+ return TCP_TW_RST; /* toss a reset back */
+ return 0;
+ } else {
+ /* In this case we must reset the TIMEWAIT timer. */
+ if(th->ack)
+ tcp_tw_reschedule(tw);
+ }
+ /* Ack old packets if necessary */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) &&
+ (th->doff * 4) > len)
+ return TCP_TW_ACK;
+ return 0;
+}
+
+/* Enter the time wait state. This is always called from BH
+ * context. Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+ struct sock **head, *sktw;
+
+ /* Step 1: Remove SK from established hash. */
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ tcp_reg_zap(sk);
+
+ /* Step 2: Put TW into bind hash where SK was. */
+ tw->tb = (struct tcp_bind_bucket *)sk->prev;
+ if((tw->bind_next = sk->bind_next) != NULL)
+ sk->bind_next->bind_pprev = &tw->bind_next;
+ tw->bind_pprev = sk->bind_pprev;
+ *sk->bind_pprev = (struct sock *)tw;
+ sk->prev = NULL;
+
+ /* Step 3: Same for the protocol sklist. */
+ (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
+ (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
+ sk->sklist_next = NULL;
+ sk->prot->inuse--;
+
+ /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
+ head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+ sktw = (struct sock *)tw;
+ if((sktw->next = *head) != NULL)
+ (*head)->pprev = &sktw->next;
+ *head = sktw;
+ sktw->pprev = head;
+}
+
+void tcp_time_wait(struct sock *sk)
+{
+ struct tcp_tw_bucket *tw;
+
+ tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+ if(tw != NULL) {
+ /* Give us an identity. */
+ tw->daddr = sk->daddr;
+ tw->rcv_saddr = sk->rcv_saddr;
+ tw->bound_dev_if= sk->bound_dev_if;
+ tw->num = sk->num;
+ tw->state = TCP_TIME_WAIT;
+ tw->sport = sk->sport;
+ tw->dport = sk->dport;
+ tw->family = sk->family;
+ tw->reuse = sk->reuse;
+ tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
+ tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
+ tw->window = tcp_select_window(sk);
+ tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if(tw->family == PF_INET6) {
+ memcpy(&tw->v6_daddr,
+ &sk->net_pinfo.af_inet6.daddr,
+ sizeof(struct in6_addr));
+ memcpy(&tw->v6_rcv_saddr,
+ &sk->net_pinfo.af_inet6.rcv_saddr,
+ sizeof(struct in6_addr));
+ }
+#endif
+ /* Linkage updates. */
+ tcp_tw_hashdance(sk, tw);
+
+ /* Get the TIME_WAIT timeout firing. */
+ tcp_tw_schedule(tw);
+
+ /* CLOSE the SK. */
+ if(sk->state == TCP_ESTABLISHED)
+ tcp_statistics.TcpCurrEstab--;
+ sk->state = TCP_CLOSE;
+ net_reset_timer(sk, TIME_DONE,
+ min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
+ } else {
+ /* Sorry, we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ }
+
+ /* Prevent rcvmsg/sndmsg calls, and wake people up. */
+ sk->shutdown = SHUTDOWN_MASK;
+ if(!sk->dead)
+ sk->state_change(sk);
+}
+
+/*
+ * Process the FIN bit. This now behaves as it is supposed to work
+ * and the FIN takes effect when it is validly part of sequence
+ * space. Not before when we get holes.
+ *
+ * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ * (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ * TIME-WAIT)
+ *
+ * If we are in FINWAIT-1, a received FIN indicates simultaneous
+ * close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+ sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
+
+ tcp_send_ack(sk);
+
+ if (!sk->dead) {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 1);
+ }
+
+ switch(sk->state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ if (th->rst)
+ sk->shutdown = SHUTDOWN_MASK;
+ break;
+
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
+
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ *
+ * This causes a WRITE timeout, which will either
+ * move on to TIME_WAIT when we timeout, or resend
+ * the FIN properly (maybe we get rid of that annoying
+ * FIN lost hang). The TIME_WRITE code is already
+ * correct for handling this timeout.
+ */
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_time_wait(sk);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
+ break;
+ };
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp)
+{
+ int this_sack, num_sacks = tp->num_sacks;
+ struct tcp_sack_block *swalk = &tp->selective_acks[0];
+
+ /* If more than one SACK block, see if the recent change to SP eats into
+ * or hits the sequence space of other SACK blocks, if so coalesce.
+ */
+ if(num_sacks != 1) {
+ for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) {
+ if(swalk == sp)
+ continue;
+
+ /* First case, bottom of SP moves into top of the
+ * sequence space of SWALK.
+ */
+ if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) {
+ sp->start_seq = swalk->start_seq;
+ goto coalesce;
+ }
+ /* Second case, top of SP moves into bottom of the
+ * sequence space of SWALK.
+ */
+ if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) {
+ sp->end_seq = swalk->end_seq;
+ goto coalesce;
+ }
+ }
+ }
+ /* SP is the only SACK, or no coalescing cases found. */
+ return;
+
+coalesce:
+ /* Zap SWALK, by moving every further SACK up by one slot.
+ * Decrease num_sacks.
+ */
+ for(; this_sack < num_sacks-1; this_sack++, swalk++) {
+ struct tcp_sack_block *next = (swalk + 1);
+ swalk->start_seq = next->start_seq;
+ swalk->end_seq = next->end_seq;
+ }
+ tp->num_sacks--;
+}
+
+static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+{
+ __u32 tmp;
+
+ tmp = sack1->start_seq;
+ sack1->start_seq = sack2->start_seq;
+ sack2->start_seq = tmp;
+
+ tmp = sack1->end_seq;
+ sack1->end_seq = sack2->end_seq;
+ sack2->end_seq = tmp;
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int cur_sacks = tp->num_sacks;
+
+ if (!cur_sacks)
+ goto new_sack;
+
+ /* Optimize for the common case, new ofo frames arrive
+ * "in order". ;-) This also satisfies the requirements
+ * of RFC2018 about ordering of SACKs.
+ */
+ if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
+ sp->end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_sack_maybe_coalesce(tp, sp);
+ } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
+ /* Re-ordered arrival, in this case, can be optimized
+ * as well.
+ */
+ sp->start_seq = TCP_SKB_CB(skb)->seq;
+ tcp_sack_maybe_coalesce(tp, sp);
+ } else {
+ struct tcp_sack_block *swap = sp + 1;
+ int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4);
+
+ /* Oh well, we have to move things around.
+ * Try to find a SACK we can tack this onto.
+ */
+
+ for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
+ if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
+ (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
+ if(swap->end_seq == TCP_SKB_CB(skb)->seq)
+ swap->end_seq = TCP_SKB_CB(skb)->end_seq;
+ else
+ swap->start_seq = TCP_SKB_CB(skb)->seq;
+ tcp_sack_swap(sp, swap);
+ tcp_sack_maybe_coalesce(tp, sp);
+ return;
+ }
+ }
+
+ /* Could not find an adjacent existing SACK, build a new one,
+ * put it at the front, and shift everyone else down. We
+ * always know there is at least one SACK present already here.
+ *
+ * If the sack array is full, forget about the last one.
+ */
+ if (cur_sacks >= max_sacks) {
+ cur_sacks--;
+ tp->num_sacks--;
+ }
+ while(cur_sacks >= 1) {
+ struct tcp_sack_block *this = &tp->selective_acks[cur_sacks];
+ struct tcp_sack_block *prev = (this - 1);
+ this->start_seq = prev->start_seq;
+ this->end_seq = prev->end_seq;
+ cur_sacks--;
+ }
+
+ new_sack:
+ /* Build the new head SACK, and we're done. */
+ sp->start_seq = TCP_SKB_CB(skb)->seq;
+ sp->end_seq = TCP_SKB_CB(skb)->end_seq;
+ tp->num_sacks++;
+ }
+}
+
+static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb)
+{
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int num_sacks = tp->num_sacks;
+ int this_sack;
+
+ /* This is an in order data segment _or_ an out-of-order SKB being
+ * moved to the receive queue, so we know this removed SKB will eat
+ * from the front of a SACK.
+ */
+ for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
+ /* Check if the start of the sack is covered by skb. */
+ if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
+ before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
+ break;
+ }
+
+ /* This should only happen if so many SACKs get built that some get
+ * pushed out before we get here, or we eat some in sequence packets
+ * which are before the first SACK block.
+ */
+ if(this_sack >= num_sacks)
+ return;
+
+ sp->start_seq = TCP_SKB_CB(skb)->end_seq;
+ if(!before(sp->start_seq, sp->end_seq)) {
+ /* Zap this SACK, by moving forward any other SACKS. */
+ for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
+ struct tcp_sack_block *next = (sp + 1);
+ sp->start_seq = next->start_seq;
+ sp->end_seq = next->end_seq;
+ }
+ tp->num_sacks--;
+ }
+}
+
+static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb)
+{
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int num_sacks = tp->num_sacks;
+ int this_sack;
+
+ for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
+ if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
+ break;
+ }
+ if(this_sack >= num_sacks)
+ return;
+ sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ while ((skb = skb_peek(&tp->out_of_order_queue))) {
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ SOCK_DEBUG(sk, "ofo packet was already received \n");
+ __skb_unlink(skb, skb->list);
+ kfree_skb(skb);
+ continue;
+ }
+ SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ if(tp->sack_ok)
+ tcp_sack_remove_skb(tp, skb);
+ __skb_unlink(skb, skb->list);
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if(skb->h.th->fin)
+ tcp_fin(skb, sk, skb->h.th);
+ }
+}
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *skb1;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Queue data for delivery to the user.
+ * Packets in sequence go to the receive queue.
+ * Out of sequence packets to the out_of_order_queue.
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ /* Ok. In sequence. */
+ queue_and_out:
+ dst_confirm(sk->dst_cache);
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if(skb->h.th->fin) {
+ tcp_fin(skb, sk, skb->h.th);
+ } else {
+ tcp_remember_ack(tp, skb->h.th, skb);
+ }
+ /* This may have eaten into a SACK block. */
+ if(tp->sack_ok && tp->num_sacks)
+ tcp_sack_remove_skb(tp, skb);
+ tcp_ofo_queue(sk);
+
+ /* Turn on fast path. */
+ if (skb_queue_len(&tp->out_of_order_queue) == 0)
+ tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
+ (0x10 << 16) |
+ tp->snd_wnd);
+ return;
+ }
+
+ /* An old packet, either a retransmit or some packet got lost. */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ /* A retransmit, 2nd most common case. Force an imediate ack. */
+ SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
+ tcp_enter_quickack_mode(tp);
+ kfree_skb(skb);
+ return;
+ }
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* Partial packet, seq < rcv_next < end_seq */
+ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ goto queue_and_out;
+ }
+
+ /* Ok. This is an out_of_order segment, force an ack. */
+ tp->delayed_acks++;
+ tcp_enter_quickack_mode(tp);
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (skb_peek(&tp->out_of_order_queue) == NULL) {
+ /* Initial out of order segment, build 1 SACK. */
+ if(tp->sack_ok) {
+ tp->num_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue,skb);
+ } else {
+ for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+ /* Already there. */
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
+ if (skb->len >= skb1->len) {
+ if(tp->sack_ok)
+ tcp_sack_extend(tp, skb1, skb);
+ __skb_append(skb1, skb);
+ __skb_unlink(skb1, skb1->list);
+ kfree_skb(skb1);
+ } else {
+ /* A duplicate, smaller than what is in the
+ * out-of-order queue right now, toss it.
+ */
+ kfree_skb(skb);
+ }
+ break;
+ }
+
+ if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
+ __skb_append(skb1, skb);
+ if(tp->sack_ok)
+ tcp_sack_new_ofo_skb(sk, skb);
+ break;
+ }
+
+ /* See if we've hit the start. If so insert. */
+ if (skb1 == skb_peek(&tp->out_of_order_queue)) {
+ __skb_queue_head(&tp->out_of_order_queue,skb);
+ if(tp->sack_ok)
+ tcp_sack_new_ofo_skb(sk, skb);
+ break;
+ }
+ }
+ }
+}
+
+
+/*
+ * This routine handles the data. If there is room in the buffer,
+ * it will be have already been moved into it. If there is no
+ * room, then we will just have to discard the packet.
+ */
+
+static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
+{
+ struct tcphdr *th;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ th = skb->h.th;
+ skb_pull(skb, th->doff*4);
+ skb_trim(skb, len - (th->doff*4));
+
+ if (skb->len == 0 && !th->fin)
+ return(0);
+
+ /*
+ * If our receive queue has grown past its limits shrink it.
+ * Make sure to do this before moving snd_nxt, otherwise
+ * data might be acked for that we don't have enough room.
+ */
+ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
+ if (prune_queue(sk) < 0) {
+ /* Still not enough room. That can happen when
+ * skb->true_size differs significantly from skb->len.
+ */
+ return 0;
+ }
+ }
+
+ tcp_data_queue(sk, skb);
+
+ if (before(tp->rcv_nxt, tp->copied_seq)) {
+ printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
+ tp->rcv_nxt = tp->copied_seq;
+ }
+
+ /* Above, tcp_data_queue() increments delayed_acks appropriately.
+ * Now tell the user we may have some data.
+ */
+ if (!sk->dead) {
+ sk->data_ready(sk,0);
+ }
+ return(1);
+}
+
+static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
+ tcp_packets_in_flight(tp) < tp->snd_cwnd) {
+ /* Put more data onto the wire. */
+ tcp_write_xmit(sk);
+ } else if (tp->packets_out == 0 && !tp->pending) {
+ /* Start probing the receivers window. */
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ }
+}
+
+static __inline__ void tcp_data_snd_check(struct sock *sk)
+{
+ struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head;
+
+ if (skb != NULL)
+ __tcp_data_snd_check(sk, skb);
+}
+
+/*
+ * Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ unsigned int len = skb->len, lss;
+
+ if (len > tp->rcv_mss)
+ tp->rcv_mss = len;
+ lss = tp->last_seg_size;
+ tp->last_seg_size = 0;
+ if (len >= 536) {
+ if (len == lss)
+ tp->rcv_mss = len;
+ tp->last_seg_size = len;
+ }
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static __inline__ void __tcp_ack_snd_check(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* This also takes care of updating the window.
+ * This if statement needs to be simplified.
+ *
+ * Rules for delaying an ack:
+ * - delay time <= 0.5 HZ
+ * - we don't have a window update to send
+ * - must send at least every 2 full sized packets
+ * - must send an ACK if we have any out of order data
+ *
+ * With an extra heuristic to handle loss of packet
+ * situations and also helping the sender leave slow
+ * start in an expediant manner.
+ */
+
+ /* Two full frames received or... */
+ if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
+ /* We will update the window "significantly" or... */
+ tcp_raise_window(sk) ||
+ /* We entered "quick ACK" mode or... */
+ tcp_in_quickack_mode(tp) ||
+ /* We have out of order data */
+ (skb_peek(&tp->out_of_order_queue) != NULL)) {
+ /* Then ack it now */
+ tcp_send_ack(sk);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(tp, HZ/2);
+ }
+}
+
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ if (tp->delayed_acks == 0) {
+ /* We sent a data segment already. */
+ return;
+ }
+ __tcp_ack_snd_check(sk);
+}
+
+
+/*
+ * This routine is only called when we have urgent data
+ * signalled. Its the 'slow' part of tcp_urg. It could be
+ * moved inline now as tcp_urg is only called from one
+ * place. We handle URGent data wrong. We have to - as
+ * BSD still doesn't use the correction from RFC961.
+ * For 1003.1g we should support a new option TCP_STDURG to permit
+ * either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ u32 ptr = ntohs(th->urg_ptr);
+
+ if (ptr && !sysctl_tcp_stdurg)
+ ptr--;
+ ptr += ntohl(th->seq);
+
+ /* Ignore urgent data that we've already seen and read. */
+ if (after(tp->copied_seq, ptr))
+ return;
+
+ /* Do we already have a newer (or duplicate) urgent pointer? */
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
+ return;
+
+ /* Tell the world about our new urgent pointer. */
+ if (sk->proc != 0) {
+ if (sk->proc > 0)
+ kill_proc(sk->proc, SIGURG, 1);
+ else
+ kill_pg(-sk->proc, SIGURG, 1);
+ }
+
+ /* We may be adding urgent data when the last byte read was
+ * urgent. To do this requires some care. We cannot just ignore
+ * tp->copied_seq since we would read the last urgent byte again
+ * as data, nor can we alter copied_seq until this data arrives
+ * or we break the sematics of SIOCATMARK (and thus sockatmark())
+ */
+ if (tp->urg_seq == tp->copied_seq)
+ tp->copied_seq++; /* Move the copied sequence on correctly */
+ tp->urg_data = URG_NOTYET;
+ tp->urg_seq = ptr;
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Check if we get a new urgent pointer - normally not. */
+ if (th->urg)
+ tcp_check_urg(sk,th);
+
+ /* Do we wait for any urgent data? - normally not... */
+ if (tp->urg_data == URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
+
+ /* Is the urgent pointer pointing into this packet? */
+ if (ptr < len) {
+ tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+ if (!sk->dead)
+ sk->data_ready(sk,0);
+ }
+ }
+}
+
+/* Clean the out_of_order queue if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int prune_queue(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct sk_buff * skb;
+
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+ net_statistics.PruneCalled++;
+
+ /* First, purge the out_of_order queue. */
+ skb = __skb_dequeue_tail(&tp->out_of_order_queue);
+ if(skb != NULL) {
+ /* Free it all. */
+ do { net_statistics.OfoPruned += skb->len;
+ kfree_skb(skb);
+ skb = __skb_dequeue_tail(&tp->out_of_order_queue);
+ } while(skb != NULL);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if(tp->sack_ok)
+ tp->num_sacks = 0;
+ }
+
+ /* If we are really being abused, tell the caller to silently
+ * drop receive data on the floor. It will get retransmitted
+ * and hopefully then we'll have sufficient space.
+ *
+ * We used to try to purge the in-order packets too, but that
+ * turns out to be deadly and fraught with races. Consider:
+ *
+ * 1) If we acked the data, we absolutely cannot drop the
+ * packet. This data would then never be retransmitted.
+ * 2) It is possible, with a proper sequence of events involving
+ * delayed acks and backlog queue handling, to have the user
+ * read the data before it gets acked. The previous code
+ * here got this wrong, and it lead to data corruption.
+ * 3) Too much state changes happen when the FIN arrives, so once
+ * we've seen that we can't remove any in-order data safely.
+ *
+ * The net result is that removing in-order receive data is too
+ * complex for anyones sanity. So we don't do it anymore. But
+ * if we are really having our buffer space abused we stop accepting
+ * new receive data.
+ */
+ if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
+ return 0;
+
+ /* Massive buffer overcommit. */
+ return -1;
+}
+
+/*
+ * TCP receive function for the ESTABLISHED state.
+ *
+ * It is split into a fast path and a slow path. The fast path is
+ * disabled when:
+ * - A zero window was announced from us - zero window probing
+ * is only handled properly in the slow path.
+ * - Out of order segments arrived.
+ * - Urgent data is expected.
+ * - There is no buffer space left
+ * - Unexpected TCP flags/window values/header lengths are received
+ * (detected by checking the TCP header against pred_flags)
+ * - Data is sent in both directions. Fast path only supports pure senders
+ * or pure receivers (this means either the sequence number or the ack
+ * value must stay constant)
+ *
+ * When these conditions are not satisfied it drops into a standard
+ * receive procedure patterned after RFC793 to handle all cases.
+ * The first three cases are guaranteed by proper pred_flags setting,
+ * the rest is checked inline. Fast processing is turned on in
+ * tcp_data_queue when everything is OK.
+ */
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int queued;
+ u32 flg;
+
+ /*
+ * Header prediction.
+ * The code follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
+
+ /*
+ * RFC1323: H1. Apply PAWS check first.
+ */
+ if (tcp_fast_parse_options(sk, th, tp)) {
+ if (tp->saw_tstamp) {
+ if (tcp_paws_discard(tp, th, len)) {
+ tcp_statistics.TcpInErrs++;
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+ }
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+ }
+ }
+
+ flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);
+
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_predition is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+ * '?' will be 0 else it will be !0
+ * (when there are holes in the receive
+ * space for instance)
+ * PSH flag is ignored.
+ */
+
+ if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (len <= th->doff*4) {
+ /* Bulk data transfer: sender */
+ if (len == th->doff*4) {
+ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+ kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return 0;
+ } else { /* Header too small */
+ tcp_statistics.TcpInErrs++;
+ goto discard;
+ }
+ } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
+ atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
+ /* Bulk data transfer: receiver */
+ __skb_pull(skb,th->doff*4);
+
+ tcp_measure_rcv_mss(sk, skb);
+
+ /* DO NOT notify forward progress here.
+ * It saves dozen of CPU instructions in fast path. --ANK
+ */
+ __skb_queue_tail(&sk->receive_queue, skb);
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+
+ /* FIN bit check is not done since if FIN is set in
+ * this frame, the pred_flags won't match up. -DaveM
+ */
+ sk->data_ready(sk, 0);
+ tcp_delack_estimator(tp);
+
+ tcp_remember_ack(tp, th, skb);
+
+ __tcp_ack_snd_check(sk);
+ return 0;
+ }
+ }
+
+ /*
+ * Standard slow path.
+ */
+
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* RFC793, page 37: "In all states except SYN-SENT, all reset
+ * (RST) segments are validated by checking their SEQ-fields."
+ * And page 69: "If an incoming segment is not acceptable,
+ * an acknowledgment should be sent in reply (unless the RST bit
+ * is set, if so drop the segment and return)".
+ */
+ if (th->rst)
+ goto discard;
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tp->rcv_wup, tp->rcv_wnd);
+ }
+ tcp_send_ack(sk);
+ goto discard;
+ }
+
+ if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+ SOCK_DEBUG(sk, "syn in established state\n");
+ tcp_statistics.TcpInErrs++;
+ tcp_reset(sk);
+ return 1;
+ }
+
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ if(th->ack)
+ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
+
+ /* Process urgent data. */
+ tcp_urg(sk, th, len);
+
+ /* step 7: process the segment text */
+ queued = tcp_data(skb, sk, len);
+
+ /* This must be after tcp_data() does the skb_pull() to
+ * remove the header size from skb->len.
+ *
+ * Dave!!! Phrase above (and all about rcv_mss) has
+ * nothing to do with reality. rcv_mss must measure TOTAL
+ * size, including sacks, IP options etc. Hence, measure_rcv_mss
+ * must occure before pulling etc, otherwise it will flap
+ * like hell. Even putting it before tcp_data is wrong,
+ * it should use skb->tail - skb->nh.raw instead.
+ * --ANK (980805)
+ *
+ * BTW I broke it. Now all TCP options are handled equally
+ * in mss_clamp calculations (i.e. ignored, rfc1122),
+ * and mss_cache does include all of them (i.e. tstamps)
+ * except for sacks, to calulate effective mss faster.
+ * --ANK (980805)
+ */
+ tcp_measure_rcv_mss(sk, skb);
+
+ /* Be careful, tcp_data() may have put this into TIME_WAIT. */
+ if(sk->state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
+
+ if (!queued) {
+ discard:
+ kfree_skb(skb);
+ }
+
+ return 0;
+}
+
+/*
+ * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
+ * as an open_request.
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct open_request *req)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ u32 flg;
+
+ /* assumption: the socket is not in use.
+ * as we checked the user count on tcp_rcv and we're
+ * running from a soft interrupt.
+ */
+
+ /* Check for syn retransmission */
+ flg = *(((u32 *)skb->h.th) + 3);
+
+ flg &= __constant_htonl(0x00170000);
+ /* Only SYN set? */
+ if (flg == __constant_htonl(0x00020000)) {
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+ /* retransmited syn.
+ */
+ req->class->rtx_syn_ack(sk, req);
+ return NULL;
+ } else {
+ return sk; /* Pass new SYN to the listen socket. */
+ }
+ }
+
+ /* We know it's an ACK here */
+ if (req->sk) {
+ /* socket already created but not
+ * yet accepted()...
+ */
+ sk = req->sk;
+ } else {
+ /* In theory the packet could be for a cookie, but
+ * TIME_WAIT should guard us against this.
+ * XXX: Nevertheless check for cookies?
+ * This sequence number check is done again later,
+ * but we do it here to prevent syn flood attackers
+ * from creating big SYN_RECV sockets.
+ */
+ if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
+ !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
+ req->rcv_isn+1+req->rcv_wnd)) {
+ req->class->send_reset(skb);
+ return NULL;
+ }
+
+ sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ if (sk == NULL)
+ return NULL;
+
+ req->expires = 0UL;
+ req->sk = sk;
+ }
+ skb_orphan(skb);
+ skb_set_owner_r(skb, sk);
+ return sk;
+}
+
+/*
+ * This function implements the receiving procedure of RFC 793 for
+ * all states except ESTABLISHED and TIME_WAIT.
+ * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ * address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int queued = 0;
+
+ switch (sk->state) {
+ case TCP_CLOSE:
+ /* When state == CLOSED, hash lookup always fails.
+ *
+ * But, there is a back door, the backlog queue.
+ * If we have a sequence of packets in the backlog
+ * during __release_sock() which have a sequence such
+ * that:
+ * packet X causes entry to TCP_CLOSE state
+ * ...
+ * packet X + N has FIN bit set
+ *
+ * We report a (luckily) harmless error in this case.
+ * The issue is that backlog queue processing bypasses
+ * any hash lookups (we know which socket packets are for).
+ * The correct behavior here is what 2.0.x did, since
+ * a TCP_CLOSE socket does not exist. Drop the frame
+ * and send a RST back to the other end.
+ */
+ return 1;
+
+ case TCP_LISTEN:
+ /* These use the socket TOS..
+ * might want to be the received TOS
+ */
+ if(th->ack)
+ return 1;
+
+ if(th->syn) {
+ if(tp->af_specific->conn_request(sk, skb, 0) < 0)
+ return 1;
+
+ /* Now we have several options: In theory there is
+ * nothing else in the frame. KA9Q has an option to
+ * send data with the syn, BSD accepts data with the
+ * syn up to the [to be] advertised window and
+ * Solaris 2.1 gives you a protocol error. For now
+ * we just ignore it, that fits the spec precisely
+ * and avoids incompatibilities. It would be nice in
+ * future to drop through and process the data.
+ *
+ * Now that TTCP is starting to be used we ought to
+ * queue this data.
+ * But, this leaves one open to an easy denial of
+ * service attack, and SYN cookies can't defend
+ * against this problem. So, we drop the data
+ * in the interest of security over speed.
+ */
+ goto discard;
+ }
+
+ goto discard;
+ break;
+
+ case TCP_SYN_SENT:
+ /* SYN sent means we have to look for a suitable ack and
+ * either reset for bad matches or go to connected.
+ * The SYN_SENT case is unusual and should
+ * not be in line code. [AC]
+ */
+ if(th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ *
+ * I cite this place to emphasize one essential
+ * detail, this check is different of one
+ * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
+ * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
+ * because we have no previous data sent before SYN.
+ * --ANK(990513)
+ *
+ * We do not send data with SYN, so that RFC-correct
+ * test reduces to:
+ */
+ if (sk->zapped ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ return 1;
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+
+ if (!th->syn)
+ goto discard;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ *
+ * Do you see? SYN-less ACKs in SYN-SENT state are
+ * completely ignored.
+ *
+ * The bug causing stalled SYN-SENT sockets
+ * was here: tcp_ack advanced snd_una and canceled
+ * retransmit timer, so that bare ACK received
+ * in SYN-SENT state (even with invalid ack==ISS,
+ * because tcp_ack check is too weak for SYN-SENT)
+ * causes moving socket to invalid semi-SYN-SENT,
+ * semi-ESTABLISHED state and connection hangs.
+ *
+ * There exist buggy stacks, which really send
+ * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
+ * Actually, if this host did not try to get something
+ * from ftp.inr.ac.ru I'd never find this bug 8)
+ *
+ * --ANK (990514)
+ */
+
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ tp->fin_seq = TCP_SKB_CB(skb)->seq;
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ tcp_parse_options(sk, th, tp, 0);
+
+ if (tp->wscale_ok == 0) {
+ tp->snd_wscale = tp->rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp,65535);
+ }
+
+ if (tp->tstamp_ok) {
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = tcp_time_stamp;
+ }
+
+ /* Can't be earlier, doff would be wrong. */
+ tcp_send_ack(sk);
+
+ sk->dport = th->source;
+ tp->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead) {
+ sk->state_change(sk);
+ sock_wake_async(sk->socket, 0);
+ }
+ } else {
+ if(th->syn && !th->rst) {
+ /* The previous version of the code
+ * checked for "connecting to self"
+ * here. that check is done now in
+ * tcp_connect.
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+ tcp_parse_options(sk, th, tp, 0);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = tcp_time_stamp;
+ }
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+
+ tcp_send_synack(sk);
+ } else
+ break;
+ }
+
+ /* tp->tcp_header_len and tp->mss_clamp
+ probably changed, synchronize mss.
+ */
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tp->rcv_mss = tp->mss_cache;
+
+ if (sk->state == TCP_SYN_RECV)
+ goto discard;
+
+ goto step6;
+ }
+
+ /* Parse the tcp_options present on this header.
+ * By this point we really only expect timestamps.
+ * Note that this really has to be here and not later for PAWS
+ * (RFC1323) to work.
+ */
+ if (tcp_fast_parse_options(sk, th, tp)) {
+ /* NOTE: assumes saw_tstamp is never set if we didn't
+ * negotiate the option. tcp_fast_parse_options() must
+ * guarantee this.
+ */
+ if (tp->saw_tstamp) {
+ if (tcp_paws_discard(tp, th, len)) {
+ tcp_statistics.TcpInErrs++;
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+ }
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+ }
+ }
+
+ /* The silly FIN test here is necessary to see an advancing ACK in
+ * retransmitted FIN frames properly. Consider the following sequence:
+ *
+ * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ
+ * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ
+ * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1
+ * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test)
+ *
+ * At this point the connection will deadlock with host1 believing
+ * that his FIN is never ACK'd, and thus it will retransmit it's FIN
+ * forever. The following fix is from Taral (taral@taral.net).
+ */
+
+ /* step 1: check sequence number */
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
+ !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ }
+ goto discard;
+ }
+
+ /* step 2: check RST bit */
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* step 3: check security and precedence [ignored] */
+
+ /* step 4:
+ *
+ * Check for a SYN, and ensure it matches the SYN we were
+ * first sent. We have to handle the rather unusual (but valid)
+ * sequence that KA9Q derived products may generate of
+ *
+ * SYN
+ * SYN|ACK Data
+ * ACK (lost)
+ * SYN|ACK Data + More Data
+ * .. we must ACK not RST...
+ *
+ * We keep syn_seq as the sequence space occupied by the
+ * original syn.
+ */
+
+ if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
+ tcp_reset(sk);
+ return 1;
+ }
+
+ /* step 5: check the ACK field */
+ if (th->ack) {
+ int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+
+ switch(sk->state) {
+ case TCP_SYN_RECV:
+ if (acceptable) {
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->dport = th->source;
+ tp->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead)
+ sk->state_change(sk);
+
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = htons(th->window) << tp->snd_wscale;
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+
+ } else {
+ SOCK_DEBUG(sk, "bad ack\n");
+ return 1;
+ }
+ break;
+
+ case TCP_FIN_WAIT1:
+ if (tp->snd_una == tp->write_seq) {
+ sk->shutdown |= SEND_SHUTDOWN;
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ if (!sk->dead)
+ sk->state_change(sk);
+ else
+ tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
+ }
+ break;
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ sk->shutdown = SHUTDOWN_MASK;
+ tcp_set_state(sk,TCP_CLOSE);
+ if (!sk->dead)
+ sk->state_change(sk);
+ goto discard;
+ }
+ break;
+ }
+ } else
+ goto discard;
+
+step6:
+ /* step 6: check the URG bit */
+ tcp_urg(sk, th, len);
+
+ /* step 7: process the segment text */
+ switch (sk->state) {
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
+ break;
+
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* RFC 793 says to queue data in these states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
+ */
+ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+ if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ tcp_reset(sk);
+ return 1;
+ }
+ }
+
+ case TCP_ESTABLISHED:
+ queued = tcp_data(skb, sk, len);
+
+ /* This must be after tcp_data() does the skb_pull() to
+ * remove the header size from skb->len.
+ */
+ tcp_measure_rcv_mss(sk, skb);
+ break;
+ }
+
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+
+ if (!queued) {
+discard:
+ kfree_skb(skb);
+ }
+ return 0;
+}
diff --git a/pfinet/linux-src/net/ipv4/tcp_ipv4.c b/pfinet/linux-src/net/ipv4/tcp_ipv4.c
new file mode 100644
index 00000000..c2c78365
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2044 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_ipv4.c,v 1.175.2.10 1999/08/13 16:14:35 davem Exp $
+ *
+ * IPv4 specific functions
+ *
+ *
+ * code split from:
+ * linux/ipv4/tcp.c
+ * linux/ipv4/tcp_input.c
+ * linux/ipv4/tcp_output.c
+ *
+ * See tcp.c for author information
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ * David S. Miller : New socket lookup architecture.
+ * This code is dedicated to John Dyson.
+ * David S. Miller : Change semantics of established hash,
+ * half is devoted to TIME_WAIT sockets
+ * and the rest go in the other half.
+ * Andi Kleen : Add support for syncookies and fixed
+ * some bugs: ip options weren't passed to
+ * the TCP layer, missed a check for an ACK bit.
+ * Andi Kleen : Implemented fast path mtu discovery.
+ * Fixed many serious bugs in the
+ * open_request handling and moved
+ * most of it into the af independent code.
+ * Added tail drop and some other bugfixes.
+ * Added new listen sematics.
+ * Mike McLagan : Routing by source
+ * Juan Jose Ciarlante: ip_dynaddr bits
+ * Andi Kleen: various fixes.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Fix new listen.
+ * Andi Kleen : Fix accept error reporting.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/ipsec.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <asm/segment.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+extern int sysctl_tcp_timestamps;
+extern int sysctl_tcp_window_scaling;
+extern int sysctl_tcp_sack;
+extern int sysctl_tcp_syncookies;
+extern int sysctl_ip_dynaddr;
+extern __u32 sysctl_wmem_max;
+extern __u32 sysctl_rmem_max;
+
+/* Check TCP sequence numbers in ICMP packets. */
+#define ICMP_MIN_LENGTH 8
+
+/* Socket used for sending RSTs */
+struct inode tcp_inode;
+struct socket *tcp_socket=&tcp_inode.u.socket_i;
+
+static void tcp_v4_send_reset(struct sk_buff *skb);
+
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+ struct sk_buff *skb);
+
+/* This is for sockets with full identity only. Sockets here will always
+ * be without wildcards and will have the following invariant:
+ * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
+ *
+ * First half of the table is for sockets not in TIME_WAIT, second half
+ * is for TIME_WAIT sockets only.
+ */
+struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
+/* All sockets in TCP_LISTEN state will be in here. This is the only table
+ * where wildcard'd TCP sockets can exist. Hash function here is just local
+ * port number.
+ */
+struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
+
+/* Register cache. */
+struct sock *tcp_regs[TCP_NUM_REGS];
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = (1024 - 1);
+
+static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+ __u32 faddr, __u16 fport)
+{
+ return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
+}
+
+static __inline__ int tcp_sk_hashfn(struct sock *sk)
+{
+ __u32 laddr = sk->rcv_saddr;
+ __u16 lport = sk->num;
+ __u32 faddr = sk->daddr;
+ __u16 fport = sk->dport;
+
+ return tcp_hashfn(laddr, lport, faddr, fport);
+}
+
+/* Allocate and initialize a new TCP local port bind bucket.
+ * Always runs inside the socket hashing lock.
+ */
+struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
+{
+ struct tcp_bind_bucket *tb;
+
+ tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
+ if(tb != NULL) {
+ struct tcp_bind_bucket **head =
+ &tcp_bound_hash[tcp_bhashfn(snum)];
+ tb->port = snum;
+ tb->fastreuse = 0;
+ tb->owners = NULL;
+ if((tb->next = *head) != NULL)
+ tb->next->pprev = &tb->next;
+ *head = tb;
+ tb->pprev = head;
+ }
+ return tb;
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+/* Ensure that the bound bucket for the port exists.
+ * Return 0 and bump bucket reference count on success.
+ *
+ * Must run in a BH atomic section.
+ */
+static __inline__ int __tcp_bucket_check(unsigned short snum)
+{
+ struct tcp_bind_bucket *tb;
+
+ tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ for( ; (tb && (tb->port != snum)); tb = tb->next)
+ ;
+ if (tb == NULL) {
+ if ((tb = tcp_bucket_create(snum)) == NULL)
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+ struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (child->num != sk->num) {
+ unsigned short snum = ntohs(child->num);
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ tb && tb->port != snum;
+ tb = tb->next)
+ ;
+ if (tb == NULL)
+ tb = (struct tcp_bind_bucket *)sk->prev;
+ }
+#endif
+ if ((child->bind_next = tb->owners) != NULL)
+ tb->owners->bind_pprev = &child->bind_next;
+ tb->owners = child;
+ child->bind_pprev = &tb->owners;
+ child->prev = (struct sock *) tb;
+}
+
+__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+ SOCKHASH_LOCK();
+ __tcp_inherit_port(sk, child);
+ SOCKHASH_UNLOCK();
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ struct tcp_bind_bucket *tb;
+
+ SOCKHASH_LOCK();
+ if (snum == 0) {
+ int rover = tcp_port_rover;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+
+ do { rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ tb = tcp_bound_hash[tcp_bhashfn(rover)];
+ for ( ; tb; tb = tb->next)
+ if (tb->port == rover)
+ goto next;
+ break;
+ next:
+ } while (--remaining > 0);
+ tcp_port_rover = rover;
+
+ /* Exhausted local port range during search? */
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. */
+ snum = rover;
+ tb = NULL;
+ } else {
+ for (tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ tb != NULL;
+ tb = tb->next)
+ if (tb->port == snum)
+ break;
+ }
+ if (tb != NULL && tb->owners != NULL) {
+ if (tb->fastreuse != 0 && sk->reuse != 0) {
+ goto success;
+ } else {
+ struct sock *sk2 = tb->owners;
+ int sk_reuse = sk->reuse;
+
+ for( ; sk2 != NULL; sk2 = sk2->bind_next) {
+ if (sk->bound_dev_if == sk2->bound_dev_if) {
+ if (!sk_reuse ||
+ !sk2->reuse ||
+ sk2->state == TCP_LISTEN) {
+ if (!sk2->rcv_saddr ||
+ !sk->rcv_saddr ||
+ (sk2->rcv_saddr == sk->rcv_saddr))
+ break;
+ }
+ }
+ }
+ /* If we found a conflict, fail. */
+ if (sk2 != NULL)
+ goto fail;
+ }
+ }
+ if (tb == NULL &&
+ (tb = tcp_bucket_create(snum)) == NULL)
+ goto fail;
+ if (tb->owners == NULL) {
+ if (sk->reuse && sk->state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ } else if (tb->fastreuse &&
+ ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
+ tb->fastreuse = 0;
+success:
+ sk->num = snum;
+ if ((sk->bind_next = tb->owners) != NULL)
+ tb->owners->bind_pprev = &sk->bind_next;
+ tb->owners = sk;
+ sk->bind_pprev = &tb->owners;
+ sk->prev = (struct sock *) tb;
+
+ SOCKHASH_UNLOCK();
+ return 0;
+
+fail:
+ SOCKHASH_UNLOCK();
+ return 1;
+}
+
+/* Get rid of any references to a local port held by the
+ * given sock.
+ */
+__inline__ void __tcp_put_port(struct sock *sk)
+{
+ struct tcp_bind_bucket *tb;
+
+ tb = (struct tcp_bind_bucket *) sk->prev;
+ if (sk->bind_next)
+ sk->bind_next->bind_pprev = sk->bind_pprev;
+ *(sk->bind_pprev) = sk->bind_next;
+ sk->prev = NULL;
+ if (tb->owners == NULL) {
+ if (tb->next)
+ tb->next->pprev = tb->pprev;
+ *(tb->pprev) = tb->next;
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+}
+
+void tcp_put_port(struct sock *sk)
+{
+ SOCKHASH_LOCK();
+ __tcp_put_port(sk);
+ SOCKHASH_UNLOCK();
+}
+
+static __inline__ void __tcp_v4_hash(struct sock *sk)
+{
+ struct sock **skp;
+
+ if(sk->state == TCP_LISTEN)
+ skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+ else
+ skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+
+ if((sk->next = *skp) != NULL)
+ (*skp)->pprev = &sk->next;
+ *skp = sk;
+ sk->pprev = skp;
+}
+
+static void tcp_v4_hash(struct sock *sk)
+{
+ if (sk->state != TCP_CLOSE) {
+ SOCKHASH_LOCK();
+ __tcp_v4_hash(sk);
+ SOCKHASH_UNLOCK();
+ }
+}
+
+static void tcp_v4_unhash(struct sock *sk)
+{
+ SOCKHASH_LOCK();
+ if(sk->pprev) {
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ tcp_reg_zap(sk);
+ __tcp_put_port(sk);
+ }
+ SOCKHASH_UNLOCK();
+}
+
+/* Don't inline this cruft. Here are some nice properties to
+ * exploit here. The BSD API does not allow a listening TCP
+ * to specify the remote port nor the remote address for the
+ * connection. So always assume those are both wildcarded
+ * during the search since they can never be otherwise.
+ */
+static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
+{
+ struct sock *sk;
+ struct sock *result = NULL;
+ int score, hiscore;
+
+ hiscore=0;
+ for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
+ if(sk->num == hnum) {
+ __u32 rcv_saddr = sk->rcv_saddr;
+
+ score = 1;
+ if(rcv_saddr) {
+ if (rcv_saddr != daddr)
+ continue;
+ score++;
+ }
+ if (sk->bound_dev_if) {
+ if (sk->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if (score == 3)
+ return sk;
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ return result;
+}
+
+/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ * It is assumed that this code only gets called from within NET_BH.
+ */
+static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
+ u32 saddr, u16 sport,
+ u32 daddr, u16 dport, int dif)
+{
+ TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+ __u16 hnum = ntohs(dport);
+ __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ int hash;
+
+ /* Check TCP register quick cache first. */
+ sk = TCP_RHASH(sport);
+ if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
+ goto hit;
+
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
+ if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
+ if (sk->state == TCP_ESTABLISHED)
+ TCP_RHASH(sport) = sk;
+ goto hit; /* You sunk my battleship! */
+ }
+ }
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+ if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
+ goto hit;
+ sk = tcp_v4_lookup_listener(daddr, hnum, dif);
+hit:
+ return sk;
+}
+
+__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+{
+ return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+/* Cleaned up a little and adapted to new bind bucket scheme.
+ * Oddly, this should increase performance here for
+ * transparent proxy, as tests within the inner loop have
+ * been eliminated. -DaveM
+ */
+static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
+ unsigned short rnum, unsigned long laddr,
+ struct device *dev, unsigned short pnum,
+ int dif)
+{
+ struct sock *s, *result = NULL;
+ int badness = -1;
+ u32 paddr = 0;
+ unsigned short hnum = ntohs(num);
+ unsigned short hpnum = ntohs(pnum);
+ int firstpass = 1;
+
+ if(dev && dev->ip_ptr) {
+ struct in_device *idev = dev->ip_ptr;
+
+ if(idev->ifa_list)
+ paddr = idev->ifa_list->ifa_local;
+ }
+
+ /* This code must run only from NET_BH. */
+ {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+ for( ; (tb && tb->port != hnum); tb = tb->next)
+ ;
+ if(tb == NULL)
+ goto next;
+ s = tb->owners;
+ }
+pass2:
+ for(; s; s = s->bind_next) {
+ int score = 0;
+ if(s->rcv_saddr) {
+ if((s->num != hpnum || s->rcv_saddr != paddr) &&
+ (s->num != hnum || s->rcv_saddr != laddr))
+ continue;
+ score++;
+ }
+ if(s->daddr) {
+ if(s->daddr != raddr)
+ continue;
+ score++;
+ }
+ if(s->dport) {
+ if(s->dport != rnum)
+ continue;
+ score++;
+ }
+ if(s->bound_dev_if) {
+ if(s->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if(score == 4 && s->num == hnum) {
+ result = s;
+ goto gotit;
+ } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+ result = s;
+ badness = score;
+ }
+ }
+next:
+ if(firstpass--) {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+ for( ; (tb && tb->port != hpnum); tb = tb->next)
+ ;
+ if(tb) {
+ s = tb->owners;
+ goto pass2;
+ }
+ }
+gotit:
+ return result;
+}
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
+
+static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+ return secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ skb->h.th->dest,
+ skb->h.th->source);
+}
+
+/* Check that a TCP address is unique, don't allow multiple
+ * connects to/from the same address. Actually we can optimize
+ * quite a bit, since the socket about to connect is still
+ * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
+ * use will exist, with a NULL owners list. So check for that.
+ * The good_socknum and verify_bind scheme we use makes this
+ * work.
+ */
+static int tcp_v4_unique_address(struct sock *sk)
+{
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
+ int retval = 1;
+
+ /* Freeze the hash while we snoop around. */
+ SOCKHASH_LOCK();
+ tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ for(; tb; tb = tb->next) {
+ if(tb->port == snum && tb->owners != NULL) {
+ /* Almost certainly the re-use port case, search the real hashes
+ * so it actually scales.
+ */
+ sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
+ sk->rcv_saddr, htons(snum),
+ sk->bound_dev_if);
+ if((sk != NULL) && (sk->state != TCP_LISTEN))
+ retval = 0;
+ break;
+ }
+ }
+ SOCKHASH_UNLOCK();
+ return retval;
+}
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct sk_buff *buff;
+ struct rtable *rt;
+ u32 daddr, nexthop;
+ int tmp;
+
+ if (sk->state != TCP_CLOSE)
+ return(-EISCONN);
+
+ /* Don't allow a double connect. */
+ if (sk->daddr)
+ return -EINVAL;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return(-EINVAL);
+
+ if (usin->sin_family != AF_INET) {
+ static int complained;
+ if (usin->sin_family)
+ return(-EAFNOSUPPORT);
+ if (!complained++)
+ printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
+ }
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ if (sk->opt && sk->opt->srr) {
+ if (daddr == 0)
+ return -EINVAL;
+ nexthop = sk->opt->faddr;
+ }
+
+ tmp = ip_route_connect(&rt, nexthop, sk->saddr,
+ RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
+ if (tmp < 0)
+ return tmp;
+
+ if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ dst_release(xchg(&sk->dst_cache, rt));
+
+ buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+ 0, GFP_KERNEL);
+
+ if (buff == NULL)
+ return -ENOBUFS;
+
+ /* Socket has no identity, so lock_sock() is useless. Also
+ * since state==TCP_CLOSE (checked above) the socket cannot
+ * possibly be in the hashes. TCP hash locking is only
+ * needed while checking quickly for a unique address.
+ * However, the socket does need to be (and is) locked
+ * in tcp_connect().
+ * Perhaps this addresses all of ANK's concerns. 8-) -DaveM
+ */
+ sk->dport = usin->sin_port;
+ sk->daddr = rt->rt_dst;
+ if (sk->opt && sk->opt->srr)
+ sk->daddr = daddr;
+ if (!sk->saddr)
+ sk->saddr = rt->rt_src;
+ sk->rcv_saddr = sk->saddr;
+
+ if (!tcp_v4_unique_address(sk)) {
+ kfree_skb(buff);
+ sk->daddr = 0;
+ return -EADDRNOTAVAIL;
+ }
+
+ tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ sk->sport, usin->sin_port);
+
+ tp->ext_header_len = 0;
+ if (sk->opt)
+ tp->ext_header_len = sk->opt->optlen;
+
+ /* Reset mss clamp */
+ tp->mss_clamp = ~0;
+
+ if (!ip_dont_fragment(sk, &rt->u.dst) &&
+ rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
+ /* Clamp mss at maximum of 536 and user_mss.
+ Probably, user ordered to override tiny segment size
+ in gatewayed case.
+ */
+ tp->mss_clamp = max(tp->user_mss, 536);
+ }
+
+ tcp_connect(sk, buff, rt->u.dst.pmtu);
+ return 0;
+}
+
+static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+{
+ int retval = -EINVAL;
+
+ /* Do sanity checking for sendmsg/sendto/send. */
+ if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
+ goto out;
+ if (msg->msg_name) {
+ struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
+
+ if (msg->msg_namelen < sizeof(*addr))
+ goto out;
+ if (addr->sin_family && addr->sin_family != AF_INET)
+ goto out;
+ retval = -ENOTCONN;
+ if(sk->state == TCP_CLOSE)
+ goto out;
+ retval = -EISCONN;
+ if (addr->sin_port != sk->dport)
+ goto out;
+ if (addr->sin_addr.s_addr != sk->daddr)
+ goto out;
+ }
+ retval = tcp_do_sendmsg(sk, msg);
+
+out:
+ return retval;
+}
+
+
+/*
+ * Do a linear search in the socket open_request list.
+ * This should be replaced with a global hash table.
+ */
+static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
+ struct iphdr *iph,
+ struct tcphdr *th,
+ struct open_request **prevp)
+{
+ struct open_request *req, *prev;
+ __u16 rport = th->source;
+
+ /* assumption: the socket is not in use.
+ * as we checked the user count on tcp_rcv and we're
+ * running from a soft interrupt.
+ */
+ prev = (struct open_request *) (&tp->syn_wait_queue);
+ for (req = prev->dl_next; req; req = req->dl_next) {
+ if (req->af.v4_req.rmt_addr == iph->saddr &&
+ req->af.v4_req.loc_addr == iph->daddr &&
+ req->rmt_port == rport
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ && req->lcl_port == th->dest
+#endif
+ ) {
+ *prevp = prev;
+ return req;
+ }
+ prev = req;
+ }
+ return NULL;
+}
+
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ if (atomic_read(&sk->sock_readers))
+ return;
+
+ /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+ * send out by Linux are always <576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->state == TCP_LISTEN)
+ return;
+
+ /* We don't check in the destentry if pmtu discovery is forbidden
+ * on this route. We just assume that no packet_to_big packets
+ * are send back when pmtu discovery is not active.
+ * There is a small race when the user changes this flag in the
+ * route, but I think that's acceptable.
+ */
+ if (sk->dst_cache == NULL)
+ return;
+ ip_rt_update_pmtu(sk->dst_cache, mtu);
+ if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
+ tp->pmtu_cookie > sk->dst_cache->pmtu) {
+ tcp_sync_mss(sk, sk->dst_cache->pmtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the tcp header. We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ * sk->err and sk->err_soft should be atomic_t.
+ */
+
+void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
+{
+ struct iphdr *iph = (struct iphdr*)dp;
+ struct tcphdr *th;
+ struct tcp_opt *tp;
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+#if ICMP_MIN_LENGTH < 14
+ int no_flags = 0;
+#else
+#define no_flags 0
+#endif
+ struct sock *sk;
+ __u32 seq;
+ int err;
+
+ if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+#if ICMP_MIN_LENGTH < 14
+ if (len < (iph->ihl << 2) + 14)
+ no_flags = 1;
+#endif
+
+ th = (struct tcphdr*)(dp+(iph->ihl<<2));
+
+ sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
+ if (sk == NULL || sk->state == TCP_TIME_WAIT) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+
+ tp = &sk->tp_pinfo.af_tcp;
+ seq = ntohl(th->seq);
+ if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
+ net_statistics.OutOfWindowIcmps++;
+ return;
+ }
+
+ switch (type) {
+ case ICMP_SOURCE_QUENCH:
+#ifndef OLD_SOURCE_QUENCH /* This is deprecated */
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_cnt = 0;
+ tp->high_seq = tp->snd_nxt;
+#endif
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ return;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
+ return;
+ }
+
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ return;
+ }
+
+ switch (sk->state) {
+ struct open_request *req, *prev;
+ case TCP_LISTEN:
+ /* Prevent race conditions with accept() -
+ * ICMP is unreliable.
+ */
+ if (atomic_read(&sk->sock_readers)) {
+ net_statistics.LockDroppedIcmps++;
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ return;
+ }
+
+ /* The final ACK of the handshake should be already
+ * handled in the new socket context, not here.
+ * Strictly speaking - an ICMP error for the final
+ * ACK should set the opening flag, but that is too
+ * complicated right now.
+ */
+ if (!no_flags && !th->syn && !th->ack)
+ return;
+
+ req = tcp_v4_search_req(tp, iph, th, &prev);
+ if (!req)
+ return;
+ if (seq != req->snt_isn) {
+ net_statistics.OutOfWindowIcmps++;
+ return;
+ }
+ if (req->sk) {
+ /*
+ * Already in ESTABLISHED and a big socket is created,
+ * set error code there.
+ * The error will _not_ be reported in the accept(),
+ * but only with the next operation on the socket after
+ * accept.
+ */
+ sk = req->sk;
+ } else {
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ tp->syn_backlog--;
+ tcp_synq_unlink(tp, req, prev);
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+ return;
+ }
+ break;
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV: /* Cannot happen */
+ if (!no_flags && !th->syn)
+ return;
+ tcp_statistics.TcpAttemptFails++;
+ sk->err = err;
+ sk->zapped = 1;
+ mb();
+ sk->error_report(sk);
+ return;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ if (sk->ip_recverr) {
+ /* This code isn't serialized with the socket code */
+ /* ANK (980927) ... which is harmless now,
+ sk->err's may be safely lost.
+ */
+ sk->err = err;
+ mb();
+ sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+ } else { /* Only an error on timeout */
+ sk->err_soft = err;
+ mb();
+ }
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+ struct sk_buff *skb)
+{
+ th->check = 0;
+ th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
+ csum_partial((char *)th, th->doff<<2, skb->csum));
+}
+
+/*
+ * This routine will send an RST to the other tcp.
+ *
+ * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ * for reset.
+ * Answer: if a packet caused RST, it is not for a socket
+ * existing in our system, if it is matched to a socket,
+ * it is just duplicate segment or bug in other side's TCP.
+ * So that we build reply only basing on parameters
+ * arrived with segment.
+ * Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ struct tcphdr rth;
+ struct ip_reply_arg arg;
+
+ /* Never send a reset in response to a reset. */
+ if (th->rst)
+ return;
+
+ if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+#endif
+ return;
+ }
+
+ /* Swap the send and the receive. */
+ memset(&rth, 0, sizeof(struct tcphdr));
+ rth.dest = th->source;
+ rth.source = th->dest;
+ rth.doff = sizeof(struct tcphdr)/4;
+ rth.rst = 1;
+
+ if (th->ack) {
+ rth.seq = th->ack_seq;
+ } else {
+ rth.ack = 1;
+ rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
+ }
+
+ memset(&arg, 0, sizeof arg);
+ arg.iov[0].iov_base = (unsigned char *)&rth;
+ arg.iov[0].iov_len = sizeof rth;
+ arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+ skb->nh.iph->saddr, /*XXX*/
+ sizeof(struct tcphdr),
+ IPPROTO_TCP,
+ 0);
+ arg.n_iov = 1;
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+ ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+
+ tcp_statistics.TcpOutSegs++;
+ tcp_statistics.TcpOutRsts++;
+}
+
+/*
+ * Send an ACK for a socket less packet (needed for time wait)
+ *
+ * FIXME: Does not echo timestamps yet.
+ *
+ * Assumes that the caller did basic address and flag checks.
+ */
+static void tcp_v4_send_ack(struct sk_buff *skb, __u32 seq, __u32 ack, __u16 window)
+{
+ struct tcphdr *th = skb->h.th;
+ struct tcphdr rth;
+ struct ip_reply_arg arg;
+
+ /* Swap the send and the receive. */
+ memset(&rth, 0, sizeof(struct tcphdr));
+ rth.dest = th->source;
+ rth.source = th->dest;
+ rth.doff = sizeof(struct tcphdr)/4;
+
+ rth.seq = seq;
+ rth.ack_seq = ack;
+ rth.ack = 1;
+
+ rth.window = htons(window);
+
+ memset(&arg, 0, sizeof arg);
+ arg.iov[0].iov_base = (unsigned char *)&rth;
+ arg.iov[0].iov_len = sizeof rth;
+ arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+ skb->nh.iph->saddr, /*XXX*/
+ sizeof(struct tcphdr),
+ IPPROTO_TCP,
+ 0);
+ arg.n_iov = 1;
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+ ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+
+ tcp_statistics.TcpOutSegs++;
+}
+
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+
+/*
+ Seems, I never wrote nothing more stupid.
+ I hope Gods will forgive me, but I cannot forgive myself 8)
+ --ANK (981001)
+ */
+
+static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
+ struct sock *sk;
+ int i;
+
+ for (i=0; i<TCP_LHTABLE_SIZE; i++) {
+ for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
+ struct open_request *dummy;
+ if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
+ th, &dummy) &&
+ (!sk->bound_dev_if ||
+ sk->bound_dev_if == skb->dev->ifindex))
+ return sk;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Check whether a received TCP packet might be for one of our
+ * connections.
+ */
+
+int tcp_chkaddr(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
+ struct sock *sk;
+
+ sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
+ th->dest, skb->dev->ifindex);
+
+ if (!sk)
+ return tcp_v4_search_proxy_openreq(skb) != NULL;
+
+ if (sk->state == TCP_LISTEN) {
+ struct open_request *dummy;
+ if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
+ th, &dummy) &&
+ (!sk->bound_dev_if ||
+ sk->bound_dev_if == skb->dev->ifindex))
+ return 1;
+ }
+
+ /* 0 means accept all LOCAL addresses here, not all the world... */
+
+ if (sk->rcv_saddr == 0)
+ return 0;
+
+ return 1;
+}
+#endif
+
+/*
+ * Send a SYN-ACK after having received an ACK.
+ * This still operates on a open_request only, not on a big
+ * socket.
+ */
+static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
+{
+ struct rtable *rt;
+ struct ip_options *opt;
+ struct sk_buff * skb;
+ int mss;
+
+ /* First, grab a route. */
+ opt = req->af.v4_req.opt;
+ if(ip_route_output(&rt, ((opt && opt->srr) ?
+ opt->faddr :
+ req->af.v4_req.rmt_addr),
+ req->af.v4_req.loc_addr,
+ RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
+ sk->bound_dev_if)) {
+ ip_statistics.IpOutNoRoutes++;
+ return;
+ }
+ if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ ip_rt_put(rt);
+ ip_statistics.IpOutNoRoutes++;
+ return;
+ }
+
+ mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+
+ skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
+ if (skb) {
+ struct tcphdr *th = skb->h.th;
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ th->source = req->lcl_port; /* LVE */
+#endif
+
+ th->check = tcp_v4_check(th, skb->len,
+ req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
+ csum_partial((char *)th, skb->len, skb->csum));
+
+ ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+ req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+ }
+ ip_rt_put(rt);
+}
+
+/*
+ * IPv4 open_request destructor.
+ */
+static void tcp_v4_or_free(struct open_request *req)
+{
+ if(!req->sk && req->af.v4_req.opt)
+ kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
+}
+
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+ static unsigned long warntime;
+
+ if (jiffies - warntime > HZ*60) {
+ warntime = jiffies;
+ printk(KERN_INFO
+ "possible SYN flooding on port %d. Sending cookies.\n",
+ ntohs(skb->h.th->dest));
+ }
+}
+
+/*
+ * Save and compile IPv4 options into the open_request if needed.
+ */
+static inline struct ip_options *
+tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ struct ip_options *dopt = NULL;
+
+ if (opt && opt->optlen) {
+ int opt_size = optlength(opt);
+ dopt = kmalloc(opt_size, GFP_ATOMIC);
+ if (dopt) {
+ if (ip_options_echo(dopt, skb)) {
+ kfree_s(dopt, opt_size);
+ dopt = NULL;
+ }
+ }
+ }
+ return dopt;
+}
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ */
+int sysctl_max_syn_backlog = 128;
+
+struct or_calltable or_ipv4 = {
+ tcp_v4_send_synack,
+ tcp_v4_or_free,
+ tcp_v4_send_reset
+};
+
+#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
+#define BACKLOGMAX(sk) sysctl_max_syn_backlog
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
+{
+ struct tcp_opt tp;
+ struct open_request *req;
+ struct tcphdr *th = skb->h.th;
+ __u32 saddr = skb->nh.iph->saddr;
+ __u32 daddr = skb->nh.iph->daddr;
+#ifdef CONFIG_SYN_COOKIES
+ int want_cookie = 0;
+#else
+#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
+#endif
+
+ /* If the socket is dead, don't accept the connection. */
+ if (sk->dead)
+ goto dead;
+
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (((struct rtable *)skb->dst)->rt_flags &
+ (RTCF_BROADCAST|RTCF_MULTICAST))
+ goto drop;
+
+ /* XXX: Check against a global syn pool counter. */
+ if (BACKLOG(sk) > BACKLOGMAX(sk)) {
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ syn_flood_warning(skb);
+ want_cookie = 1;
+ } else
+#endif
+ goto drop;
+ } else {
+ if (isn == 0)
+ isn = tcp_v4_init_sequence(sk, skb);
+ BACKLOG(sk)++;
+ }
+
+ req = tcp_openreq_alloc();
+ if (req == NULL) {
+ goto dropbacklog;
+ }
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+
+ req->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+
+ tp.mss_clamp = 65535;
+ tcp_parse_options(NULL, th, &tp, want_cookie);
+ if (tp.mss_clamp == 65535)
+ tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
+
+ if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+ tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+ req->mss = tp.mss_clamp;
+
+ if (tp.saw_tstamp)
+ req->ts_recent = tp.rcv_tsval;
+ req->tstamp_ok = tp.tstamp_ok;
+ req->sack_ok = tp.sack_ok;
+ req->snd_wscale = tp.snd_wscale;
+ req->wscale_ok = tp.wscale_ok;
+ req->rmt_port = th->source;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ req->lcl_port = th->dest ; /* LVE */
+#endif
+ req->af.v4_req.loc_addr = daddr;
+ req->af.v4_req.rmt_addr = saddr;
+
+ /* Note that we ignore the isn passed from the TIME_WAIT
+ * state here. That's the price we pay for cookies.
+ */
+ if (want_cookie)
+ isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+
+ req->snt_isn = isn;
+
+ req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+
+ req->class = &or_ipv4;
+ req->retrans = 0;
+ req->sk = NULL;
+
+ tcp_v4_send_synack(sk, req);
+
+ if (want_cookie) {
+ if (req->af.v4_req.opt)
+ kfree(req->af.v4_req.opt);
+ tcp_v4_or_free(req);
+ tcp_openreq_free(req);
+ } else {
+ req->expires = jiffies + TCP_TIMEOUT_INIT;
+ tcp_inc_slow_timer(TCP_SLT_SYNACK);
+ tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
+ }
+
+ return 0;
+
+dead:
+ SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
+ tcp_statistics.TcpAttemptFails++;
+ return -ENOTCONN; /* send reset */
+
+dropbacklog:
+ if (!want_cookie)
+ BACKLOG(sk)--;
+drop:
+ tcp_statistics.TcpAttemptFails++;
+ return 0;
+}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * This function wants to be moved to a common for IPv[46] file. --ANK
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
+
+ if(newsk != NULL) {
+ struct tcp_opt *newtp;
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter;
+#endif
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->sklist_next = NULL;
+ newsk->state = TCP_SYN_RECV;
+
+ /* Clone the TCP header template */
+ newsk->dport = req->rmt_port;
+
+ atomic_set(&newsk->sock_readers, 0);
+ atomic_set(&newsk->rmem_alloc, 0);
+ skb_queue_head_init(&newsk->receive_queue);
+ atomic_set(&newsk->wmem_alloc, 0);
+ skb_queue_head_init(&newsk->write_queue);
+ atomic_set(&newsk->omem_alloc, 0);
+
+ newsk->done = 0;
+ newsk->proc = 0;
+ skb_queue_head_init(&newsk->back_log);
+ skb_queue_head_init(&newsk->error_queue);
+#ifdef CONFIG_FILTER
+ if ((filter = newsk->filter) != NULL)
+ sk_filter_charge(newsk, filter);
+#endif
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->srtt = 0;
+ newtp->ato = 0;
+ newtp->snd_wl1 = req->rcv_isn;
+ newtp->snd_wl2 = req->snt_isn;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ newtp->snd_wnd = ntohs(skb->h.th->window);
+
+ newtp->max_window = newtp->snd_wnd;
+ newtp->pending = 0;
+ newtp->retransmits = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
+ newtp->backoff = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = 2;
+
+ newtp->rto = TCP_TIMEOUT_INIT;
+ newtp->packets_out = 0;
+ newtp->fackets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->high_seq = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+ newtp->snd_cwnd_cnt = 0;
+ newtp->dup_acks = 0;
+ newtp->delayed_acks = 0;
+ init_timer(&newtp->retransmit_timer);
+ newtp->retransmit_timer.function = &tcp_retransmit_timer;
+ newtp->retransmit_timer.data = (unsigned long) newsk;
+ init_timer(&newtp->delack_timer);
+ newtp->delack_timer.function = &tcp_delack_timer;
+ newtp->delack_timer.data = (unsigned long) newsk;
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->send_head = newtp->retrans_head = NULL;
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->saw_tstamp = 0;
+ newtp->mss_clamp = req->mss;
+
+ init_timer(&newtp->probe_timer);
+ newtp->probe_timer.function = &tcp_probe_timer;
+ newtp->probe_timer.data = (unsigned long) newsk;
+ newtp->probes_out = 0;
+ newtp->syn_seq = req->rcv_isn;
+ newtp->fin_seq = req->rcv_isn;
+ newtp->urg_data = 0;
+ tcp_synq_init(newtp);
+ newtp->syn_backlog = 0;
+ if (skb->len >= 536)
+ newtp->last_seg_size = skb->len;
+
+ /* Back to base struct sock members. */
+ newsk->err = 0;
+ newsk->ack_backlog = 0;
+ newsk->max_ack_backlog = SOMAXCONN;
+ newsk->priority = 0;
+
+ /* IP layer stuff */
+ newsk->timeout = 0;
+ init_timer(&newsk->timer);
+ newsk->timer.function = &net_timer;
+ newsk->timer.data = (unsigned long) newsk;
+ newsk->socket = NULL;
+
+ newtp->tstamp_ok = req->tstamp_ok;
+ if((newtp->sack_ok = req->sack_ok) != 0)
+ newtp->num_sacks = 0;
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->wscale_ok = req->wscale_ok;
+ if (newtp->wscale_ok) {
+ newtp->snd_wscale = req->snd_wscale;
+ newtp->rcv_wscale = req->rcv_wscale;
+ } else {
+ newtp->snd_wscale = newtp->rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp,65535);
+ }
+ if (newtp->tstamp_ok) {
+ newtp->ts_recent = req->ts_recent;
+ newtp->ts_recent_stamp = tcp_time_stamp;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ }
+ return newsk;
+}
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct open_request *req,
+ struct dst_entry *dst)
+{
+ struct ip_options *opt = req->af.v4_req.opt;
+ struct tcp_opt *newtp;
+ struct sock *newsk;
+
+ if (sk->ack_backlog > sk->max_ack_backlog)
+ goto exit; /* head drop */
+ if (dst == NULL) {
+ struct rtable *rt;
+
+ if (ip_route_output(&rt,
+ opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
+ req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
+ return NULL;
+ dst = &rt->u.dst;
+ }
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ /* The new socket created for transparent proxy may fall
+ * into a non-existed bind bucket because sk->num != newsk->num.
+ * Ensure existance of the bucket now. The placement of the check
+ * later will require to destroy just created newsk in the case of fail.
+ * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
+ */
+ if (__tcp_bucket_check(ntohs(skb->h.th->dest)))
+ goto exit;
+#endif
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (!newsk)
+ goto exit;
+
+ sk->tp_pinfo.af_tcp.syn_backlog--;
+ sk->ack_backlog++;
+
+ newsk->dst_cache = dst;
+
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newsk->daddr = req->af.v4_req.rmt_addr;
+ newsk->saddr = req->af.v4_req.loc_addr;
+ newsk->rcv_saddr = req->af.v4_req.loc_addr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ newsk->num = ntohs(skb->h.th->dest);
+ newsk->sport = req->lcl_port;
+#endif
+ newsk->opt = req->af.v4_req.opt;
+ newtp->ext_header_len = 0;
+ if (newsk->opt)
+ newtp->ext_header_len = newsk->opt->optlen;
+
+ tcp_sync_mss(newsk, dst->pmtu);
+ newtp->rcv_mss = newtp->mss_clamp;
+
+ /* It would be better to use newtp->mss_clamp here */
+ if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
+ newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
+ if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
+ newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
+
+ /* We run in BH processing itself or within a BH atomic
+ * sequence (backlog) so no locking is needed.
+ */
+ __tcp_v4_hash(newsk);
+ __tcp_inherit_port(sk, newsk);
+ __add_to_prot_sklist(newsk);
+
+ sk->data_ready(sk, 0); /* Deliver SIGIO */
+
+ return newsk;
+
+exit:
+ dst_release(dst);
+ return NULL;
+}
+
+static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct open_request *req, *prev;
+
+ req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
+ if (!req)
+ return;
+ /* Sequence number check required by RFC793 */
+ if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+ after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+ return;
+ tcp_synq_unlink(tp, req, prev);
+ (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+
+ net_statistics.EmbryonicRsts++;
+}
+
+/* Check for embryonic sockets (open_requests) We check packets with
+ * only the SYN bit set against the open_request queue too: This
+ * increases connection latency a bit, but is required to detect
+ * retransmitted SYNs.
+ */
+static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+ struct tcphdr *th = skb->h.th;
+ u32 flg = ((u32 *)th)[3];
+
+ /* Check for RST */
+ if (flg & __constant_htonl(0x00040000)) {
+ tcp_v4_rst_req(sk, skb);
+ return NULL;
+ }
+
+ /* Check for SYN|ACK */
+ if (flg & __constant_htonl(0x00120000)) {
+ struct open_request *req, *dummy;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Find possible connection requests. */
+ req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
+ if (req) {
+ sk = tcp_check_req(sk, skb, req);
+ }
+#ifdef CONFIG_SYN_COOKIES
+ else if (flg == __constant_htonl(0x00120000)) {
+ sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+ }
+#endif
+ }
+ return sk;
+}
+
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter = sk->filter;
+ if (filter && sk_filter(skb, filter))
+ goto discard;
+#endif /* CONFIG_FILTER */
+
+ /*
+ * This doesn't check if the socket has enough room for the packet.
+ * Either process the packet _without_ queueing it and then free it,
+ * or do the check later.
+ */
+ skb_set_owner_r(skb, sk);
+
+ if (sk->state == TCP_ESTABLISHED) { /* Fast path */
+ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ return 0;
+ }
+
+
+ if (sk->state == TCP_LISTEN) {
+ struct sock *nsk;
+
+ nsk = tcp_v4_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ /*
+ * Queue it on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket..
+ */
+ if (atomic_read(&nsk->sock_readers)) {
+ skb_orphan(skb);
+ __skb_queue_tail(&nsk->back_log, skb);
+ return 0;
+ }
+ sk = nsk;
+ }
+
+ if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ tcp_v4_send_reset(skb);
+discard:
+ kfree_skb(skb);
+ /* Be careful here. If this function gets more complicated and
+ * gcc suffers from register pressure on the x86, sk (in %ebx)
+ * might be destroyed here. This current version compiles correctly,
+ * but you have been warned.
+ */
+ return 0;
+}
+
+/*
+ * From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
+{
+ struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type!=PACKET_HOST)
+ goto discard_it;
+
+ th = skb->h.th;
+
+ /* Pull up the IP header. */
+ __skb_pull(skb, skb->h.raw - skb->data);
+
+ /* Count it even if it's bad */
+ tcp_statistics.TcpInSegs++;
+
+ len = skb->len;
+ if (len < sizeof(struct tcphdr))
+ goto bad_packet;
+
+ /* Try to use the device checksum if provided. */
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = csum_partial((char *)th, len, 0);
+ case CHECKSUM_HW:
+ if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
+ NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
+ "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
+ "len=%d/%d/%d\n",
+ NIPQUAD(skb->nh.iph->saddr),
+ ntohs(th->source),
+ NIPQUAD(skb->nh.iph->daddr),
+ ntohs(th->dest),
+ len, skb->len,
+ ntohs(skb->nh.iph->tot_len)));
+ bad_packet:
+ tcp_statistics.TcpInErrs++;
+ goto discard_it;
+ }
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ }
+
+ if((th->doff * 4) < sizeof(struct tcphdr) ||
+ len < (th->doff * 4))
+ goto bad_packet;
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (IPCB(skb)->redirport)
+ sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, skb->dev,
+ IPCB(skb)->redirport, skb->dev->ifindex);
+ else {
+#endif
+ sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (!sk)
+ sk = tcp_v4_search_proxy_openreq(skb);
+ }
+#endif
+ if (!sk)
+ goto no_tcp_socket;
+ if(!ipsec_sk_policy(sk,skb))
+ goto discard_it;
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ len - th->doff*4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+
+ skb->used = 0;
+
+ if (sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
+ if (!atomic_read(&sk->sock_readers))
+ return tcp_v4_do_rcv(sk, skb);
+
+ __skb_queue_tail(&sk->back_log, skb);
+ return 0;
+
+no_tcp_socket:
+ tcp_v4_send_reset(skb);
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+do_time_wait:
+ /* Sorry for the ugly switch. 2.3 will have a better solution. */
+ switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, skb->len)) {
+ case TCP_TW_ACK:
+ tcp_v4_send_ack(skb,
+ ((struct tcp_tw_bucket *)sk)->snd_nxt,
+ ((struct tcp_tw_bucket *)sk)->rcv_nxt,
+ ((struct tcp_tw_bucket *)sk)->window);
+ goto discard_it;
+ case TCP_TW_RST:
+ goto no_tcp_socket;
+ default:
+ goto discard_it;
+ }
+}
+
+static void __tcp_v4_rehash(struct sock *sk)
+{
+ struct sock **skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+
+ SOCKHASH_LOCK();
+ if(sk->pprev) {
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ tcp_reg_zap(sk);
+ }
+ if((sk->next = *skp) != NULL)
+ (*skp)->pprev = &sk->next;
+ *skp = sk;
+ sk->pprev = skp;
+ SOCKHASH_UNLOCK();
+}
+
+int tcp_v4_rebuild_header(struct sock *sk)
+{
+ struct rtable *rt = (struct rtable *)sk->dst_cache;
+ __u32 new_saddr;
+ int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
+
+ if(rt == NULL)
+ return 0;
+
+ /* Force route checking if want_rewrite.
+ * The idea is good, the implementation is disguisting.
+ * Well, if I made bind on this socket, you cannot randomly ovewrite
+ * its source address. --ANK
+ */
+ if (want_rewrite) {
+ int tmp;
+ struct rtable *new_rt;
+ __u32 old_saddr = rt->rt_src;
+
+ /* Query new route using another rt buffer */
+ tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
+ RT_TOS(sk->ip_tos)|sk->localroute,
+ sk->bound_dev_if);
+
+ /* Only useful if different source addrs */
+ if (tmp == 0) {
+ /*
+ * Only useful if different source addrs
+ */
+ if (new_rt->rt_src != old_saddr ) {
+ dst_release(sk->dst_cache);
+ sk->dst_cache = &new_rt->u.dst;
+ rt = new_rt;
+ goto do_rewrite;
+ }
+ dst_release(&new_rt->u.dst);
+ }
+ }
+ if (rt->u.dst.obsolete) {
+ int err;
+ err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
+ if (err) {
+ sk->err_soft=-err;
+ sk->error_report(sk);
+ return -1;
+ }
+ dst_release(xchg(&sk->dst_cache, &rt->u.dst));
+ }
+
+ return 0;
+
+do_rewrite:
+ new_saddr = rt->rt_src;
+
+ /* Ouch!, this should not happen. */
+ if (!sk->saddr || !sk->rcv_saddr) {
+ printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: "
+ "saddr=%08lX rcv_saddr=%08lX\n",
+ ntohl(sk->saddr),
+ ntohl(sk->rcv_saddr));
+ return 0;
+ }
+
+ if (new_saddr != sk->saddr) {
+ if (sysctl_ip_dynaddr > 1) {
+ printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
+ "from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ NIPQUAD(sk->saddr),
+ NIPQUAD(new_saddr));
+ }
+
+ sk->saddr = new_saddr;
+ sk->rcv_saddr = new_saddr;
+
+ /* XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ */
+ __tcp_v4_rehash(sk);
+ }
+
+ return 0;
+}
+
+static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
+{
+ return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+}
+
+static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = sk->daddr;
+ sin->sin_port = sk->dport;
+}
+
+struct tcp_func ipv4_specific = {
+ ip_queue_xmit,
+ tcp_v4_send_check,
+ tcp_v4_rebuild_header,
+ tcp_v4_conn_request,
+ tcp_v4_syn_recv_sock,
+ tcp_v4_get_sock,
+ sizeof(struct iphdr),
+
+ ip_setsockopt,
+ ip_getsockopt,
+ v4_addr2sockaddr,
+ sizeof(struct sockaddr_in)
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+
+ tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
+ tp->mdev = TCP_TIMEOUT_INIT;
+ tp->mss_clamp = ~0;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = 2;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_ssthresh = 0x7fffffff; /* Infinity */
+
+ sk->state = TCP_CLOSE;
+ sk->max_ack_backlog = SOMAXCONN;
+ tp->rcv_mss = 536;
+
+ sk->write_space = tcp_write_space;
+
+ /* Init SYN queue. */
+ tcp_synq_init(tp);
+
+ sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
+
+ return 0;
+}
+
+static int tcp_v4_destroy_sock(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+
+ tcp_clear_xmit_timers(sk);
+
+ if (sk->keepopen)
+ tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
+
+ /* Cleanup up the write buffer. */
+ while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
+ kfree_skb(skb);
+
+ /* Cleans up our, hopefuly empty, out_of_order_queue. */
+ while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
+ kfree_skb(skb);
+
+ /* Clean up a referenced TCP bind bucket, this only happens if a
+ * port is allocated for a socket, but it never fully connects.
+ */
+ if(sk->prev != NULL)
+ tcp_put_port(sk);
+
+ return 0;
+}
+
+struct proto tcp_prot = {
+ (struct sock *)&tcp_prot, /* sklist_next */
+ (struct sock *)&tcp_prot, /* sklist_prev */
+ tcp_close, /* close */
+ tcp_v4_connect, /* connect */
+ tcp_accept, /* accept */
+ NULL, /* retransmit */
+ tcp_write_wakeup, /* write_wakeup */
+ tcp_read_wakeup, /* read_wakeup */
+ tcp_poll, /* poll */
+ tcp_ioctl, /* ioctl */
+ tcp_v4_init_sock, /* init */
+ tcp_v4_destroy_sock, /* destroy */
+ tcp_shutdown, /* shutdown */
+ tcp_setsockopt, /* setsockopt */
+ tcp_getsockopt, /* getsockopt */
+ tcp_v4_sendmsg, /* sendmsg */
+ tcp_recvmsg, /* recvmsg */
+ NULL, /* bind */
+ tcp_v4_do_rcv, /* backlog_rcv */
+ tcp_v4_hash, /* hash */
+ tcp_v4_unhash, /* unhash */
+ tcp_v4_get_port, /* get_port */
+ 128, /* max_header */
+ 0, /* retransmits */
+ "TCP", /* name */
+ 0, /* inuse */
+ 0 /* highestinuse */
+};
+
+
+
+__initfunc(void tcp_v4_init(struct net_proto_family *ops))
+{
+ int err;
+
+ tcp_inode.i_mode = S_IFSOCK;
+ tcp_inode.i_sock = 1;
+ tcp_inode.i_uid = 0;
+ tcp_inode.i_gid = 0;
+
+ tcp_socket->inode = &tcp_inode;
+ tcp_socket->state = SS_UNCONNECTED;
+ tcp_socket->type=SOCK_RAW;
+
+ if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
+ panic("Failed to create the TCP control socket.\n");
+ tcp_socket->sk->allocation=GFP_ATOMIC;
+ tcp_socket->sk->num = 256; /* Don't receive any data */
+ tcp_socket->sk->ip_ttl = MAXTTL;
+}
diff --git a/pfinet/linux-src/net/ipv4/tcp_output.c b/pfinet/linux-src/net/ipv4/tcp_output.c
new file mode 100644
index 00000000..2ac5e8a2
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp_output.c
@@ -0,0 +1,1143 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_output.c,v 1.108.2.1 1999/05/14 23:07:36 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes: Pedro Roque : Retransmit queue handled by TCP.
+ * : Fragmentation on mtu decrease
+ * : Segment collapse on retransmit
+ * : AF independence
+ *
+ * Linus Torvalds : send_delayed_ack
+ * David S. Miller : Charge memory using the right skb
+ * during syn/ack processing.
+ * David S. Miller : Output engine completely rewritten.
+ * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ *
+ */
+
+#include <net/tcp.h>
+
+extern int sysctl_tcp_timestamps;
+extern int sysctl_tcp_window_scaling;
+extern int sysctl_tcp_sack;
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse = 1;
+
+/* Get rid of any delayed acks, we sent one already.. */
+static __inline__ void clear_delayed_acks(struct sock * sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tp->delayed_acks = 0;
+ if(tcp_in_quickack_mode(tp))
+ tcp_exit_quickack_mode(tp);
+ tcp_clear_xmit_timer(sk, TIME_DACK);
+}
+
+static __inline__ void update_send_head(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ tp->send_head = tp->send_head->next;
+ if (tp->send_head == (struct sk_buff *) &sk->write_queue)
+ tp->send_head = NULL;
+}
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg(). This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if(skb != NULL) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ int tcp_header_size = tp->tcp_header_len;
+ struct tcphdr *th;
+ int sysctl_flags;
+
+#define SYSCTL_FLAG_TSTAMPS 0x1
+#define SYSCTL_FLAG_WSCALE 0x2
+#define SYSCTL_FLAG_SACK 0x4
+
+ sysctl_flags = 0;
+ if(tcb->flags & TCPCB_FLAG_SYN) {
+ tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+ if(sysctl_tcp_timestamps) {
+ tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+ }
+ if(sysctl_tcp_window_scaling) {
+ tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+ sysctl_flags |= SYSCTL_FLAG_WSCALE;
+ }
+ if(sysctl_tcp_sack) {
+ sysctl_flags |= SYSCTL_FLAG_SACK;
+ if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+ tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+ }
+ } else if(tp->sack_ok && tp->num_sacks) {
+ /* A SACK is 2 pad bytes, a 2 byte header, plus
+ * 2 32-bit sequence numbers for each SACK block.
+ */
+ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->num_sacks * TCPOLEN_SACK_PERBLOCK));
+ }
+ th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+ skb->h.th = th;
+ skb_set_owner_w(skb, sk);
+
+ /* Build TCP header and checksum it. */
+ th->source = sk->sport;
+ th->dest = sk->dport;
+ th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ th->doff = (tcp_header_size >> 2);
+ th->res1 = 0;
+ *(((__u8 *)th) + 13) = tcb->flags;
+ if(!(tcb->flags & TCPCB_FLAG_SYN))
+ th->window = htons(tcp_select_window(sk));
+ th->check = 0;
+ th->urg_ptr = ntohs(tcb->urg_ptr);
+ if(tcb->flags & TCPCB_FLAG_SYN) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(tp->rcv_wnd);
+ tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
+ (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+ (sysctl_flags & SYSCTL_FLAG_SACK),
+ (sysctl_flags & SYSCTL_FLAG_WSCALE),
+ tp->rcv_wscale,
+ TCP_SKB_CB(skb)->when,
+ tp->ts_recent);
+ } else {
+ tcp_build_and_update_options((__u32 *)(th + 1),
+ tp, TCP_SKB_CB(skb)->when);
+ }
+ tp->af_specific->send_check(sk, th, skb->len, skb);
+
+ clear_delayed_acks(sk);
+ tp->last_ack_sent = tp->rcv_nxt;
+ tcp_statistics.TcpOutSegs++;
+ tp->af_specific->queue_xmit(skb);
+ }
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
+}
+
+/* This is the main buffer sending routine. We queue the buffer
+ * and decide whether to queue or transmit now.
+ */
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Advance write_seq and place onto the write_queue. */
+ tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+ __skb_queue_tail(&sk->write_queue, skb);
+
+ if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+ /* Send it out now. */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ if(!tcp_timer_is_set(sk, TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else {
+ /* Queue it, remembering where we must start sending. */
+ if (tp->send_head == NULL)
+ tp->send_head = skb;
+ if (!force_queue && tp->packets_out == 0 && !tp->pending) {
+ tp->pending = TIME_PROBE0;
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ }
+ }
+}
+
+/* Function to create two new TCP segments. Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list. This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ struct sk_buff *buff;
+ int nsize = skb->len - len;
+ u16 flags;
+
+ /* Get a new skb... force flag on. */
+ buff = sock_wmalloc(sk,
+ (nsize + MAX_HEADER + sk->prot->max_header),
+ 1, GFP_ATOMIC);
+ if (buff == NULL)
+ return -1; /* We'll just try again later. */
+
+ /* Reserve space for headers. */
+ skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->flags;
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+ if(flags & TCPCB_FLAG_URG) {
+ u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr;
+
+ /* Urgent data is always a pain in the ass. */
+ if(old_urg_ptr > len) {
+ TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG);
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+ TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len;
+ } else {
+ flags &= ~(TCPCB_FLAG_URG);
+ }
+ }
+ if(!(flags & TCPCB_FLAG_URG))
+ TCP_SKB_CB(buff)->urg_ptr = 0;
+ TCP_SKB_CB(buff)->flags = flags;
+ TCP_SKB_CB(buff)->sacked = 0;
+
+ /* Copy and checksum data tail into the new buffer. */
+ buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
+ nsize, 0);
+
+ /* This takes care of the FIN sequence number too. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+ skb_trim(skb, len);
+
+ /* Rechecksum original buffer. */
+ skb->csum = csum_partial(skb->data, skb->len, 0);
+
+ /* Looks stupid, but our code really uses when of
+ * skbs, which it never sent before. --ANK
+ */
+ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
+ /* Link BUFF into the send queue. */
+ __skb_append(skb, buff);
+
+ return 0;
+}
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+ tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+ for TCP options, but includes only bare TCP header.
+
+ tp->mss_clamp is mss negotiated at connection setup.
+ It is minumum of user_mss and mss received with SYN.
+ It also does not include TCP options.
+
+ tp->pmtu_cookie is last pmtu, seen by this function.
+
+ tp->mss_cache is current effective sending mss, including
+ all tcp options except for SACKs. It is evaluated,
+ taking into account current pmtu, but never exceeds
+ tp->mss_clamp.
+
+ NOTE1. rfc1122 clearly states that advertised MSS
+ DOES NOT include either tcp or ip options.
+
+ NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+ this function. --ANK (980731)
+ */
+
+int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->mss_clamp)
+ mss_now = tp->mss_clamp;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ /* Now subtract optional transport overhead */
+ mss_now -= tp->ext_header_len;
+
+ /* It we got too small (or even negative) value,
+ clamp it by 8 from below. Why 8 ?
+ Well, it could be 1 with the same success,
+ but if IP accepted segment of length 1,
+ it would love 8 even more 8) --ANK (980731)
+ */
+ if (mss_now < 8)
+ mss_now = 8;
+
+ /* And store cached results */
+ tp->pmtu_cookie = pmtu;
+ tp->mss_cache = mss_now;
+ return mss_now;
+}
+
+
+/* This routine writes packets to the network. It advances the
+ * send_head. This happens as incoming acks open up the remote
+ * window for us.
+ */
+void tcp_write_xmit(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ unsigned int mss_now;
+
+ /* Account for SACKS, we may need to fragment due to this.
+ * It is just like the real MSS changing on us midstream.
+ * We also handle things correctly when the user adds some
+ * IP options mid-stream. Silly to do, but cover it.
+ */
+ mss_now = tcp_current_mss(sk);
+
+ /* If we are zapped, the bytes will have to remain here.
+ * In time closedown will empty the write queue and all
+ * will be happy.
+ */
+ if(!sk->zapped) {
+ struct sk_buff *skb;
+ int sent_pkts = 0;
+
+ /* Anything on the transmit queue that fits the window can
+ * be added providing we are:
+ *
+ * a) following SWS avoidance [and Nagle algorithm]
+ * b) not exceeding our congestion window.
+ * c) not retransmitting [Nagle]
+ */
+ while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
+ if (skb->len > mss_now) {
+ if (tcp_fragment(sk, skb, mss_now))
+ break;
+ }
+
+ /* Advance the send_head. This one is going out. */
+ update_send_head(sk);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ sent_pkts = 1;
+ }
+
+ /* If we sent anything, make sure the retransmit
+ * timer is active.
+ */
+ if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ * RECV.NEXT + RCV.WIN fixed until:
+ * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ * If the free space is less than the 1/4 of the maximum
+ * space available and the free space is less than 1/2 mss,
+ * then set the window to 0.
+ * Otherwise, just prevent the window from shrinking
+ * and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ unsigned int mss = tp->mss_cache;
+ int free_space;
+ u32 window;
+
+ /* Sometimes free_space can be < 0. */
+ free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
+ if (tp->window_clamp) {
+ if (free_space > ((int) tp->window_clamp))
+ free_space = tp->window_clamp;
+ mss = min(tp->window_clamp, mss);
+ } else {
+ printk("tcp_select_window: tp->window_clamp == 0.\n");
+ }
+
+ if (mss < 1) {
+ mss = 1;
+ printk("tcp_select_window: sk->mss fell to 0.\n");
+ }
+
+ if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
+ window = 0;
+ tp->pred_flags = 0;
+ } else {
+ /* Get the largest window that is a nice multiple of mss.
+ * Window clamp already applied above.
+ * If our current window offering is within 1 mss of the
+ * free space we just keep it. This prevents the divide
+ * and multiply from happening most of the time.
+ * We also don't do any window rounding when the free space
+ * is too small.
+ */
+ window = tp->rcv_wnd;
+ if ((((int) window) <= (free_space - ((int) mss))) ||
+ (((int) window) > free_space))
+ window = (((unsigned int) free_space)/mss)*mss;
+ }
+ return window;
+}
+
+/* Attempt to collapse two adjacent SKB's during retransmission. */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+{
+ struct sk_buff *next_skb = skb->next;
+
+ /* The first test we must make is that neither of these two
+ * SKB's are still referenced by someone else.
+ */
+ if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
+ int skb_size = skb->len, next_skb_size = next_skb->len;
+ u16 flags = TCP_SKB_CB(skb)->flags;
+
+ /* Punt if the first SKB has URG set. */
+ if(flags & TCPCB_FLAG_URG)
+ return;
+
+ /* Also punt if next skb has been SACK'd. */
+ if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
+ return;
+
+ /* Punt if not enough space exists in the first SKB for
+ * the data in the second, or the total combined payload
+ * would exceed the MSS.
+ */
+ if ((next_skb_size > skb_tailroom(skb)) ||
+ ((skb_size + next_skb_size) > mss_now))
+ return;
+
+ /* Ok. We will be able to collapse the packet. */
+ __skb_unlink(next_skb, next_skb->list);
+
+ if(skb->len % 4) {
+ /* Must copy and rechecksum all data. */
+ memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
+ skb->csum = csum_partial(skb->data, skb->len, 0);
+ } else {
+ /* Optimize, actually we could also combine next_skb->csum
+ * to skb->csum using a single add w/carry operation too.
+ */
+ skb->csum = csum_partial_copy(next_skb->data,
+ skb_put(skb, next_skb_size),
+ next_skb_size, skb->csum);
+ }
+
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ /* Merge over control information. */
+ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
+ if(flags & TCPCB_FLAG_URG) {
+ u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr;
+ TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size;
+ }
+ TCP_SKB_CB(skb)->flags = flags;
+
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ kfree_skb(next_skb);
+ sk->tp_pinfo.af_tcp.packets_out--;
+ }
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb, *old_next_skb;
+ unsigned int mss = tcp_current_mss(sk);
+
+ /* Don't muck with the congestion window here. */
+ tp->dup_acks = 0;
+ tp->high_seq = tp->snd_nxt;
+ tp->retrans_head = NULL;
+
+ /* Input control flow will see that this was retransmitted
+ * and not use it for RTT calculation in the absence of
+ * the timestamp option.
+ */
+ for (old_next_skb = skb = skb_peek(&sk->write_queue);
+ ((skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue));
+ skb = skb->next) {
+ int resend_skb = 0;
+
+ /* Our goal is to push out the packets which we
+ * sent already, but are being chopped up now to
+ * account for the PMTU information we have.
+ *
+ * As we resend the queue, packets are fragmented
+ * into two pieces, and when we try to send the
+ * second piece it may be collapsed together with
+ * a subsequent packet, and so on. -DaveM
+ */
+ if (old_next_skb != skb || skb->len > mss)
+ resend_skb = 1;
+ old_next_skb = skb->next;
+ if (resend_skb != 0)
+ tcp_retransmit_skb(sk, skb);
+ }
+}
+
+static __inline__ void update_retrans_head(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ tp->retrans_head = tp->retrans_head->next;
+ if((tp->retrans_head == tp->send_head) ||
+ (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) {
+ tp->retrans_head = NULL;
+ tp->rexmt_done = 1;
+ }
+}
+
+/* This retransmits one SKB. Policy decisions and retransmit queue
+ * state updates are done by the caller. Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ unsigned int cur_mss = tcp_current_mss(sk);
+
+ if(skb->len > cur_mss) {
+ if(tcp_fragment(sk, skb, cur_mss))
+ return 1; /* We'll try again later. */
+
+ /* New SKB created, account for it. */
+ tp->packets_out++;
+ }
+
+ /* Collapse two adjacent packets if worthwhile and we can. */
+ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
+ (skb->len < (cur_mss >> 1)) &&
+ (skb->next != tp->send_head) &&
+ (skb->next != (struct sk_buff *)&sk->write_queue) &&
+ (sysctl_tcp_retrans_collapse != 0))
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+ if(tp->af_specific->rebuild_header(sk))
+ return 1; /* Routing failure or similar. */
+
+ /* Some Solaris stacks overoptimize and ignore the FIN on a
+ * retransmit when old data is attached. So strip it off
+ * since it is cheap to do so and saves bytes on the network.
+ */
+ if(skb->len > 0 &&
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+ TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+ skb_trim(skb, 0);
+ skb->csum = 0;
+ }
+
+ /* Ok, we're gonna send it out, update state. */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
+ tp->retrans_out++;
+
+ /* Make a copy, if the first transmission SKB clone we made
+ * is still in somebody's hands, else make a clone.
+ */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ if(skb_cloned(skb))
+ skb = skb_copy(skb, GFP_ATOMIC);
+ else
+ skb = skb_clone(skb, GFP_ATOMIC);
+
+ tcp_transmit_skb(sk, skb);
+
+ /* Update global TCP statistics and return success. */
+ sk->prot->retransmits++;
+ tcp_statistics.TcpRetransSegs++;
+
+ return 0;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged. It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+
+ if (tp->retrans_head == NULL &&
+ tp->rexmt_done == 0)
+ tp->retrans_head = skb_peek(&sk->write_queue);
+ if (tp->retrans_head == tp->send_head)
+ tp->retrans_head = NULL;
+
+ /* Each time, advance the retrans_head if we got
+ * a packet out or we skipped one because it was
+ * SACK'd. -DaveM
+ */
+ while ((skb = tp->retrans_head) != NULL) {
+ /* If it has been ack'd by a SACK block, we don't
+ * retransmit it.
+ */
+ if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ /* Send it out, punt if error occurred. */
+ if(tcp_retransmit_skb(sk, skb))
+ break;
+
+ update_retrans_head(sk);
+
+ /* Stop retransmitting if we've hit the congestion
+ * window limit.
+ */
+ if (tp->retrans_out >= tp->snd_cwnd)
+ break;
+ } else {
+ update_retrans_head(sk);
+ }
+ }
+}
+
+/* Using FACK information, retransmit all missing frames at the receiver
+ * up to the forward most SACK'd packet (tp->fackets_out) if the packet
+ * has not been retransmitted already.
+ */
+void tcp_fack_retransmit(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb = skb_peek(&sk->write_queue);
+ int packet_cnt = 0;
+
+ while((skb != NULL) &&
+ (skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue)) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS))
+ goto next_packet;
+
+ /* Ok, retransmit it. */
+ if(tcp_retransmit_skb(sk, skb))
+ break;
+
+ if(tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ break;
+next_packet:
+ packet_cnt++;
+ if(packet_cnt >= tp->fackets_out)
+ break;
+ skb = skb->next;
+ }
+}
+
+/* Send a fin. The caller locks the socket for us. This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
+ unsigned int mss_now;
+
+ /* Optimization, tack on the FIN if we have a queue of
+ * unsent frames. But be careful about outgoing SACKS
+ * and IP options.
+ */
+ mss_now = tcp_current_mss(sk);
+
+ if((tp->send_head != NULL) && (skb->len < mss_now)) {
+ /* tcp_write_xmit() takes care of the rest. */
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+ TCP_SKB_CB(skb)->end_seq++;
+ tp->write_seq++;
+
+ /* Special case to avoid Nagle bogosity. If this
+ * segment is the last segment, and it was queued
+ * due to Nagle/SWS-avoidance, send it out now.
+ */
+ if(tp->send_head == skb &&
+ !sk->nonagle &&
+ skb->len < (tp->mss_cache >> 1) &&
+ tp->packets_out &&
+ !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
+ update_send_head(sk);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if(!tcp_timer_is_set(sk, TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ }
+ } else {
+ /* Socket is locked, keep trying until memory is available. */
+ do {
+ skb = sock_wmalloc(sk,
+ (MAX_HEADER +
+ sk->prot->max_header),
+ 1, GFP_KERNEL);
+ } while (skb == NULL);
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
+ TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ tcp_send_skb(sk, skb, 0);
+ }
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue. This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+
+ /* NOTE: No TCP options attached and we never retransmit this. */
+ skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* Send it off. */
+ TCP_SKB_CB(skb)->seq = tp->write_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, skb);
+}
+
+/* WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+ struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff* skb;
+
+ skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+ 1, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* SYN eats a sequence byte. */
+ TCP_SKB_CB(skb)->seq = tp->snd_una;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ __skb_queue_tail(&sk->write_queue, skb);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ return 0;
+}
+
+/*
+ * Prepare a SYN-ACK.
+ */
+struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+ struct open_request *req, int mss)
+{
+ struct tcphdr *th;
+ int tcp_header_size;
+ struct sk_buff *skb;
+
+ skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+
+ skb->dst = dst_clone(dst);
+
+ /* Don't offer more than they did.
+ * This way we don't have to memorize who said what.
+ * FIXME: maybe this should be changed for better performance
+ * with syncookies.
+ */
+ req->mss = min(mss, req->mss);
+ if (req->mss < 8) {
+ printk(KERN_DEBUG "initial req->mss below 8\n");
+ req->mss = 8;
+ }
+
+ tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
+ (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+ (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+ /* SACK_PERM is in the place of NOP NOP of TS */
+ ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+ skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+
+ memset(th, 0, sizeof(struct tcphdr));
+ th->syn = 1;
+ th->ack = 1;
+ th->source = sk->sport;
+ th->dest = req->rmt_port;
+ TCP_SKB_CB(skb)->seq = req->snt_isn;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+ th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ th->ack_seq = htonl(req->rcv_isn + 1);
+ if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+ __u8 rcv_wscale;
+ /* Set this up on the first call only */
+ req->window_clamp = skb->dst->window;
+ tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+ &req->rcv_wnd,
+ &req->window_clamp,
+ req->wscale_ok,
+ &rcv_wscale);
+ req->rcv_wscale = rcv_wscale;
+ }
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+ th->window = htons(req->rcv_wnd);
+
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
+ req->sack_ok, req->wscale_ok, req->rcv_wscale,
+ TCP_SKB_CB(skb)->when,
+ req->ts_recent);
+
+ skb->csum = 0;
+ th->doff = (tcp_header_size >> 2);
+ tcp_statistics.TcpOutSegs++;
+ return skb;
+}
+
+void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
+{
+ struct dst_entry *dst = sk->dst_cache;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Reserve space for headers. */
+ skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+
+ tp->snd_wnd = 0;
+ tp->snd_wl1 = 0;
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
+ tp->rcv_nxt = 0;
+
+ sk->err = 0;
+
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+ /* If user gave his TCP_MAXSEG, record it to clamp */
+ if (tp->user_mss)
+ tp->mss_clamp = tp->user_mss;
+ tcp_sync_mss(sk, mtu);
+
+ /* Now unpleasant action: if initial pmtu is too low
+ set lower clamp. I am not sure that it is good.
+ To be more exact, I do not think that clamping at value, which
+ is apparently transient and may improve in future is good idea.
+ It would be better to wait until peer will returns its MSS
+ (probably 65535 too) and now advertise something sort of 65535
+ or at least first hop device mtu. Is it clear, what I mean?
+ We should tell peer what maximal mss we expect to RECEIVE,
+ it has nothing to do with pmtu.
+ I am afraid someone will be confused by such huge value.
+ --ANK (980731)
+ */
+ if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
+ tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
+
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+ TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->urg_ptr = 0;
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->seq = tp->write_seq++;
+ TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
+
+ tp->window_clamp = dst->window;
+ tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
+ &tp->rcv_wnd,
+ &tp->window_clamp,
+ sysctl_tcp_window_scaling,
+ &tp->rcv_wscale);
+ /* Ok, now lock the socket before we make it visible to
+ * the incoming packet engine.
+ */
+ lock_sock(sk);
+
+ /* Socket identity change complete, no longer
+ * in TCP_CLOSE, so enter ourselves into the
+ * hash tables.
+ */
+ tcp_set_state(sk,TCP_SYN_SENT);
+ sk->prot->hash(sk);
+
+ tp->rto = dst->rtt;
+ tcp_init_xmit_timers(sk);
+ tp->retransmits = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
+
+ /* Send it off. */
+ __skb_queue_tail(&sk->write_queue, buff);
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+ tcp_statistics.TcpActiveOpens++;
+
+ /* Timer for repeating the SYN until an answer. */
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+
+ /* Now, it is safe to release the socket. */
+ release_sock(sk);
+}
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
+{
+ unsigned long timeout;
+
+ /* Stay within the limit we were given */
+ timeout = tp->ato;
+ if (timeout > max_timeout)
+ timeout = max_timeout;
+ timeout += jiffies;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (!tp->delack_timer.prev) {
+ tp->delack_timer.expires = timeout;
+ add_timer(&tp->delack_timer);
+ } else {
+ if (time_before(timeout, tp->delack_timer.expires))
+ mod_timer(&tp->delack_timer, timeout);
+ }
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+ /* If we have been reset, we may not send again. */
+ if(!sk->zapped) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *buff;
+
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
+ if (buff == NULL) {
+ /* Force it to send an ack. We don't have to do this
+ * (ACK is unreliable) but it's much better use of
+ * bandwidth on slow links to send a spare ack than
+ * resend packets.
+ *
+ * This is the one possible way that we can delay an
+ * ACK and have tp->ato indicate that we are in
+ * quick ack mode, so clear it.
+ */
+ if(tcp_in_quickack_mode(tp))
+ tcp_exit_quickack_mode(tp);
+ tcp_send_delayed_ack(tp, HZ/2);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->urg_ptr = 0;
+
+ /* Send it off, this clears delayed acks for us. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt;
+ TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, buff);
+ }
+}
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ */
+void tcp_write_wakeup(struct sock *sk)
+{
+ /* After a valid reset we can send no more. */
+ if (!sk->zapped) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct sk_buff *skb;
+
+ /* Write data can still be transmitted/retransmitted in the
+ * following states. If any other state is encountered, return.
+ * [listen/close will never occur here anyway]
+ */
+ if ((1 << sk->state) &
+ ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
+ TCPF_LAST_ACK|TCPF_CLOSING))
+ return;
+
+ if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
+ ((skb = tp->send_head) != NULL)) {
+ unsigned long win_size;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
+ if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
+ if (tcp_fragment(sk, skb, win_size))
+ return; /* Let a retransmit get it. */
+ }
+ update_send_head(sk);
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ tp->packets_out++;
+ tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+ if (!tcp_timer_is_set(sk, TIME_RETRANS))
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ } else {
+ /* We don't queue it, tcp_transmit_skb() sets ownership. */
+ skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers and set control bits. */
+ skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+ skb->csum = 0;
+ TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->urg_ptr = 0;
+
+ /* Use a previous sequence. This should cause the other
+ * end to send an ack. Don't queue or clone SKB, just
+ * send it.
+ */
+ TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+ tcp_transmit_skb(sk, skb);
+ }
+ }
+}
+
+/* A window probe timeout has occurred. If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tcp_write_wakeup(sk);
+ tp->pending = TIME_PROBE0;
+ tp->backoff++;
+ tp->probes_out++;
+ tcp_reset_xmit_timer (sk, TIME_PROBE0,
+ min(tp->rto << tp->backoff, 120*HZ));
+}
diff --git a/pfinet/linux-src/net/ipv4/tcp_timer.c b/pfinet/linux-src/net/ipv4/tcp_timer.c
new file mode 100644
index 00000000..21029f8e
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/tcp_timer.c
@@ -0,0 +1,595 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version: $Id: tcp_timer.c,v 1.62.2.3 1999/06/20 20:14:30 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_retries1 = TCP_RETR1;
+int sysctl_tcp_retries2 = TCP_RETR2;
+
+static void tcp_sltimer_handler(unsigned long);
+static void tcp_syn_recv_timer(unsigned long);
+static void tcp_keepalive(unsigned long data);
+static void tcp_twkill(unsigned long);
+
+struct timer_list tcp_slow_timer = {
+ NULL, NULL,
+ 0, 0,
+ tcp_sltimer_handler,
+};
+
+
+struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
+ {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
+ {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */
+ {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */
+};
+
+const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
+ sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
+ sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
+
+ init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
+ sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
+ sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
+
+ init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
+ sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
+ sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
+}
+
+/*
+ * Reset the retransmission timer
+ */
+
+void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ switch (what) {
+ case TIME_RETRANS:
+ /* When seting the transmit timer the probe timer
+ * should not be set.
+ * The delayed ack timer can be set if we are changing the
+ * retransmit timer when removing acked frames.
+ */
+ if(tp->probe_timer.prev)
+ del_timer(&tp->probe_timer);
+ mod_timer(&tp->retransmit_timer, jiffies+when);
+ break;
+
+ case TIME_DACK:
+ mod_timer(&tp->delack_timer, jiffies+when);
+ break;
+
+ case TIME_PROBE0:
+ mod_timer(&tp->probe_timer, jiffies+when);
+ break;
+
+ case TIME_WRITE:
+ printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
+ break;
+
+ default:
+ printk(KERN_DEBUG "bug: unknown timer value\n");
+ };
+}
+
+void tcp_clear_xmit_timers(struct sock *sk)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ if(tp->retransmit_timer.prev)
+ del_timer(&tp->retransmit_timer);
+ if(tp->delack_timer.prev)
+ del_timer(&tp->delack_timer);
+ if(tp->probe_timer.prev)
+ del_timer(&tp->probe_timer);
+}
+
+static int tcp_write_err(struct sock *sk, int force)
+{
+ sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
+ sk->error_report(sk);
+
+ tcp_clear_xmit_timers(sk);
+
+ /* Time wait the socket. */
+ if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
+ tcp_time_wait(sk);
+ } else {
+ /* Clean up time. */
+ tcp_set_state(sk, TCP_CLOSE);
+ return 0;
+ }
+ return 1;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* Look for a 'soft' timeout. */
+ if ((sk->state == TCP_ESTABLISHED &&
+ tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
+ (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
+ dst_negative_advice(&sk->dst_cache);
+ }
+
+ /* Have we tried to SYN too many times (repent repent 8)) */
+ if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
+ tcp_write_err(sk, 1);
+ /* Don't FIN, we got nothing back */
+ return 0;
+ }
+
+ /* Has it gone just too far? */
+ if (tp->retransmits > sysctl_tcp_retries2)
+ return tcp_write_err(sk, 0);
+
+ return 1;
+}
+
+void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+
+ if(!sk->zapped &&
+ sk->tp_pinfo.af_tcp.delayed_acks &&
+ sk->state != TCP_CLOSE) {
+ /* If socket is currently locked, defer the ACK. */
+ if (!atomic_read(&sk->sock_readers))
+ tcp_send_ack(sk);
+ else
+ tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
+ }
+}
+
+void tcp_probe_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ if(sk->zapped)
+ return;
+
+ if (atomic_read(&sk->sock_readers)) {
+ /* Try again later. */
+ tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+ return;
+ }
+
+ /* *WARNING* RFC 1122 forbids this
+ * It doesn't AFAIK, because we kill the retransmit timer -AK
+ * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+ * this behaviour in Solaris down as a bug fix. [AC]
+ */
+ if (tp->probes_out > sysctl_tcp_retries2) {
+ if(sk->err_soft)
+ sk->err = sk->err_soft;
+ else
+ sk->err = ETIMEDOUT;
+ sk->error_report(sk);
+
+ if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
+ /* Time wait the socket. */
+ tcp_time_wait(sk);
+ } else {
+ /* Clean up time. */
+ tcp_set_state(sk, TCP_CLOSE);
+ }
+ } else {
+ /* Only send another probe if we didn't close things up. */
+ tcp_send_probe0(sk);
+ }
+}
+
+static __inline__ int tcp_keepopen_proc(struct sock *sk)
+{
+ int res = 0;
+
+ if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+ if (elapsed >= sysctl_tcp_keepalive_time) {
+ if (tp->probes_out > sysctl_tcp_keepalive_probes) {
+ if(sk->err_soft)
+ sk->err = sk->err_soft;
+ else
+ sk->err = ETIMEDOUT;
+
+ tcp_set_state(sk, TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+ if (!sk->dead)
+ sk->state_change(sk);
+ } else {
+ tp->probes_out++;
+ tp->pending = TIME_KEEPOPEN;
+ tcp_write_wakeup(sk);
+ res = 1;
+ }
+ }
+ }
+ return res;
+}
+
+/* Kill off TIME_WAIT sockets once their lifetime has expired. */
+int tcp_tw_death_row_slot = 0;
+static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+
+extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
+
+static void tcp_twkill(unsigned long data)
+{
+ struct tcp_tw_bucket *tw;
+ int killed = 0;
+
+ tw = tcp_tw_death_row[tcp_tw_death_row_slot];
+ tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
+ while(tw != NULL) {
+ struct tcp_tw_bucket *next = tw->next_death;
+
+ tcp_timewait_kill(tw);
+ killed++;
+ tw = next;
+ }
+ if(killed != 0) {
+ struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
+ atomic_sub(killed, &slt->count);
+ }
+ tcp_tw_death_row_slot =
+ ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+}
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+void tcp_tw_schedule(struct tcp_tw_bucket *tw)
+{
+ int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+ struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
+
+ if((tw->next_death = *tpp) != NULL)
+ (*tpp)->pprev_death = &tw->next_death;
+ *tpp = tw;
+ tw->pprev_death = tpp;
+
+ tw->death_slot = slot;
+
+ tcp_inc_slow_timer(TCP_SLT_TWKILL);
+}
+
+/* Happens rarely if at all, no care about scalability here. */
+void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
+{
+ struct tcp_tw_bucket **tpp;
+ int slot;
+
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+
+ slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+ tpp = &tcp_tw_death_row[slot];
+ if((tw->next_death = *tpp) != NULL)
+ (*tpp)->pprev_death = &tw->next_death;
+ *tpp = tw;
+ tw->pprev_death = tpp;
+
+ tw->death_slot = slot;
+ /* Timer was incremented when we first entered the table. */
+}
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ tcp_dec_slow_timer(TCP_SLT_TWKILL);
+}
+
+/*
+ * Check all sockets for keepalive timer
+ * Called every 75 seconds
+ * This timer is started by af_inet init routine and is constantly
+ * running.
+ *
+ * It might be better to maintain a count of sockets that need it using
+ * setsockopt/tcp_destroy_sk and only set the timer when needed.
+ */
+
+/*
+ * don't send over 5 keepopens at a time to avoid burstiness
+ * on big servers [AC]
+ */
+#define MAX_KA_PROBES 5
+
+int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
+
+/* Keepopen's are only valid for "established" TCP's, nicely our listener
+ * hash gets rid of most of the useless testing, so we run through a couple
+ * of the established hash chains each clock tick. -DaveM
+ *
+ * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
+ * going off for them, so we only need check the first half of the established
+ * hash table, even less testing under heavy load.
+ *
+ * I _really_ would rather do this by adding a new timer_struct to struct sock,
+ * and this way only those who set the keepalive option will get the overhead.
+ * The idea is you set it for 2 hours when the sock is first connected, when it
+ * does fire off (if at all, most sockets die earlier) you check for the keepalive
+ * option and also if the sock has been idle long enough to start probing.
+ */
+static void tcp_keepalive(unsigned long data)
+{
+ static int chain_start = 0;
+ int count = 0;
+ int i;
+
+ for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) {
+ struct sock *sk = tcp_established_hash[i];
+ while(sk) {
+ if(!atomic_read(&sk->sock_readers) && sk->keepopen) {
+ count += tcp_keepopen_proc(sk);
+ if(count == sysctl_tcp_max_ka_probes)
+ goto out;
+ }
+ sk = sk->next;
+ }
+ }
+out:
+ chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
+ ((TCP_HTABLE_SIZE/2) - 1));
+}
+
+/*
+ * The TCP retransmit timer. This lacks a few small details.
+ *
+ * 1. An initial rtt timeout on the probe0 should cause what we can
+ * of the first write queue buffer to be split and sent.
+ * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
+ * ETIMEDOUT if we know an additional 'soft' error caused this.
+ * tcp_err should save a 'soft error' for us.
+ * [Unless someone has broken it then it does, except for one 2.0
+ * broken case of a send when the route/device is directly unreachable,
+ * and we error but should retry! - FIXME] [AC]
+ */
+
+void tcp_retransmit_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ /* We are reset. We will send no more retransmits. */
+ if(sk->zapped) {
+ tcp_clear_xmit_timer(sk, TIME_RETRANS);
+ return;
+ }
+
+ if (atomic_read(&sk->sock_readers)) {
+ /* Try again later */
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+ return;
+ }
+
+ /* Clear delay ack timer. */
+ tcp_clear_xmit_timer(sk, TIME_DACK);
+
+ /* RFC 2018, clear all 'sacked' flags in retransmission queue,
+ * the sender may have dropped out of order frames and we must
+ * send them out should this timer fire on us.
+ */
+ if(tp->sack_ok) {
+ struct sk_buff *skb = skb_peek(&sk->write_queue);
+
+ while((skb != NULL) &&
+ (skb != tp->send_head) &&
+ (skb != (struct sk_buff *)&sk->write_queue)) {
+ TCP_SKB_CB(skb)->sacked &=
+ ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS);
+ skb = skb->next;
+ }
+ }
+
+ /* Retransmission. */
+ tp->retrans_head = NULL;
+ tp->rexmt_done = 0;
+ tp->fackets_out = 0;
+ tp->retrans_out = 0;
+ if (tp->retransmits == 0) {
+ /* Remember window where we lost:
+ * "one half of the current window but at least 2 segments"
+ *
+ * Here "current window" means the effective one, which
+ * means it must be an accurate representation of our current
+ * sending rate _and_ the snd_wnd.
+ */
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd = 1;
+ }
+
+ tp->retransmits++;
+
+ tp->dup_acks = 0;
+ tp->high_seq = tp->snd_nxt;
+ tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+
+ /* Increase the timeout each time we retransmit. Note that
+ * we do not increase the rtt estimate. rto is initialized
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
+ * that doubling rto each time is the least we can get away with.
+ * In KA9Q, Karn uses this for the first few times, and then
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
+ * defined in the protocol as the maximum possible RTT. I guess
+ * we'll have to use something other than TCP to talk to the
+ * University of Mars.
+ *
+ * PAWS allows us longer timeouts and large windows, so once
+ * implemented ftp to mars will work nicely. We will have to fix
+ * the 120 second clamps though!
+ */
+ tp->backoff++;
+ tp->rto = min(tp->rto << 1, 120*HZ);
+ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+
+ tcp_write_timeout(sk);
+}
+
+/*
+ * Slow timer for SYN-RECV sockets
+ */
+
+/* This now scales very nicely. -DaveM */
+static void tcp_syn_recv_timer(unsigned long data)
+{
+ struct sock *sk;
+ unsigned long now = jiffies;
+ int i;
+
+ for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
+ sk = tcp_listening_hash[i];
+
+ while(sk) {
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ /* TCP_LISTEN is implied. */
+ if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
+ struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
+ struct open_request *req = tp->syn_wait_queue;
+ do {
+ struct open_request *conn;
+
+ conn = req;
+ req = req->dl_next;
+
+ if (conn->sk ||
+ ((long)(now - conn->expires)) <= 0) {
+ prev = conn;
+ continue;
+ }
+
+ tcp_synq_unlink(tp, conn, prev);
+ if (conn->retrans >= sysctl_tcp_retries1) {
+#ifdef TCP_DEBUG
+ printk(KERN_DEBUG "syn_recv: "
+ "too many retransmits\n");
+#endif
+ (*conn->class->destructor)(conn);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ tp->syn_backlog--;
+ tcp_openreq_free(conn);
+
+ if (!tp->syn_wait_queue)
+ break;
+ } else {
+ unsigned long timeo;
+ struct open_request *op;
+
+ (*conn->class->rtx_syn_ack)(sk, conn);
+
+ conn->retrans++;
+#ifdef TCP_DEBUG
+ printk(KERN_DEBUG "syn_ack rtx %d\n",
+ conn->retrans);
+#endif
+ timeo = min((TCP_TIMEOUT_INIT
+ << conn->retrans),
+ 120*HZ);
+ conn->expires = now + timeo;
+ op = prev->dl_next;
+ tcp_synq_queue(tp, conn);
+ if (op != prev->dl_next)
+ prev = prev->dl_next;
+ }
+ /* old prev still valid here */
+ } while (req);
+ }
+ sk = sk->next;
+ }
+ }
+}
+
+void tcp_sltimer_handler(unsigned long data)
+{
+ struct tcp_sl_timer *slt = tcp_slt_array;
+ unsigned long next = ~0UL;
+ unsigned long now = jiffies;
+ int i;
+
+ for (i=0; i < TCP_SLT_MAX; i++, slt++) {
+ if (atomic_read(&slt->count)) {
+ long trigger;
+
+ trigger = slt->period - ((long)(now - slt->last));
+
+ if (trigger <= 0) {
+ (*slt->handler)((unsigned long) slt);
+ slt->last = now;
+ trigger = slt->period;
+ }
+
+ /* Only reschedule if some events remain. */
+ if (atomic_read(&slt->count))
+ next = min(next, trigger);
+ }
+ }
+ if (next != ~0UL)
+ mod_timer(&tcp_slow_timer, (now + next));
+}
+
+void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
+{
+ unsigned long now = jiffies;
+ unsigned long when;
+
+ slt->last = now;
+
+ when = now + slt->period;
+
+ if (tcp_slow_timer.prev) {
+ if ((long)(tcp_slow_timer.expires - when) >= 0)
+ mod_timer(&tcp_slow_timer, when);
+ } else {
+ tcp_slow_timer.expires = when;
+ add_timer(&tcp_slow_timer);
+ }
+}
diff --git a/pfinet/linux-src/net/ipv4/timer.c b/pfinet/linux-src/net/ipv4/timer.c
new file mode 100644
index 00000000..3821a7c4
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/timer.c
@@ -0,0 +1,127 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * TIMER - implementation of software timers for IP.
+ *
+ * Version: $Id: timer.c,v 1.15 1999/02/22 13:54:29 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ *
+ * Fixes:
+ * Alan Cox : To avoid destroying a wait queue as we use it
+ * we defer destruction until the destroy timer goes
+ * off.
+ * Alan Cox : Destroy socket doesn't write a status value to the
+ * socket buffer _AFTER_ freeing it! Also sock ensures
+ * the socket will get removed BEFORE this is called
+ * otherwise if the timer TIME_DESTROY occurs inside
+ * of inet_bh() with this socket being handled it goes
+ * BOOM! Have to stop timer going off if net_bh is
+ * active or the destroy causes crashes.
+ * Alan Cox : Cleaned up unused code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <asm/system.h>
+#include <linux/interrupt.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+
+void net_delete_timer (struct sock *t)
+{
+ if(t->timer.prev)
+ del_timer (&t->timer);
+ t->timeout = 0;
+}
+
+void net_reset_timer (struct sock *t, int timeout, unsigned long len)
+{
+ t->timeout = timeout;
+ mod_timer(&t->timer, jiffies+len);
+}
+
+/* Now we will only be called whenever we need to do
+ * something, but we must be sure to process all of the
+ * sockets that need it.
+ */
+void net_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock*)data;
+ int why = sk->timeout;
+
+ /* Only process if socket is not in use. */
+ if (atomic_read(&sk->sock_readers)) {
+ /* Try again later. */
+ mod_timer(&sk->timer, jiffies+HZ/20);
+ return;
+ }
+
+ /* Always see if we need to send an ack. */
+ if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) {
+ sk->prot->read_wakeup (sk);
+ if (!sk->dead)
+ sk->data_ready(sk,0);
+ }
+
+ /* Now we need to figure out why the socket was on the timer. */
+ switch (why) {
+ case TIME_DONE:
+ /* If the socket hasn't been closed off, re-try a bit later. */
+ if (!sk->dead) {
+ net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME);
+ break;
+ }
+
+ if (sk->state != TCP_CLOSE) {
+ printk (KERN_DEBUG "non CLOSE socket in time_done\n");
+ break;
+ }
+ destroy_sock (sk);
+ break;
+
+ case TIME_DESTROY:
+ /* We've waited for a while for all the memory associated with
+ * the socket to be freed.
+ */
+ destroy_sock(sk);
+ break;
+
+ case TIME_CLOSE:
+ /* We've waited long enough, close the socket. */
+ tcp_set_state(sk, TCP_CLOSE);
+ sk->shutdown = SHUTDOWN_MASK;
+ if (!sk->dead)
+ sk->state_change(sk);
+ net_reset_timer (sk, TIME_DONE, TCP_DONE_TIME);
+ break;
+
+ default:
+ /* I want to see these... */
+ printk ("net_timer: timer expired - reason %d is unknown\n", why);
+ break;
+ }
+}
+
diff --git a/pfinet/linux-src/net/ipv4/udp.c b/pfinet/linux-src/net/ipv4/udp.c
new file mode 100644
index 00000000..909e858f
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/udp.c
@@ -0,0 +1,1191 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The User Datagram Protocol (UDP).
+ *
+ * Version: $Id: udp.c,v 1.66.2.3 1999/08/07 10:56:36 davem Exp $
+ *
+ * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Alan Cox, <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() calls
+ * Alan Cox : stopped close while in use off icmp
+ * messages. Not a fix but a botch that
+ * for udp at least is 'valid'.
+ * Alan Cox : Fixed icmp handling properly
+ * Alan Cox : Correct error for oversized datagrams
+ * Alan Cox : Tidied select() semantics.
+ * Alan Cox : udp_err() fixed properly, also now
+ * select and read wake correctly on errors
+ * Alan Cox : udp_send verify_area moved to avoid mem leak
+ * Alan Cox : UDP can count its memory
+ * Alan Cox : send to an unknown connection causes
+ * an ECONNREFUSED off the icmp, but
+ * does NOT close.
+ * Alan Cox : Switched to new sk_buff handlers. No more backlog!
+ * Alan Cox : Using generic datagram code. Even smaller and the PEEK
+ * bug no longer crashes it.
+ * Fred Van Kempen : Net2e support for sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram
+ * Alan Cox : Added get/set sockopt support.
+ * Alan Cox : Broadcasting without option set returns EACCES.
+ * Alan Cox : No wakeup calls. Instead we now use the callbacks.
+ * Alan Cox : Use ip_tos and ip_ttl
+ * Alan Cox : SNMP Mibs
+ * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
+ * Matt Dillon : UDP length checks.
+ * Alan Cox : Smarter af_inet used properly.
+ * Alan Cox : Use new kernel side addressing.
+ * Alan Cox : Incorrect return on truncated datagram receive.
+ * Arnt Gulbrandsen : New udp_send and stuff
+ * Alan Cox : Cache last socket
+ * Alan Cox : Route cache
+ * Jon Peatfield : Minor efficiency fix to sendto().
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Nonblocking error fix.
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * David S. Miller : New socket lookup architecture.
+ * Last socket cache retained as it
+ * does have a high hit rate.
+ * Olaf Kirch : Don't linearise iovec on sendmsg.
+ * Andi Kleen : Some cleanups, cache destination entry
+ * for connect.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Melvin Smith : Check msg_name not msg_namelen in sendto(),
+ * return ENOTCONN for unconnected sockets (POSIX)
+ * Janos Farkas : don't deliver multi/broadcasts to a different
+ * bound-to-device socket
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* RFC1122 Status:
+ 4.1.3.1 (Ports):
+ SHOULD send ICMP_PORT_UNREACHABLE in response to datagrams to
+ an un-listened port. (OK)
+ 4.1.3.2 (IP Options)
+ MUST pass IP options from IP -> application (OK)
+ MUST allow application to specify IP options (OK)
+ 4.1.3.3 (ICMP Messages)
+ MUST pass ICMP error messages to application (OK -- except when SO_BSDCOMPAT is set)
+ 4.1.3.4 (UDP Checksums)
+ MUST provide facility for checksumming (OK)
+ MAY allow application to control checksumming (OK)
+ MUST default to checksumming on (OK)
+ MUST discard silently datagrams with bad csums (OK, except during debugging)
+ 4.1.3.5 (UDP Multihoming)
+ MUST allow application to specify source address (OK)
+ SHOULD be able to communicate the chosen src addr up to application
+ when application doesn't choose (DOES - use recvmsg cmsgs)
+ 4.1.3.6 (Invalid Addresses)
+ MUST discard invalid source addresses (OK -- done in the new routing code)
+ MUST only send datagrams with one of our addresses (OK)
+*/
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/config.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/checksum.h>
+
+/*
+ * Snmp MIB for the UDP layer
+ */
+
+struct udp_mib udp_statistics;
+
+struct sock *udp_hash[UDP_HTABLE_SIZE];
+
+/* Shared by v4/v6 udp. */
+int udp_port_rover = 0;
+
+static int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ SOCKHASH_LOCK();
+ if (snum == 0) {
+ int best_size_so_far, best, result, i;
+
+ if (udp_port_rover > sysctl_local_port_range[1] ||
+ udp_port_rover < sysctl_local_port_range[0])
+ udp_port_rover = sysctl_local_port_range[0];
+ best_size_so_far = 32767;
+ best = result = udp_port_rover;
+ for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+ struct sock *sk;
+ int size;
+
+ sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+ if (!sk) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0] +
+ ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+ goto gotit;
+ }
+ size = 0;
+ do {
+ if (++size >= best_size_so_far)
+ goto next;
+ } while ((sk = sk->next) != NULL);
+ best_size_so_far = size;
+ best = result;
+ next:
+ }
+ result = best;
+ for(;; result += UDP_HTABLE_SIZE) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0]
+ + ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+ if (!udp_lport_inuse(result))
+ break;
+ }
+gotit:
+ udp_port_rover = snum = result;
+ } else {
+ struct sock *sk2;
+
+ for (sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+ sk2 != NULL;
+ sk2 = sk2->next) {
+ if (sk2->num == snum &&
+ sk2 != sk &&
+ sk2->bound_dev_if == sk->bound_dev_if &&
+ (!sk2->rcv_saddr ||
+ !sk->rcv_saddr ||
+ sk2->rcv_saddr == sk->rcv_saddr) &&
+ (!sk2->reuse || !sk->reuse))
+ goto fail;
+ }
+ }
+ sk->num = snum;
+ SOCKHASH_UNLOCK();
+ return 0;
+
+fail:
+ SOCKHASH_UNLOCK();
+ return 1;
+}
+
+/* Last hit UDP socket cache, this is ipv4 specific so make it static. */
+static u32 uh_cache_saddr, uh_cache_daddr;
+static u16 uh_cache_dport, uh_cache_sport;
+static struct sock *uh_cache_sk = NULL;
+
+static void udp_v4_hash(struct sock *sk)
+{
+ struct sock **skp = &udp_hash[sk->num & (UDP_HTABLE_SIZE - 1)];
+
+ SOCKHASH_LOCK();
+ if ((sk->next = *skp) != NULL)
+ (*skp)->pprev = &sk->next;
+ *skp = sk;
+ sk->pprev = skp;
+ SOCKHASH_UNLOCK();
+}
+
+static void udp_v4_unhash(struct sock *sk)
+{
+ SOCKHASH_LOCK();
+ if (sk->pprev) {
+ if (sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ if(uh_cache_sk == sk)
+ uh_cache_sk = NULL;
+ }
+ SOCKHASH_UNLOCK();
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this here plus the last hit cache. -DaveM
+ */
+struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+{
+ struct sock *sk, *result = NULL;
+ unsigned short hnum = ntohs(dport);
+ int badness = -1;
+
+ for(sk = udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]; sk != NULL; sk = sk->next) {
+ if((sk->num == hnum) && !(sk->dead && (sk->state == TCP_CLOSE))) {
+ int score = 0;
+ if(sk->rcv_saddr) {
+ if(sk->rcv_saddr != daddr)
+ continue;
+ score++;
+ }
+ if(sk->daddr) {
+ if(sk->daddr != saddr)
+ continue;
+ score++;
+ }
+ if(sk->dport) {
+ if(sk->dport != sport)
+ continue;
+ score++;
+ }
+ if(sk->bound_dev_if) {
+ if(sk->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if(score == 4) {
+ result = sk;
+ break;
+ } else if(score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+ }
+ return result;
+}
+
+__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+{
+ struct sock *sk;
+
+ if(!dif && uh_cache_sk &&
+ uh_cache_saddr == saddr &&
+ uh_cache_sport == sport &&
+ uh_cache_dport == dport &&
+ uh_cache_daddr == daddr)
+ return uh_cache_sk;
+
+ sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
+ if(!dif) {
+ uh_cache_sk = sk;
+ uh_cache_saddr = saddr;
+ uh_cache_daddr = daddr;
+ uh_cache_sport = sport;
+ uh_cache_dport = dport;
+ }
+ return sk;
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+#define secondlist(hpnum, sk, fpass) \
+({ struct sock *s1; if(!(sk) && (fpass)--) \
+ s1 = udp_hash[(hpnum) & (UDP_HTABLE_SIZE - 1)]; \
+ else \
+ s1 = (sk); \
+ s1; \
+})
+
+#define udp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
+ secondlist((hpnum), udp_hash[(hnum)&(UDP_HTABLE_SIZE-1)],(fpass))
+
+#define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
+ secondlist((hpnum),(sk)->next,(fpass))
+
+static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
+ unsigned short rnum, unsigned long laddr,
+ struct device *dev, unsigned short pnum,
+ int dif)
+{
+ struct sock *s, *result = NULL;
+ int badness = -1;
+ u32 paddr = 0;
+ unsigned short hnum = ntohs(num);
+ unsigned short hpnum = ntohs(pnum);
+ int firstpass = 1;
+
+ if(dev && dev->ip_ptr) {
+ struct in_device *idev = dev->ip_ptr;
+
+ if(idev->ifa_list)
+ paddr = idev->ifa_list->ifa_local;
+ }
+
+ SOCKHASH_LOCK();
+ for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
+ s != NULL;
+ s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
+ if(s->num == hnum || s->num == hpnum) {
+ int score = 0;
+ if(s->dead && (s->state == TCP_CLOSE))
+ continue;
+ if(s->rcv_saddr) {
+ if((s->num != hpnum || s->rcv_saddr != paddr) &&
+ (s->num != hnum || s->rcv_saddr != laddr))
+ continue;
+ score++;
+ }
+ if(s->daddr) {
+ if(s->daddr != raddr)
+ continue;
+ score++;
+ }
+ if(s->dport) {
+ if(s->dport != rnum)
+ continue;
+ score++;
+ }
+ if(s->bound_dev_if) {
+ if(s->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if(score == 4 && s->num == hnum) {
+ result = s;
+ break;
+ } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+ result = s;
+ badness = score;
+ }
+ }
+ }
+ SOCKHASH_UNLOCK();
+ return result;
+}
+
+#undef secondlist
+#undef udp_v4_proxy_loop_init
+#undef udp_v4_proxy_loop_next
+
+#endif
+
+static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+ unsigned short num,
+ unsigned long raddr,
+ unsigned short rnum,
+ unsigned long laddr,
+ int dif)
+{
+ struct sock *s = sk;
+ unsigned short hnum = ntohs(num);
+ for(; s; s = s->next) {
+ if ((s->num != hnum) ||
+ (s->dead && (s->state == TCP_CLOSE)) ||
+ (s->daddr && s->daddr!=raddr) ||
+ (s->dport != rnum && s->dport != 0) ||
+ (s->rcv_saddr && s->rcv_saddr != laddr) ||
+ (s->bound_dev_if && s->bound_dev_if != dif))
+ continue;
+ break;
+ }
+ return s;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header. We need
+ * to find the appropriate port.
+ */
+
+void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
+{
+ struct iphdr *iph = (struct iphdr*)dp;
+ struct udphdr *uh = (struct udphdr*)(dp+(iph->ihl<<2));
+ int type = skb->h.icmph->type;
+ int code = skb->h.icmph->code;
+ struct sock *sk;
+ int harderr;
+ u32 info;
+ int err;
+
+ if (len < (iph->ihl<<2)+sizeof(struct udphdr)) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+
+ sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
+ if (sk == NULL) {
+ icmp_statistics.IcmpInErrors++;
+ return; /* No socket for error */
+ }
+
+ err = 0;
+ info = 0;
+ harderr = 0;
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ info = ntohl(skb->h.icmph->un.gateway)>>24;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ info = ntohs(skb->h.icmph->un.frag.mtu);
+ harderr = 1;
+ break;
+ }
+ return;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ }
+
+ /*
+ * Various people wanted BSD UDP semantics. Well they've come
+ * back out because they slow down response to stuff like dead
+ * or unreachable name servers and they screw term users something
+ * chronic. Oh and it violates RFC1122. So basically fix your
+ * client code people.
+ */
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3. After the comment above, that should be no surprise.
+ */
+
+ if (!harderr && !sk->ip_recverr)
+ return;
+
+ /*
+ * 4.x BSD compatibility item. Break RFC1122 to
+ * get BSD socket semantics.
+ */
+ if(sk->bsdism && sk->state!=TCP_ESTABLISHED)
+ return;
+
+ if (sk->ip_recverr)
+ ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+ sk->err = err;
+ sk->error_report(sk);
+}
+
+
+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
+{
+ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+}
+
+struct udpfakehdr
+{
+ struct udphdr uh;
+ u32 saddr;
+ u32 daddr;
+ struct iovec *iov;
+ u32 wcheck;
+};
+
+/*
+ * Copy and checksum a UDP packet from user space into a buffer. We still have
+ * to do the planning to get ip_build_xmit to spot direct transfer to network
+ * card and provide an additional callback mode for direct user->board I/O
+ * transfers. That one will be fun.
+ */
+
+static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen)
+{
+ struct udpfakehdr *ufh = (struct udpfakehdr *)p;
+ if (offset==0) {
+ if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
+ fraglen-sizeof(struct udphdr), &ufh->wcheck))
+ return -EFAULT;
+ ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr),
+ ufh->wcheck);
+ ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr,
+ ntohs(ufh->uh.len),
+ IPPROTO_UDP, ufh->wcheck);
+ if (ufh->uh.check == 0)
+ ufh->uh.check = -1;
+ memcpy(to, ufh, sizeof(struct udphdr));
+ return 0;
+ }
+ if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
+ fraglen, &ufh->wcheck))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * Unchecksummed UDP is sufficiently critical to stuff like ATM video conferencing
+ * that we use two routines for this for speed. Probably we ought to have a
+ * CONFIG_FAST_NET set for >10Mb/second boards to activate this sort of coding.
+ * Timing needed to verify if this is a valid decision.
+ */
+
+static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen)
+{
+ struct udpfakehdr *ufh = (struct udpfakehdr *)p;
+
+ if (offset==0) {
+ memcpy(to, ufh, sizeof(struct udphdr));
+ return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
+ fraglen-sizeof(struct udphdr));
+ }
+ return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
+ fraglen);
+}
+
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+{
+ int ulen = len + sizeof(struct udphdr);
+ struct ipcm_cookie ipc;
+ struct udpfakehdr ufh;
+ struct rtable *rt = NULL;
+ int free = 0;
+ int connected = 0;
+ u32 daddr;
+ u8 tos;
+ int err;
+
+ /* This check is ONLY to check for arithmetic overflow
+ on integer(!) len. Not more! Real check will be made
+ in ip_build_xmit --ANK
+
+ BTW socket.c -> af_*.c -> ... make multiple
+ invalid conversions size_t -> int. We MUST repair it f.e.
+ by replacing all of them with size_t and revise all
+ the places sort of len += sizeof(struct iphdr)
+ If len was ULONG_MAX-10 it would be cathastrophe --ANK
+ */
+
+ if (len < 0 || len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY|MSG_NOSIGNAL))
+ return -EINVAL;
+ if ((msg->msg_flags&MSG_PROXY) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+#else
+ if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
+ return -EINVAL;
+#endif
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_name) {
+ struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*usin))
+ return(-EINVAL);
+ if (usin->sin_family != AF_INET) {
+ static int complained;
+ if (!complained++)
+ printk(KERN_WARNING "%s forgot to set AF_INET in udp sendmsg. Fix it!\n", current->comm);
+ if (usin->sin_family)
+ return -EINVAL;
+ }
+ ufh.daddr = usin->sin_addr.s_addr;
+ ufh.uh.dest = usin->sin_port;
+ if (ufh.uh.dest == 0)
+ return -EINVAL;
+ } else {
+ if (sk->state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+ ufh.daddr = sk->daddr;
+ ufh.uh.dest = sk->dport;
+ /* Open fast path for connected socket.
+ Route will not be used, if at least one option is set.
+ */
+ connected = 1;
+ }
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (msg->msg_flags&MSG_PROXY) {
+ /*
+ * We map the first 8 bytes of a second sockaddr_in
+ * into the last 8 (unused) bytes of a sockaddr_in.
+ */
+ struct sockaddr_in *from = (struct sockaddr_in *)msg->msg_name;
+ from = (struct sockaddr_in *)&from->sin_zero;
+ if (from->sin_family != AF_INET)
+ return -EINVAL;
+ ipc.addr = from->sin_addr.s_addr;
+ ufh.uh.source = from->sin_port;
+ if (ipc.addr == 0)
+ ipc.addr = sk->saddr;
+ connected = 0;
+ } else
+#endif
+ {
+ ipc.addr = sk->saddr;
+ ufh.uh.source = sk->sport;
+ }
+
+ ipc.opt = NULL;
+ ipc.oif = sk->bound_dev_if;
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(msg, &ipc);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ connected = 0;
+ }
+ if (!ipc.opt)
+ ipc.opt = sk->opt;
+
+ ufh.saddr = ipc.addr;
+ ipc.addr = daddr = ufh.daddr;
+
+ if (ipc.opt && ipc.opt->srr) {
+ if (!daddr)
+ return -EINVAL;
+ daddr = ipc.opt->faddr;
+ connected = 0;
+ }
+ tos = RT_TOS(sk->ip_tos);
+ if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->is_strictroute)) {
+ tos |= RTO_ONLINK;
+ connected = 0;
+ }
+
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = sk->ip_mc_index;
+ if (!ufh.saddr)
+ ufh.saddr = sk->ip_mc_addr;
+ connected = 0;
+ }
+
+ if (connected && sk->dst_cache) {
+ rt = (struct rtable*)sk->dst_cache;
+ if (rt->u.dst.obsolete) {
+ sk->dst_cache = NULL;
+ dst_release(&rt->u.dst);
+ rt = NULL;
+ } else
+ dst_clone(&rt->u.dst);
+ }
+
+ if (rt == NULL) {
+ err = ip_route_output(&rt, daddr, ufh.saddr,
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ (msg->msg_flags&MSG_PROXY ? RTO_TPROXY : 0) |
+#endif
+ tos, ipc.oif);
+ if (err)
+ goto out;
+
+ err = -EACCES;
+ if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
+ goto out;
+ if (connected && sk->dst_cache == NULL)
+ sk->dst_cache = dst_clone(&rt->u.dst);
+ }
+
+ ufh.saddr = rt->rt_src;
+ if (!ipc.addr)
+ ufh.daddr = ipc.addr = rt->rt_dst;
+ ufh.uh.len = htons(ulen);
+ ufh.uh.check = 0;
+ ufh.iov = msg->msg_iov;
+ ufh.wcheck = 0;
+
+ /* RFC1122: OK. Provides the checksumming facility (MUST) as per */
+ /* 4.1.3.4. It's configurable by the application via setsockopt() */
+ /* (MAY) and it defaults to on (MUST). */
+
+ err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag,
+ &ufh, ulen, &ipc, rt, msg->msg_flags);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ udp_statistics.UdpOutDatagrams++;
+ return len;
+ }
+ return err;
+}
+
+/*
+ * IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch(cmd)
+ {
+ case TIOCOUTQ:
+ {
+ unsigned long amount;
+
+ amount = sock_wspace(sk);
+ return put_user(amount, (int *)arg);
+ }
+
+ case TIOCINQ:
+ {
+ struct sk_buff *skb;
+ unsigned long amount;
+
+ amount = 0;
+ /* N.B. Is this interrupt safe??
+ -> Yes. Interrupts do not remove skbs. --ANK (980725)
+ */
+ skb = skb_peek(&sk->receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ amount = skb->len - sizeof(struct udphdr);
+ }
+ return put_user(amount, (int *)arg);
+ }
+
+ default:
+ return(-ENOIOCTLCMD);
+ }
+ return(0);
+}
+
+#ifndef HAVE_CSUM_COPY_USER
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
+ int noblock, int flags, int *addr_len)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ struct sk_buff *skb;
+ int copied, err;
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len);
+
+ /*
+ * From here the generic datagram does a lot of the work. Come
+ * the finished NET3, it will do _ALL_ the work!
+ */
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len - sizeof(struct udphdr);
+ if (copied > len) {
+ copied = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+#ifndef CONFIG_UDP_DELAY_CSUM
+ err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+ copied);
+#else
+ if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
+ err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+ copied);
+ } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)))
+ goto csum_copy_err;
+ err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+ copied);
+ } else {
+ unsigned int csum;
+
+ err = 0;
+ csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+ csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base,
+ copied, csum, &err);
+ if (err)
+ goto out_free;
+ if ((unsigned short)csum_fold(csum))
+ goto csum_copy_err;
+ }
+#endif
+ if (err)
+ goto out_free;
+ sk->stamp=skb->stamp;
+
+ /* Copy the address. */
+ if (sin)
+ {
+ /*
+ * Check any passed addresses
+ */
+ if (addr_len)
+ *addr_len=sizeof(*sin);
+
+ sin->sin_family = AF_INET;
+ sin->sin_port = skb->h.uh->source;
+ sin->sin_addr.s_addr = skb->nh.iph->saddr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (flags&MSG_PROXY)
+ {
+ /*
+ * We map the first 8 bytes of a second sockaddr_in
+ * into the last 8 (unused) bytes of a sockaddr_in.
+ * This _is_ ugly, but it's the only way to do it
+ * easily, without adding system calls.
+ */
+ struct sockaddr_in *sinto =
+ (struct sockaddr_in *) sin->sin_zero;
+
+ sinto->sin_family = AF_INET;
+ sinto->sin_port = skb->h.uh->dest;
+ sinto->sin_addr.s_addr = skb->nh.iph->daddr;
+ }
+#endif
+ }
+ if (sk->ip_cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ err = copied;
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ return err;
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+csum_copy_err:
+ udp_statistics.UdpInErrors++;
+ skb_free_datagram(sk, skb);
+
+ /*
+ * Error for blocking case is chosen to masquerade
+ * as some normal condition.
+ */
+ return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+#endif
+}
+
+int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct rtable *rt;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return(-EINVAL);
+
+ /*
+ * 1003.1g - break association.
+ */
+
+ if (usin->sin_family==AF_UNSPEC)
+ {
+ sk->saddr=INADDR_ANY;
+ sk->rcv_saddr=INADDR_ANY;
+ sk->daddr=INADDR_ANY;
+ sk->state = TCP_CLOSE;
+ if(uh_cache_sk == sk)
+ uh_cache_sk = NULL;
+ return 0;
+ }
+
+ if (usin->sin_family && usin->sin_family != AF_INET)
+ return(-EAFNOSUPPORT);
+
+ dst_release(xchg(&sk->dst_cache, NULL));
+
+ err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
+ sk->ip_tos|sk->localroute, sk->bound_dev_if);
+ if (err)
+ return err;
+ if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) {
+ ip_rt_put(rt);
+ return -EACCES;
+ }
+ if(!sk->saddr)
+ sk->saddr = rt->rt_src; /* Update source address */
+ if(!sk->rcv_saddr)
+ sk->rcv_saddr = rt->rt_src;
+ sk->daddr = rt->rt_dst;
+ sk->dport = usin->sin_port;
+ sk->state = TCP_ESTABLISHED;
+
+ if(uh_cache_sk == sk)
+ uh_cache_sk = NULL;
+
+ sk->dst_cache = &rt->u.dst;
+ return(0);
+}
+
+
+static void udp_close(struct sock *sk, long timeout)
+{
+ /* See for explanation: raw_close in ipv4/raw.c */
+ sk->state = TCP_CLOSE;
+ udp_v4_unhash(sk);
+ sk->dead = 1;
+ destroy_sock(sk);
+}
+
+static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+{
+ /*
+ * Charge it to the socket, dropping if the queue is full.
+ */
+
+#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+ if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+ udp_statistics.UdpInErrors++;
+ ip_statistics.IpInDiscards++;
+ ip_statistics.IpInDelivers--;
+ kfree_skb(skb);
+ return -1;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+#endif
+
+ if (sock_queue_rcv_skb(sk,skb)<0) {
+ udp_statistics.UdpInErrors++;
+ ip_statistics.IpInDiscards++;
+ ip_statistics.IpInDelivers--;
+ kfree_skb(skb);
+ return -1;
+ }
+ udp_statistics.UdpInDatagrams++;
+ return 0;
+}
+
+
+static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
+{
+ udp_queue_rcv_skb(sk, skb);
+}
+
+/*
+ * Multicasts and broadcasts go to each listener.
+ *
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
+ */
+static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
+ u32 saddr, u32 daddr)
+{
+ struct sock *sk;
+ int dif;
+
+ sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
+ dif = skb->dev->ifindex;
+ sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif);
+ if (sk) {
+ struct sock *sknext = NULL;
+
+ do {
+ struct sk_buff *skb1 = skb;
+
+ sknext = udp_v4_mcast_next(sk->next, uh->dest, saddr,
+ uh->source, daddr, dif);
+ if(sknext)
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+
+ if(skb1)
+ udp_deliver(sk, skb1);
+ sk = sknext;
+ } while(sknext);
+ } else
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+/*
+ * Check whether a received UDP packet might be for one of our
+ * sockets.
+ */
+
+int udp_chkaddr(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct udphdr *uh = (struct udphdr *)(skb->nh.raw + iph->ihl*4);
+ struct sock *sk;
+
+ sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest, skb->dev->ifindex);
+ if (!sk)
+ return 0;
+
+ /* 0 means accept all LOCAL addresses here, not all the world... */
+ if (sk->rcv_saddr == 0)
+ return 0;
+
+ return 1;
+}
+#endif
+
+/*
+ * All we need to do is get the socket, and then do a checksum.
+ */
+
+int udp_rcv(struct sk_buff *skb, unsigned short len)
+{
+ struct sock *sk;
+ struct udphdr *uh;
+ unsigned short ulen;
+ struct rtable *rt = (struct rtable*)skb->dst;
+ u32 saddr = skb->nh.iph->saddr;
+ u32 daddr = skb->nh.iph->daddr;
+
+ /*
+ * First time through the loop.. Do all the setup stuff
+ * (including finding out the socket we go to etc)
+ */
+
+ /*
+ * Get the header.
+ */
+
+ uh = skb->h.uh;
+ __skb_pull(skb, skb->h.raw - skb->data);
+
+ ip_statistics.IpInDelivers++;
+
+ /*
+ * Validate the packet and the UDP length.
+ */
+
+ ulen = ntohs(uh->len);
+
+ if (ulen > len || ulen < sizeof(*uh)) {
+ NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len));
+ udp_statistics.UdpInErrors++;
+ kfree_skb(skb);
+ return(0);
+ }
+ skb_trim(skb, ulen);
+
+#ifndef CONFIG_UDP_DELAY_CSUM
+ if (uh->check &&
+ (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) ||
+ ((skb->ip_summed==CHECKSUM_NONE) &&
+ (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0))))))
+ goto csum_error;
+#else
+ if (uh->check==0)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else if (skb->ip_summed==CHECKSUM_HW) {
+ if (udp_check(uh,ulen,saddr,daddr,skb->csum))
+ goto csum_error;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
+
+ if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (IPCB(skb)->redirport)
+ sk = udp_v4_proxy_lookup(uh->dest, saddr, uh->source,
+ daddr, skb->dev, IPCB(skb)->redirport,
+ skb->dev->ifindex);
+ else
+#endif
+ sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
+
+ if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ (unsigned short)csum_fold(csum_partial((char*)uh, ulen, skb->csum)))
+ goto csum_error;
+#endif
+ udp_statistics.UdpNoPorts++;
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ /*
+ * Hmm. We got an UDP broadcast to a port to which we
+ * don't wanna listen. Ignore it.
+ */
+ kfree_skb(skb);
+ return(0);
+ }
+ udp_deliver(sk, skb);
+ return 0;
+
+csum_error:
+ /*
+ * RFC1122: OK. Discards the bad packet silently (as far as
+ * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+ */
+ NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen));
+ udp_statistics.UdpInErrors++;
+ kfree_skb(skb);
+ return(0);
+}
+
+struct proto udp_prot = {
+ (struct sock *)&udp_prot, /* sklist_next */
+ (struct sock *)&udp_prot, /* sklist_prev */
+ udp_close, /* close */
+ udp_connect, /* connect */
+ NULL, /* accept */
+ NULL, /* retransmit */
+ NULL, /* write_wakeup */
+ NULL, /* read_wakeup */
+ datagram_poll, /* poll */
+ udp_ioctl, /* ioctl */
+ NULL, /* init */
+ NULL, /* destroy */
+ NULL, /* shutdown */
+ ip_setsockopt, /* setsockopt */
+ ip_getsockopt, /* getsockopt */
+ udp_sendmsg, /* sendmsg */
+ udp_recvmsg, /* recvmsg */
+ NULL, /* bind */
+ udp_queue_rcv_skb, /* backlog_rcv */
+ udp_v4_hash, /* hash */
+ udp_v4_unhash, /* unhash */
+ udp_v4_get_port, /* good_socknum */
+ 128, /* max_header */
+ 0, /* retransmits */
+ "UDP", /* name */
+ 0, /* inuse */
+ 0 /* highestinuse */
+};
diff --git a/pfinet/linux-src/net/ipv4/utils.c b/pfinet/linux-src/net/ipv4/utils.c
new file mode 100644
index 00000000..ce74ade2
--- /dev/null
+++ b/pfinet/linux-src/net/ipv4/utils.c
@@ -0,0 +1,91 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Various kernel-resident INET utility functions; mainly
+ * for format conversion and debugging output.
+ *
+ * Version: $Id: utils.c,v 1.6 1997/12/13 21:53:03 kuznet Exp $
+ *
+ * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area check.
+ * Alan Cox : removed old debugging.
+ * Andi Kleen : add net_ratelimit()
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <stdarg.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+
+
+/*
+ * Display an IP address in readable format.
+ */
+
+char *in_ntoa(__u32 in)
+{
+ static char buff[18];
+ char *p;
+
+ p = (char *) &in;
+ sprintf(buff, "%d.%d.%d.%d",
+ (p[0] & 255), (p[1] & 255), (p[2] & 255), (p[3] & 255));
+ return(buff);
+}
+
+
+/*
+ * Convert an ASCII string to binary IP.
+ */
+
+__u32 in_aton(const char *str)
+{
+ unsigned long l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++)
+ {
+ l <<= 8;
+ if (*str != '\0')
+ {
+ val = 0;
+ while (*str != '\0' && *str != '.')
+ {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return(htonl(l));
+}
+