From 9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 4 Feb 2000 03:21:18 +0000 Subject: Import of Linux 2.2.12 subset (ipv4 stack and related) --- pfinet/linux-src/net/core/Makefile | 41 + pfinet/linux-src/net/core/datagram.c | 249 ++++ pfinet/linux-src/net/core/dev.c | 2026 +++++++++++++++++++++++++++ pfinet/linux-src/net/core/dev_mcast.c | 252 ++++ pfinet/linux-src/net/core/dst.c | 145 ++ pfinet/linux-src/net/core/filter.c | 454 ++++++ pfinet/linux-src/net/core/firewall.c | 160 +++ pfinet/linux-src/net/core/iovec.c | 278 ++++ pfinet/linux-src/net/core/neighbour.c | 1394 ++++++++++++++++++ pfinet/linux-src/net/core/profile.c | 305 ++++ pfinet/linux-src/net/core/rtnetlink.c | 512 +++++++ pfinet/linux-src/net/core/scm.c | 280 ++++ pfinet/linux-src/net/core/skbuff.c | 385 +++++ pfinet/linux-src/net/core/sock.c | 1051 ++++++++++++++ pfinet/linux-src/net/core/sysctl_net_core.c | 61 + pfinet/linux-src/net/core/utils.c | 66 + 16 files changed, 7659 insertions(+) create mode 100644 pfinet/linux-src/net/core/Makefile create mode 100644 pfinet/linux-src/net/core/datagram.c create mode 100644 pfinet/linux-src/net/core/dev.c create mode 100644 pfinet/linux-src/net/core/dev_mcast.c create mode 100644 pfinet/linux-src/net/core/dst.c create mode 100644 pfinet/linux-src/net/core/filter.c create mode 100644 pfinet/linux-src/net/core/firewall.c create mode 100644 pfinet/linux-src/net/core/iovec.c create mode 100644 pfinet/linux-src/net/core/neighbour.c create mode 100644 pfinet/linux-src/net/core/profile.c create mode 100644 pfinet/linux-src/net/core/rtnetlink.c create mode 100644 pfinet/linux-src/net/core/scm.c create mode 100644 pfinet/linux-src/net/core/skbuff.c create mode 100644 pfinet/linux-src/net/core/sock.c create mode 100644 pfinet/linux-src/net/core/sysctl_net_core.c create mode 100644 pfinet/linux-src/net/core/utils.c (limited to 'pfinet/linux-src/net/core') diff --git a/pfinet/linux-src/net/core/Makefile b/pfinet/linux-src/net/core/Makefile new file mode 100644 index 00000000..5df65cd2 --- /dev/null +++ b/pfinet/linux-src/net/core/Makefile @@ -0,0 +1,41 @@ +# +# Makefile for the Linux networking core. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := core.o + +O_OBJS := sock.o skbuff.o iovec.o datagram.o scm.o + +ifeq ($(CONFIG_SYSCTL),y) +ifeq ($(CONFIG_NET),y) +O_OBJS += sysctl_net_core.o +endif +endif + +ifdef CONFIG_FILTER +O_OBJS += filter.o +endif + +ifdef CONFIG_NET + +O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o + +ifdef CONFIG_FIREWALL +OX_OBJS += firewall.o +endif + +endif + +ifdef CONFIG_NET_PROFILE +OX_OBJS += profile.o +endif + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/pfinet/linux-src/net/core/datagram.c b/pfinet/linux-src/net/core/datagram.c new file mode 100644 index 00000000..9bb68fa4 --- /dev/null +++ b/pfinet/linux-src/net/core/datagram.c @@ -0,0 +1,249 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top + * of these would make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox . (datagram_poll() from old udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() understood + * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but + * AX.25 now works right, and SPX is feasible. + * Alan Cox : Fixed write poll of non IP protocol crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +/* + * Wait for a packet.. + * + * Interrupts off so that no packet arrives before we begin sleeping. + * Otherwise we might miss our wake up + */ + +static inline void wait_for_packet(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + + if (skb_peek(&sk->receive_queue) == NULL) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + +/* + * Is a socket 'connection oriented' ? + */ + +static inline int connection_based(struct sock *sk) +{ + return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM); +} + +/* + * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible + * races. This replaces identical code in packet,raw and udp, as well as the IPX + * AX.25 and Appletalk. It also finally fixes the long standing peek and read + * race for datagram sockets. If you alter this routine remember it must be + * re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err) +{ + int error; + struct sk_buff *skb; + + /* Caller is allowed not to check sk->err before skb_recv_datagram() */ + error = sock_error(sk); + if (error) + goto no_packet; + +restart: + while(skb_queue_empty(&sk->receive_queue)) /* No data */ + { + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto no_packet; + + /* Socket shut down? */ + if (sk->shutdown & RCV_SHUTDOWN) + goto no_packet; + + /* Sequenced packets can come disconnected. If so we report the problem */ + error = -ENOTCONN; + if(connection_based(sk) && sk->state!=TCP_ESTABLISHED) + goto no_packet; + + /* handle signals */ + error = -ERESTARTSYS; + if (signal_pending(current)) + goto no_packet; + + /* User doesn't want to wait */ + error = -EAGAIN; + if (noblock) + goto no_packet; + + wait_for_packet(sk); + } + + /* Again only user level code calls this function, so nothing interrupt level + will suddenly eat the receive_queue */ + if (flags & MSG_PEEK) + { + unsigned long cpu_flags; + + /* It is the only POTENTIAL race condition + in this function. skb may be stolen by + another receiver after peek, but before + incrementing use count, provided kernel + is reentearble (it is not) or this function + is called by interrupts. + + Protect it with global skb spinlock, + though for now even this is overkill. + --ANK (980728) + */ + spin_lock_irqsave(&skb_queue_lock, cpu_flags); + skb = skb_peek(&sk->receive_queue); + if(skb!=NULL) + atomic_inc(&skb->users); + spin_unlock_irqrestore(&skb_queue_lock, cpu_flags); + } else + skb = skb_dequeue(&sk->receive_queue); + + if (!skb) /* Avoid race if someone beats us to the data */ + goto restart; + return skb; + +no_packet: + *err = error; + return NULL; +} + +void skb_free_datagram(struct sock * sk, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* + * Copy a datagram to a linear buffer. + */ + +int skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) +{ + int err = -EFAULT; + + if (!copy_to_user(to, skb->h.raw + offset, size)) + err = 0; + return err; +} + + +/* + * Copy a datagram to an iovec. + * Note: the iovec is modified during the copy. + */ + +int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, + int size) +{ + return memcpy_toiovec(to, skb->h.raw + offset, size); +} + +/* + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ + +unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->err || !skb_queue_empty(&sk->error_queue)) + mask |= POLLERR; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->state==TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + sk->socket->flags |= SO_NOSPACE; + + return mask; +} diff --git a/pfinet/linux-src/net/core/dev.c b/pfinet/linux-src/net/core/dev.c new file mode 100644 index 00000000..cc9584a1 --- /dev/null +++ b/pfinet/linux-src/net/core/dev.c @@ -0,0 +1,2026 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * + * Additional Authors: + * Florian la Roche + * Alan Cox + * David Hinds + * Alexey Kuznetsov + * Adam Sulmicki + * + * Changes: + * Marcelo Tosatti : dont accept mtu 0 or < + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant stunts to + * keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before calling netif_rx. Saves + * a function call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close changes. + * Rudi Cilibrasi : Pass the right thing to set_mac_address() + * Dave Miller : 32bit quantity for the device lock to make it work out + * on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russel : SIOCSIFNAME + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NET_RADIO +#include +#endif /* CONFIG_NET_RADIO */ +#ifdef CONFIG_PLIP +extern int plip_init(void); +#endif + +NET_PROFILE_DEFINE(dev_queue_xmit) +NET_PROFILE_DEFINE(net_bh) +NET_PROFILE_DEFINE(net_bh_skb) + + +const char *if_port_text[] = { + "unknown", + "BNC", + "10baseT", + "AUI", + "100baseT", + "100baseTX", + "100baseFX" +}; + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * 0800 IP + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +struct packet_type *ptype_base[16]; /* 16 way hashed list */ +struct packet_type *ptype_all = NULL; /* Taps */ + +/* + * Device list lock. Setting it provides that interface + * will not disappear unexpectedly while kernel sleeps. + */ + +atomic_t dev_lockct = ATOMIC_INIT(0); + +/* + * Our notifier list + */ + +static struct notifier_block *netdev_chain=NULL; + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the bottom half handler. + */ + +static struct sk_buff_head backlog; + +#ifdef CONFIG_NET_FASTROUTE +int netdev_fastroute; +int netdev_fastroute_obstacles; +struct net_fastroute_stats dev_fastroute_stat; +#endif + +static void dev_clear_backlog(struct device *dev); + + +/****************************************************************************************** + + Protocol management and registration routines + +*******************************************************************************************/ + +/* + * For efficiency + */ + +int netdev_nit=0; + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explantion follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +void dev_add_pack(struct packet_type *pt) +{ + int hash; +#ifdef CONFIG_NET_FASTROUTE + /* Hack to detect packet socket */ + if (pt->data) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(pt->dev); + } +#endif + if(pt->type==htons(ETH_P_ALL)) + { + netdev_nit++; + pt->next=ptype_all; + ptype_all=pt; + } + else + { + hash=ntohs(pt->type)&15; + pt->next = ptype_base[hash]; + ptype_base[hash] = pt; + } +} + + +/* + * Remove a protocol ID from the list. + */ + +void dev_remove_pack(struct packet_type *pt) +{ + struct packet_type **pt1; + if(pt->type==htons(ETH_P_ALL)) + { + netdev_nit--; + pt1=&ptype_all; + } + else + pt1=&ptype_base[ntohs(pt->type)&15]; + for(; (*pt1)!=NULL; pt1=&((*pt1)->next)) + { + if(pt==(*pt1)) + { + *pt1=pt->next; + synchronize_bh(); +#ifdef CONFIG_NET_FASTROUTE + if (pt->data) + netdev_fastroute_obstacles--; +#endif + return; + } + } + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +} + +/***************************************************************************************** + + Device Interface Subroutines + +******************************************************************************************/ + +/* + * Find an interface by name. + */ + +struct device *dev_get(const char *name) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (strcmp(dev->name, name) == 0) + return(dev); + } + return NULL; +} + +struct device * dev_get_by_index(int ifindex) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->ifindex == ifindex) + return(dev); + } + return NULL; +} + +struct device *dev_getbyhwaddr(unsigned short type, char *ha) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->type == type && + memcmp(dev->dev_addr, ha, dev->addr_len) == 0) + return(dev); + } + return(NULL); +} + +/* + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. Not efficient for many devices, not called a lot.. + */ + +int dev_alloc_name(struct device *dev, const char *name) +{ + int i; + /* + * If you need over 100 please also fix the algorithm... + */ + for(i=0;i<100;i++) + { + sprintf(dev->name,name,i); + if(dev_get(dev->name)==NULL) + return i; + } + return -ENFILE; /* Over 100 of the things .. bail out! */ +} + +struct device *dev_alloc(const char *name, int *err) +{ + struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL); + if(dev==NULL) + { + *err=-ENOBUFS; + return NULL; + } + dev->name=(char *)(dev+1); /* Name string space */ + *err=dev_alloc_name(dev,name); + if(*err<0) + { + kfree(dev); + return NULL; + } + return dev; +} + +void netdev_state_change(struct device *dev) +{ + if (dev->flags&IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); +} + + +/* + * Find and possibly load an interface. + */ + +#ifdef CONFIG_KMOD + +void dev_load(const char *name) +{ + if(!dev_get(name) && capable(CAP_SYS_MODULE)) + request_module(name); +} + +#else + +extern inline void dev_load(const char *unused){;} + +#endif + +static int default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); + return 1; +} + +/* + * Prepare an interface for use. + */ + +int dev_open(struct device *dev) +{ + int ret = 0; + + /* + * Is it already up? + */ + + if (dev->flags&IFF_UP) + return 0; + + /* + * Call device private open method + */ + + if (dev->open) + ret = dev->open(dev); + + /* + * If it went open OK then: + */ + + if (ret == 0) + { + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; + + /* + * Set the flags. + */ + dev->flags |= (IFF_UP | IFF_RUNNING); + + /* + * Initialize multicasting status + */ + dev_mc_upload(dev); + + /* + * Wakeup transmit queue engine + */ + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + + } + return(ret); +} + +#ifdef CONFIG_NET_FASTROUTE + +static __inline__ void dev_do_clear_fastroute(struct device *dev) +{ + if (dev->accept_fastpath) { + int i; + + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release_irqwait(xchg(dev->fastpath+i, NULL)); + } +} + +void dev_clear_fastroute(struct device *dev) +{ + if (dev) { + dev_do_clear_fastroute(dev); + } else { + for (dev = dev_base; dev; dev = dev->next) + dev_do_clear_fastroute(dev); + } +} +#endif + +/* + * Completely shutdown an interface. + */ + +int dev_close(struct device *dev) +{ + if (!(dev->flags&IFF_UP)) + return 0; + + dev_deactivate(dev); + + dev_lock_wait(); + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + */ + + if (dev->stop) + dev->stop(dev); + + if (dev->start) + printk("dev_close: bug %s still running\n", dev->name); + + /* + * Device is now down. + */ + dev_clear_backlog(dev); + + dev->flags&=~(IFF_UP|IFF_RUNNING); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + + /* + * Tell people we are going down + */ + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + + return(0); +} + + +/* + * Device change register/unregister. These are not inline or static + * as we export them to the world. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&netdev_chain, nb); +} + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&netdev_chain,nb); +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) +{ + struct packet_type *ptype; + get_fast_time(&skb->stamp); + + for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) + { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + ((struct sock *)ptype->data != skb->sk)) + { + struct sk_buff *skb2; + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) + break; + + /* Code, following below is wrong. + + The only reason, why it does work is that + ONLY packet sockets receive outgoing + packets. If such a packet will be (occasionally) + received by normal packet handler, which expects + that mac header is pulled... + */ + + /* More sensible variant. skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; + + if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) { + if (net_ratelimit()) + printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + if (dev->hard_header) + skb2->nh.raw += dev->hard_header_len; + } + + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); + } + } +} + +/* + * Fast path for loopback frames. + */ + +void dev_loopback_xmit(struct sk_buff *skb) +{ + struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); + if (newskb==NULL) + return; + + newskb->mac.raw = newskb->data; + skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + if (newskb->dst==NULL) + printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); + netif_rx(newskb); +} + +int dev_queue_xmit(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + struct Qdisc *q; + +#ifdef CONFIG_NET_PROFILE + start_bh_atomic(); + NET_PROFILE_ENTER(dev_queue_xmit); +#endif + + start_bh_atomic(); + q = dev->qdisc; + if (q->enqueue) { + q->enqueue(skb, q); + qdisc_wakeup(dev); + end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that bh protection is necessary here: + virtual devices do not generate EOI events. + However, it is possible, that they rely on bh protection + made by us here. + */ + if (dev->flags&IFF_UP) { + if (netdev_nit) + dev_queue_xmit_nit(skb,dev); + if (dev->hard_start_xmit(skb, dev) == 0) { + end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; + } + if (net_ratelimit()) + printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + } + end_bh_atomic(); + + kfree_skb(skb); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + + return 0; +} + + +/*======================================================================= + Receiver rotutines + =======================================================================*/ + +int netdev_dropping = 0; +int netdev_max_backlog = 300; +atomic_t netdev_rx_dropped; +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif + +#ifdef CONFIG_NET_HW_FLOWCONTROL +int netdev_throttle_events; +static unsigned long netdev_fc_mask = 1; +unsigned long netdev_fc_xoff = 0; + +static struct +{ + void (*stimul)(struct device *); + struct device *dev; +} netdev_fc_slots[32]; + +int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) +{ + int bit = 0; + unsigned long flags; + + save_flags(flags); + cli(); + if (netdev_fc_mask != ~0UL) { + bit = ffz(netdev_fc_mask); + netdev_fc_slots[bit].stimul = stimul; + netdev_fc_slots[bit].dev = dev; + set_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + restore_flags(flags); + return bit; +} + +void netdev_unregister_fc(int bit) +{ + unsigned long flags; + + save_flags(flags); + cli(); + if (bit > 0) { + netdev_fc_slots[bit].stimul = NULL; + netdev_fc_slots[bit].dev = NULL; + clear_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + restore_flags(flags); +} + +static void netdev_wakeup(void) +{ + unsigned long xoff; + + cli(); + xoff = netdev_fc_xoff; + netdev_fc_xoff = 0; + netdev_dropping = 0; + netdev_throttle_events++; + while (xoff) { + int i = ffz(~xoff); + xoff &= ~(1<next; + if ( curr->prev->dev == dev ) { + prev = curr->prev; + spin_lock_irqsave(&skb_queue_lock, flags); + __skb_unlink(prev, &backlog); + spin_unlock_irqrestore(&skb_queue_lock, flags); + kfree_skb(prev); + } + } + end_bh_atomic(); +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + } +} + +/* + * Receive a packet from a device driver and queue it for the upper + * (protocol) levels. It always succeeds. + */ + +void netif_rx(struct sk_buff *skb) +{ +#ifndef CONFIG_CPU_IS_SLOW + if(skb->stamp.tv_sec==0) + get_fast_time(&skb->stamp); +#else + skb->stamp = xtime; +#endif + + /* The code is rearranged so that the path is the most + short when CPU is congested, but is still operating. + */ + + if (backlog.qlen <= netdev_max_backlog) { + if (backlog.qlen) { + if (netdev_dropping == 0) { + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); + return; + } +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + netdev_dropping = 1; + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); +} + +#ifdef CONFIG_BRIDGE +static inline void handle_bridge(struct sk_buff *skb, unsigned short type) +{ + if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) + { + /* + * We pass the bridge a complete frame. This means + * recovering the MAC header first. + */ + + int offset; + + skb=skb_clone(skb, GFP_ATOMIC); + if(skb==NULL) + return; + + offset=skb->data-skb->mac.raw; + skb_push(skb,offset); /* Put header back on for bridge */ + + if(br_receive_frame(skb)) + return; + kfree_skb(skb); + } + return; +} +#endif + + +/* + * When we are called the queue is ready to grab, the interrupts are + * on and hardware can interrupt and queue to the receive queue as we + * run with no problems. + * This is run as a bottom half after an interrupt handler that does + * mark_bh(NET_BH); + */ + +void net_bh(void) +{ + struct packet_type *ptype; + struct packet_type *pt_prev; + unsigned short type; + unsigned long start_time = jiffies; +#ifdef CONFIG_CPU_IS_SLOW + static unsigned long start_busy = 0; + static unsigned long ave_busy = 0; + + if (start_busy == 0) + start_busy = start_time; + net_cpu_congestion = ave_busy>>8; +#endif + + NET_PROFILE_ENTER(net_bh); + /* + * Can we send anything now? We want to clear the + * decks for any more sends that get done as we + * process the input. This also minimises the + * latency on a transmit interrupt bh. + */ + + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + + /* + * Any data left to process. This may occur because a + * mark_bh() is done after we empty the queue including + * that from the device which does a mark_bh() just after + */ + + /* + * While the queue is not empty.. + * + * Note that the queue never shrinks due to + * an interrupt, so we can do this test without + * disabling interrupts. + */ + + while (!skb_queue_empty(&backlog)) + { + struct sk_buff * skb; + + /* Give chance to other bottom halves to run */ + if (jiffies - start_time > 1) + goto net_bh_break; + + /* + * We have a packet. Therefore the queue has shrunk + */ + skb = skb_dequeue(&backlog); + +#ifdef CONFIG_CPU_IS_SLOW + if (ave_busy > 128*16) { + kfree_skb(skb); + while ((skb = skb_dequeue(&backlog)) != NULL) + kfree_skb(skb); + break; + } +#endif + + +#if 0 + NET_PROFILE_SKB_PASSED(skb, net_bh_skb); +#endif +#ifdef CONFIG_NET_FASTROUTE + if (skb->pkt_type == PACKET_FASTROUTE) { + dev_queue_xmit(skb); + continue; + } +#endif + + /* + * Bump the pointer to the next structure. + * + * On entry to the protocol layer. skb->data and + * skb->nh.raw point to the MAC and encapsulated data + */ + + /* XXX until we figure out every place to modify.. */ + skb->h.raw = skb->nh.raw = skb->data; + + if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) { + printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol); + kfree_skb(skb); + continue; + } + + /* + * Fetch the packet protocol ID. + */ + + type = skb->protocol; + +#ifdef CONFIG_BRIDGE + /* + * If we are bridging then pass the frame up to the + * bridging code (if this protocol is to be bridged). + * If it is bridged then move on + */ + handle_bridge(skb, type); +#endif + + /* + * We got a packet ID. Now loop over the "known protocols" + * list. There are two lists. The ptype_all list of taps (normally empty) + * and the main protocol list which is hashed perfectly for normal protocols. + */ + + pt_prev = NULL; + for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) + { + if (!ptype->dev || ptype->dev == skb->dev) { + if(pt_prev) + { + struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + if(skb2) + pt_prev->func(skb2,skb->dev, pt_prev); + } + pt_prev=ptype; + } + } + + for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) + { + if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) + { + /* + * We already have a match queued. Deliver + * to it and then remember the new match + */ + if(pt_prev) + { + struct sk_buff *skb2; + + skb2=skb_clone(skb, GFP_ATOMIC); + + /* + * Kick the protocol handler. This should be fast + * and efficient code. + */ + + if(skb2) + pt_prev->func(skb2, skb->dev, pt_prev); + } + /* Remember the current last to do */ + pt_prev=ptype; + } + } /* End of protocol list loop */ + + /* + * Is there a last item to send to ? + */ + + if(pt_prev) + pt_prev->func(skb, skb->dev, pt_prev); + /* + * Has an unknown packet has been received ? + */ + + else { + kfree_skb(skb); + } + } /* End of queue loop */ + + /* + * We have emptied the queue + */ + + /* + * One last output flush. + */ + + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + +#ifdef CONFIG_CPU_IS_SLOW + if (1) { + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; + } +#endif +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + NET_PROFILE_LEAVE(net_bh); + return; + +net_bh_break: + mark_bh(NET_BH); + NET_PROFILE_LEAVE(net_bh); + return; +} + +/* Protocol dependent address dumping routines */ + +static gifconf_func_t * gifconf_list [NPROTO]; + +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ + if (family>=NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * This call is useful, but I'd remove it too. + * + * The reason is purely aestetical, it is the only call + * from SIOC* family using struct ifreq in reversed manner. + * Besides that, it is pretty silly to put "drawing" facility + * to kernel, it is useful only to print ifindices + * in readable form, is not it? --ANK + * + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct ifreq *arg) +{ + struct device *dev; + struct ifreq ifr; + int err; + + /* + * Fetch the caller's info block. + */ + + err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); + if (err) + return -EFAULT; + + dev = dev_get_by_index(ifr.ifr_ifindex); + if (!dev) + return -ENODEV; + + strcpy(ifr.ifr_name, dev->name); + + err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); + return (err)?-EFAULT:0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(char *arg) +{ + struct ifconf ifc; + struct device *dev; + char *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for (dev = dev_base; dev != NULL; dev = dev->next) { + for (i=0; iget_stats ? dev->get_stats(dev): NULL); + int size; + + if (stats) + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, + stats->rx_bytes, + stats->rx_packets, stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, + stats->tx_packets, stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + stats->tx_aborted_errors + + stats->tx_window_errors + stats->tx_heartbeat_errors, + stats->tx_compressed); + else + size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); + + return size; +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface + * to create /proc/net/dev + */ + +int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t begin=0; + off_t pos=0; + int size; + + struct device *dev; + + + size = sprintf(buffer, + "Inter-| Receive | Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); + + pos+=size; + len+=size; + + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + size = sprintf_stats(buffer+len, dev); + len+=size; + pos=begin+len; + + if(posoffset+length) + break; + } + + *start=buffer+(offset-begin); /* Start of wanted data */ + len-=(offset-begin); /* Start slop */ + if(len>length) + len=length; /* Ending slop */ + return len; +} + +static int dev_proc_stats(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x %08x %08x %08x\n", + atomic_read(&netdev_rx_dropped), +#ifdef CONFIG_NET_HW_FLOWCONTROL + netdev_throttle_events, +#else + 0, +#endif +#ifdef CONFIG_NET_FASTROUTE + dev_fastroute_stat.hits, + dev_fastroute_stat.succeed, + dev_fastroute_stat.deferred +#else + 0, 0, 0 +#endif + ); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} + +#endif /* CONFIG_PROC_FS */ + + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS + +/* + * Print one entry of /proc/net/wireless + * This is a clone of /proc/net/dev (just above) + */ +static int sprintf_wireless_stats(char *buffer, struct device *dev) +{ + /* Get stats from the driver */ + struct iw_statistics *stats = (dev->get_wireless_stats ? + dev->get_wireless_stats(dev) : + (struct iw_statistics *) NULL); + int size; + + if(stats != (struct iw_statistics *) NULL) + size = sprintf(buffer, + "%6s: %02x %3d%c %3d%c %3d%c %5d %5d %5d\n", + dev->name, + stats->status, + stats->qual.qual, + stats->qual.updated & 1 ? '.' : ' ', + stats->qual.level, + stats->qual.updated & 2 ? '.' : ' ', + stats->qual.noise, + stats->qual.updated & 3 ? '.' : ' ', + stats->discard.nwid, + stats->discard.code, + stats->discard.misc); + else + size = 0; + + return size; +} + +/* + * Print info for /proc/net/wireless (print all entries) + * This is a clone of /proc/net/dev (just above) + */ +int dev_get_wireless_info(char * buffer, char **start, off_t offset, + int length, int dummy) +{ + int len = 0; + off_t begin = 0; + off_t pos = 0; + int size; + + struct device * dev; + + size = sprintf(buffer, + "Inter-|sta| Quality | Discarded packets\n" + " face |tus|link level noise| nwid crypt misc\n"); + + pos+=size; + len+=size; + + for(dev = dev_base; dev != NULL; dev = dev->next) + { + size = sprintf_wireless_stats(buffer+len, dev); + len+=size; + pos=begin+len; + + if(pos < offset) + { + len=0; + begin=pos; + } + if(pos > offset + length) + break; + } + + *start = buffer + (offset - begin); /* Start of wanted data */ + len -= (offset - begin); /* Start slop */ + if(len > length) + len = length; /* Ending slop */ + + return len; +} +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + +void dev_set_promiscuity(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags^old_flags) { +#ifdef CONFIG_NET_FASTROUTE + if (dev->flags&IFF_PROMISC) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(dev); + } else + netdev_fastroute_obstacles--; +#endif + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); + } +} + +void dev_set_allmulti(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags^old_flags) + dev_mc_upload(dev); +} + +int dev_change_flags(struct device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| + IFF_SLAVE|IFF_MASTER|IFF_DYNAMIC| + IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ + { + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + + if (ret == 0) + dev_mc_upload(dev); + } + + if (dev->flags&IFF_UP && + ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE))) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + + if ((flags^dev->gflags)&IFF_PROMISC) { + int inc = (flags&IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags^dev->gflags)&IFF_ALLMULTI) { + int inc = (flags&IFF_ALLMULTI) ? +1 : -1; + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + return ret; +} + +/* + * Perform the SIOCxIFxxx calls. + */ + +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + struct device *dev; + int err; + + if ((dev = dev_get(ifr->ifr_name)) == NULL) + return -ENODEV; + + switch(cmd) + { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI)) + |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI)); + return 0; + + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ + return -EOPNOTSUPP; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCSIFMTU: /* Set the MTU of a device */ + if (ifr->ifr_mtu == dev->mtu) + return 0; + + /* + * MTU must be positive. + */ + + if (ifr->ifr_mtu<=0) + return -EINVAL; + + if (dev->change_mtu) + err = dev->change_mtu(dev, ifr->ifr_mtu); + else { + dev->mtu = ifr->ifr_mtu; + err = 0; + } + if (!err && dev->flags&IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); + return err; + + case SIOCGIFHWADDR: + memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); + ifr->ifr_hwaddr.sa_family=dev->type; + return 0; + + case SIOCSIFHWADDR: + if(dev->set_mac_address==NULL) + return -EOPNOTSUPP; + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + err=dev->set_mac_address(dev,&ifr->ifr_hwaddr); + if (!err) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return err; + + case SIOCSIFHWBROADCAST: + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start=dev->mem_start; + ifr->ifr_map.mem_end=dev->mem_end; + ifr->ifr_map.base_addr=dev->base_addr; + ifr->ifr_map.irq=dev->irq; + ifr->ifr_map.dma=dev->dma; + ifr->ifr_map.port=dev->if_port; + return 0; + + case SIOCSIFMAP: + if (dev->set_config) + return dev->set_config(dev,&ifr->ifr_map); + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) + return -EINVAL; + dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); + return 0; + + case SIOCDELMULTI: + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) + return -EINVAL; + dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if(ifr->ifr_qlen<0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + if (dev->flags&IFF_UP) + return -EBUSY; + if (dev_get(ifr->ifr_newname)) + return -EEXIST; + memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); + dev->name[IFNAMSIZ-1] = 0; + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + return 0; + + /* + * Unknown or private ioctl + */ + + default: + if(cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; + } + +#ifdef CONFIG_NET_RADIO + if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; + } +#endif /* CONFIG_NET_RADIO */ + + } + return -EINVAL; +} + + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +int dev_ioctl(unsigned int cmd, void *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + ret = dev_ifconf((char *) arg); + rtnl_shunlock(); + return ret; + } + if (cmd == SIOCGIFNAME) { + return dev_ifname((struct ifreq *)arg); + } + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch(cmd) + { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); + ret = dev_ifsioc(&ifr, cmd); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but currently + do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + + default: + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; + } +#ifdef CONFIG_NET_RADIO + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + dev_load(ifr.ifr_name); + if (IW_IS_SET(cmd)) { + if (!suser()) + return -EPERM; + rtnl_lock(); + } + ret = dev_ifsioc(&ifr, cmd); + if (IW_IS_SET(cmd)) + rtnl_unlock(); + if (!ret && IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; + } +#endif /* CONFIG_NET_RADIO */ + return -EINVAL; + } +} + +int dev_new_index(void) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex=1; + if (dev_get_by_index(ifindex) == NULL) + return ifindex; + } +} + +static int dev_boot_phase = 1; + + +int register_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase) { + /* This is NOT bug, but I am not sure, that all the + devices, initialized before netdev module is started + are sane. + + Now they are chained to device boot list + and probed later. If a module is initialized + before netdev, but assumes that dev->init + is really called by register_netdev(), it will fail. + + So that this message should be printed for a while. + */ + printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + *dp = dev; + return 0; + } + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init && dev->init(dev) != 0) + return -EIO; + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + dev_init_scheduler(dev); + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + *dp = dev; + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + return 0; +} + +int unregister_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase == 0) { + /* If device is running, close it. + It is very bad idea, really we should + complain loudly here, but random hackery + in linux/drivers/net likes it. + */ + if (dev->flags & IFF_UP) + dev_close(dev); + +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + /* To avoid pointers looking to nowhere, + we wait for end of critical section */ + dev_lock_wait(); + } + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev) { + *dp = d->next; + synchronize_bh(); + d->next = NULL; + + if (dev->destructor) + dev->destructor(dev); + return 0; + } + } + return -ENODEV; +} + + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ +extern int lance_init(void); +extern int bpq_init(void); +extern int scc_init(void); +extern void sdla_setup(void); +extern void dlci_setup(void); +extern int dmascc_init(void); +extern int sm_init(void); + +extern int baycom_ser_fdx_init(void); +extern int baycom_ser_hdx_init(void); +extern int baycom_par_init(void); + +extern int lapbeth_init(void); +extern void arcnet_init(void); +extern void ip_auto_config(void); +#ifdef CONFIG_8xx +extern int cpm_enet_init(void); +#endif /* CONFIG_8xx */ + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_dev = { + PROC_NET_DEV, 3, "dev", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + dev_get_info +}; +#endif + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_wireless = { + PROC_NET_WIRELESS, 8, "wireless", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + dev_get_wireless_info +}; +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + +__initfunc(int net_dev_init(void)) +{ + struct device *dev, **dp; + +#ifdef CONFIG_NET_SCHED + pktsched_init(); +#endif + + /* + * Initialise the packet receive queue. + */ + + skb_queue_head_init(&backlog); + + /* + * The bridge has to be up before the devices + */ + +#ifdef CONFIG_BRIDGE + br_init(); +#endif + + /* + * This is Very Ugly(tm). + * + * Some devices want to be initialized early.. + */ + +#if defined(CONFIG_SCC) + scc_init(); +#endif +#if defined(CONFIG_DMASCC) + dmascc_init(); +#endif +#if defined(CONFIG_BPQETHER) + bpq_init(); +#endif +#if defined(CONFIG_DLCI) + dlci_setup(); +#endif +#if defined(CONFIG_SDLA) + sdla_setup(); +#endif +#if defined(CONFIG_BAYCOM_PAR) + baycom_par_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_FDX) + baycom_ser_fdx_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_HDX) + baycom_ser_hdx_init(); +#endif +#if defined(CONFIG_SOUNDMODEM) + sm_init(); +#endif +#if defined(CONFIG_LAPBETHER) + lapbeth_init(); +#endif +#if defined(CONFIG_PLIP) + plip_init(); +#endif +#if defined(CONFIG_ARCNET) + arcnet_init(); +#endif +#if defined(CONFIG_8xx) + cpm_enet_init(); +#endif + /* + * SLHC if present needs attaching so other people see it + * even if not opened. + */ + +#ifdef CONFIG_INET +#if (defined(CONFIG_SLIP) && defined(CONFIG_SLIP_COMPRESSED)) \ + || defined(CONFIG_PPP) \ + || (defined(CONFIG_ISDN) && defined(CONFIG_ISDN_PPP)) + slhc_install(); +#endif +#endif + +#ifdef CONFIG_NET_PROFILE + net_profile_init(); + NET_PROFILE_REGISTER(dev_queue_xmit); + NET_PROFILE_REGISTER(net_bh); +#if 0 + NET_PROFILE_REGISTER(net_bh_skb); +#endif +#endif + /* + * Add the devices. + * If the call to dev->init fails, the dev is removed + * from the chain disconnecting the device until the + * next reboot. + */ + + dp = &dev_base; + while ((dev = *dp) != NULL) + { + dev->iflink = -1; + if (dev->init && dev->init(dev)) + { + /* + * It failed to come up. Unhook it. + */ + *dp = dev->next; + synchronize_bh(); + } + else + { + dp = &dev->next; + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + dev_init_scheduler(dev); + } + } + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_dev); + { + struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0); + ent->read_proc = dev_proc_stats; + } +#endif + +#ifdef CONFIG_NET_RADIO +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_wireless); +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_RADIO */ + + init_bh(NET_BH, net_bh); + + dev_boot_phase = 0; + + dev_mcast_init(); + +#ifdef CONFIG_IP_PNP + ip_auto_config(); +#endif + + return 0; +} diff --git a/pfinet/linux-src/net/core/dev_mcast.c b/pfinet/linux-src/net/core/dev_mcast.c new file mode 100644 index 00000000..bce3f4a4 --- /dev/null +++ b/pfinet/linux-src/net/core/dev_mcast.c @@ -0,0 +1,252 @@ +/* + * Linux NET3: Multicast List maintenance. + * + * Authors: + * Tim Kordas + * Richard Underwood + * + * Stir fried together from the IP multicast and CAP patches above + * Alan Cox + * + * Fixes: + * Alan Cox : Update the device on a real delete + * rather than any time but... + * Alan Cox : IFF_ALLMULTI support. + * Alan Cox : New format set_multicast_list() calls. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Device multicast list maintenance. + * + * This is used both by IP and by the user level maintenance functions. + * Unlike BSD we maintain a usage count on a given multicast address so + * that a casual user application can add/delete multicasts used by + * protocols without doing damage to the protocols when it deletes the + * entries. It also helps IP as it tracks overlapping maps. + * + * Device mc lists are changed by bh at least if IPv6 is enabled, + * so that it must be bh protected. + */ + +/* + * Update the multicast list into the physical NIC controller. + */ + +void dev_mc_upload(struct device *dev) +{ + /* Don't do anything till we up the interface + [dev_open will call this function so the list will + stay sane] */ + + if(!(dev->flags&IFF_UP)) + return; + + /* + * Devices with no set multicast don't get set + */ + + if(dev->set_multicast_list==NULL) + return; + + start_bh_atomic(); + dev->set_multicast_list(dev); + end_bh_atomic(); +} + +/* + * Delete a device level multicast + */ + +int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, **dmip; + + start_bh_atomic(); + for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) { + /* + * Find the entry we want to delete. The device could + * have variable length entries so check these too. + */ + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && alen==dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + break; + } + if(--dmi->dmi_users) + goto done; + + /* + * Last user. So delete the entry. + */ + *dmip = dmi->next; + dev->mc_count--; + kfree_s(dmi,sizeof(*dmi)); + /* + * We have altered the list, so the card + * loaded filter is now wrong. Fix it + */ + end_bh_atomic(); + dev_mc_upload(dev); + return 0; + } + } + err = -ENOENT; +done: + end_bh_atomic(); + return err; +} + +/* + * Add a device level multicast + */ + +int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, *dmi1; + + dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any()); + + start_bh_atomic(); + for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + goto done; + } + dmi->dmi_users++; + goto done; + } + } + + if ((dmi=dmi1)==NULL) + return -ENOMEM; + memcpy(dmi->dmi_addr, addr, alen); + dmi->dmi_addrlen=alen; + dmi->next=dev->mc_list; + dmi->dmi_users=1; + dmi->dmi_gusers=glbl ? 1 : 0; + dev->mc_list=dmi; + dev->mc_count++; + end_bh_atomic(); + dev_mc_upload(dev); + return 0; + +done: + end_bh_atomic(); + if (dmi1) + kfree(dmi1); + return err; +} + +/* + * Discard multicast list when a device is downed + */ + +void dev_mc_discard(struct device *dev) +{ + start_bh_atomic(); + while (dev->mc_list!=NULL) { + struct dev_mc_list *tmp=dev->mc_list; + dev->mc_list=tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); + kfree_s(tmp,sizeof(*tmp)); + } + dev->mc_count=0; + end_bh_atomic(); +} + +#ifdef CONFIG_PROC_FS +static int dev_mc_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0, begin=0; + struct dev_mc_list *m; + int len=0; + struct device *dev; + + start_bh_atomic(); + + for (dev = dev_base; dev; dev = dev->next) { + for (m = dev->mc_list; m; m = m->next) { + int i; + + len += sprintf(buffer+len,"%-4d %-15s %-5d %-5d ", dev->ifindex, dev->name, + m->dmi_users, m->dmi_gusers); + + for (i=0; idmi_addrlen; i++) + len += sprintf(buffer+len, "%02x", m->dmi_addr[i]); + + len+=sprintf(buffer+len, "\n"); + + pos=begin+len; + if (pos < offset) { + len=0; + begin=pos; + } + if (pos > offset+length) + goto done; + } + } + *eof = 1; + +done: + end_bh_atomic(); + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if(len<0) + len=0; + return len; +} +#endif + +__initfunc(void dev_mcast_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/dev_mcast", 0, 0); + ent->read_proc = dev_mc_read_proc; +#endif +} + diff --git a/pfinet/linux-src/net/core/dst.c b/pfinet/linux-src/net/core/dst.c new file mode 100644 index 00000000..9007dde6 --- /dev/null +++ b/pfinet/linux-src/net/core/dst.c @@ -0,0 +1,145 @@ +/* + * net/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct dst_entry * dst_garbage_list; +atomic_t dst_total = ATOMIC_INIT(0); + +static unsigned long dst_gc_timer_expires; +static unsigned long dst_gc_timer_inc = DST_GC_MAX; +static void dst_run_gc(unsigned long); + +static struct timer_list dst_gc_timer = + { NULL, NULL, DST_GC_MIN, 0L, dst_run_gc }; + +#if RT_CACHE_DEBUG >= 2 +atomic_t hh_count; +#endif + +static void dst_run_gc(unsigned long dummy) +{ + int delayed = 0; + struct dst_entry * dst, **dstp; + + del_timer(&dst_gc_timer); + dstp = &dst_garbage_list; + while ((dst = *dstp) != NULL) { + if (atomic_read(&dst->use)) { + dstp = &dst->next; + delayed++; + continue; + } + *dstp = dst->next; + dst_destroy(dst); + } + if (!dst_garbage_list) { + dst_gc_timer_inc = DST_GC_MAX; + return; + } + if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) + dst_gc_timer_expires = DST_GC_MAX; + dst_gc_timer_inc += DST_GC_INC; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; +#if RT_CACHE_DEBUG >= 2 + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); +#endif + add_timer(&dst_gc_timer); +} + +static int dst_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int dst_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +void * dst_alloc(int size, struct dst_ops * ops) +{ + struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } + dst = kmalloc(size, GFP_ATOMIC); + if (!dst) + return NULL; + memset(dst, 0, size); + dst->ops = ops; + atomic_set(&dst->refcnt, 0); + dst->lastuse = jiffies; + dst->input = dst_discard; + dst->output = dst_blackhole; + atomic_inc(&dst_total); + atomic_inc(&ops->entries); + return dst; +} + +void __dst_free(struct dst_entry * dst) +{ + start_bh_atomic(); + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_blackhole; + dst->dev = &loopback_dev; + } + dst->obsolete = 2; + dst->next = dst_garbage_list; + dst_garbage_list = dst; + if (dst_gc_timer_inc > DST_GC_INC) { + del_timer(&dst_gc_timer); + dst_gc_timer_inc = DST_GC_INC; + dst_gc_timer_expires = DST_GC_MIN; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; + add_timer(&dst_gc_timer); + } + end_bh_atomic(); +} + +void dst_destroy(struct dst_entry * dst) +{ + struct neighbour *neigh = dst->neighbour; + struct hh_cache *hh = dst->hh; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + atomic_dec(&dst_total); + kfree(dst); +} diff --git a/pfinet/linux-src/net/core/filter.c b/pfinet/linux-src/net/core/filter.c new file mode 100644 index 00000000..8e1ffb62 --- /dev/null +++ b/pfinet/linux-src/net/core/filter.c @@ -0,0 +1,454 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + */ + +#include +#if defined(CONFIG_FILTER) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No hurry in this branch */ + +static u8 *load_pointer(struct sk_buff *skb, int k) +{ + u8 *ptr = NULL; + + if (k>=SKF_NET_OFF) + ptr = skb->nh.raw + k - SKF_NET_OFF; + else if (k>=SKF_LL_OFF) + ptr = skb->mac.raw + k - SKF_LL_OFF; + + if (ptrhead && ptr < skb->tail) + return ptr; + return NULL; +} + +/* + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +{ + unsigned char *data = skb->data; + /* len is UNSIGNED. Byte wide insns relies only on implicit + type casts to prevent reading arbitrary memory locations. + */ + unsigned int len = skb->len; + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + + /* + * Process array of filter instructions. + */ + + for(pc = 0; pc < flen; pc++) + { + fentry = &filter[pc]; + + switch(fentry->code) + { + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + + case BPF_ALU|BPF_ADD|BPF_K: + A += fentry->k; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + + case BPF_ALU|BPF_SUB|BPF_K: + A -= fentry->k; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + + case BPF_ALU|BPF_MUL|BPF_K: + A *= fentry->k; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + if(X == 0) + return (0); + A /= X; + continue; + + case BPF_ALU|BPF_DIV|BPF_K: + if(fentry->k == 0) + return (0); + A /= fentry->k; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + + case BPF_ALU|BPF_AND|BPF_K: + A &= fentry->k; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + + case BPF_ALU|BPF_OR|BPF_K: + A |= fentry->k; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + + case BPF_ALU|BPF_LSH|BPF_K: + A <<= fentry->k; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + + case BPF_ALU|BPF_RSH|BPF_K: + A >>= fentry->k; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; +load_w: + if(k+sizeof(u32) <= len) { + A = ntohl(*(u32*)&data[k]); + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = ntohl(*(u32*)ptr); + continue; + } + } + return 0; + + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; +load_h: + if(k + sizeof(u16) <= len) { + A = ntohs(*(u16*)&data[k]); + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = ntohs(*(u16*)ptr); + continue; + } + } + return 0; + + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; +load_b: + if(k < len) { + A = data[k]; + continue; + } + if (k<0) { + u8 *ptr; + + if (k>=SKF_AD_OFF) + break; + if ((ptr = load_pointer(skb, k)) != NULL) { + A = *ptr; + continue; + } + } + + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + goto load_w; + + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + goto load_h; + + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + goto load_b; + + case BPF_LDX|BPF_B|BPF_MSH: + k = fentry->k; + if(k >= len) + return (0); + X = (data[k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + + case BPF_RET|BPF_A: + return ((unsigned int)A); + + case BPF_ST: + mem[fentry->k] = A; + continue; + + case BPF_STX: + mem[fentry->k] = X; + continue; + + default: + /* Invalid instruction counts as RET */ + return (0); + } + + /* Handle ancillary data, which are impossible + (or very difficult) to get parsing packet contents. + */ + switch (k-SKF_AD_OFF) { + case SKF_AD_PROTOCOL: + A = htons(skb->protocol); + continue; + case SKF_AD_PKTTYPE: + A = skb->pkt_type; + continue; + case SKF_AD_IFINDEX: + A = skb->dev->ifindex; + continue; + default: + return 0; + } + } + + return (0); +} + +/* + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! + */ + +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + /* + * Check the filter code now. + */ + for(pc = 0; pc < flen; pc++) + { + /* + * All jumps are forward as they are not signed + */ + + ftest = &filter[pc]; + if(BPF_CLASS(ftest->code) == BPF_JMP) + { + /* + * But they mustn't jump off the end. + */ + if(BPF_OP(ftest->code) == BPF_JA) + { + /* Note, the large ftest->k might cause + loops. Compare this with conditional + jumps below, where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return (-EINVAL); + } + else + { + /* + * For conditionals both must be safe + */ + if(pc + ftest->jt +1 >= flen || pc + ftest->jf +1 >= flen) + return (-EINVAL); + } + } + + /* + * Check that memory operations use valid addresses. + */ + + if (ftest->k >= BPF_MEMWORDS) + { + /* + * But it might not be a memory operation... + */ + switch (ftest->code) { + case BPF_ST: + case BPF_STX: + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + return -EINVAL; + } + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the + * end they _will_ hit this. + */ + + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL; +} + +/* + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. + */ + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + return (-EINVAL); + + fp = (struct sk_filter *)sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if(fp == NULL) + return (-ENOMEM); + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + + if ((err = sk_chk_filter(fp->insns, fp->len))==0) { + struct sk_filter *old_fp = sk->filter; + sk->filter = fp; + synchronize_bh(); + fp = old_fp; + } + + if (fp) + sk_filter_release(sk, fp); + + return (err); +} +#endif /* CONFIG_FILTER */ diff --git a/pfinet/linux-src/net/core/firewall.c b/pfinet/linux-src/net/core/firewall.c new file mode 100644 index 00000000..fc7b1a51 --- /dev/null +++ b/pfinet/linux-src/net/core/firewall.c @@ -0,0 +1,160 @@ +/* + * Generic loadable firewalls. At the moment only IP will actually + * use these, but people can add the others as they are needed. + * + * Authors: Dave Bonn (for IP) + * much hacked by: Alan Cox + */ + +#include +#include +#include +#include +#include +#include + +struct semaphore firewall_sem = MUTEX; +static int firewall_policy[NPROTO]; +static struct firewall_ops *firewall_chain[NPROTO]; + +/* + * Register a firewall + */ + +int register_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **p; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + down(&firewall_sem); + + p=&firewall_chain[pf]; + + while(*p) + { + if(fw->fw_priority > (*p)->fw_priority) + break; + p=&((*p)->next); + } + + /* + * We need to use a memory barrier to make sure that this + * works correctly even in SMP with weakly ordered writes. + * + * This is atomic wrt interrupts (and generally walking the + * chain), but not wrt itself (so you can't call this from + * an interrupt. Not that you'd want to). + */ + + fw->next=*p; + mb(); + *p = fw; + + /* + * And release the sleep lock + */ + + up(&firewall_sem); + return 0; +} + +/* + * Unregister a firewall + */ + +int unregister_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **nl; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + down(&firewall_sem); + + nl=&firewall_chain[pf]; + + while(*nl!=NULL) + { + if(*nl==fw) + { + struct firewall_ops *f=fw->next; + *nl = f; + up(&firewall_sem); + synchronize_bh(); + return 0; + } + nl=&((*nl)->next); + } + up(&firewall_sem); + return -ENOENT; +} + +int call_fw_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_forward(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +/* + * Actual invocation of the chains + */ + +int call_in_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_input(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +int call_out_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_output(fw,pf,dev,phdr,arg,skb); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + /* alan, is this right? */ + return firewall_policy[pf]; +} + +EXPORT_SYMBOL(register_firewall); +EXPORT_SYMBOL(unregister_firewall); +EXPORT_SYMBOL(call_in_firewall); +EXPORT_SYMBOL(call_out_firewall); +EXPORT_SYMBOL(call_fw_firewall); + +__initfunc(void fwchain_init(void)) +{ + int i; + for(i=0;i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing verify_area. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ + int size, err, ct; + + if(m->msg_namelen) + { + if(mode==VERIFY_READ) + { + err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address); + if(err<0) + goto out; + } + + m->msg_name = address; + } else + m->msg_name = NULL; + + err = -EFAULT; + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, m->msg_iov, size)) + goto out; + m->msg_iov=iov; + + for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) { + err += iov[ct].iov_len; + /* Goal is not to verify user data, but to prevent returning + negative value, which is interpreted as errno. + Overflow is still possible, but it is harmless. + */ + if (err < 0) + return -EMSGSIZE; + } +out: + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + int err = -EFAULT; + + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + goto out; + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } + err = 0; +out: + return err; +} + +/* + * In kernel copy to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len, len); + memcpy(iov->iov_base, kdata, copy); + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } +} + + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + int err = -EFAULT; + + while(len>0) + { + if(iov->iov_len) + { + int copy = min(len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; + len-=copy; + kdata+=copy; + iov->iov_base+=copy; + iov->iov_len-=copy; + } + iov++; + } + err = 0; +out: + return err; +} + + +/* + * For use with ip_build_xmit + */ + +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, + int len) +{ + int err = -EFAULT; + + /* Skip over the finished iovecs */ + while(offset >= iov->iov_len) + { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) + { + u8 *base = iov->iov_base + offset; + int copy = min(len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + goto out; + len -= copy; + kdata += copy; + iov++; + } + err = 0; +out: + return err; +} + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ + +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) +{ + int csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) + { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) + { + u8 *base = iov->iov_base + offset; + unsigned int copy = min(len, iov->iov_len - offset); + + offset = 0; + /* There is a remnant from previous iov. */ + if (partial_cnt) + { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) + { + partial_cnt = copy % 4; + if (partial_cnt) + { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} diff --git a/pfinet/linux-src/net/core/neighbour.c b/pfinet/linux-src/net/core/neighbour.c new file mode 100644 index 00000000..6afbfdcc --- /dev/null +++ b/pfinet/linux-src/net/core/neighbour.c @@ -0,0 +1,1394 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + */ + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include +#include +#include + +/* + NOTE. The most unpleasent question is serialization of + accesses to resolved addresses. The problem is that addresses + are modified by bh, but they are referenced from normal + kernel thread. Before today no locking was made. + My reasoning was that corrupted address token will be copied + to packet with cosmologically small probability + (it is even difficult to estimate such small number) + and it is very silly to waste cycles in fast path to lock them. + + But now I changed my mind, but not because previous statement + is wrong. Actually, neigh->ha MAY BE not opaque byte array, + but reference to some private data. In this case even neglibible + corruption probability becomes bug. + + - hh cache is protected by rwlock. It assumes that + hh cache update procedure is short and fast, and that + read_lock is cheaper than start_bh_atomic(). + - ha tokens, saved in neighbour entries, are protected + by bh_atomic(). + - no protection is made in /proc reading. It is OK, because + /proc is broken by design in any case, and + corrupted output is normal behaviour there. + + --ANK (981025) + */ + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif +static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev); + +static int neigh_glbl_allocs; +static struct neigh_table *neigh_tables; + +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonbale choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (net_random() % base) + (base>>1); +} + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + + if (atomic_read(&tbl->lock)) + return 0; + + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + /* Neighbour record may be discarded if: + - nobody refers to it. + - it is not premanent + - (NEW and probably wrong) + INCOMPLETE entries are kept at least for + n->parms->retrans_time, otherwise we could + flood network with resolution requests. + It is not clear, what is better table overflow + or flooding. + */ + if (atomic_read(&n->refcnt) == 0 && + !(n->nud_state&NUD_PERMANENT) && + (n->nud_state != NUD_INCOMPLETE || + jiffies - n->used > n->parms->retrans_time)) { + *np = n->next; + n->tbl = NULL; + tbl->entries--; + shrunk = 1; + neigh_destroy(n); + continue; + } + np = &n->next; + } + } + + tbl->last_flush = jiffies; + return shrunk; +} + +int neigh_ifdown(struct neigh_table *tbl, struct device *dev) +{ + int i; + + if (atomic_read(&tbl->lock)) { + NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n"); + return -EBUSY; + } + + start_bh_atomic(); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + n->tbl = NULL; + tbl->entries--; + if (atomic_read(&n->refcnt)) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + if (n->nud_state & NUD_IN_TIMER) + del_timer(&n->timer); + n->parms = &tbl->parms; + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state&NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } else + neigh_destroy(n); + } + } + + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + pneigh_ifdown(tbl, dev); + end_bh_atomic(); + return 0; +} + +static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) +{ + struct neighbour *n; + unsigned long now = jiffies; + + if (tbl->entries > tbl->gc_thresh1) { + if (creat < 0) + return NULL; + if (tbl->entries > tbl->gc_thresh3 || + (tbl->entries > tbl->gc_thresh2 && + now - tbl->last_flush > 5*HZ)) { + if (neigh_forced_gc(tbl) == 0 && + tbl->entries > tbl->gc_thresh3) + return NULL; + } + } + + n = kmalloc(tbl->entry_size, GFP_ATOMIC); + if (n == NULL) + return NULL; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = &tbl->parms; + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + tbl->stats.allocs++; + neigh_glbl_allocs++; + return n; +} + + +struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) +{ + struct neighbour *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>3; + hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && + memcmp(n->primary_key, pkey, key_len) == 0) { + atomic_inc(&n->refcnt); + return n; + } + } + if (!creat) + return NULL; + + n = neigh_alloc(tbl, creat); + if (n == NULL) + return NULL; + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + + /* Protocol specific setup. */ + if (tbl->constructor && tbl->constructor(n) < 0) { + neigh_destroy(n); + return NULL; + } + + /* Device specific setup. */ + if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { + neigh_destroy(n); + return NULL; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time<<1); + atomic_set(&n->refcnt, 1); + tbl->entries++; + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->tbl = tbl; + NEIGH_PRINTK2("neigh %p is created.\n", n); + return n; +} + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) +{ + struct pneigh_entry *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && + (n->dev == dev || !n->dev)) + return n; + } + if (!creat) + return NULL; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (n == NULL) + return NULL; + + memcpy(n->key, pkey, key_len); + n->dev = dev; + + if (tbl->pconstructor && tbl->pconstructor(n)) { + kfree(n); + return NULL; + } + + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) { + *np = n->next; + synchronize_bh(); + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + return 0; + } + } + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h=0; h<=PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n=*np) != NULL) { + if (n->dev == dev || dev == NULL) { + *np = n->next; + synchronize_bh(); + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct hh_cache *hh; + + if (neigh->tbl || atomic_read(&neigh->refcnt)) { + NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: " + "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0)); + return; + } + + if (neigh->nud_state&NUD_IN_TIMER) + del_timer(&neigh->timer); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + hh->hh_output = neigh_blackhole; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + } + + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + neigh_glbl_allocs--; + kfree(neigh); +} + +/* Neighbour state is suspicious; + disable fast path. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh); + + neigh->output = neigh->ops->output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; +} + +/* + Transitions NUD_STALE <-> NUD_REACHABLE do not occur + when fast path is built: we have no timers assotiated with + these states, we do not have time to check state when sending. + neigh_periodic_timer check periodically neigh->confirmed + time and moves NUD_REACHABLE -> NUD_STALE. + + If a routine wants to know TRUE entry state, it calls + neigh_sync before checking state. + */ + +static void neigh_sync(struct neighbour *n) +{ + unsigned long now = jiffies; + u8 state = n->nud_state; + + if (state&(NUD_NOARP|NUD_PERMANENT)) + return; + if (state&NUD_REACHABLE) { + if (now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + } else if (state&NUD_VALID) { + if (now - n->confirmed < n->parms->reachable_time) { + if (state&NUD_IN_TIMER) + del_timer(&n->timer); + n->nud_state = NUD_REACHABLE; + neigh_connect(n); + } + } +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table*)arg; + unsigned long now = jiffies; + int i; + + if (atomic_read(&tbl->lock)) { + tbl->gc_timer.expires = now + 1*HZ; + add_timer(&tbl->gc_timer); + return; + } + + /* + * periodicly recompute ReachableTime from random function + */ + + if (now - tbl->last_rand > 300*HZ) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p=&tbl->parms; p; p = p->next) + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + } + + for (i=0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + unsigned state = n->nud_state; + + if (state&(NUD_PERMANENT|NUD_IN_TIMER)) + goto next_elt; + + if ((long)(n->used - n->confirmed) < 0) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 0 && + (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) { + *np = n->next; + n->tbl = NULL; + n->next = NULL; + tbl->entries--; + neigh_destroy(n); + continue; + } + + if (n->nud_state&NUD_REACHABLE && + now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + +next_elt: + np = &n->next; + } + } + + tbl->gc_timer.expires = now + tbl->gc_interval; + add_timer(&tbl->gc_timer); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return p->ucast_probes + p->app_probes + p->mcast_probes; +} + + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now = jiffies; + struct neighbour *neigh = (struct neighbour*)arg; + unsigned state = neigh->nud_state; + + if (!(state&NUD_IN_TIMER)) { + NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); + return; + } + + if ((state&NUD_VALID) && + now - neigh->confirmed < neigh->parms->reachable_time) { + neigh->nud_state = NUD_REACHABLE; + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_connect(neigh); + return; + } + if (state == NUD_DELAY) { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->probes = 0; + } + + if (neigh->probes >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + neigh->tbl->stats.res_failed++; + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->ops->error_report(neigh, skb); + skb_queue_purge(&neigh->arp_queue); + return; + } + + neigh->timer.expires = now + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); + neigh->probes++; +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + start_bh_atomic(); + if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { + if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { + if (neigh->tbl == NULL) { + NEIGH_PRINTK2("neigh %p used after death.\n", neigh); + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + neigh->probes = neigh->parms->ucast_probes; + neigh->nud_state = NUD_INCOMPLETE; + neigh->timer.expires = jiffies + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb); + neigh->probes++; + } else { + neigh->nud_state = NUD_FAILED; + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + } + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.prev; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); + } + __skb_queue_head(&neigh->arp_queue, skb); + } + end_bh_atomic(); + return 1; + } + if (neigh->nud_state == NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + } + end_bh_atomic(); + return 0; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct device*, unsigned char*) = + neigh->dev->header_cache_update; + + if (update) { + for (hh=neigh->hh; hh; hh=hh->hh_next) { + write_lock_irq(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_unlock_irq(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- override==1 allows to override existing lladdr, if it is different. + -- arp==0 means that the change is administrative. + */ + +int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) +{ + u8 old = neigh->nud_state; + struct device *dev = neigh->dev; + + if (arp && (old&(NUD_NOARP|NUD_PERMANENT))) + return -EPERM; + + if (!(new&NUD_VALID)) { + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + if (old&NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + return 0; + } + + /* Compare new lladdr with cached one */ + if (dev->addr_len == 0) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if (old&NUD_VALID) { + if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0) + lladdr = neigh->ha; + else if (!override) + return -EPERM; + } + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + if (!(old&NUD_VALID)) + return -EINVAL; + lladdr = neigh->ha; + } + + neigh_sync(neigh); + old = neigh->nud_state; + if (new&NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + if (old&NUD_VALID) { + if (lladdr == neigh->ha) + if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) + return 0; + } + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + neigh->nud_state = new; + if (lladdr != neigh->ha) { + memcpy(&neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1); +#ifdef CONFIG_ARPD + if (neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + } + if (new == old) + return 0; + if (new&NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old&NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state&NUD_VALID && + (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + } + skb_queue_purge(&neigh->arp_queue); + } + return 0; +} + +struct neighbour * neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct device *dev) +{ + struct neighbour *neigh; + + neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, 1, 1); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol) +{ + struct hh_cache *hh = NULL; + struct device *dev = dst->dev; + + for (hh=n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state&NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resoltution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (neigh_event_send(neigh, skb) == 0) { + int err; + struct device *dev = neigh->dev; + if (dev->hard_header_cache && dst->hh == NULL) { + start_bh_atomic(); + if (dst->hh == NULL) + neigh_hh_init(neigh, dst, dst->ops->protocol); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + } else { + start_bh_atomic(); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + } + if (err >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; + } + return 0; + +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL); + kfree_skb(skb); + return -EINVAL; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + start_bh_atomic(); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); + end_bh_atomic(); + if (err >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff*)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo) + tbl->proxy_redo(back); + else + kfree_skb(back); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) { + tbl->proxy_timer.expires = jiffies + sched_next; + add_timer(&tbl->proxy_timer); + } +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + long sched_next = net_random()%p->proxy_delay; + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = 0; + skb->stamp.tv_usec = now + sched_next; + if (del_timer(&tbl->proxy_timer)) { + long tval = tbl->proxy_timer.expires - now; + if (tval < sched_next) + sched_next = tval; + } + tbl->proxy_timer.expires = now + sched_next; + dst_release(skb->dst); + skb->dst = NULL; + __skb_queue_tail(&tbl->proxy_queue, skb); + add_timer(&tbl->proxy_timer); +} + + +struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl) +{ + struct neigh_parms *p; + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup) { + if (dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + } + p->next = tbl->parms.next; + tbl->parms.next = p; + } + return p; +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (parms == NULL || parms == &tbl->parms) + return; + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + synchronize_bh(); +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(parms); +#endif + kfree(parms); + return; + } + } + NEIGH_PRINTK1("neigh_release_parms: not found\n"); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + + tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); + + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time*20; + tbl->next = neigh_tables; + neigh_tables = tbl; +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + start_bh_atomic(); + del_timer(&tbl->gc_timer); + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + end_bh_atomic(); + if (tbl->entries) + printk(KERN_CRIT "neighbour leakage\n"); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + synchronize_bh(); + break; + } + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&tbl->parms); +#endif + return 0; +} + +#ifdef CONFIG_RTNETLINK + + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + + if (ndm->ndm_flags&NTF_PROXY) + return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + + if (dev == NULL) + return -EINVAL; + + start_bh_atomic(); + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, 1, 0); + neigh_release(n); + } + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + if (ndm->ndm_flags&NTF_PROXY) { + if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1)) + return 0; + return -ENOBUFS; + } + if (dev == NULL) + return -EINVAL; + if (nda[NDA_LLADDR-1] != NULL && + nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) + return -EINVAL; + start_bh_atomic(); + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 0); + if (n) { + if (nlh->nlmsg_flags&NLM_F_EXCL) + err = -EEXIST; + } else if (!(nlh->nlmsg_flags&NLM_F_CREATE)) + err = -ENOENT; + else { + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1); + if (n == NULL) + err = -ENOBUFS; + } + if (err == 0) { + err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL, + ndm->ndm_state, + nlh->nlmsg_flags&NLM_F_REPLACE, 0); + } + if (n) + neigh_release(n); + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + u32 pid, u32 seq, int event) +{ + unsigned long now = jiffies; + struct ndmsg *ndm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm)); + ndm = NLMSG_DATA(nlh); + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_state = n->nud_state; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + if (n->nud_state&NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt); + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neighbour *n; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[1]; + s_idx = idx = cb->args[2]; + for (h=0; h <= NEIGH_HASHMASK; h++) { + if (h < s_h) continue; + if (h > s_h) + s_idx = 0; + start_bh_atomic(); + for (n = tbl->hash_buckets[h], idx = 0; n; + n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) { + end_bh_atomic(); + cb->args[1] = h; + cb->args[2] = idx; + return -1; + } + } + end_bh_atomic(); + } + + cb->args[1] = h; + cb->args[2] = idx; + return skb->len; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct neigh_table *tbl; + int family = ((struct rtgenmsg*)NLMSG_DATA(cb->nlh))->rtgen_family; + + s_t = cb->args[0]; + + for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) { + if (t < s_t) continue; + if (family && tbl->family != family) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +static void neigh_app_notify(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + + + +#endif + + +#endif + +#ifdef CONFIG_SYSCTL + +struct neigh_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[17]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + NULL, + {{NET_NEIGH_MCAST_SOLICIT, "mcast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_UCAST_SOLICIT, "ucast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_APP_SOLICIT, "app_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_RETRANS_TIME, "retrans_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_REACHABLE_TIME, "base_reachable_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_STALE_TIME, "gc_stale_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_UNRES_QLEN, "unres_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_QLEN, "proxy_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_ANYCAST_DELAY, "anycast_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_DELAY, "proxy_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_LOCKTIME, "locktime", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_INTERVAL, "gc_interval", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_THRESH1, "gc_thresh1", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH2, "gc_thresh2", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH3, "gc_thresh3", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{NET_PROTO_CONF_DEFAULT, "default", NULL, 0, 0555, NULL},{0}}, + {{0, "neigh", NULL, 0, 0555, NULL},{0}}, + {{0, NULL, NULL, 0, 0555, NULL},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} +}; + +int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name) +{ + struct neigh_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[0].data = &p->mcast_probes; + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->base_reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + if (dev) { + t->neigh_dev[0].procname = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex; + memset(&t->neigh_vars[12], 0, sizeof(ctl_table)); + } else { + t->neigh_vars[12].data = (int*)(p+1); + t->neigh_vars[13].data = (int*)(p+1) + 1; + t->neigh_vars[14].data = (int*)(p+1) + 2; + t->neigh_vars[15].data = (int*)(p+1) + 3; + } + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (t->sysctl_header == NULL) { + kfree(t); + return -ENOBUFS; + } + p->sysctl_table = t; + return 0; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ diff --git a/pfinet/linux-src/net/core/profile.c b/pfinet/linux-src/net/core/profile.c new file mode 100644 index 00000000..fc7464b7 --- /dev/null +++ b/pfinet/linux-src/net/core/profile.c @@ -0,0 +1,305 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifdef CONFIG_NET_PROFILE + +atomic_t net_profile_active; +struct timeval net_profile_adjust; + +NET_PROFILE_DEFINE(total); + +struct net_profile_slot *net_profile_chain = &net_prof_total; + +#ifdef __alpha__ +__u32 alpha_lo; +long alpha_hi; + +static void alpha_tick(unsigned long); + +static struct timer_list alpha_timer = + { NULL, NULL, 0, 0L, alpha_tick }; + +void alpha_tick(unsigned long dummy) +{ + struct timeval dummy_stamp; + net_profile_stamp(&dummy_stamp); + alpha_timer.expires = jiffies + 4*HZ; + add_timer(&alpha_timer); +} + +#endif + +void net_profile_irq_adjust(struct timeval *entered, struct timeval* leaved) +{ + struct net_profile_slot *s; + + net_profile_sub(entered, leaved); + for (s = net_profile_chain; s; s = s->next) { + if (s->active) + net_profile_add(leaved, &s->irq); + } +} + + +#ifdef CONFIG_PROC_FS +static int profile_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + struct net_profile_slot *s; + + len+= sprintf(buffer, "Slot Hits Hi Lo OnIrqHi OnIrqLo Ufl\n"); + + if (offset == 0) { + cli(); + net_prof_total.active = 1; + atomic_inc(&net_profile_active); + NET_PROFILE_LEAVE(total); + sti(); + } + for (s = net_profile_chain; s; s = s->next) { + struct net_profile_slot tmp; + + cli(); + tmp = *s; + + /* Wrong, but pretty close to truth */ + + s->accumulator.tv_sec = 0; + s->accumulator.tv_usec = 0; + s->irq.tv_sec = 0; + s->irq.tv_usec = 0; + s->hits = 0; + s->underflow = 0; + /* Repair active count, it is possible, only if code has a bug */ + if (s->active) { + s->active = 0; + atomic_dec(&net_profile_active); + } + sti(); + + net_profile_sub(&tmp.irq, &tmp.accumulator); + + len += sprintf(buffer+len,"%-15s %-10d %-10ld %-10lu %-10lu %-10lu %d/%d", + tmp.id, + tmp.hits, + tmp.accumulator.tv_sec, + tmp.accumulator.tv_usec, + tmp.irq.tv_sec, + tmp.irq.tv_usec, + tmp.underflow, tmp.active); + + buffer[len++]='\n'; + + pos=begin+len; + if(posoffset+length) + goto done; + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } + if (offset == 0) { + cli(); + net_prof_total.active = 0; + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + sti(); + } + return len; +} +#endif + +struct iphdr whitehole_iph; +int whitehole_count; + +static int whitehole_xmit(struct sk_buff *skb, struct device *dev) +{ + struct net_device_stats *stats; + dev_kfree_skb(skb); + stats = (struct net_device_stats *)dev->priv; + stats->tx_packets++; + stats->tx_bytes+=skb->len; + + return 0; +} + +static void whitehole_inject(unsigned long); +int whitehole_init(struct device *dev); + +static struct timer_list whitehole_timer = + { NULL, NULL, 0, 0L, whitehole_inject }; + +static struct device whitehole_dev = { + "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, }; + +static int whitehole_open(struct device *dev) +{ + whitehole_count = 100000; + whitehole_timer.expires = jiffies + 5*HZ; + add_timer(&whitehole_timer); + return 0; +} + +static int whitehole_close(struct device *dev) +{ + del_timer(&whitehole_timer); + return 0; +} + +static void whitehole_inject(unsigned long dummy) +{ + struct net_device_stats *stats = (struct net_device_stats *)whitehole_dev.priv; + extern int netdev_dropping; + + do { + struct iphdr *iph; + struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + if (!skb) + break; + skb_reserve(skb, 32); + iph = (struct iphdr*)skb_put(skb, sizeof(*iph)); + skb->mac.raw = ((u8*)iph) - 14; + memcpy(iph, &whitehole_iph, sizeof(*iph)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = &whitehole_dev; + skb->pkt_type = PACKET_HOST; + stats->rx_packets++; + stats->rx_bytes += skb->len; + netif_rx(skb); + whitehole_count--; + } while (netdev_dropping == 0 && whitehole_count>0); + if (whitehole_count > 0) { + whitehole_timer.expires = jiffies + 1; + add_timer(&whitehole_timer); + } +} + +static struct net_device_stats *whitehole_get_stats(struct device *dev) +{ + struct net_device_stats *stats = (struct net_device_stats *) dev->priv; + return stats; +} + +__initfunc(int whitehole_init(struct device *dev)) +{ + dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); + if (dev->priv == NULL) + return -ENOBUFS; + memset(dev->priv, 0, sizeof(struct net_device_stats)); + dev->get_stats = whitehole_get_stats; + dev->hard_start_xmit = whitehole_xmit; + dev->open = whitehole_open; + dev->stop = whitehole_close; + ether_setup(dev); + dev->tx_queue_len = 0; + dev->flags |= IFF_NOARP; + dev->flags &= ~(IFF_BROADCAST|IFF_MULTICAST); + dev->iflink = 0; + whitehole_iph.ihl = 5; + whitehole_iph.version = 4; + whitehole_iph.ttl = 2; + whitehole_iph.saddr = in_aton("193.233.7.21"); + whitehole_iph.daddr = in_aton("193.233.7.10"); + whitehole_iph.tot_len = htons(20); + whitehole_iph.check = ip_compute_csum((void *)&whitehole_iph, 20); + return 0; +} + +int net_profile_register(struct net_profile_slot *slot) +{ + cli(); + slot->next = net_profile_chain; + net_profile_chain = slot; + sti(); + return 0; +} + +int net_profile_unregister(struct net_profile_slot *slot) +{ + struct net_profile_slot **sp, *s; + + for (sp = &net_profile_chain; (s = *sp) != NULL; sp = &s->next) { + if (s == slot) { + cli(); + *sp = s->next; + sti(); + return 0; + } + } + return -ESRCH; +} + + +__initfunc(int net_profile_init(void)) +{ + int i; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/profile", 0, 0); + ent->read_proc = profile_read_proc; +#endif + + register_netdevice(&whitehole_dev); + + printk("Evaluating net profiler cost ..."); +#if CPU == 586 || CPU == 686 + if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC)) { + printk(KERN_ERR "Sorry, your CPU does not support TSC. Net profiler disabled.\n"); + return -1; + } +#endif + start_bh_atomic(); +#ifdef __alpha__ + alpha_tick(0); +#endif + for (i=0; i<1024; i++) { + NET_PROFILE_ENTER(total); + NET_PROFILE_LEAVE(total); + } + if (net_prof_total.accumulator.tv_sec) { + printk(" too high!\n"); + } else { + net_profile_adjust.tv_usec = net_prof_total.accumulator.tv_usec>>10; + printk("%ld units\n", net_profile_adjust.tv_usec); + } + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + end_bh_atomic(); + return 0; +} + +#endif diff --git a/pfinet/linux-src/net/core/rtnetlink.c b/pfinet/linux-src/net/core/rtnetlink.c new file mode 100644 index 00000000..7f89e54a --- /dev/null +++ b/pfinet/linux-src/net/core/rtnetlink.c @@ -0,0 +1,512 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + * Alexey Zhuravlev ifi_change does something useful + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +atomic_t rtnl_rlockct; +struct wait_queue *rtnl_wait; + + +void rtnl_lock() +{ + rtnl_shlock(); + rtnl_exlock(); +} + +void rtnl_unlock() +{ + rtnl_exunlock(); + rtnl_shunlock(); +} + +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + +#ifdef CONFIG_RTNETLINK +struct sock *rtnl; + +unsigned long rtnl_wlockct; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +#define _S 1 /* superuser privileges required */ +#define _X 2 /* exclusive access to tables required */ +#define _G 4 /* GET request */ + +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +{ + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)) +}; + +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +{ + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); +} + +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, + int type, u32 pid, u32 seq, u32 change) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = change; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + if (1) { + unsigned mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + if (dev->qdisc_sleeping) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct device *dev; + + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) + break; + } + cb->args[0] = idx; + + return skb->len; +} + +int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idxnlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0) + continue; + if (skb_tailroom(skb) < 256) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct device *dev) +{ + struct sk_buff *skb; + int size = NLMSG_GOODSIZE; + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, ~0U) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ + if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC) + rtnl_shunlock(); + return 0; +} + +/* Process one rtnetlink message. */ + +extern __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + struct rtattr *rta[RTATTR_MAX]; + + int exclusive = 0; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + /* Only requests are handled by kernel now */ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < RTM_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > RTM_MAX) + goto err_inval; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family > NPROTO) { + *errp = -EAFNOSUPPORT; + return -1; + } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[PF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && !cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + *errp = -EPERM; + return -1; + } + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + u32 rlen; + + if (link->dumpit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + + if (link->dumpit == NULL) + goto err_inval; + + /* Super-user locks all the tables to get atomic snapshot */ + if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) + && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_inc(&rtnl_rlockct); + if ((*errp = netlink_dump_start(rtnl, skb, nlh, + link->dumpit, + rtnetlink_done)) != 0) { + if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC) + atomic_dec(&rtnl_rlockct); + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + if (kind != 2) { + if (rtnl_exlock_nowait()) { + *errp = 0; + return -1; + } + exclusive = 1; + } + + memset(&rta, 0, sizeof(rta)); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + if (link->doit == NULL) + goto err_inval; + err = link->doit(skb, nlh, (void *)&rta); + + if (exclusive) + rtnl_exunlock(); + *errp = err; + return err; + +err_inval: + if (exclusive) + rtnl_exunlock(); + *errp = -EINVAL; + return -1; +} + +/* + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (rtnetlink_rcv_msg(skb, nlh, &err)) { + /* Not error, but we must interrupt processing here: + * Note, that in this case we do not pull message + * from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags&NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +/* + * rtnetlink input queue processing routine: + * - try to acquire shared lock. If it is failed, defer processing. + * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, + * that will occur, when a dump started and/or acquisition of + * exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->receive_queue, skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + rtnl_shunlock(); +} + +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { neigh_add, NULL, }, + { neigh_delete, NULL, }, + { NULL, neigh_dump_info, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +}; + + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev); + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev); + break; + } + return NOTIFY_DONE; +} + +struct notifier_block rtnetlink_dev_notifier = { + rtnetlink_event, + NULL, + 0 +}; + + +__initfunc(void rtnetlink_init(void)) +{ +#ifdef RTNL_DEBUG + printk("Initializing RT netlink socket\n"); +#endif + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + if (rtnl == NULL) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[PF_PACKET] = link_rtnetlink_table; +} + + + +#endif diff --git a/pfinet/linux-src/net/core/scm.c b/pfinet/linux-src/net/core/scm.c new file mode 100644 index 00000000..cdb5f3d0 --- /dev/null +++ b/pfinet/linux-src/net/core/scm.c @@ -0,0 +1,280 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + if ((creds->pid == current->pid || capable(CAP_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || + creds->gid == current->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > SCM_MAX_FD) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } +} + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (cmsg->cmsg_len < sizeof(struct cmsghdr) || + (unsigned long)(((char*)cmsg - (char*)msg->msg_control) + + cmsg->cmsg_len) > msg->msg_controllen) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + + err = -EINVAL; + if (msg->msg_flags & MSG_CTLFLAGS) + goto error; + + return 0; + +error: + scm_destroy(p); + return err; +} + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + + int fdmax = (msg->msg_controllen - sizeof(struct cmsghdr))/sizeof(int); + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int *cmfptr; + int err = 0, i; + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(int*)CMSG_DATA(cm); if_count++; + current->files->fd[new_fd] = fp[i]; + } + + if (i > 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + if (!err) + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + memcpy(new_fpl, fpl, sizeof(*fpl)); + + for (i=fpl->count-1; i>=0; i--) + fpl->fp[i]->f_count++; + } + return new_fpl; +} diff --git a/pfinet/linux-src/net/core/skbuff.c b/pfinet/linux-src/net/core/skbuff.c new file mode 100644 index 00000000..b7636437 --- /dev/null +++ b/pfinet/linux-src/net/core/skbuff.c @@ -0,0 +1,385 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox + * Florian La Roche + * + * Version: $Id: skbuff.c,v 1.55 1999/02/23 08:12:27 davem Exp $ + * + * Fixes: + * Alan Cox : Fixed the worst of the load balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Skb list spinlock + */ +spinlock_t skb_queue_lock = SPIN_LOCK_UNLOCKED; + +/* + * Resource tracking variables + */ + +static atomic_t net_skbcount = ATOMIC_INIT(0); +static atomic_t net_allocs = ATOMIC_INIT(0); +static atomic_t net_fails = ATOMIC_INIT(0); + +extern atomic_t ip_frag_mem; + +static kmem_cache_t *skbuff_head_cache; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + panic("skput:over: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : ""); +} + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + panic("skput:under: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : ""); +} + +void show_net_buffers(void) +{ + printk("Networking buffers in use : %u\n", + atomic_read(&net_skbcount)); + printk("Total network buffer allocations : %u\n", + atomic_read(&net_allocs)); + printk("Total failed network buffer allocs : %u\n", + atomic_read(&net_fails)); +#ifdef CONFIG_INET + printk("IP fragment buffer size : %u\n", + atomic_read(&ip_frag_mem)); +#endif +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { + static int count = 0; + if (++count < 5) { + printk(KERN_ERR "alloc_skb called nonatomically " + "from interrupt %p\n", __builtin_return_address(0)); + } + gfp_mask &= ~__GFP_WAIT; + } + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (skb == NULL) + goto nohead; + + /* Get the DATA. Size must match skb_add_mtu(). */ + size = ((size + 15) & ~15); + data = kmalloc(size + sizeof(atomic_t), gfp_mask); + if (data == NULL) + goto nodata; + + /* Note that this counter is useless now - you can just look in the + * skbuff_head entry in /proc/slabinfo. We keep it only for emergency + * cases. + */ + atomic_inc(&net_allocs); + + skb->truesize = size; + + atomic_inc(&net_skbcount); + + /* Load the data pointers. */ + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + /* Set up other state */ + skb->len = 0; + skb->is_clone = 0; + skb->cloned = 0; + + atomic_set(&skb->users, 1); + atomic_set(skb_datarefp(skb), 1); + return skb; + +nodata: + kmem_cache_free(skbuff_head_cache, skb); +nohead: + atomic_inc(&net_fails); + return NULL; +} + + +/* + * Slab constructor for a skb head. + */ +static inline void skb_headerinit(void *p, kmem_cache_t *cache, + unsigned long flags) +{ + struct sk_buff *skb = p; + + skb->destructor = NULL; + skb->pkt_type = PACKET_HOST; /* Default type */ + skb->pkt_bridged = 0; /* Not bridged */ + skb->prev = skb->next = NULL; + skb->list = NULL; + skb->sk = NULL; + skb->stamp.tv_sec=0; /* No idea about time */ + skb->ip_summed = 0; + skb->security = 0; /* By default packets are insecure */ + skb->dst = NULL; +#ifdef CONFIG_IP_FIREWALL + skb->fwmark = 0; +#endif + memset(skb->cb, 0, sizeof(skb->cb)); + skb->priority = 0; +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ + if (!skb->cloned || atomic_dec_and_test(skb_datarefp(skb))) + kfree(skb->head); + + kmem_cache_free(skbuff_head_cache, skb); + atomic_dec(&net_skbcount); +} + +/* + * Free an sk_buff. Release anything attached to the buffer. Clean the state. + */ + +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) + printk(KERN_WARNING "Warning: kfree_skb passed an skb still " + "on a list (from %p).\n", __builtin_return_address(0)); + + dst_release(skb->dst); + if(skb->destructor) + skb->destructor(skb); + skb_headerinit(skb, NULL, 0); /* clean state */ + kfree_skbmem(skb); +} + +/* + * Duplicate an sk_buff. The new one is not owned by a socket. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n; + + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + memcpy(n, skb, sizeof(*n)); + atomic_inc(skb_datarefp(skb)); + skb->cloned = 1; + + atomic_inc(&net_allocs); + atomic_inc(&net_skbcount); + dst_clone(n->dst); + n->cloned = 1; + n->next = n->prev = NULL; + n->list = NULL; + n->sk = NULL; + n->is_clone = 1; + atomic_set(&n->users, 1); + n->destructor = NULL; + return n; +} + +/* + * This is slower, and copies the whole data area + */ + +struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n; + unsigned long offset; + + /* + * Allocate the copy buffer + */ + + n=alloc_skb(skb->end - skb->head, gfp_mask); + if(n==NULL) + return NULL; + + /* + * Shift between the two data areas in bytes + */ + + offset=n->head-skb->head; + + /* Set the data pointer */ + skb_reserve(n,skb->data-skb->head); + /* Set the tail pointer and length */ + skb_put(n,skb->len); + /* Copy the bytes */ + memcpy(n->head,skb->head,skb->end-skb->head); + n->csum = skb->csum; + n->list=NULL; + n->sk=NULL; + n->dev=skb->dev; + n->priority=skb->priority; + n->protocol=skb->protocol; + n->dst=dst_clone(skb->dst); + n->h.raw=skb->h.raw+offset; + n->nh.raw=skb->nh.raw+offset; + n->mac.raw=skb->mac.raw+offset; + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + n->used=skb->used; + n->is_clone=0; + atomic_set(&n->users, 1); + n->pkt_type=skb->pkt_type; + n->stamp=skb->stamp; + n->destructor = NULL; + n->security=skb->security; +#ifdef CONFIG_IP_FIREWALL + n->fwmark = skb->fwmark; +#endif + return n; +} + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) +{ + struct sk_buff *n; + unsigned long offset; + int headroom = skb_headroom(skb); + + /* + * Allocate the copy buffer + */ + + n=alloc_skb(skb->truesize+newheadroom-headroom, GFP_ATOMIC); + if(n==NULL) + return NULL; + + skb_reserve(n,newheadroom); + + /* + * Shift between the two data areas in bytes + */ + + offset=n->data-skb->data; + + /* Set the tail pointer and length */ + skb_put(n,skb->len); + /* Copy the bytes */ + memcpy(n->data,skb->data,skb->len); + n->list=NULL; + n->sk=NULL; + n->priority=skb->priority; + n->protocol=skb->protocol; + n->dev=skb->dev; + n->dst=dst_clone(skb->dst); + n->h.raw=skb->h.raw+offset; + n->nh.raw=skb->nh.raw+offset; + n->mac.raw=skb->mac.raw+offset; + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + n->used=skb->used; + n->is_clone=0; + atomic_set(&n->users, 1); + n->pkt_type=skb->pkt_type; + n->stamp=skb->stamp; + n->destructor = NULL; + n->security=skb->security; +#ifdef CONFIG_IP_FIREWALL + n->fwmark = skb->fwmark; +#endif + + return n; +} + +#if 0 +/* + * Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ + /* Must match allocation in alloc_skb */ + mtu = ((mtu + 15) & ~15) + sizeof(atomic_t); + + kmem_add_cache_size(mtu); +} +#endif + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + skb_headerinit, NULL); + if (!skbuff_head_cache) + panic("cannot create skbuff cache"); +} diff --git a/pfinet/linux-src/net/core/sock.c b/pfinet/linux-src/net/core/sock.c new file mode 100644 index 00000000..e0eb41a0 --- /dev/null +++ b/pfinet/linux-src/net/core/sock.c @@ -0,0 +1,1051 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Version: $Id: sock.c,v 1.80 1999/05/08 03:04:34 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Florian La Roche, + * Alan Cox, + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_FILTER +#include +#endif + +#define min(a,b) ((a)<(b)?(a):(b)) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max = SK_WMEM_MAX; +__u32 sysctl_rmem_max = SK_RMEM_MAX; +__u32 sysctl_wmem_default = SK_WMEM_MAX; +__u32 sysctl_rmem_default = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancilliary data plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + struct sock *sk=sock->sk; +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + int val; + int valbool; + int err; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + +#ifdef SO_DONTLINGER /* Compatibility item... */ + switch(optname) + { + case SO_DONTLINGER: + sk->linger=0; + return 0; + } +#endif + + if(optlendebug=valbool; + break; + case SO_REUSEADDR: + sk->reuse = valbool; + break; + case SO_TYPE: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sk->localroute=valbool; + break; + case SO_BROADCAST: + sk->broadcast=valbool; + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; + + sk->sndbuf = max(val*2,2048); + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->write_space(sk); + break; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; + + /* FIXME: is this lower bound the right one? */ + sk->rcvbuf = max(val*2,256); + break; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->protocol == IPPROTO_TCP) + { + tcp_set_keepalive(sk, valbool); + } +#endif + sk->keepopen = valbool; + break; + + case SO_OOBINLINE: + sk->urginline = valbool; + break; + + case SO_NO_CHECK: + sk->no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->priority = val; + else + return(-EPERM); + break; + + case SO_LINGER: + if(optlenlinger=0; + else + { + sk->lingertime=ling.l_linger; + sk->linger=1; + } + break; + + case SO_BSDCOMPAT: + sk->bsdism = valbool; + break; + + case SO_PASSCRED: + sock->passcred = valbool; + break; + + +#ifdef CONFIG_NETDEVICES + case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) + return -EPERM; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + + if (!valbool) { + sk->bound_dev_if = 0; + } else { + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) + return -EFAULT; + + /* Remove any cached route for this socket. */ + lock_sock(sk); + dst_release(xchg(&sk->dst_cache, NULL)); + release_sock(sk); + + if (devname[0] == '\0') { + sk->bound_dev_if = 0; + } else { + struct device *dev = dev_get(devname); + if (!dev) + return -EINVAL; + sk->bound_dev_if = dev->ifindex; + } + return 0; + } + } +#endif + + +#ifdef CONFIG_FILTER + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + filter = sk->filter; + if(filter) { + sk->filter = NULL; + synchronize_bh(); + sk_filter_release(sk, filter); + return 0; + } + return -ENOENT; +#endif + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + default: + return(-ENOPROTOOPT); + } + return ret; +} + + +int sock_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + struct sock *sk = sock->sk; + + union + { + int val; + struct linger ling; + struct timeval tm; + } v; + + int lv=sizeof(int),len; + + if(get_user(len,optlen)) + return -EFAULT; + + switch(optname) + { + case SO_DEBUG: + v.val = sk->debug; + break; + + case SO_DONTROUTE: + v.val = sk->localroute; + break; + + case SO_BROADCAST: + v.val= sk->broadcast; + break; + + case SO_SNDBUF: + v.val=sk->sndbuf; + break; + + case SO_RCVBUF: + v.val =sk->rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->reuse; + break; + + case SO_KEEPALIVE: + v.val = sk->keepopen; + break; + + case SO_TYPE: + v.val = sk->type; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if(v.val==0) + v.val=xchg(&sk->err_soft,0); + break; + + case SO_OOBINLINE: + v.val = sk->urginline; + break; + + case SO_NO_CHECK: + v.val = sk->no_check; + break; + + case SO_PRIORITY: + v.val = sk->priority; + break; + + case SO_LINGER: + lv=sizeof(v.ling); + v.ling.l_onoff=sk->linger; + v.ling.l_linger=sk->lingertime; + break; + + case SO_BSDCOMPAT: + v.val = sk->bsdism; + break; + + case SO_RCVTIMEO: + case SO_SNDTIMEO: + lv=sizeof(struct timeval); + v.tm.tv_sec=0; + v.tm.tv_usec=0; + break; + + case SO_RCVLOWAT: + case SO_SNDLOWAT: + v.val=1; + break; + + case SO_PASSCRED: + v.val = sock->passcred; + break; + + case SO_PEERCRED: + lv=sizeof(sk->peercred); + len=min(len, lv); + if(copy_to_user((void*)optval, &sk->peercred, len)) + return -EFAULT; + goto lenout; + + default: + return(-ENOPROTOOPT); + } + len=min(len,lv); + if(copy_to_user(optval,&v,len)) + return -EFAULT; +lenout: + if(put_user(len, optlen)) + return -EFAULT; + return 0; +} + +static kmem_cache_t *sk_cachep; + +/* + * All socket objects are allocated here. This is for future + * usage. + */ + +struct sock *sk_alloc(int family, int priority, int zero_it) +{ + struct sock *sk = kmem_cache_alloc(sk_cachep, priority); + + if(sk) { + if (zero_it) + memset(sk, 0, sizeof(struct sock)); + sk->family = family; + } + + return sk; +} + +void sk_free(struct sock *sk) +{ +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + if (sk->destruct) + sk->destruct(sk); + +#ifdef CONFIG_FILTER + filter = sk->filter; + if (filter) { + sk_filter_release(sk, filter); + sk->filter = NULL; + } +#endif + + if (atomic_read(&sk->omem_alloc)) + printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc)); + + kmem_cache_free(sk_cachep, sk); +} + +void __init sk_init(void) +{ + sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0, + SLAB_HWCACHE_ALIGN, 0, 0); + +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + /* In case it might be waiting for more memory. */ + atomic_sub(skb->truesize, &sk->wmem_alloc); + sk->write_space(sk); +} + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->rmem_alloc); +} + + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) { + struct sk_buff * skb = alloc_skb(size, priority); + if (skb) { + atomic_add(skb->truesize, &sk->wmem_alloc); + skb->destructor = sock_wfree; + skb->sk = sk; + return skb; + } + } + return NULL; +} + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + atomic_add(skb->truesize, &sk->rmem_alloc); + skb->destructor = sock_rfree; + skb->sk = sk; + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->omem_alloc); + } + return NULL; +} + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree_s(mem, size); + atomic_sub(size, &sk->omem_alloc); +} + +/* FIXME: this is insane. We are trying suppose to be controlling how + * how much space we have for data bytes, not packet headers. + * This really points out that we need a better system for doing the + * receive buffer. -- erics + * WARNING: This is currently ONLY used in tcp. If you need it else where + * this will probably not be what you want. Possibly these two routines + * should move over to the ipv4 directory. + */ +unsigned long sock_rspace(struct sock *sk) +{ + int amt = 0; + + if (sk != NULL) { + /* This used to have some bizarre complications that + * to attempt to reserve some amount of space. This doesn't + * make sense, since the number returned here does not + * actually reflect allocated space, but rather the amount + * of space we committed to. We gamble that we won't + * run out of memory, and returning a smaller number does + * not change the gamble. If we lose the gamble tcp still + * works, it may just slow down for retransmissions. + */ + amt = sk->rcvbuf - atomic_read(&sk->rmem_alloc); + if (amt < 0) + amt = 0; + } + return amt; +} + + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static void sock_wait_for_wmem(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + + +/* + * Generic send/receive buffer handlers + */ + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + unsigned long fallback, int noblock, int *errcode) +{ + int err; + struct sk_buff *skb; + + while (1) { + unsigned long try_size = size; + + err = sock_error(sk); + if (err != 0) + goto failure; + + /* + * We should send SIGPIPE in these cases according to + * 1003.1g draft 6.4. If we (the user) did a shutdown() + * call however we should not. + * + * Note: This routine isnt just used for datagrams and + * anyway some datagram protocols have a notion of + * close down. + */ + + err = -EPIPE; + if (sk->shutdown&SEND_SHUTDOWN) + goto failure; + + if (fallback) { + /* The buffer get won't block, or use the atomic queue. + * It does produce annoying no free page messages still. + */ + skb = sock_wmalloc(sk, size, 0, GFP_BUFFER); + if (skb) + break; + try_size = fallback; + } + skb = sock_wmalloc(sk, try_size, 0, sk->allocation); + if (skb) + break; + + /* + * This means we have too many buffers for this socket already. + */ + + sk->socket->flags |= SO_NOSPACE; + err = -EAGAIN; + if (noblock) + goto failure; + err = -ERESTARTSYS; + if (signal_pending(current)) + goto failure; + sock_wait_for_wmem(sk); + } + + return skb; + +failure: + *errcode = err; + return NULL; +} + + +void __release_sock(struct sock *sk) +{ +#ifdef CONFIG_INET + if (!sk->prot || !sk->backlog_rcv) + return; + + /* See if we have any packets built up. */ + start_bh_atomic(); + while (!skb_queue_empty(&sk->back_log)) { + struct sk_buff * skb = sk->back_log.next; + __skb_unlink(skb, &sk->back_log); + sk->backlog_rcv(sk, skb); + } + end_bh_atomic(); +#endif +} + + +/* + * Generic socket manager library. Most simpler socket families + * use this to manage their socket lists. At some point we should + * hash these. By making this generic we get the lot hashed for free. + */ + +void sklist_remove_socket(struct sock **list, struct sock *sk) +{ + struct sock *s; + + start_bh_atomic(); + + s= *list; + if(s==sk) + { + *list = s->next; + end_bh_atomic(); + return; + } + while(s && s->next) + { + if(s->next==sk) + { + s->next=sk->next; + break; + } + s=s->next; + } + end_bh_atomic(); +} + +void sklist_insert_socket(struct sock **list, struct sock *sk) +{ + start_bh_atomic(); + sk->next= *list; + *list=sk; + end_bh_atomic(); +} + +/* + * This is only called from user mode. Thus it protects itself against + * interrupt users but doesn't worry about being called during work. + * Once it is removed from the queue no interrupt or bottom half will + * touch it and we are (fairly 8-) ) safe. + */ + +void sklist_destroy_socket(struct sock **list, struct sock *sk); + +/* + * Handler for deferred kills. + */ + +static void sklist_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + sklist_destroy_socket(NULL,sk); +} + +/* + * Destroy a socket. We pass NULL for a list if we know the + * socket is not on a list. + */ + +void sklist_destroy_socket(struct sock **list,struct sock *sk) +{ + struct sk_buff *skb; + if(list) + sklist_remove_socket(list, sk); + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + { + kfree_skb(skb); + } + + if(atomic_read(&sk->wmem_alloc) == 0 && + atomic_read(&sk->rmem_alloc) == 0 && + sk->dead) + { + sk_free(sk); + } + else + { + /* + * Someone is using our buffers still.. defer + */ + init_timer(&sk->timer); + sk->timer.expires=jiffies+SOCK_DESTROY_TIME; + sk->timer.function=sklist_destroy_timer; + sk->timer.data = (unsigned long)sk; + add_timer(&sk->timer); + } +} + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_dup(struct socket *newsock, struct socket *oldsock) +{ + struct sock *sk = oldsock->sk; + + return net_families[sk->family]->create(newsock, sk->protocol); +} + +int sock_no_release(struct socket *sock, struct socket *peersock) +{ + return 0; +} + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} + +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +{ + return 0; +} + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + return -EOPNOTSUPP; +} + +/* + * Note: if you add something that sleeps here then change sock_fcntl() + * to do proper fd locking. + */ +int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch(cmd) + { + case F_SETOWN: + /* + * This is a little restrictive, but it's the only + * way to make sure that you can't send a sigurg to + * another process. + */ + if (current->pgrp != -arg && + current->pid != arg && + !capable(CAP_KILL)) return(-EPERM); + sk->proc = arg; + return(0); + case F_GETOWN: + return(sk->proc); + default: + return(-EINVAL); + } +} + +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags, + struct scm_cookie *scm) +{ + return -EOPNOTSUPP; +} + +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags, + struct scm_cookie *scm) +{ + return -EOPNOTSUPP; +} + + + +/* + * Default Socket Callbacks + */ + +void sock_def_wakeup(struct sock *sk) +{ + if(!sk->dead) + wake_up_interruptible(sk->sleep); +} + +void sock_def_error_report(struct sock *sk) +{ + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,0); + } +} + +void sock_def_readable(struct sock *sk, int len) +{ + if(!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1); + } +} + +void sock_def_write_space(struct sock *sk) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if(!sk->dead && + ((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf)) { + wake_up_interruptible(sk->sleep); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sock_wake_async(sk->socket, 2); + } +} + +void sock_def_destruct(struct sock *sk) +{ + if (sk->protinfo.destruct_hook) + kfree(sk->protinfo.destruct_hook); +} + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->back_log); + skb_queue_head_init(&sk->error_queue); + + init_timer(&sk->timer); + + sk->allocation = GFP_KERNEL; + sk->rcvbuf = sysctl_rmem_default; + sk->sndbuf = sysctl_wmem_default; + sk->state = TCP_CLOSE; + sk->zapped = 1; + sk->socket = sock; + + if(sock) + { + sk->type = sock->type; + sk->sleep = &sock->wait; + sock->sk = sk; + } + + sk->state_change = sock_def_wakeup; + sk->data_ready = sock_def_readable; + sk->write_space = sock_def_write_space; + sk->error_report = sock_def_error_report; + sk->destruct = sock_def_destruct; + + sk->peercred.pid = 0; + sk->peercred.uid = -1; + sk->peercred.gid = -1; + +} diff --git a/pfinet/linux-src/net/core/sysctl_net_core.c b/pfinet/linux-src/net/core/sysctl_net_core.c new file mode 100644 index 00000000..446ca145 --- /dev/null +++ b/pfinet/linux-src/net/core/sysctl_net_core.c @@ -0,0 +1,61 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include +#include +#include + +#ifdef CONFIG_SYSCTL + +extern int netdev_max_backlog; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + +extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; + +ctl_table core_table[] = { +#ifdef CONFIG_NET + {NET_CORE_WMEM_MAX, "wmem_max", + &sysctl_wmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_RMEM_MAX, "rmem_max", + &sysctl_rmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_WMEM_DEFAULT, "wmem_default", + &sysctl_wmem_default, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_RMEM_DEFAULT, "rmem_default", + &sysctl_rmem_default, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", + &netdev_max_backlog, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_NET_FASTROUTE + {NET_CORE_FASTROUTE, "netdev_fastroute", + &netdev_fastroute, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_CORE_MSG_COST, "message_cost", + &net_msg_cost, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_MSG_BURST, "message_burst", + &net_msg_burst, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_OPTMEM_MAX, "optmem_max", + &sysctl_optmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif /* CONFIG_NET */ + { 0 } +}; +#endif diff --git a/pfinet/linux-src/net/core/utils.c b/pfinet/linux-src/net/core/utils.c new file mode 100644 index 00000000..415926b8 --- /dev/null +++ b/pfinet/linux-src/net/core/utils.c @@ -0,0 +1,66 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +static unsigned long net_rand_seed = 152L; + +unsigned long net_random(void) +{ + net_rand_seed=net_rand_seed*69069L+1; + return net_rand_seed^jiffies; +} + +void net_srandom(unsigned long entropy) +{ + net_rand_seed ^= entropy; + net_random(); +} + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10*5*HZ; + +/* + * This enforces a rate limit: not more than one kernel message + * every 5secs to make a denial-of-service attack impossible. + * + * All warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + static unsigned long toks = 10*5*HZ; + static unsigned long last_msg; + static int missed; + unsigned long now = jiffies; + + toks += now - xchg(&last_msg, now); + if (toks > net_msg_burst) + toks = net_msg_burst; + if (toks >= net_msg_cost) { + toks -= net_msg_cost; + if (missed) + printk(KERN_WARNING "NET: %d messages suppressed.\n", missed); + missed = 0; + return 1; + } + missed++; + return 0; +} -- cgit v1.2.3