Initial source

author: Thomas Bushnell <thomas@gnu.org> 1997-02-25 21:28:37 +0000
committer: Thomas Bushnell <thomas@gnu.org> 1997-02-25 21:28:37 +0000
commit: f07a4c844da9f0ecae5bbee1ab94be56505f26f7 (patch)
tree: 12b07c7e578fc1a5f53dbfde2632408491ff2a70 /device/net_io.c
1 files changed, 2168 insertions, 0 deletions
diff --git a/device/net_io.c b/device/net_io.c
new file mode 100644
index 0000000..7714ebe
--- /dev/null
+++ b/device/net_io.c
@@ -0,0 +1,2168 @@
+ /* 
+ * Mach Operating System
+ * Copyright (c) 1993-1989 Carnegie Mellon University
+ * All Rights Reserved.
+ * 
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ * 
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ * 
+ * Carnegie Mellon requests users of this software to return to
+ * 
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ * 
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
+ */
+/*
+ *	Author: David B. Golub, Carnegie Mellon University
+ *	Date: 	3/98
+ *
+ *	Network IO.
+ *
+ *	Packet filter code taken from vaxif/enet.c written		 
+ *		CMU and Stanford. 
+ */
+
+/*
+ *	Note:  don't depend on anything in this file.
+ *	It may change a lot real soon.	-cmaeda 11 June 1993
+ */
+
+#include <norma_ether.h>
+#include <mach_ttd.h>
+
+#include <sys/types.h>
+#include <device/net_status.h>
+#include <machine/machspl.h>		/* spl definitions */
+#include <device/net_io.h>
+#include <device/if_hdr.h>
+#include <device/io_req.h>
+#include <device/ds_routines.h>
+
+#include <mach/boolean.h>
+#include <mach/vm_param.h>
+
+#include <ipc/ipc_port.h>
+#include <ipc/ipc_kmsg.h>
+#include <ipc/ipc_mqueue.h>
+
+#include <kern/counters.h>
+#include <kern/lock.h>
+#include <kern/queue.h>
+#include <kern/sched_prim.h>
+#include <kern/thread.h>
+
+#if	NORMA_ETHER
+#include <norma/ipc_ether.h>
+#endif	/*NORMA_ETHER*/
+
+#include <machine/machspl.h>
+
+#if	MACH_TTD
+#include <ttd/ttd_stub.h>
+#endif	/* MACH_TTD */
+
+#if	MACH_TTD
+int kttd_async_counter= 0;
+#endif	/* MACH_TTD */
+
+
+/*
+ *	Packet Buffer Management
+ *
+ *	This module manages a private pool of kmsg buffers.
+ */
+
+/*
+ * List of net kmsgs queued to be sent to users.
+ * Messages can be high priority or low priority.
+ * The network thread processes high priority messages first.
+ */
+decl_simple_lock_data(,net_queue_lock)
+boolean_t	net_thread_awake = FALSE;
+struct ipc_kmsg_queue	net_queue_high;
+int		net_queue_high_size = 0;
+int		net_queue_high_max = 0;		/* for debugging */
+struct ipc_kmsg_queue	net_queue_low;
+int		net_queue_low_size = 0;
+int		net_queue_low_max = 0;		/* for debugging */
+
+/*
+ * List of net kmsgs that can be touched at interrupt level.
+ * If it is empty, we will also steal low priority messages.
+ */
+decl_simple_lock_data(,net_queue_free_lock)
+struct ipc_kmsg_queue	net_queue_free;
+int		net_queue_free_size = 0;	/* on free list */
+int		net_queue_free_max = 0;		/* for debugging */
+
+/*
+ * This value is critical to network performance.
+ * At least this many buffers should be sitting in net_queue_free.
+ * If this is set too small, we will drop network packets.
+ * Even a low drop rate (<1%) can cause severe network throughput problems.
+ * We add one to net_queue_free_min for every filter.
+ */
+int		net_queue_free_min = 3;
+
+int		net_queue_free_hits = 0;	/* for debugging */
+int		net_queue_free_steals = 0;	/* for debugging */
+int		net_queue_free_misses = 0;	/* for debugging */
+
+int		net_kmsg_send_high_hits = 0;	/* for debugging */
+int		net_kmsg_send_low_hits = 0;	/* for debugging */
+int		net_kmsg_send_high_misses = 0;	/* for debugging */
+int		net_kmsg_send_low_misses = 0;	/* for debugging */
+
+int		net_thread_awaken = 0;		/* for debugging */
+int		net_ast_taken = 0;		/* for debugging */
+
+decl_simple_lock_data(,net_kmsg_total_lock)
+int		net_kmsg_total = 0;		/* total allocated */
+int		net_kmsg_max;			/* initialized below */
+
+vm_size_t	net_kmsg_size;			/* initialized below */
+
+/*
+ *	We want more buffers when there aren't enough in the free queue
+ *	and the low priority queue.  However, we don't want to allocate
+ *	more than net_kmsg_max.
+ */
+
+#define net_kmsg_want_more()		\
+	(((net_queue_free_size + net_queue_low_size) < net_queue_free_min) && \
+	 (net_kmsg_total < net_kmsg_max))
+
+ipc_kmsg_t
+net_kmsg_get(void)
+{
+	register ipc_kmsg_t kmsg;
+	spl_t s;
+
+	/*
+	 *	First check the list of free buffers.
+	 */
+	s = splimp();
+	simple_lock(&net_queue_free_lock);
+	kmsg = ipc_kmsg_queue_first(&net_queue_free);
+	if (kmsg != IKM_NULL) {
+	    ipc_kmsg_rmqueue_first_macro(&net_queue_free, kmsg);
+	    net_queue_free_size--;
+	    net_queue_free_hits++;
+	}
+	simple_unlock(&net_queue_free_lock);
+
+	if (kmsg == IKM_NULL) {
+	    /*
+	     *	Try to steal from the low priority queue.
+	     */
+	    simple_lock(&net_queue_lock);
+	    kmsg = ipc_kmsg_queue_first(&net_queue_low);
+	    if (kmsg != IKM_NULL) {
+		ipc_kmsg_rmqueue_first_macro(&net_queue_low, kmsg);
+		net_queue_low_size--;
+		net_queue_free_steals++;
+	    }
+	    simple_unlock(&net_queue_lock);
+	}
+
+	if (kmsg == IKM_NULL)
+	    net_queue_free_misses++;
+	(void) splx(s);
+
+	if (net_kmsg_want_more() || (kmsg == IKM_NULL)) {
+	    boolean_t awake;
+
+	    s = splimp();
+	    simple_lock(&net_queue_lock);
+	    awake = net_thread_awake;
+	    net_thread_awake = TRUE;
+	    simple_unlock(&net_queue_lock);
+	    (void) splx(s);
+
+	    if (!awake)
+		thread_wakeup((event_t) &net_thread_awake);
+	}
+
+	return kmsg;
+}
+
+void
+net_kmsg_put(register ipc_kmsg_t kmsg)
+{
+	spl_t s;
+
+	s = splimp();
+	simple_lock(&net_queue_free_lock);
+	ipc_kmsg_enqueue_macro(&net_queue_free, kmsg);
+	if (++net_queue_free_size > net_queue_free_max)
+	    net_queue_free_max = net_queue_free_size;
+	simple_unlock(&net_queue_free_lock);
+	(void) splx(s);
+}
+
+void
+net_kmsg_collect(void)
+{
+	register ipc_kmsg_t kmsg;
+	spl_t s;
+
+	s = splimp();
+	simple_lock(&net_queue_free_lock);
+	while (net_queue_free_size > net_queue_free_min) {
+	    kmsg = ipc_kmsg_dequeue(&net_queue_free);
+	    net_queue_free_size--;
+	    simple_unlock(&net_queue_free_lock);
+	    (void) splx(s);
+
+	    net_kmsg_free(kmsg);
+	    simple_lock(&net_kmsg_total_lock);
+	    net_kmsg_total--;
+	    simple_unlock(&net_kmsg_total_lock);
+
+	    s = splimp();
+	    simple_lock(&net_queue_free_lock);
+	}
+	simple_unlock(&net_queue_free_lock);
+	(void) splx(s);
+}
+
+void
+net_kmsg_more(void)
+{
+	register ipc_kmsg_t kmsg;
+
+	/*
+	 * Replenish net kmsg pool if low.  We don't have the locks
+	 * necessary to look at these variables, but that's OK because
+	 * misread values aren't critical.  The danger in this code is
+	 * that while we allocate buffers, interrupts are happening
+	 * which take buffers out of the free list.  If we are not
+	 * careful, we will sit in the loop and allocate a zillion
+	 * buffers while a burst of packets arrives.  So we count
+	 * buffers in the low priority queue as available, because
+	 * net_kmsg_get will make use of them, and we cap the total
+	 * number of buffers we are willing to allocate.
+	 */
+
+	while (net_kmsg_want_more()) {
+	    simple_lock(&net_kmsg_total_lock);
+	    net_kmsg_total++;
+	    simple_unlock(&net_kmsg_total_lock);
+	    kmsg = net_kmsg_alloc();
+	    net_kmsg_put(kmsg);
+	}
+}
+
+/*
+ *	Packet Filter Data Structures
+ *
+ *	Each network interface has a set of packet filters
+ *	that are run on incoming packets.
+ *
+ *	Each packet filter may represent a single network
+ *	session or multiple network sessions.  For example,
+ *	all application level TCP sessions would be represented
+ *	by a single packet filter data structure.
+ *	
+ *	If a packet filter has a single session, we use a
+ *	struct net_rcv_port to represent it.  If the packet
+ *	filter represents multiple sessions, we use a 
+ *	struct net_hash_header to represent it.
+ */
+
+/*
+ * Each interface has a write port and a set of read ports.
+ * Each read port has one or more filters to determine what packets
+ * should go to that port.
+ */
+
+/*
+ * Receive port for net, with packet filter.
+ * This data structure by itself represents a packet
+ * filter for a single session.
+ */
+struct net_rcv_port {
+	queue_chain_t	chain;		/* list of open_descriptors */
+	ipc_port_t	rcv_port;	/* port to send packet to */
+	int		rcv_qlimit;	/* port's qlimit */
+	int		rcv_count;	/* number of packets received */
+	int		priority;	/* priority for filter */
+	filter_t	*filter_end;	/* pointer to end of filter */
+	filter_t	filter[NET_MAX_FILTER];
+					/* filter operations */
+};
+typedef struct net_rcv_port *net_rcv_port_t;
+
+zone_t		net_rcv_zone;	/* zone of net_rcv_port structs */
+
+
+#define NET_HASH_SIZE   256
+#define N_NET_HASH      4
+#define N_NET_HASH_KEYS 4
+
+unsigned int bpf_hash (int, unsigned int *);
+
+/*
+ * A single hash entry.
+ */
+struct net_hash_entry {
+	queue_chain_t   chain;	        /* list of entries with same hval */
+#define he_next chain.next
+#define he_prev chain.prev
+	ipc_port_t      rcv_port;	/* destination port */
+	int             rcv_qlimit;	/* qlimit for the port */
+	unsigned int	keys[N_NET_HASH_KEYS];
+};
+typedef struct net_hash_entry *net_hash_entry_t;
+
+zone_t  net_hash_entry_zone;
+
+/*
+ * This structure represents a packet filter with multiple sessions.
+ *
+ * For example, all application level TCP sessions might be
+ * represented by one of these structures.  It looks like a 
+ * net_rcv_port struct so that both types can live on the
+ * same packet filter queues.
+ */
+struct net_hash_header {
+	struct net_rcv_port rcv;
+        int n_keys;			/* zero if not used */
+        int ref_count;			/* reference count */
+        net_hash_entry_t table[NET_HASH_SIZE];
+} filter_hash_header[N_NET_HASH];
+
+typedef struct net_hash_header *net_hash_header_t;
+
+decl_simple_lock_data(,net_hash_header_lock)
+
+#define HASH_ITERATE(head, elt) (elt) = (net_hash_entry_t) (head); do {
+#define HASH_ITERATE_END(head, elt) \
+	(elt) = (net_hash_entry_t) queue_next((queue_entry_t) (elt));	   \
+	} while ((elt) != (head));
+
+
+#define FILTER_ITERATE(ifp, fp, nextfp) \
+	for ((fp) = (net_rcv_port_t) queue_first(&(ifp)->if_rcv_port_list);\
+	     !queue_end(&(ifp)->if_rcv_port_list, (queue_entry_t)(fp));    \
+	     (fp) = (nextfp)) {						   \
+		(nextfp) = (net_rcv_port_t) queue_next(&(fp)->chain);
+#define FILTER_ITERATE_END }
+
+/* entry_p must be net_rcv_port_t or net_hash_entry_t */
+#define ENQUEUE_DEAD(dead, entry_p) { \
+	queue_next(&(entry_p)->chain) = (queue_entry_t) (dead);	\
+	(dead) = (queue_entry_t)(entry_p);			\
+}
+
+extern boolean_t net_do_filter();	/* CSPF */
+extern int bpf_do_filter();		/* BPF */
+
+
+/*
+ *	ethernet_priority:
+ *
+ *	This function properly belongs in the ethernet interfaces;
+ *	it should not be called by this module.  (We get packet
+ *	priorities as an argument to net_filter.)  It is here
+ *	to avoid massive code duplication.
+ *
+ *	Returns TRUE for high-priority packets.
+ */
+
+boolean_t ethernet_priority(kmsg)
+	ipc_kmsg_t kmsg;
+{
+	register unsigned char *addr =
+		(unsigned char *) net_kmsg(kmsg)->header;
+
+	/*
+	 *	A simplistic check for broadcast packets.
+	 */
+
+	if ((addr[0] == 0xff) && (addr[1] == 0xff) &&
+	    (addr[2] == 0xff) && (addr[3] == 0xff) &&
+	    (addr[4] == 0xff) && (addr[5] == 0xff))
+	    return FALSE;
+	else
+	    return TRUE;
+}
+
+mach_msg_type_t header_type = {
+	MACH_MSG_TYPE_BYTE,
+	8,
+	NET_HDW_HDR_MAX,
+	TRUE,
+	FALSE,
+	FALSE,
+	0
+};
+
+mach_msg_type_t packet_type = {
+	MACH_MSG_TYPE_BYTE,	/* name */
+	8,			/* size */
+	0,			/* number */
+	TRUE,			/* inline */
+	FALSE,			/* longform */
+	FALSE			/* deallocate */
+};
+
+/*
+ *	net_deliver:
+ *
+ *	Called and returns holding net_queue_lock, at splimp.
+ *	Dequeues a message and delivers it at spl0.
+ *	Returns FALSE if no messages.
+ */
+boolean_t net_deliver(nonblocking)
+	boolean_t nonblocking;
+{
+	register ipc_kmsg_t kmsg;
+	boolean_t high_priority;
+	struct ipc_kmsg_queue send_list;
+
+	/*
+	 * Pick up a pending network message and deliver it.
+	 * Deliver high priority messages before low priority.
+	 */
+
+	if ((kmsg = ipc_kmsg_dequeue(&net_queue_high)) != IKM_NULL) {
+	    net_queue_high_size--;
+	    high_priority = TRUE;
+	} else if ((kmsg = ipc_kmsg_dequeue(&net_queue_low)) != IKM_NULL) {
+	    net_queue_low_size--;
+	    high_priority = FALSE;
+	} else
+	    return FALSE;
+	simple_unlock(&net_queue_lock);
+	(void) spl0();
+
+	/*
+	 * Run the packet through the filters,
+	 * getting back a queue of packets to send.
+	 */
+	net_filter(kmsg, &send_list);
+
+	if (!nonblocking) {
+	    /*
+	     * There is a danger of running out of available buffers
+	     * because they all get moved into the high priority queue
+	     * or a port queue.  In particular, we might need to
+	     * allocate more buffers as we pull (previously available)
+	     * buffers out of the low priority queue.  But we can only
+	     * allocate if we are allowed to block.
+	     */
+	    net_kmsg_more();
+	}
+
+	while ((kmsg = ipc_kmsg_dequeue(&send_list)) != IKM_NULL) {
+	    int count;
+
+	    /*
+	     * Fill in the rest of the kmsg.
+	     */
+	    count = net_kmsg(kmsg)->net_rcv_msg_packet_count;
+
+	    ikm_init_special(kmsg, IKM_SIZE_NETWORK);
+
+	    kmsg->ikm_header.msgh_bits =
+		    MACH_MSGH_BITS(MACH_MSG_TYPE_PORT_SEND, 0);
+	    /* remember message sizes must be rounded up */
+	    kmsg->ikm_header.msgh_size =
+		    ((mach_msg_size_t) (sizeof(struct net_rcv_msg)
+					- NET_RCV_MAX + count))+3 &~ 3;
+	    kmsg->ikm_header.msgh_local_port = MACH_PORT_NULL;
+	    kmsg->ikm_header.msgh_kind = MACH_MSGH_KIND_NORMAL;
+	    kmsg->ikm_header.msgh_id = NET_RCV_MSG_ID;
+
+	    net_kmsg(kmsg)->header_type = header_type;
+	    net_kmsg(kmsg)->packet_type = packet_type;
+	    net_kmsg(kmsg)->net_rcv_msg_packet_count = count;
+
+	    /*
+	     * Send the packet to the destination port.  Drop it
+	     * if the destination port is over its backlog.
+	     */
+
+	    if (ipc_mqueue_send(kmsg, MACH_SEND_TIMEOUT, 0) ==
+						    MACH_MSG_SUCCESS) {
+		if (high_priority)
+		    net_kmsg_send_high_hits++;
+		else
+		    net_kmsg_send_low_hits++;
+		/* the receiver is responsible for the message now */
+	    } else {
+		if (high_priority)
+		    net_kmsg_send_high_misses++;
+		else
+		    net_kmsg_send_low_misses++;
+		ipc_kmsg_destroy(kmsg);
+	    }
+	}
+
+	(void) splimp();
+	simple_lock(&net_queue_lock);
+	return TRUE;
+}
+
+/*
+ *	We want to deliver packets using ASTs, so we can avoid the
+ *	thread_wakeup/thread_block needed to get to the network
+ *	thread.  However, we can't allocate memory in the AST handler,
+ *	because memory allocation might block.  Hence we have the
+ *	network thread to allocate memory.  The network thread also
+ *	delivers packets, so it can be allocating and delivering for a
+ *	burst.  net_thread_awake is protected by net_queue_lock
+ *	(instead of net_queue_free_lock) so that net_packet and
+ *	net_ast can safely determine if the network thread is running.
+ *	This prevents a race that might leave a packet sitting without
+ *	being delivered.  It is possible for net_kmsg_get to think
+ *	the network thread is awake, and so avoid a wakeup, and then
+ *	have the network thread sleep without allocating.  The next
+ *	net_kmsg_get will do a wakeup.
+ */
+
+void net_ast()
+{
+	spl_t s;
+
+	net_ast_taken++;
+
+	/*
+	 *	If the network thread is awake, then we would
+	 *	rather deliver messages from it, because
+	 *	it can also allocate memory.
+	 */
+
+	s = splimp();
+	simple_lock(&net_queue_lock);
+	while (!net_thread_awake && net_deliver(TRUE))
+		continue;
+
+	/*
+	 *	Prevent an unnecessary AST.  Either the network
+	 *	thread will deliver the messages, or there are
+	 *	no messages left to deliver.
+	 */
+
+	simple_unlock(&net_queue_lock);
+	(void) splsched();
+	ast_off(cpu_number(), AST_NETWORK);
+	(void) splx(s);
+}
+
+void net_thread_continue()
+{
+	for (;;) {
+		spl_t s;
+
+		net_thread_awaken++;
+
+		/*
+		 *	First get more buffers.
+		 */
+		net_kmsg_more();
+
+		s = splimp();
+		simple_lock(&net_queue_lock);
+		while (net_deliver(FALSE))
+			continue;
+
+		net_thread_awake = FALSE;
+		assert_wait(&net_thread_awake, FALSE);
+		simple_unlock(&net_queue_lock);
+		(void) splx(s);
+		counter(c_net_thread_block++);
+		thread_block(net_thread_continue);
+	}
+}
+
+void net_thread()
+{
+	spl_t s;
+
+	/*
+	 *	We should be very high priority.
+	 */
+
+	thread_set_own_priority(0);
+
+	/*
+	 *	We sleep initially, so that we don't allocate any buffers
+	 *	unless the network is really in use and they are needed.
+	 */
+
+	s = splimp();
+	simple_lock(&net_queue_lock);
+	net_thread_awake = FALSE;
+	assert_wait(&net_thread_awake, FALSE);
+	simple_unlock(&net_queue_lock);
+	(void) splx(s);
+	counter(c_net_thread_block++);
+	thread_block(net_thread_continue);
+	net_thread_continue();
+	/*NOTREACHED*/
+}
+
+void
+reorder_queue(first, last)
+	register queue_t	first, last;
+{
+	register queue_entry_t	prev, next;
+
+	prev = first->prev;
+	next = last->next;
+
+	prev->next = last;
+	next->prev = first;
+
+	last->prev = prev;
+	last->next = first;
+
+	first->next = next;
+	first->prev = last;
+}
+
+/*
+ * Incoming packet.  Header has already been moved to proper place.
+ * We are already at splimp.
+ */
+void
+net_packet(ifp, kmsg, count, priority)
+	register struct ifnet	*ifp;
+	register ipc_kmsg_t	kmsg;
+	unsigned int		count;
+	boolean_t		priority;
+{
+	boolean_t awake;
+
+#if	NORMA_ETHER
+	if (netipc_net_packet(kmsg, count)) {
+		return;
+	}
+#endif	NORMA_ETHER
+
+#if	MACH_TTD
+	/*
+	 * Do a quick check to see if it is a kernel TTD packet.
+	 *
+	 * Only check if KernelTTD is enabled, ie. the current
+	 * device driver supports TTD, and the bootp succeded.
+	 */
+	if (kttd_enabled && kttd_handle_async(kmsg)) {
+		/* 
+		 * Packet was a valid ttd packet and
+		 * doesn't need to be passed up to filter.
+		 * The ttd code put the used kmsg buffer
+		 * back onto the free list.
+		 */
+		if (kttd_debug)
+			printf("**%x**", kttd_async_counter++);
+		return;
+	}
+#endif	/* MACH_TTD */
+
+	kmsg->ikm_header.msgh_remote_port = (mach_port_t) ifp;
+	net_kmsg(kmsg)->net_rcv_msg_packet_count = count;
+
+	simple_lock(&net_queue_lock);
+	if (priority) {
+	    ipc_kmsg_enqueue(&net_queue_high, kmsg);
+	    if (++net_queue_high_size > net_queue_high_max)
+		net_queue_high_max = net_queue_high_size;
+	} else {
+	    ipc_kmsg_enqueue(&net_queue_low, kmsg);
+	    if (++net_queue_low_size > net_queue_low_max)
+		net_queue_low_max = net_queue_low_size;
+	}
+	/*
+	 *	If the network thread is awake, then we don't
+	 *	need to take an AST, because the thread will
+	 *	deliver the packet.
+	 */
+	awake = net_thread_awake;
+	simple_unlock(&net_queue_lock);
+
+	if (!awake) {
+	    spl_t s = splsched();
+	    ast_on(cpu_number(), AST_NETWORK);
+	    (void) splx(s);
+	}
+}
+
+int net_filter_queue_reorder = 0; /* non-zero to enable reordering */
+
+/*
+ * Run a packet through the filters, returning a list of messages.
+ * We are *not* called at interrupt level.
+ */
+void
+net_filter(kmsg, send_list)
+	register ipc_kmsg_t	kmsg;
+	ipc_kmsg_queue_t	send_list;
+{
+	register struct ifnet	*ifp;
+	register net_rcv_port_t	infp, nextfp;
+	register ipc_kmsg_t	new_kmsg;
+
+ 	net_hash_entry_t	entp, *hash_headp;
+ 	ipc_port_t		dest;
+ 	queue_entry_t		dead_infp = (queue_entry_t) 0;
+ 	queue_entry_t		dead_entp = (queue_entry_t) 0;
+ 	unsigned int		ret_count;
+
+	int count = net_kmsg(kmsg)->net_rcv_msg_packet_count;
+	ifp = (struct ifnet *) kmsg->ikm_header.msgh_remote_port;
+	ipc_kmsg_queue_init(send_list);
+
+	/*
+	 * Unfortunately we can't allocate or deallocate memory
+	 * while holding this lock.  And we can't drop the lock
+	 * while examining the filter list.
+	 */
+	simple_lock(&ifp->if_rcv_port_list_lock);
+ 	FILTER_ITERATE(ifp, infp, nextfp)
+ 	{
+ 	    entp = (net_hash_entry_t) 0;
+ 	    if (infp->filter[0] == NETF_BPF) {
+ 		ret_count = bpf_do_filter(infp, net_kmsg(kmsg)->packet, count,
+ 					  net_kmsg(kmsg)->header,
+ 					  &hash_headp, &entp);
+		if (entp == (net_hash_entry_t) 0)
+		  dest = infp->rcv_port;
+		else
+		  dest = entp->rcv_port;
+ 	    } else {
+ 		ret_count = net_do_filter(infp, net_kmsg(kmsg)->packet, count,
+ 					  net_kmsg(kmsg)->header);
+ 		if (ret_count)
+ 		    ret_count = count;
+ 		dest = infp->rcv_port;
+ 	    }		    
+
+ 	    if (ret_count) {
+
+		/*
+		 * Make a send right for the destination.
+		 */
+
+ 		dest = ipc_port_copy_send(dest);
+		if (!IP_VALID(dest)) {
+		    /*
+		     * This filter is dead.  We remove it from the
+		     * filter list and set it aside for deallocation.
+		     */
+
+ 		    if (entp == (net_hash_entry_t) 0) {
+ 			queue_remove(&ifp->if_rcv_port_list, infp,
+ 				     net_rcv_port_t, chain);
+ 			ENQUEUE_DEAD(dead_infp, infp);
+ 			continue;
+ 		    } else {
+ 			hash_ent_remove (ifp,
+ 					 (net_hash_header_t)infp,
+ 					 FALSE,		/* no longer used */
+ 					 hash_headp,
+ 					 entp,
+ 					 &dead_entp);
+ 			continue;
+ 		    }
+		}
+
+		/*
+		 * Deliver copy of packet to this channel.
+		 */
+		if (ipc_kmsg_queue_empty(send_list)) {
+		    /*
+		     * Only receiver, so far
+		     */
+		    new_kmsg = kmsg;
+		} else {
+		    /*
+		     * Other receivers - must allocate message and copy.
+		     */
+		    new_kmsg = net_kmsg_get();
+		    if (new_kmsg == IKM_NULL) {
+			ipc_port_release_send(dest);
+			break;
+		    }
+
+		    bcopy(
+			net_kmsg(kmsg)->packet,
+			net_kmsg(new_kmsg)->packet,
+			ret_count);
+		    bcopy(
+			net_kmsg(kmsg)->header,
+			net_kmsg(new_kmsg)->header,
+			NET_HDW_HDR_MAX);
+		}
+ 		net_kmsg(new_kmsg)->net_rcv_msg_packet_count = ret_count;
+		new_kmsg->ikm_header.msgh_remote_port = (mach_port_t) dest;
+		ipc_kmsg_enqueue(send_list, new_kmsg);
+
+	    {
+		register net_rcv_port_t prevfp;
+		int rcount = ++infp->rcv_count;
+
+		/*
+		 * See if ordering of filters is wrong
+		 */
+		if (infp->priority >= NET_HI_PRI) {
+		    prevfp = (net_rcv_port_t) queue_prev(&infp->chain);
+		    /*
+		     * If infp is not the first element on the queue,
+		     * and the previous element is at equal priority
+		     * but has a lower count, then promote infp to
+		     * be in front of prevfp.
+		     */
+		    if ((queue_t)prevfp != &ifp->if_rcv_port_list &&
+			infp->priority == prevfp->priority) {
+			/*
+			 * Threshold difference to prevent thrashing
+			 */
+			if (net_filter_queue_reorder
+			    && (100 + prevfp->rcv_count < rcount))
+				reorder_queue(&prevfp->chain, &infp->chain);
+		    }
+		    /*
+		     * High-priority filter -> no more deliveries
+		     */
+		    break;
+		}
+	    }
+	    }
+	}
+	FILTER_ITERATE_END
+
+	simple_unlock(&ifp->if_rcv_port_list_lock);
+
+	/*
+	 * Deallocate dead filters.
+	 */
+ 	if (dead_infp != 0)
+ 		net_free_dead_infp(dead_infp);
+ 	if (dead_entp != 0)
+ 		net_free_dead_entp(dead_entp);
+
+	if (ipc_kmsg_queue_empty(send_list)) {
+	    /* Not sent - recycle */
+	    net_kmsg_put(kmsg);
+	}
+}
+
+boolean_t
+net_do_filter(infp, data, data_count, header)
+	net_rcv_port_t	infp;
+	char *		data;
+	unsigned int	data_count;
+	char *		header;
+{
+	int		stack[NET_FILTER_STACK_DEPTH+1];
+	register int	*sp;
+	register filter_t	*fp, *fpe;
+	register unsigned int	op, arg;
+
+	/*
+	 * The filter accesses the header and data
+	 * as unsigned short words.
+	 */
+	data_count /= sizeof(unsigned short);
+
+#define	data_word	((unsigned short *)data)
+#define	header_word	((unsigned short *)header)
+
+	sp = &stack[NET_FILTER_STACK_DEPTH];
+	fp = &infp->filter[0];
+	fpe = infp->filter_end;
+
+	*sp = TRUE;
+
+	while (fp < fpe) {
+	    arg = *fp++;
+	    op = NETF_OP(arg);
+	    arg = NETF_ARG(arg);
+
+	    switch (arg) {
+		case NETF_NOPUSH:
+		    arg = *sp++;
+		    break;
+		case NETF_PUSHZERO:
+		    arg = 0;
+		    break;
+		case NETF_PUSHLIT:
+		    arg = *fp++;
+		    break;
+		case NETF_PUSHIND:
+		    arg = *sp++;
+		    if (arg >= data_count)
+			return FALSE;
+		    arg = data_word[arg];
+		    break;
+		case NETF_PUSHHDRIND:
+		    arg = *sp++;
+		    if (arg >= NET_HDW_HDR_MAX/sizeof(unsigned short))
+			return FALSE;
+		    arg = header_word[arg];
+		    break;
+		default:
+		    if (arg >= NETF_PUSHSTK) {
+			arg = sp[arg - NETF_PUSHSTK];
+		    }
+		    else if (arg >= NETF_PUSHHDR) {
+			arg = header_word[arg - NETF_PUSHHDR];
+		    }
+		    else {
+			arg -= NETF_PUSHWORD;
+			if (arg >= data_count)
+			    return FALSE;
+			arg = data_word[arg];
+		    }
+		    break;
+
+	    }
+	    switch (op) {
+		case NETF_OP(NETF_NOP):
+		    *--sp = arg;
+		    break;
+		case NETF_OP(NETF_AND):
+		    *sp &= arg;
+		    break;
+		case NETF_OP(NETF_OR):
+		    *sp |= arg;
+		    break;
+		case NETF_OP(NETF_XOR):
+		    *sp ^= arg;
+		    break;
+		case NETF_OP(NETF_EQ):
+		    *sp = (*sp == arg);
+		    break;
+		case NETF_OP(NETF_NEQ):
+		    *sp = (*sp != arg);
+		    break;
+		case NETF_OP(NETF_LT):
+		    *sp = (*sp < arg);
+		    break;
+		case NETF_OP(NETF_LE):
+		    *sp = (*sp <= arg);
+		    break;
+		case NETF_OP(NETF_GT):
+		    *sp = (*sp > arg);
+		    break;
+		case NETF_OP(NETF_GE):
+		    *sp = (*sp >= arg);
+		    break;
+		case NETF_OP(NETF_COR):
+		    if (*sp++ == arg)
+			return (TRUE);
+		    break;
+		case NETF_OP(NETF_CAND):
+		    if (*sp++ != arg)
+			return (FALSE);
+		    break;
+		case NETF_OP(NETF_CNOR):
+		    if (*sp++ == arg)
+			return (FALSE);
+		    break;
+		case NETF_OP(NETF_CNAND):
+		    if (*sp++ != arg)
+			return (TRUE);
+		    break;
+		case NETF_OP(NETF_LSH):
+		    *sp <<= arg;
+		    break;
+		case NETF_OP(NETF_RSH):
+		    *sp >>= arg;
+		    break;
+		case NETF_OP(NETF_ADD):
+		    *sp += arg;
+		    break;
+		case NETF_OP(NETF_SUB):
+		    *sp -= arg;
+		    break;
+	    }
+	}
+	return ((*sp) ? TRUE : FALSE);
+
+#undef	data_word
+#undef	header_word
+}
+
+/*
+ * Check filter for invalid operations or stack over/under-flow.
+ */
+boolean_t
+parse_net_filter(filter, count)
+	register filter_t	*filter;
+	unsigned int		count;
+{
+	register int	sp;
+	register filter_t	*fpe = &filter[count];
+	register filter_t	op, arg;
+
+	sp = NET_FILTER_STACK_DEPTH;
+
+	for (; filter < fpe; filter++) {
+	    op = NETF_OP(*filter);
+	    arg = NETF_ARG(*filter);
+
+	    switch (arg) {
+		case NETF_NOPUSH:
+		    break;
+		case NETF_PUSHZERO:
+		    sp--;
+		    break;
+		case NETF_PUSHLIT:
+		    filter++;
+		    if (filter >= fpe)
+			return (FALSE);	/* literal value not in filter */
+		    sp--;
+		    break;
+		case NETF_PUSHIND:
+		case NETF_PUSHHDRIND:
+		    break;
+		default:
+		    if (arg >= NETF_PUSHSTK) {
+			if (arg - NETF_PUSHSTK + sp > NET_FILTER_STACK_DEPTH)
+			    return FALSE;
+		    }
+		    else if (arg >= NETF_PUSHHDR) {
+			if (arg - NETF_PUSHHDR >=
+				NET_HDW_HDR_MAX/sizeof(unsigned short))
+			    return FALSE;
+		    }
+		    /* else... cannot check for packet bounds
+				without packet */
+		    sp--;
+		    break;
+	    }
+	    if (sp < 2) {
+		return (FALSE);	/* stack overflow */
+	    }
+	    if (op == NETF_OP(NETF_NOP))
+		continue;
+
+	    /*
+	     * all non-NOP operators are binary.
+	     */
+	    if (sp > NET_MAX_FILTER-2)
+		return (FALSE);
+
+	    sp++;
+	    switch (op) {
+		case NETF_OP(NETF_AND):
+		case NETF_OP(NETF_OR):
+		case NETF_OP(NETF_XOR):
+		case NETF_OP(NETF_EQ):
+		case NETF_OP(NETF_NEQ):
+		case NETF_OP(NETF_LT):
+		case NETF_OP(NETF_LE):
+		case NETF_OP(NETF_GT):
+		case NETF_OP(NETF_GE):
+		case NETF_OP(NETF_COR):
+		case NETF_OP(NETF_CAND):
+		case NETF_OP(NETF_CNOR):
+		case NETF_OP(NETF_CNAND):
+		case NETF_OP(NETF_LSH):
+		case NETF_OP(NETF_RSH):
+		case NETF_OP(NETF_ADD):
+		case NETF_OP(NETF_SUB):
+		    break;
+		default:
+		    return (FALSE);
+	    }
+	}
+	return (TRUE);
+}
+
+/*
+ * Set a filter for a network interface.
+ *
+ * We are given a naked send right for the rcv_port.
+ * If we are successful, we must consume that right.
+ */
+io_return_t
+net_set_filter(ifp, rcv_port, priority, filter, filter_count)
+	struct ifnet	*ifp;
+	ipc_port_t	rcv_port;
+	int		priority;
+	filter_t	*filter;
+	unsigned int	filter_count;
+{
+    int				filter_bytes;
+    bpf_insn_t			match;
+    register net_rcv_port_t	infp, my_infp;
+    net_rcv_port_t		nextfp;
+    net_hash_header_t		hhp;
+    register net_hash_entry_t	entp, hash_entp;
+    net_hash_entry_t		*head, nextentp;
+    queue_entry_t		dead_infp, dead_entp;
+    int				i;
+    int				ret, is_new_infp;
+    io_return_t			rval;
+
+    /*
+     * Check the filter syntax.
+     */
+
+    filter_bytes = CSPF_BYTES(filter_count);
+    match = (bpf_insn_t) 0;
+
+    if (filter_count > 0 && filter[0] == NETF_BPF) {
+	ret = bpf_validate((bpf_insn_t)filter, filter_bytes, &match);
+	if (!ret)
+	    return (D_INVALID_OPERATION);
+    } else {
+	if (!parse_net_filter(filter, filter_count))
+	    return (D_INVALID_OPERATION);
+    }
+
+    rval = D_SUCCESS;			/* default return value */
+    dead_infp = dead_entp = 0;
+
+    if (match == (bpf_insn_t) 0) {
+        /*
+	 * If there is no match instruction, we allocate
+	 * a normal packet filter structure.
+	 */
+	my_infp = (net_rcv_port_t) zalloc(net_rcv_zone);
+	my_infp->rcv_port = rcv_port;
+	is_new_infp = TRUE;
+    } else {
+        /*
+	 * If there is a match instruction, we assume there will
+	 * multiple session with a common substructure and allocate
+	 * a hash table to deal with them.
+	 */
+	my_infp = 0;
+	hash_entp = (net_hash_entry_t) zalloc(net_hash_entry_zone);
+	is_new_infp = FALSE;
+    }    
+
+    /*
+     * Look for an existing filter on the same reply port.
+     * Look for filters with dead ports (for GC).
+     * Look for a filter with the same code except KEY insns.
+     */
+    
+    simple_lock(&ifp->if_rcv_port_list_lock);
+    
+    FILTER_ITERATE(ifp, infp, nextfp)
+    {
+	    if (infp->rcv_port == MACH_PORT_NULL) {
+		    if (match != 0
+			&& infp->priority == priority
+			&& my_infp == 0
+			&& (infp->filter_end - infp->filter) == filter_count
+			&& bpf_eq((bpf_insn_t)infp->filter,
+				  filter, filter_bytes))
+			    {
+				    my_infp = infp;
+			    }
+
+		    for (i = 0; i < NET_HASH_SIZE; i++) {
+			    head = &((net_hash_header_t) infp)->table[i];
+			    if (*head == 0)
+				    continue;
+
+			    /*
+			     * Check each hash entry to make sure the
+			     * destination port is still valid.  Remove
+			     * any invalid entries.
+			     */
+			    entp = *head;
+			    do {
+				    nextentp = (net_hash_entry_t) entp->he_next;
+  
+				    /* checked without 
+				       ip_lock(entp->rcv_port) */
+				    if (entp->rcv_port == rcv_port
+					|| !IP_VALID(entp->rcv_port)
+					|| !ip_active(entp->rcv_port)) {
+				
+					    ret = hash_ent_remove (ifp,
+						(net_hash_header_t)infp,
+						(my_infp == infp),
+						head,
+						entp,
+						&dead_entp);
+					    if (ret)
+						    goto hash_loop_end;
+				    }
+			
+				    entp = nextentp;
+			    /* While test checks head since hash_ent_remove
+			       might modify it.
+			       */
+			    } while (*head != 0 && entp != *head);
+		    }
+		hash_loop_end:
+		    ;
+		    
+	    } else if (infp->rcv_port == rcv_port
+		       || !IP_VALID(infp->rcv_port)
+		       || !ip_active(infp->rcv_port)) {
+		    /* Remove the old filter from list */
+		    remqueue(&ifp->if_rcv_port_list, (queue_entry_t)infp);
+		    ENQUEUE_DEAD(dead_infp, infp);
+	    }
+    }
+    FILTER_ITERATE_END
+
+    if (my_infp == 0) {
+	/* Allocate a dummy infp */
+	simple_lock(&net_hash_header_lock);
+	for (i = 0; i < N_NET_HASH; i++) {
+	    if (filter_hash_header[i].n_keys == 0)
+		break;
+	}
+	if (i == N_NET_HASH) {
+	    simple_unlock(&net_hash_header_lock);
+	    simple_unlock(&ifp->if_rcv_port_list_lock);
+
+            ipc_port_release_send(rcv_port);
+	    if (match != 0)
+		    zfree (net_hash_entry_zone, (vm_offset_t)hash_entp);
+
+	    rval = D_NO_MEMORY;
+	    goto clean_and_return;
+	}
+
+	hhp = &filter_hash_header[i];
+	hhp->n_keys = match->jt;
+	simple_unlock(&net_hash_header_lock);
+
+	hhp->ref_count = 0;
+	for (i = 0; i < NET_HASH_SIZE; i++)
+	    hhp->table[i] = 0;
+
+	my_infp = (net_rcv_port_t)hhp;
+	my_infp->rcv_port = MACH_PORT_NULL;	/* indication of dummy */
+	is_new_infp = TRUE;
+    }
+
+    if (is_new_infp) {
+	my_infp->priority = priority;
+	my_infp->rcv_count = 0;
+
+	/* Copy filter program. */
+	bcopy ((vm_offset_t)filter, (vm_offset_t)my_infp->filter,
+	       filter_bytes);
+	my_infp->filter_end =
+	    (filter_t *)((char *)my_infp->filter + filter_bytes);
+
+	if (match == 0) {
+	    my_infp->rcv_qlimit = net_add_q_info(rcv_port);
+	} else {
+	    my_infp->rcv_qlimit = 0;
+	}
+
+	/* Insert my_infp according to priority */
+	queue_iterate(&ifp->if_rcv_port_list, infp, net_rcv_port_t, chain)
+	    if (priority > infp->priority)
+		break;
+	enqueue_tail((queue_t)&infp->chain, (queue_entry_t)my_infp);
+    }
+    
+    if (match != 0)
+    {	    /* Insert to hash list */
+	net_hash_entry_t *p;
+	int j;
+	
+	hash_entp->rcv_port = rcv_port;
+	for (i = 0; i < match->jt; i++)		/* match->jt is n_keys */
+	    hash_entp->keys[i] = match[i+1].k;
+	p = &((net_hash_header_t)my_infp)->
+			table[bpf_hash(match->jt, hash_entp->keys)];
+	
+	/* Not checking for the same key values */
+	if (*p == 0) {
+	    queue_init ((queue_t) hash_entp);
+	    *p = hash_entp;
+	} else {
+	    enqueue_tail((queue_t)*p, hash_entp);
+	}
+
+	((net_hash_header_t)my_infp)->ref_count++;
+	hash_entp->rcv_qlimit = net_add_q_info(rcv_port);
+
+    }
+    
+    simple_unlock(&ifp->if_rcv_port_list_lock);
+
+clean_and_return:
+    /* No locks are held at this point. */
+
+    if (dead_infp != 0)
+	    net_free_dead_infp(dead_infp);
+    if (dead_entp != 0)
+	    net_free_dead_entp(dead_entp);
+    
+    return (rval);
+}
+
+/*
+ * Other network operations
+ */
+io_return_t
+net_getstat(ifp, flavor, status, count)
+	struct ifnet	*ifp;
+	dev_flavor_t	flavor;
+	dev_status_t	status;		/* pointer to OUT array */
+	natural_t	*count;		/* OUT */
+{
+	switch (flavor) {
+	    case NET_STATUS:
+	    {
+		register struct net_status *ns = (struct net_status *)status;
+
+		if (*count < NET_STATUS_COUNT)
+		    return (D_INVALID_OPERATION);
+		
+		ns->min_packet_size = ifp->if_header_size;
+		ns->max_packet_size = ifp->if_header_size + ifp->if_mtu;
+		ns->header_format   = ifp->if_header_format;
+		ns->header_size	    = ifp->if_header_size;
+		ns->address_size    = ifp->if_address_size;
+		ns->flags	    = ifp->if_flags;
+		ns->mapped_size	    = 0;
+
+		*count = NET_STATUS_COUNT;
+		break;
+	    }
+	    case NET_ADDRESS:
+	    {
+		register int	addr_byte_count;
+		register int	addr_int_count;
+		register int	i;
+
+		addr_byte_count = ifp->if_address_size;
+		addr_int_count = (addr_byte_count + (sizeof(int)-1))
+					 / sizeof(int);
+
+		if (*count < addr_int_count)
+		{
+/* XXX debug hack. */
+printf ("net_getstat: count: %d, addr_int_count: %d\n",
+		*count, addr_int_count);
+		    return (D_INVALID_OPERATION);
+		}
+
+		bcopy((char *)ifp->if_address,
+		      (char *)status,
+		      (unsigned) addr_byte_count);
+		if (addr_byte_count < addr_int_count * sizeof(int))
+		    bzero((char *)status + addr_byte_count,
+			  (unsigned) (addr_int_count * sizeof(int)
+				      - addr_byte_count));
+
+		for (i = 0; i < addr_int_count; i++) {
+		    register int word;
+
+		    word = status[i];
+		    status[i] = htonl(word);
+		}
+		*count = addr_int_count;
+		break;
+	    }
+	    default:
+		return (D_INVALID_OPERATION);
+	}
+	return (D_SUCCESS);
+}
+
+io_return_t
+net_write(ifp, start, ior)
+	register struct ifnet *ifp;
+	int		(*start)();
+	io_req_t	ior;
+{
+	spl_t	s;
+	kern_return_t	rc;
+	boolean_t	wait;
+
+	/*
+	 * Reject the write if the interface is down.
+	 */
+	if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
+	    return (D_DEVICE_DOWN);
+
+	/*
+	 * Reject the write if the packet is too large or too small.
+	 */
+	if (ior->io_count < ifp->if_header_size ||
+	    ior->io_count > ifp->if_header_size + ifp->if_mtu)
+	    return (D_INVALID_SIZE);
+
+	/*
+	 * Wire down the memory.
+	 */
+
+	rc = device_write_get(ior, &wait);
+	if (rc != KERN_SUCCESS)
+	    return (rc);
+
+	/*
+	 *	Network interfaces can't cope with VM continuations.
+	 *	If wait is set, just panic.
+	*/
+	if (wait) {
+		panic("net_write: VM continuation");
+	}
+
+	/*
+	 * Queue the packet on the output queue, and
+	 * start the device.
+	 */
+	s = splimp();
+	IF_ENQUEUE(&ifp->if_snd, ior);
+	(*start)(ifp->if_unit);
+	splx(s);
+	
+	return (D_IO_QUEUED);
+}
+
+#ifdef FIPC
+/* This gets called by nefoutput for dev_ops->d_port_death ... */
+
+io_return_t
+net_fwrite(ifp, start, ior)
+	register struct ifnet *ifp;
+	int		(*start)();
+	io_req_t	ior;
+{
+	spl_t	s;
+	kern_return_t	rc;
+	boolean_t	wait;
+
+	/*
+	 * Reject the write if the interface is down.
+	 */
+	if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
+	    return (D_DEVICE_DOWN);
+
+	/*
+	 * Reject the write if the packet is too large or too small.
+	 */
+	if (ior->io_count < ifp->if_header_size ||
+	    ior->io_count > ifp->if_header_size + ifp->if_mtu)
+	    	return (D_INVALID_SIZE);
+
+	/*
+	 * DON'T Wire down the memory.
+	 */
+#if 0
+	rc = device_write_get(ior, &wait);
+	if (rc != KERN_SUCCESS)
+	    return (rc);
+#endif
+	/*
+	 *	Network interfaces can't cope with VM continuations.
+	 *	If wait is set, just panic.
+	*/
+	/* I'll have to figure out who was setting wait...*/
+#if 0
+	if (wait) {
+		panic("net_write: VM continuation");
+	}
+#endif
+	/*
+	 * Queue the packet on the output queue, and
+	 * start the device.
+	 */
+	s = splimp();
+	IF_ENQUEUE(&ifp->if_snd, ior);
+	(*start)(ifp->if_unit);
+	splx(s);
+	
+	return (D_IO_QUEUED);
+}
+#endif /* FIPC */
+
+/*
+ * Initialize the whole package.
+ */
+void
+net_io_init()
+{
+	register vm_size_t	size;
+
+	size = sizeof(struct net_rcv_port);
+	net_rcv_zone = zinit(size,
+			     size * 1000,
+			     PAGE_SIZE,
+			     FALSE,
+			     "net_rcv_port");
+
+ 	size = sizeof(struct net_hash_entry);
+ 	net_hash_entry_zone = zinit(size,
+ 				    size * 100,
+ 				    PAGE_SIZE,
+ 				    FALSE,
+ 				    "net_hash_entry");
+
+	size = ikm_plus_overhead(sizeof(struct net_rcv_msg));
+	net_kmsg_size = round_page(size);
+
+	/*
+	 *	net_kmsg_max caps the number of buffers
+	 *	we are willing to allocate.  By default,
+	 *	we allow for net_queue_free_min plus
+	 *	the queue limit for each filter.
+	 *	(Added as the filters are added.)
+	 */
+
+	simple_lock_init(&net_kmsg_total_lock);
+	if (net_kmsg_max == 0)
+	    net_kmsg_max = net_queue_free_min;
+
+	simple_lock_init(&net_queue_free_lock);
+	ipc_kmsg_queue_init(&net_queue_free);
+
+	simple_lock_init(&net_queue_lock);
+	ipc_kmsg_queue_init(&net_queue_high);
+	ipc_kmsg_queue_init(&net_queue_low);
+
+ 	simple_lock_init(&net_hash_header_lock);
+}
+
+
+/* ======== BPF: Berkeley Packet Filter ======== */
+
+/*-
+ * Copyright (c) 1990-1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence 
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)bpf.c	7.5 (Berkeley) 7/15/91
+ */
+
+#if defined(sparc) || defined(mips) || defined(ibm032) || defined(alpha)
+#define BPF_ALIGN
+#endif
+
+#ifndef BPF_ALIGN
+#define EXTRACT_SHORT(p)	((u_short)ntohs(*(u_short *)p))
+#define EXTRACT_LONG(p)		(ntohl(*(u_long *)p))
+#else
+#define EXTRACT_SHORT(p)\
+	((u_short)\
+		((u_short)*((u_char *)p+0)<<8|\
+		 (u_short)*((u_char *)p+1)<<0))
+#define EXTRACT_LONG(p)\
+		((u_long)*((u_char *)p+0)<<24|\
+		 (u_long)*((u_char *)p+1)<<16|\
+		 (u_long)*((u_char *)p+2)<<8|\
+		 (u_long)*((u_char *)p+3)<<0)
+#endif
+
+/*
+ * Execute the filter program starting at pc on the packet p
+ * wirelen is the length of the original packet
+ * buflen is the amount of data present
+ */
+
+int
+bpf_do_filter(infp, p, wirelen, header, hash_headpp, entpp)
+	net_rcv_port_t	infp;
+	char *		p;		/* packet data */
+	unsigned int	wirelen;	/* data_count (in bytes) */
+	char *		header;
+	net_hash_entry_t	**hash_headpp, *entpp;	/* out */
+{
+	register bpf_insn_t pc, pc_end;
+	register unsigned int buflen;
+
+	register unsigned long A, X;
+	register int k;
+	long mem[BPF_MEMWORDS];
+
+	pc = ((bpf_insn_t) infp->filter) + 1;
+					/* filter[0].code is BPF_BEGIN */
+	pc_end = (bpf_insn_t)infp->filter_end;
+	buflen = NET_RCV_MAX;
+	*entpp = 0;			/* default */
+
+#ifdef lint
+	A = 0;
+	X = 0;
+#endif
+	for (; pc < pc_end; ++pc) {
+		switch (pc->code) {
+
+		default:
+#ifdef KERNEL
+			return 0;
+#else
+			abort();
+#endif			
+		case BPF_RET|BPF_K:
+			if (infp->rcv_port == MACH_PORT_NULL &&
+			    *entpp == 0) {
+				return 0;
+			}
+			return ((u_int)pc->k <= wirelen) ?
+						pc->k : wirelen;
+
+		case BPF_RET|BPF_A:
+			if (infp->rcv_port == MACH_PORT_NULL &&
+			    *entpp == 0) {
+				return 0;
+			}
+			return ((u_int)A <= wirelen) ?
+						A : wirelen;
+
+		case BPF_RET|BPF_MATCH_IMM:
+			if (bpf_match ((net_hash_header_t)infp, pc->jt, mem,
+				       hash_headpp, entpp)) {
+				return ((u_int)pc->k <= wirelen) ?
+							pc->k : wirelen;
+			}
+			return 0;
+
+		case BPF_LD|BPF_W|BPF_ABS:
+			k = pc->k;
+			if ((u_int)k + sizeof(long) <= buflen) {
+#ifdef BPF_ALIGN
+				if (((int)(p + k) & 3) != 0)
+					A = EXTRACT_LONG(&p[k]);
+				else
+#endif
+					A = ntohl(*(long *)(p + k));
+				continue;
+			}
+
+			k -= BPF_DLBASE;
+			if ((u_int)k + sizeof(long) <= NET_HDW_HDR_MAX) {
+#ifdef BPF_ALIGN
+				if (((int)(header + k) & 3) != 0)
+					A = EXTRACT_LONG(&header[k]);
+				else
+#endif
+					A = ntohl(*(long *)(header + k));
+				continue;
+			} else {
+				return 0;
+			}
+
+		case BPF_LD|BPF_H|BPF_ABS:
+			k = pc->k;
+			if ((u_int)k + sizeof(short) <= buflen) {
+				A = EXTRACT_SHORT(&p[k]);
+				continue;
+			}
+
+			k -= BPF_DLBASE;
+			if ((u_int)k + sizeof(short) <= NET_HDW_HDR_MAX) {
+				A = EXTRACT_SHORT(&header[k]);
+				continue;
+			} else {
+				return 0;
+			}
+
+		case BPF_LD|BPF_B|BPF_ABS:
+			k = pc->k;
+			if ((u_int)k < buflen) {
+				A = p[k];
+				continue;
+			}
+			
+			k -= BPF_DLBASE;
+			if ((u_int)k < NET_HDW_HDR_MAX) {
+				A = header[k];
+				continue;
+			} else {
+				return 0;
+			}
+
+		case BPF_LD|BPF_W|BPF_LEN:
+			A = wirelen;
+			continue;
+
+		case BPF_LDX|BPF_W|BPF_LEN:
+			X = wirelen;
+			continue;
+
+		case BPF_LD|BPF_W|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof(long) > buflen)
+				return 0;
+#ifdef BPF_ALIGN
+			if (((int)(p + k) & 3) != 0)
+				A = EXTRACT_LONG(&p[k]);
+			else
+#endif
+				A = ntohl(*(long *)(p + k));
+			continue;
+
+		case BPF_LD|BPF_H|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof(short) > buflen)
+				return 0;
+			A = EXTRACT_SHORT(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_IND:
+			k = X + pc->k;
+			if (k >= buflen)
+				return 0;
+			A = p[k];
+			continue;
+
+		case BPF_LDX|BPF_MSH|BPF_B:
+			k = pc->k;
+			if (k >= buflen)
+				return 0;
+			X = (p[pc->k] & 0xf) << 2;
+			continue;
+
+		case BPF_LD|BPF_IMM:
+			A = pc->k;
+			continue;
+
+		case BPF_LDX|BPF_IMM:
+			X = pc->k;
+			continue;
+
+		case BPF_LD|BPF_MEM:
+			A = mem[pc->k];
+			continue;
+			
+		case BPF_LDX|BPF_MEM:
+			X = mem[pc->k];
+			continue;
+
+		case BPF_ST:
+			mem[pc->k] = A;
+			continue;
+
+		case BPF_STX:
+			mem[pc->k] = X;
+			continue;
+
+		case BPF_JMP|BPF_JA:
+			pc += pc->k;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_K:
+			pc += (A > pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_K:
+			pc += (A >= pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_K:
+			pc += (A == pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_K:
+			pc += (A & pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_X:
+			pc += (A > X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_X:
+			pc += (A >= X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_X:
+			pc += (A == X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_X:
+			pc += (A & X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_X:
+			A += X;
+			continue;
+			
+		case BPF_ALU|BPF_SUB|BPF_X:
+			A -= X;
+			continue;
+			
+		case BPF_ALU|BPF_MUL|BPF_X:
+			A *= X;
+			continue;
+			
+		case BPF_ALU|BPF_DIV|BPF_X:
+			if (X == 0)
+				return 0;
+			A /= X;
+			continue;
+			
+		case BPF_ALU|BPF_AND|BPF_X:
+			A &= X;
+			continue;
+			
+		case BPF_ALU|BPF_OR|BPF_X:
+			A |= X;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_X:
+			A <<= X;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_X:
+			A >>= X;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_K:
+			A += pc->k;
+			continue;
+			
+		case BPF_ALU|BPF_SUB|BPF_K:
+			A -= pc->k;
+			continue;
+			
+		case BPF_ALU|BPF_MUL|BPF_K:
+			A *= pc->k;
+			continue;
+			
+		case BPF_ALU|BPF_DIV|BPF_K:
+			A /= pc->k;
+			continue;
+			
+		case BPF_ALU|BPF_AND|BPF_K:
+			A &= pc->k;
+			continue;
+			
+		case BPF_ALU|BPF_OR|BPF_K:
+			A |= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_K:
+			A <<= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_K:
+			A >>= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_NEG:
+			A = -A;
+			continue;
+
+		case BPF_MISC|BPF_TAX:
+			X = A;
+			continue;
+
+		case BPF_MISC|BPF_TXA:
+			A = X;
+			continue;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return 1 if the 'f' is a valid filter program without a MATCH
+ * instruction. Return 2 if it is a valid filter program with a MATCH
+ * instruction. Otherwise, return 0.
+ * The constraints are that each jump be forward and to a valid
+ * code.  The code must terminate with either an accept or reject. 
+ * 'valid' is an array for use by the routine (it must be at least
+ * 'len' bytes long).  
+ *
+ * The kernel needs to be able to verify an application's filter code.
+ * Otherwise, a bogus program could easily crash the system.
+ */
+int
+bpf_validate(f, bytes, match)
+	bpf_insn_t f;
+	int bytes;
+	bpf_insn_t *match;
+{
+	register int i, j, len;
+	register bpf_insn_t p;
+
+	len = BPF_BYTES2LEN(bytes);
+	/* f[0].code is already checked to be BPF_BEGIN. So skip f[0]. */
+
+	for (i = 1; i < len; ++i) {
+		/*
+		 * Check that that jumps are forward, and within 
+		 * the code block.
+		 */
+		p = &f[i];
+		if (BPF_CLASS(p->code) == BPF_JMP) {
+			register int from = i + 1;
+
+			if (BPF_OP(p->code) == BPF_JA) {
+				if (from + p->k >= len)
+					return 0;
+			}
+			else if (from + p->jt >= len || from + p->jf >= len)
+				return 0;
+		}
+		/*
+		 * Check that memory operations use valid addresses.
+		 */
+		if ((BPF_CLASS(p->code) == BPF_ST ||
+		     (BPF_CLASS(p->code) == BPF_LD && 
+		      (p->code & 0xe0) == BPF_MEM)) &&
+		    (p->k >= BPF_MEMWORDS || p->k < 0))
+			return 0;
+		/*
+		 * Check for constant division by 0.
+		 */
+		if (p->code == (BPF_ALU|BPF_DIV|BPF_K) && p->k == 0)
+			return 0;
+		/*
+		 * Check for match instruction.
+		 * Only one match instruction per filter is allowed.
+		 */
+		if (p->code == (BPF_RET|BPF_MATCH_IMM)) {
+			if (*match != 0 ||
+			    p->jt == 0 ||
+			    p->jt > N_NET_HASH_KEYS)
+				return 0;
+			i += p->jt;		/* skip keys */
+			if (i + 1 > len)
+				return 0;
+
+			for (j = 1; j <= p->jt; j++) {
+			    if (p[j].code != (BPF_MISC|BPF_KEY))
+				return 0;
+			}
+
+			*match = p;
+		}
+	}
+	if (BPF_CLASS(f[len - 1].code) == BPF_RET)
+		return ((*match == 0) ? 1 : 2);
+	else
+		return 0;
+}
+
+int
+bpf_eq (f1, f2, bytes)
+	register bpf_insn_t f1, f2;
+	register int bytes;
+{
+	register int count;
+
+	count = BPF_BYTES2LEN(bytes);
+	for (; count--; f1++, f2++) {
+		if (!BPF_INSN_EQ(f1, f2)) {
+			if ( f1->code == (BPF_MISC|BPF_KEY) &&
+			     f2->code == (BPF_MISC|BPF_KEY) )
+				continue;
+			return FALSE;
+		}
+	};
+	return TRUE;
+}
+
+unsigned int
+bpf_hash (n, keys)
+	register int n;
+	register unsigned int *keys;
+{
+	register unsigned int hval = 0;
+	
+	while (n--) {
+		hval += *keys++;
+	}
+	return (hval % NET_HASH_SIZE);
+}
+
+
+int
+bpf_match (hash, n_keys, keys, hash_headpp, entpp)
+	net_hash_header_t hash;
+	register int n_keys;
+	register unsigned int *keys;
+	net_hash_entry_t **hash_headpp, *entpp;
+{
+	register net_hash_entry_t head, entp;
+	register int i;
+
+	if (n_keys != hash->n_keys)
+		return FALSE;
+
+	*hash_headpp = &hash->table[bpf_hash(n_keys, keys)];
+	head = **hash_headpp;
+
+	if (head == 0)
+		return FALSE;
+
+	HASH_ITERATE (head, entp)
+	{
+		for (i = 0; i < n_keys; i++) {
+			if (keys[i] != entp->keys[i])
+				break;
+		}
+		if (i == n_keys) {
+			*entpp = entp;
+			return TRUE;
+		}
+	}
+	HASH_ITERATE_END (head, entp)
+	return FALSE;
+}	
+
+
+/*
+ * Removes a hash entry (ENTP) from its queue (HEAD).
+ * If the reference count of filter (HP) becomes zero and not USED,
+ * HP is removed from ifp->if_rcv_port_list and is freed.
+ */
+
+int
+hash_ent_remove (ifp, hp, used, head, entp, dead_p)
+    struct ifnet	*ifp;
+    net_hash_header_t 	hp;
+    int			used;
+    net_hash_entry_t	*head, entp;
+    queue_entry_t	*dead_p;
+{    
+	hp->ref_count--;
+
+	if (*head == entp) {
+
+		if (queue_empty((queue_t) entp)) {
+			*head = 0;
+			ENQUEUE_DEAD(*dead_p, entp);
+			if (hp->ref_count == 0 && !used) {
+				remqueue((queue_t) &ifp->if_rcv_port_list,
+					 (queue_entry_t)hp);
+				hp->n_keys = 0;
+				return TRUE;
+			}
+			return FALSE;
+		} else {
+			*head = (net_hash_entry_t)queue_next((queue_t) entp);
+		}
+	}
+
+	remqueue((queue_t)*head, (queue_entry_t)entp);
+	ENQUEUE_DEAD(*dead_p, entp);
+	return FALSE;
+}    
+
+int
+net_add_q_info (rcv_port)
+	ipc_port_t	rcv_port;
+{
+	mach_port_msgcount_t qlimit = 0;
+	    
+	/*
+	 * We use a new port, so increase net_queue_free_min
+	 * and net_kmsg_max to allow for more queued messages.
+	 */
+	    
+	if (IP_VALID(rcv_port)) {
+		ip_lock(rcv_port);
+		if (ip_active(rcv_port))
+			qlimit = rcv_port->ip_qlimit;
+		ip_unlock(rcv_port);
+	}
+	    
+	simple_lock(&net_kmsg_total_lock);
+	net_queue_free_min++;
+	net_kmsg_max += qlimit + 1;
+	simple_unlock(&net_kmsg_total_lock);
+
+	return (int)qlimit;
+}
+
+net_del_q_info (qlimit)
+	int qlimit;
+{
+	simple_lock(&net_kmsg_total_lock);
+	net_queue_free_min--;
+	net_kmsg_max -= qlimit + 1;
+	simple_unlock(&net_kmsg_total_lock);
+}
+
+
+/*
+ * net_free_dead_infp (dead_infp)
+ *	queue_entry_t dead_infp;	list of dead net_rcv_port_t.
+ *
+ * Deallocates dead net_rcv_port_t.
+ * No locks should be held when called.
+ */
+net_free_dead_infp (dead_infp)
+	queue_entry_t dead_infp;
+{
+	register net_rcv_port_t infp, nextfp;
+
+	for (infp = (net_rcv_port_t) dead_infp; infp != 0; infp = nextfp)
+	{
+		nextfp = (net_rcv_port_t) queue_next(&infp->chain);
+		ipc_port_release_send(infp->rcv_port);
+		net_del_q_info(infp->rcv_qlimit);
+		zfree(net_rcv_zone, (vm_offset_t) infp);
+	}	    
+}
+    
+/*
+ * net_free_dead_entp (dead_entp)
+ *	queue_entry_t dead_entp;	list of dead net_hash_entry_t.
+ *
+ * Deallocates dead net_hash_entry_t.
+ * No locks should be held when called.
+ */
+net_free_dead_entp (dead_entp)
+	queue_entry_t dead_entp;
+{
+	register net_hash_entry_t entp, nextentp;
+
+	for (entp = (net_hash_entry_t)dead_entp; entp != 0; entp = nextentp)
+	{
+		nextentp = (net_hash_entry_t) queue_next(&entp->chain);
+
+		ipc_port_release_send(entp->rcv_port);
+		net_del_q_info(entp->rcv_qlimit);
+		zfree(net_hash_entry_zone, (vm_offset_t) entp);
+	}
+}
+
author	Thomas Bushnell <thomas@gnu.org>	1997-02-25 21:28:37 +0000
committer	Thomas Bushnell <thomas@gnu.org>	1997-02-25 21:28:37 +0000
commit	f07a4c844da9f0ecae5bbee1ab94be56505f26f7 (patch)
tree	12b07c7e578fc1a5f53dbfde2632408491ff2a70 /device/net_io.c