12 files changed, 330 insertions, 13 deletions
diff --git a/linux/dev/arch/i386/kernel/irq.c b/linux/dev/arch/i386/kernel/irq.c
index e9dfe6a..4bed353 100644
--- a/linux/dev/arch/i386/kernel/irq.c
+++ b/linux/dev/arch/i386/kernel/irq.c
@@ -45,12 +45,19 @@
 #include <asm/bitops.h>
 #include <asm/irq.h>
 #include <asm/io.h>
+#include <asm/hardirq.h>
 
 extern int linux_timer_intr (void);
 extern spl_t splhigh (void);
 extern spl_t spl0 (void);
 extern void form_pic_mask (void);
 
+#if 0
+/* XXX: This is the way it's done in linux 2.2. GNU Mach currently uses intr_count. It should be made using local_{bh/irq}_count instead (through hardirq_enter/exit) for SMP support. */
+unsigned int local_bh_count[NR_CPUS];
+unsigned int local_irq_count[NR_CPUS];
+#endif
+
 /*
  * XXX Move this into more suitable place...
  * Set if the machine has an EISA bus.
@@ -407,6 +414,277 @@ reserve_mach_irqs (void)
     }
 }
 
+#ifdef __SMP__
+unsigned char global_irq_holder = NO_PROC_ID;
+unsigned volatile int global_irq_lock;
+atomic_t global_irq_count;
+
+atomic_t global_bh_count;
+atomic_t global_bh_lock;
+
+/*
+ * "global_cli()" is a special case, in that it can hold the
+ * interrupts disabled for a longish time, and also because
+ * we may be doing TLB invalidates when holding the global
+ * IRQ lock for historical reasons. Thus we may need to check
+ * SMP invalidate events specially by hand here (but not in
+ * any normal spinlocks)
+ */
+#if 0
+/* XXX: check how Mach handles this */
+static inline void check_smp_invalidate(int cpu)
+{
+	if (test_bit(cpu, &smp_invalidate_needed)) {
+		clear_bit(cpu, &smp_invalidate_needed);
+		local_flush_tlb();
+	}
+}
+#endif
+
+static void show(char * str)
+{
+	int i;
+	unsigned long *stack;
+	int cpu = smp_processor_id();
+	extern char *get_options(char *str, int *ints);
+
+	printk("\n%s, CPU %d:\n", str, cpu);
+	printk("irq:  %d [%d %d]\n",
+		atomic_read(&global_irq_count), local_irq_count[0], local_irq_count[1]);
+	printk("bh:   %d [%d %d]\n",
+		atomic_read(&global_bh_count), local_bh_count[0], local_bh_count[1]);
+	stack = (unsigned long *) &stack;
+	for (i = 40; i ; i--) {
+		unsigned long x = *++stack;
+		//if (x > (unsigned long) &get_options && x < (unsigned long) &vsprintf) {
+			printk("<[%08lx]> ", x);
+		//}
+	}
+}
+	
+#define MAXCOUNT 100000000
+
+static inline void wait_on_bh(void)
+{
+	int count = MAXCOUNT;
+	do {
+		if (!--count) {
+			show("wait_on_bh");
+			count = ~0;
+		}
+		/* nothing .. wait for the other bh's to go away */
+	} while (atomic_read(&global_bh_count) != 0);
+}
+
+/*
+ * I had a lockup scenario where a tight loop doing
+ * spin_unlock()/spin_lock() on CPU#1 was racing with
+ * spin_lock() on CPU#0. CPU#0 should have noticed spin_unlock(), but
+ * apparently the spin_unlock() information did not make it
+ * through to CPU#0 ... nasty, is this by design, do we have to limit
+ * 'memory update oscillation frequency' artificially like here?
+ *
+ * Such 'high frequency update' races can be avoided by careful design, but
+ * some of our major constructs like spinlocks use similar techniques,
+ * it would be nice to clarify this issue. Set this define to 0 if you
+ * want to check whether your system freezes.  I suspect the delay done
+ * by SYNC_OTHER_CORES() is in correlation with 'snooping latency', but
+ * i thought that such things are guaranteed by design, since we use
+ * the 'LOCK' prefix.
+ */
+#define SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND 1
+
+#if SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND
+# define SYNC_OTHER_CORES(x) udelay(x+1)
+#else
+/*
+ * We have to allow irqs to arrive between __sti and __cli
+ */
+# define SYNC_OTHER_CORES(x) __asm__ __volatile__ ("nop")
+#endif
+
+static inline void wait_on_irq(int cpu)
+{
+	int count = MAXCOUNT;
+
+	for (;;) {
+
+		/*
+		 * Wait until all interrupts are gone. Wait
+		 * for bottom half handlers unless we're
+		 * already executing in one..
+		 */
+		if (!atomic_read(&global_irq_count)) {
+			if (local_bh_count[cpu] || !atomic_read(&global_bh_count))
+				break;
+		}
+
+		/* Duh, we have to loop. Release the lock to avoid deadlocks */
+		clear_bit(0,&global_irq_lock);
+
+		for (;;) {
+			if (!--count) {
+				show("wait_on_irq");
+				count = ~0;
+			}
+			__sti();
+			SYNC_OTHER_CORES(cpu);
+			__cli();
+			//check_smp_invalidate(cpu);
+			if (atomic_read(&global_irq_count))
+				continue;
+			if (global_irq_lock)
+				continue;
+			if (!local_bh_count[cpu] && atomic_read(&global_bh_count))
+				continue;
+			if (!test_and_set_bit(0,&global_irq_lock))
+				break;
+		}
+	}
+}
+
+/*
+ * This is called when we want to synchronize with
+ * bottom half handlers. We need to wait until
+ * no other CPU is executing any bottom half handler.
+ *
+ * Don't wait if we're already running in an interrupt
+ * context or are inside a bh handler. 
+ */
+void synchronize_bh(void)
+{
+	if (atomic_read(&global_bh_count) && !in_interrupt())
+		wait_on_bh();
+}
+
+/*
+ * This is called when we want to synchronize with
+ * interrupts. We may for example tell a device to
+ * stop sending interrupts: but to make sure there
+ * are no interrupts that are executing on another
+ * CPU we need to call this function.
+ */
+void synchronize_irq(void)
+{
+	if (atomic_read(&global_irq_count)) {
+		/* Stupid approach */
+		cli();
+		sti();
+	}
+}
+
+static inline void get_irqlock(int cpu)
+{
+	if (test_and_set_bit(0,&global_irq_lock)) {
+		/* do we already hold the lock? */
+		if ((unsigned char) cpu == global_irq_holder)
+			return;
+		/* Uhhuh.. Somebody else got it. Wait.. */
+		do {
+			do {
+				//check_smp_invalidate(cpu);
+			} while (test_bit(0,&global_irq_lock));
+		} while (test_and_set_bit(0,&global_irq_lock));		
+	}
+	/* 
+	 * We also to make sure that nobody else is running
+	 * in an interrupt context. 
+	 */
+	wait_on_irq(cpu);
+
+	/*
+	 * Ok, finally..
+	 */
+	global_irq_holder = cpu;
+}
+
+#define EFLAGS_IF_SHIFT 9
+
+/*
+ * A global "cli()" while in an interrupt context
+ * turns into just a local cli(). Interrupts
+ * should use spinlocks for the (very unlikely)
+ * case that they ever want to protect against
+ * each other.
+ *
+ * If we already have local interrupts disabled,
+ * this will not turn a local disable into a
+ * global one (problems with spinlocks: this makes
+ * save_flags+cli+sti usable inside a spinlock).
+ */
+void __global_cli(void)
+{
+	unsigned int flags;
+
+	__save_flags(flags);
+	if (flags & (1 << EFLAGS_IF_SHIFT)) {
+		int cpu = smp_processor_id();
+		__cli();
+		if (!local_irq_count[cpu])
+			get_irqlock(cpu);
+	}
+}
+
+void __global_sti(void)
+{
+	int cpu = smp_processor_id();
+
+	if (!local_irq_count[cpu])
+		release_irqlock(cpu);
+	__sti();
+}
+
+/*
+ * SMP flags value to restore to:
+ * 0 - global cli
+ * 1 - global sti
+ * 2 - local cli
+ * 3 - local sti
+ */
+unsigned long __global_save_flags(void)
+{
+	int retval;
+	int local_enabled;
+	unsigned long flags;
+
+	__save_flags(flags);
+	local_enabled = (flags >> EFLAGS_IF_SHIFT) & 1;
+	/* default to local */
+	retval = 2 + local_enabled;
+
+	/* check for global flags if we're not in an interrupt */
+	if (!local_irq_count[smp_processor_id()]) {
+		if (local_enabled)
+			retval = 1;
+		if (global_irq_holder == (unsigned char) smp_processor_id())
+			retval = 0;
+	}
+	return retval;
+}
+
+void __global_restore_flags(unsigned long flags)
+{
+	switch (flags) {
+	case 0:
+		__global_cli();
+		break;
+	case 1:
+		__global_sti();
+		break;
+	case 2:
+		__cli();
+		break;
+	case 3:
+		__sti();
+		break;
+	default:
+		printk("global_restore_flags: %08lx (%08lx)\n",
+			flags, (&flags)[-1]);
+	}
+}
+
+#endif
+
 static int (*old_clock_handler) ();
 static int old_clock_pri;
 
diff --git a/linux/dev/include/asm-i386/smp.h b/linux/dev/include/asm-i386/smp.h
index 96423ae..fabe01d 100644
--- a/linux/dev/include/asm-i386/smp.h
+++ b/linux/dev/include/asm-i386/smp.h
@@ -1,4 +1,8 @@
 #ifndef _I386_SMP_H
 #define _I386_SMP_H
 
+#include <machine/cpu_number.h>
+
+#define smp_processor_id() cpu_number()
+
 #endif /* _I386_SMP_H */
diff --git a/linux/src/drivers/net/3c515.c b/linux/src/drivers/net/3c515.c
index a283bab..fd6ec50 100644
--- a/linux/src/drivers/net/3c515.c
+++ b/linux/src/drivers/net/3c515.c
@@ -85,7 +85,7 @@ static int max_interrupt_work = 20;
 #define IRQ(irq, dev_id, pt_regs) (irq, dev_id, pt_regs)
 
 #if (LINUX_VERSION_CODE < 0x20123)
-#define test_and_set_bit(val, addr) set_bit(val, addr)
+//#define test_and_set_bit(val, addr) set_bit(val, addr)
 #elif defined(MODULE)
 MODULE_AUTHOR("Donald Becker <becker@cesdis.gsfc.nasa.gov>");
 MODULE_DESCRIPTION("3Com 3c515 Corkscrew driver");
diff --git a/linux/src/drivers/net/de4x5.c b/linux/src/drivers/net/de4x5.c
index b1ab417..114f6a7 100644
--- a/linux/src/drivers/net/de4x5.c
+++ b/linux/src/drivers/net/de4x5.c
@@ -445,7 +445,7 @@ static const char *version = "de4x5.c:V0.5351 1998/10/4 davies@maniac.ultranet.c
 #include <linux/version.h>
 #if	LINUX_VERSION_CODE < LinuxVersionCode(2,1,0)
 #  define __initfunc(__arginit) __arginit
-#  define test_and_set_bit      set_bit
+//#  define test_and_set_bit      set_bit
 #  define net_device_stats      enet_statistics
 #  define copy_to_user(a,b,c)   memcpy_tofs(a,b,c)
 #  define copy_from_user(a,b,c) memcpy_fromfs(a,b,c)
diff --git a/linux/src/drivers/net/eth16i.c b/linux/src/drivers/net/eth16i.c
index 903b3ec..244c3e7 100644
--- a/linux/src/drivers/net/eth16i.c
+++ b/linux/src/drivers/net/eth16i.c
@@ -175,7 +175,7 @@ static char *version =
 #endif
 
 #if LINUX_VERSION_CODE < 0x20138
-#define test_and_set_bit(val,addr) set_bit(val,addr)
+//#define test_and_set_bit(val,addr) set_bit(val,addr)
 #endif
 
 #if LINUX_VERSION_CODE < 0x020100
diff --git a/linux/src/drivers/net/kern_compat.h b/linux/src/drivers/net/kern_compat.h
index 75c34b0..39e1934 100644
--- a/linux/src/drivers/net/kern_compat.h
+++ b/linux/src/drivers/net/kern_compat.h
@@ -2,7 +2,7 @@
 #define _KERN_COMPAT_H
 /* kern_compat.h: Linux PCI network adapter backward compatibility code. */
 /*
-	$Revision: 1.1.2.1 $ $Date: 2006/01/22 15:54:41 $
+	$Revision: 1.1.2.2 $ $Date: 2007/08/04 21:02:21 $
 
 	Kernel compatibility defines.
 	This file provides macros to mask the difference between kernel versions.
@@ -76,7 +76,7 @@ __attribute__((section(".modinfo"))) =		\
 */
 #if LINUX_VERSION_CODE < 0x20123
 #define hard_smp_processor_id() smp_processor_id()
-#define test_and_set_bit(val, addr) set_bit(val, addr)
+//#define test_and_set_bit(val, addr) set_bit(val, addr)
 #define cpu_to_le16(val) (val)
 #define cpu_to_le32(val) (val)
 #define le16_to_cpu(val) (val)
diff --git a/linux/src/drivers/net/pcnet32.c b/linux/src/drivers/net/pcnet32.c
index acc5ce7..02e7098 100644
--- a/linux/src/drivers/net/pcnet32.c
+++ b/linux/src/drivers/net/pcnet32.c
@@ -105,7 +105,7 @@ static int pcnet32_debug = 1;
 #define le32_to_cpu(val)  (val)
 #endif
 #if (LINUX_VERSION_CODE < 0x20123)
-#define test_and_set_bit(val, addr) set_bit(val, addr)
+//#define test_and_set_bit(val, addr) set_bit(val, addr)
 #endif
 
 #define TX_RING_SIZE			(1 << (PCNET_LOG_TX_BUFFERS))
diff --git a/linux/src/include/asm-i386/atomic.h b/linux/src/include/asm-i386/atomic.h
index 1b9d99f..7e5dd06 100644
--- a/linux/src/include/asm-i386/atomic.h
+++ b/linux/src/include/asm-i386/atomic.h
@@ -21,6 +21,8 @@
 
 typedef int atomic_t;
 
+#define atomic_read(v) (*v)
+
 static __inline__ void atomic_add(atomic_t i, atomic_t *v)
 {
 	__asm__ __volatile__(
diff --git a/linux/src/include/asm-i386/bitops.h b/linux/src/include/asm-i386/bitops.h
index d3ed1fb..fc4cf19 100644
--- a/linux/src/include/asm-i386/bitops.h
+++ b/linux/src/include/asm-i386/bitops.h
@@ -61,6 +61,40 @@ extern __inline__ int change_bit(int nr, SMPVOL void * addr)
 	return oldbit;
 }
 
+extern __inline__ int test_and_set_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr));
+	return oldbit;
+}
+
+extern __inline__ int test_and_clear_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+		"btrl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr));
+	return oldbit;
+}
+
+extern __inline__ int test_and_change_bit(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+		"btcl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"=m" (ADDR)
+		:"Ir" (nr));
+	return oldbit;
+}
+
+
 /*
  * This routine doesn't need to be atomic.
  */
diff --git a/linux/src/include/asm-i386/hardirq.h b/linux/src/include/asm-i386/hardirq.h
index 5339613..10dae41 100644
--- a/linux/src/include/asm-i386/hardirq.h
+++ b/linux/src/include/asm-i386/hardirq.h
@@ -4,6 +4,7 @@
 #include <linux/tasks.h>
 
 extern unsigned int local_irq_count[NR_CPUS];
+extern unsigned int local_bh_count[NR_CPUS];
 
 /*
  * Are we in an interrupt context? Either doing bottom half
diff --git a/linux/src/include/linux/compatmac.h b/linux/src/include/linux/compatmac.h
index 5f9175d..b9a4112 100644
--- a/linux/src/include/linux/compatmac.h
+++ b/linux/src/include/linux/compatmac.h
@@ -111,8 +111,8 @@ static inline void *ioremap(unsigned long base, long length)
 #define time_after(t1,t2)            (((long)t1-t2) > 0)
 
 
-#define test_and_set_bit(nr, addr)   set_bit(nr, addr)
-#define test_and_clear_bit(nr, addr) clear_bit(nr, addr)
+//#define test_and_set_bit(nr, addr)   set_bit(nr, addr)
+//#define test_and_clear_bit(nr, addr) clear_bit(nr, addr)
 
 /* Not yet implemented on 2.0 */
 #define ASYNC_SPD_SHI  -1
diff --git a/linux/src/include/linux/tasks.h b/linux/src/include/linux/tasks.h
index 4540e34..466560e 100644
--- a/linux/src/include/linux/tasks.h
+++ b/linux/src/include/linux/tasks.h
@@ -5,14 +5,12 @@
  * This is the maximum nr of tasks - change it if you need to
  */
  
-#ifdef __SMP__
-#define NR_CPUS	32		/* Max processors that can be running in SMP */
-#else
-#define NR_CPUS 1
-#endif
+#define NR_CPUS	NCPUS		/* Max processors that can be running in SMP */
 
 #define NR_TASKS	512
 
+#define NO_PROC_ID -1
+
 #define MAX_TASKS_PER_USER (NR_TASKS/2)
 #define MIN_TASKS_LEFT_FOR_ROOT 4