Import of Linux 2.2.12 subset (ipv4 stack and related)

author: Roland McGrath <roland@gnu.org> 2000-02-04 03:21:18 +0000
committer: Roland McGrath <roland@gnu.org> 2000-02-04 03:21:18 +0000
commit: 9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch)
tree: 8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/arch
8 files changed, 2838 insertions, 0 deletions
diff --git a/pfinet/linux-src/arch/alpha/lib/checksum.c b/pfinet/linux-src/arch/alpha/lib/checksum.c
new file mode 100644
index 00000000..5165279f
--- /dev/null
+++ b/pfinet/linux-src/arch/alpha/lib/checksum.c
@@ -0,0 +1,169 @@
+/*
+ * arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+ 
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short from64to16(unsigned long x)
+{
+	/* add up 32-bit words for 33 bits */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 17-bit words for 17+c bits */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up 16-bit and 2-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int csum_tcpudp_magic(unsigned long saddr,
+				   unsigned long daddr,
+				   unsigned short len,
+				   unsigned short proto,
+				   unsigned int sum)
+{
+	return ~from64to16(saddr + daddr + sum +
+		((unsigned long) ntohs(len) << 16) +
+		((unsigned long) proto << 8));
+}
+
+unsigned int csum_tcpudp_nofold(unsigned long saddr,
+				   unsigned long daddr,
+				   unsigned short len,
+				   unsigned short proto,
+				   unsigned int sum)
+{
+	unsigned long result;
+
+	result = (saddr + daddr + sum +
+		  ((unsigned long) ntohs(len) << 16) +
+		  ((unsigned long) proto << 8));
+
+	/* Fold down to 32-bits so we don't loose in the typedef-less 
+	   network stack.  */
+	/* 64 to 33 */
+	result = (result & 0xffffffff) + (result >> 32);
+	/* 33 to 32 */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area..
+ *
+ * This isn't a great routine, but it's not _horrible_ either. The
+ * inner loop could be unrolled a bit further, and there are better
+ * ways to do the carry, but this is reasonable.
+ */
+static inline unsigned long do_csum(const unsigned char * buff, int len)
+{
+	int odd, count;
+	unsigned long result = 0;
+
+	if (len <= 0)
+		goto out;
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+			if (count) {
+				unsigned long carry = 0;
+				do {
+					unsigned long w = *(unsigned long *) buff;
+					count--;
+					buff += 8;
+					result += carry;
+					result += w;
+					carry = (w > result);
+				} while (count);
+				result += carry;
+				result = (result & 0xffffffff) + (result >> 32);
+			}
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = from64to16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+out:
+	return result;
+}
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
+{
+	return ~do_csum(iph,ihl*4);
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+{
+	unsigned long result = do_csum(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += sum;
+	/* 32+c bits -> 32 bits */
+	result = (result & 0xffffffff) + (result >> 32);
+	return result;
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+	return ~from64to16(do_csum(buff,len));
+}
diff --git a/pfinet/linux-src/arch/arm/lib/checksum.S b/pfinet/linux-src/arch/arm/lib/checksum.S
new file mode 100644
index 00000000..bd5c78d3
--- /dev/null
+++ b/pfinet/linux-src/arch/arm/lib/checksum.S
@@ -0,0 +1,730 @@
+/*
+ * linux/arch/arm/lib/checksum.S
+ *
+ * Copyright (C) 1995, 1996, 1997, 1998 Russell King
+ */
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include "constants.h"
+
+		.text
+
+/* Function: __u32 csum_partial(const char *src, int len, __u32)
+ * Params  : r0 = buffer, r1 = len, r2 = checksum
+ * Returns : r0 = new checksum
+ */
+
+ENTRY(csum_partial)
+		tst	r0, #2
+		beq	1f
+		subs	r1, r1, #2
+		addmi	r1, r1, #2
+		bmi	3f
+		bic	r0, r0, #3
+		ldr	r3, [r0], #4
+		adds	r2, r2, r3, lsr #16
+		adcs	r2, r2, #0
+1:		adds	r2, r2, #0
+		bics	ip, r1, #31
+		beq	3f
+		stmfd	sp!, {r4 - r6}
+2:		ldmia	r0!, {r3 - r6}
+		adcs	r2, r2, r3
+		adcs	r2, r2, r4
+		adcs	r2, r2, r5
+		adcs	r2, r2, r6
+		ldmia	r0!, {r3 - r6}
+		adcs	r2, r2, r3
+		adcs	r2, r2, r4
+		adcs	r2, r2, r5
+		adcs	r2, r2, r6
+		sub	ip, ip, #32
+		teq	ip, #0
+		bne	2b
+		adcs	r2, r2, #0
+		ldmfd	sp!, {r4 - r6}
+3:		ands	ip, r1, #0x1c
+		beq	5f
+4:		ldr	r3, [r0], #4
+		adcs	r2, r2, r3
+		sub	ip, ip, #4
+		teq	ip, #0
+		bne	4b
+		adcs	r2, r2, #0
+5:		ands	ip, r1, #3
+		moveq	r0, r2
+		RETINSTR(moveq,pc,lr)
+		mov	ip, ip, lsl #3
+		rsb	ip, ip, #32
+		ldr	r3, [r0]
+		mov	r3, r3, lsl ip
+		adds	r2, r2, r3, lsr ip
+		adc	r0, r2, #0
+		RETINSTR(mov,pc,lr)
+
+/* Function: __u32 csum_partial_copy_from_user (const char *src, char *dst, int len, __u32 sum, int *err_ptr)
+ * Params  : r0 = src, r1 = dst, r2 = len, r3 = sum, [sp, #0] = &err
+ * Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT
+ */
+#if defined(CONFIG_CPU_32)
+
+		.macro	save_regs
+		stmfd	sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}
+		.endm
+
+#define LOAD_REGS(cond)					\
+		LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r8, fp, sp, pc})
+
+		.macro	load1b,	reg1
+9999:		ldrbt	\reg1, [r0], $1
+		.section __ex_table, "a"
+		.align	3
+		.long	9999b, 6001f
+		.previous
+		.endm
+
+		.macro	load2b, reg1, reg2
+9999:		ldrbt	\reg1, [r0], $1
+9998:		ldrbt	\reg2, [r0], $1
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.previous
+		.endm
+
+		.macro	load1l, reg1
+9999:		ldrt	\reg1, [r0], $4
+		.section __ex_table, "a"
+		.align	3
+		.long	9999b, 6001f
+		.previous
+		.endm
+
+		.macro	load2l, reg1, reg2
+9999:		ldrt	\reg1, [r0], $4
+9998:		ldrt	\reg2, [r0], $4
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.previous
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+9999:		ldrt	\reg1, [r0], $4
+9998:		ldrt	\reg2, [r0], $4
+9997:		ldrt	\reg3, [r0], $4
+9996:		ldrt	\reg4, [r0], $4
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.long	9997b, 6001f
+		.long	9996b, 6001f
+		.previous
+		.endm
+
+#elif defined(CONFIG_CPU_26)
+
+		.macro	save_regs
+		stmfd	sp!, {r1 - r2, r4 - r9, fp, ip, lr, pc}
+		mov	r9, sp, lsr #13
+		mov	r9, r9, lsl #13
+		ldr	r9, [r9, #TSK_ADDR_LIMIT]
+		mov	r9, r9, lsr #24
+		.endm
+
+#define LOAD_REGS(cond)					\
+		LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r9, fp, sp, pc})
+
+		.macro	load1b,	reg1
+		tst	r9, #0x01
+9999:		ldreqbt	\reg1, [r0], #1
+		ldrneb	\reg1, [r0], #1
+		.section __ex_table, "a"
+		.align	3
+		.long	9999b, 6001f
+		.previous
+		.endm
+
+		.macro	load2b, reg1, reg2
+		tst	r9, #0x01
+9999:		ldreqbt	\reg1, [r0], #1
+		ldrneb	\reg1, [r0], #1
+9998:		ldreqbt	\reg2, [r0], #1
+		ldrneb	\reg2, [r0], #1
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.previous
+		.endm
+
+		.macro	load1l, reg1
+		tst	r9, #0x01
+9999:		ldreqt	\reg1, [r0], #4
+		ldrne	\reg1, [r0], #4
+		.section __ex_table, "a"
+		.align	3
+		.long	9999b, 6001f
+		.previous
+		.endm
+
+		.macro	load2l, reg1, reg2
+		tst	r9, #0x01
+		ldmneia	r0!, {\reg1, \reg2}
+9999:		ldreqt	\reg1, [r0], #4
+9998:		ldreqt	\reg2, [r0], #4
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.previous
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		tst	r9, #0x01
+		ldmneia	r0!, {\reg1, \reg2, \reg3, \reg4}
+9999:		ldreqt	\reg1, [r0], #4
+9998:		ldreqt	\reg2, [r0], #4
+9997:		ldreqt	\reg3, [r0], #4
+9996:		ldreqt	\reg4, [r0], #4
+		.section __ex_table, "a"
+		.long	9999b, 6001f
+		.long	9998b, 6001f
+		.long	9997b, 6001f
+		.long	9996b, 6001f
+		.previous
+		.endm
+
+#else
+#error Unknown CPU architecture
+#endif
+
+ENTRY(csum_partial_copy_from_user)
+		mov	ip, sp
+		save_regs
+		sub	fp, ip, #4
+		cmp	r2, #4
+		blt	.too_small_user
+		tst	r1, #2			@ Test destination alignment
+		beq	.dst_aligned_user
+		subs	r2, r2, #2		@ We do not know if SRC is aligned...
+		load2b	ip, r8
+		orr	ip, ip, r8, lsl #8
+		adds	r3, r3, ip
+		adcs	r3, r3, #0
+		strb	ip, [r1], #1
+		mov	ip, ip, lsr #8
+		strb	ip, [r1], #1		@ Destination now aligned
+.dst_aligned_user:
+		tst	r0, #3
+		bne	.src_not_aligned_user
+		adds	r3, r3, #0
+		bics	ip, r2, #15		@ Routine for src & dst aligned
+		beq	2f
+1:		load4l	r4, r5, r6, r7
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r4, r5
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		tst	ip, #4
+		beq	4f
+3:		load1l	r4
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOAD_REGS(eq)
+		load1l	r4
+		tst	r2, #2
+		beq	.exit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+.exit:		tst	r2, #1
+		strneb	r4, [r1], #1
+		andne	r4, r4, #255
+		adcnes	r3, r3, r4
+		adcs	r0, r3, #0
+		LOAD_REGS(al)
+
+.too_small_user:
+		teq	r2, #0
+		LOAD_REGS(eq)
+		cmp	r2, #2
+		blt	.too_small_user1
+		load2b	ip, r8
+		orr	ip, ip, r8, lsl #8
+		adds	r3, r3, ip
+		strb	ip, [r1], #1
+		strb	r8, [r1], #1
+		tst	r2, #1
+.too_small_user1:				@ C = 0
+		beq	.csum_exit
+		load1b	ip
+		strb	ip, [r1], #1
+		adcs	r3, r3, ip
+.csum_exit:	adc	r0, r3, #0
+		LOAD_REGS(al)
+
+.src_not_aligned_user:
+		cmp	r2, #4
+		blt	.too_small_user
+		and	ip, r0, #3
+		bic	r0, r0, #3
+		load1l	r4
+		cmp	ip, #2
+		beq	.src2_aligned_user
+		bhi	.src3_aligned_user
+		mov	r4, r4, lsr #8
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, lsl #24
+		mov	r5, r5, lsr #8
+		orr	r5, r5, r6, lsl #24
+		mov	r6, r6, lsr #8
+		orr	r6, r6, r7, lsl #24
+		mov	r7, r7, lsr #8
+		orr	r7, r7, r8, lsl #24
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #8
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, lsl #24
+		mov	r5, r5, lsr #8
+		orr	r5, r5, r6, lsl #24
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #8
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, lsl #24
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #8
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOAD_REGS(eq)
+		tst	r2, #2
+		beq	.exit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		b	.exit
+
+.src2_aligned_user:
+		mov	r4, r4, lsr #16
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, lsl #16
+		mov	r5, r5, lsr #16
+		orr	r5, r5, r6, lsl #16
+		mov	r6, r6, lsr #16
+		orr	r6, r6, r7, lsl #16
+		mov	r7, r7, lsr #16
+		orr	r7, r7, r8, lsl #16
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #16
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, lsl #16
+		mov	r5, r5, lsr #16
+		orr	r5, r5, r6, lsl #16
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #16
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, lsl #16
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #16
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOAD_REGS(eq)
+		tst	r2, #2
+		beq	.exit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		load1b	r4
+		b	.exit
+
+.src3_aligned_user:
+		mov	r4, r4, lsr #24
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, lsl #8
+		mov	r5, r5, lsr #24
+		orr	r5, r5, r6, lsl #8
+		mov	r6, r6, lsr #24
+		orr	r6, r6, r7, lsl #8
+		mov	r7, r7, lsr #24
+		orr	r7, r7, r8, lsl #8
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #24
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, lsl #8
+		mov	r5, r5, lsr #24
+		orr	r5, r5, r6, lsl #8
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #24
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, lsl #8
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #24
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOAD_REGS(eq)
+		tst	r2, #2
+		beq	.exit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		load1l	r4
+		strb	r4, [r1], #1
+		adcs	r3, r3, r4, lsl #24
+		mov	r4, r4, lsr #8
+		b	.exit
+
+#if defined(CONFIG_CPU_32)
+		.section .fixup,"ax"
+#endif
+		.align	4
+6001:		mov	r4, #-EFAULT
+		ldr	r5, [fp, #4]
+		str	r4, [r5]
+		ldmia	sp, {r1, r2}		@ retrieve original arguments
+		add	r2, r2, r1
+		mov	r3, #0			@ zero the buffer
+6002:		teq	r2, r1
+		strneb	r3, [r1], #1
+		bne	6002b
+		LOAD_REGS(al)
+#if defined(CONFIG_CPU_32)
+		.previous
+#endif
+
+/* Function: __u32 csum_partial_copy (const char *src, char *dst, int len, __u32 sum)
+ * Params  : r0 = src, r1 = dst, r2 = len, r3 = checksum
+ * Returns : r0 = new checksum
+ */
+ENTRY(csum_partial_copy_nocheck)
+ENTRY(csum_partial_copy)
+		mov	ip, sp
+		stmfd	sp!, {r4 - r8, fp, ip, lr, pc}
+		sub	fp, ip, #4
+		cmp	r2, #4
+		blt	Ltoo_small
+		tst	r1, #2			@ Test destination alignment
+		beq	Ldst_aligned
+		subs	r2, r2, #2		@ We do not know if SRC is aligned...
+		ldrb	ip, [r0], #1
+		ldrb	r8, [r0], #1
+		orr	ip, ip, r8, lsl #8
+		adds	r3, r3, ip
+		adcs	r3, r3, #0
+		strb	ip, [r1], #1
+		mov	ip, ip, lsr #8
+		strb	ip, [r1], #1		@ Destination now aligned
+Ldst_aligned:	tst	r0, #3
+		bne	Lsrc_not_aligned
+		adds	r3, r3, #0
+		bics	ip, r2, #15		@ Routine for src & dst aligned
+		beq	3f
+1:		ldmia	r0!, {r4, r5, r6, r7}
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+3:		ands	ip, r2, #12
+		beq	5f
+		tst	ip, #8
+		beq	4f
+		ldmia	r0!, {r4, r5}
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		tst	ip, #4
+		beq	5f
+4:		ldr	r4, [r0], #4
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+5:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+		ldr	r4, [r0], #4
+		tst	r2, #2
+		beq	Lexit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		b	Lexit
+
+Ltoo_small:	teq	r2, #0
+		LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+		cmp	r2, #2
+		blt	Ltoo_small1
+		ldrb	ip, [r0], #1
+		ldrb	r8, [r0], #1
+		orr	ip, ip, r8, lsl #8
+		adds	r3, r3, ip
+		strb	ip, [r1], #1
+		strb	r8, [r1], #1
+Lexit:		tst	r2, #1
+Ltoo_small1:	ldrneb	ip, [r0], #1
+		strneb	ip, [r1], #1
+		adcnes	r3, r3, ip
+		adcs	r0, r3, #0
+		LOADREGS(ea,fp,{r4 - r8, fp, sp, pc})
+
+Lsrc_not_aligned:
+		cmp	r2, #4
+		blt	Ltoo_small
+		and	ip, r0, #3
+		bic	r0, r0, #3
+		ldr	r4, [r0], #4
+		cmp	ip, #2
+		beq	Lsrc2_aligned
+		bhi	Lsrc3_aligned
+		mov	r4, r4, lsr #8
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		ldmia	r0!, {r5, r6, r7, r8}
+		orr	r4, r4, r5, lsl #24
+		mov	r5, r5, lsr #8
+		orr	r5, r5, r6, lsl #24
+		mov	r6, r6, lsr #8
+		orr	r6, r6, r7, lsl #24
+		mov	r7, r7, lsr #8
+		orr	r7, r7, r8, lsl #24
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #8
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		ldmia	r0!, {r5, r6}
+		orr	r4, r4, r5, lsl #24
+		mov	r5, r5, lsr #8
+		orr	r5, r5, r6, lsl #24
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #8
+		tst	ip, #4
+		beq	4f
+3:		ldr	r5, [r0], #4
+		orr	r4, r4, r5, lsl #24
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #8
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+		tst	r2, #2
+		beq	Lexit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		b	Lexit
+
+Lsrc2_aligned:	mov	r4, r4, lsr #16
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		ldmia	r0!, {r5, r6, r7, r8}
+		orr	r4, r4, r5, lsl #16
+		mov	r5, r5, lsr #16
+		orr	r5, r5, r6, lsl #16
+		mov	r6, r6, lsr #16
+		orr	r6, r6, r7, lsl #16
+		mov	r7, r7, lsr #16
+		orr	r7, r7, r8, lsl #16
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #16
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		ldmia	r0!, {r5, r6}
+		orr	r4, r4, r5, lsl #16
+		mov	r5, r5, lsr #16
+		orr	r5, r5, r6, lsl #16
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #16
+		tst	ip, #4
+		beq	4f
+3:		ldr	r5, [r0], #4
+		orr	r4, r4, r5, lsl #16
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #16
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+		tst	r2, #2
+		beq	Lexit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		mov	r4, r4, lsr #8
+		strb	r4, [r1], #1
+		ldrb	r4, [r0], #1
+		b	Lexit
+
+Lsrc3_aligned:	mov	r4, r4, lsr #24
+		adds	r3, r3, #0
+		bics	ip, r2, #15
+		beq	2f
+1:		ldmia	r0!, {r5, r6, r7, r8}
+		orr	r4, r4, r5, lsl #8
+		mov	r5, r5, lsr #24
+		orr	r5, r5, r6, lsl #8
+		mov	r6, r6, lsr #24
+		orr	r6, r6, r7, lsl #8
+		mov	r7, r7, lsr #24
+		orr	r7, r7, r8, lsl #8
+		stmia	r1!, {r4, r5, r6, r7}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		adcs	r3, r3, r6
+		adcs	r3, r3, r7
+		mov	r4, r8, lsr #24
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, r2, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		ldmia	r0!, {r5, r6}
+		orr	r4, r4, r5, lsl #8
+		mov	r5, r5, lsr #24
+		orr	r5, r5, r6, lsl #8
+		stmia	r1!, {r4, r5}
+		adcs	r3, r3, r4
+		adcs	r3, r3, r5
+		mov	r4, r6, lsr #24
+		tst	ip, #4
+		beq	4f
+3:		ldr	r5, [r0], #4
+		orr	r4, r4, r5, lsl #8
+		str	r4, [r1], #4
+		adcs	r3, r3, r4
+		mov	r4, r5, lsr #24
+4:		ands	r2, r2, #3
+		adceq	r0, r3, #0
+		LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+		tst	r2, #2
+		beq	Lexit
+		adcs	r3, r3, r4, lsl #16
+		strb	r4, [r1], #1
+		ldr	r4, [r0], #4
+		strb	r4, [r1], #1
+		adcs	r3, r3, r4, lsl #24
+		mov	r4, r4, lsr #8
+		b	Lexit
+
+ENTRY(__csum_ipv6_magic)
+		stmfd	sp!, {lr}
+		adds	ip, r2, r3
+		ldmia	r1, {r1 - r3, lr}
+		adcs	ip, ip, r1
+		adcs	ip, ip, r2
+		adcs	ip, ip, r3
+		adcs	ip, ip, lr
+		ldmia	r0, {r0 - r3}
+		adcs	r0, ip, r0
+		adcs	r0, r0, r1
+		adcs	r0, r0, r2
+		adcs	r0, r0, r3
+		ldr	r3, [sp, #4]
+		adcs	r0, r0, r3
+		adcs	r0, r0, #0
+		LOADREGS(fd, sp!, {pc})
diff --git a/pfinet/linux-src/arch/i386/lib/checksum.S b/pfinet/linux-src/arch/i386/lib/checksum.S
new file mode 100644
index 00000000..af10dc7c
--- /dev/null
+++ b/pfinet/linux-src/arch/i386/lib/checksum.S
@@ -0,0 +1,447 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+				
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+		
+.text
+.align 4
+.globl csum_partial								
+		
+#if CPU!=686
+
+	  /*		
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */		
+csum_partial:	
+	pushl %esi
+	pushl %ebx
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg: unsigned char *buff
+	testl $2, %esi		# Check alignment.
+	jz 2f			# Jump if alignment is ok.
+	subl $2, %ecx		# Alignment uses up two bytes.
+	jae 1f			# Jump if we had at least two bytes.
+	addl $2, %ecx		# ecx was < 2.  Deal with it.
+	jmp 4f
+1:	movw (%esi), %bx
+	addl $2, %esi
+	addw %bx, %ax
+	adcl $0, %eax
+2:
+	movl %ecx, %edx
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+1:	movl (%esi), %ebx
+	adcl %ebx, %eax
+	movl 4(%esi), %ebx
+	adcl %ebx, %eax
+	movl 8(%esi), %ebx
+	adcl %ebx, %eax
+	movl 12(%esi), %ebx
+	adcl %ebx, %eax
+	movl 16(%esi), %ebx
+	adcl %ebx, %eax
+	movl 20(%esi), %ebx
+	adcl %ebx, %eax
+	movl 24(%esi), %ebx
+	adcl %ebx, %eax
+	movl 28(%esi), %ebx
+	adcl %ebx, %eax
+	lea 32(%esi), %esi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx		# This clears CF
+3:	adcl (%esi), %eax
+	lea 4(%esi), %esi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+	movw (%esi),%cx
+	leal 2(%esi),%esi
+	je 6f
+	shll $16,%ecx
+5:	movb (%esi),%cl
+6:	addl %ecx,%eax
+	adcl $0, %eax 
+7:	
+	popl %ebx
+	popl %esi
+	ret
+
+#else /* CPU==686 */
+
+csum_partial:
+	movl 12(%esp),%eax	# Function arg: unsigned int sum
+	movl 8(%esp),%ecx	# Function arg: int len
+	movl 4(%esp),%esi	# Function arg:	const unsigned char *buf
+
+	testl $2, %esi         
+	jnz 30f                 
+10:
+	movl %ecx, %edx
+	movl %ecx, %ebx
+	andl $0x7c, %ebx
+	shrl $7, %ecx
+	addl %ebx,%esi
+	shrl $2, %ebx  
+	negl %ebx
+	lea 45f(%ebx,%ebx,2), %ebx
+	testl %esi, %esi
+	jmp *%ebx
+
+	# Handle 2-byte-aligned regions
+20:	addw (%esi), %ax
+	lea 2(%esi), %esi
+	adcl $0, %eax
+	jmp 10b
+
+30:	subl $2, %ecx          
+	ja 20b                 
+	je 32f
+	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
+	addl %ebx, %eax
+	adcl $0, %eax
+	jmp 80f
+32:
+	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
+	adcl $0, %eax
+	jmp 80f
+
+40: 
+	addl -128(%esi), %eax
+	adcl -124(%esi), %eax
+	adcl -120(%esi), %eax
+	adcl -116(%esi), %eax   
+	adcl -112(%esi), %eax   
+	adcl -108(%esi), %eax
+	adcl -104(%esi), %eax
+	adcl -100(%esi), %eax
+	adcl -96(%esi), %eax
+	adcl -92(%esi), %eax
+	adcl -88(%esi), %eax
+	adcl -84(%esi), %eax
+	adcl -80(%esi), %eax
+	adcl -76(%esi), %eax
+	adcl -72(%esi), %eax
+	adcl -68(%esi), %eax
+	adcl -64(%esi), %eax     
+	adcl -60(%esi), %eax     
+	adcl -56(%esi), %eax     
+	adcl -52(%esi), %eax   
+	adcl -48(%esi), %eax   
+	adcl -44(%esi), %eax
+	adcl -40(%esi), %eax
+	adcl -36(%esi), %eax
+	adcl -32(%esi), %eax
+	adcl -28(%esi), %eax
+	adcl -24(%esi), %eax
+	adcl -20(%esi), %eax
+	adcl -16(%esi), %eax
+	adcl -12(%esi), %eax
+	adcl -8(%esi), %eax
+	adcl -4(%esi), %eax
+45:
+	lea 128(%esi), %esi
+	adcl $0, %eax
+	dec %ecx
+	jge 40b
+	movl %edx, %ecx
+50:	andl $3, %ecx
+	jz 80f
+
+	# Handle the last 1-3 bytes without jumping
+	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
+	movl $0xffffff,%ebx	# by the shll and shrl instructions
+	shll $3,%ecx
+	shrl %cl,%ebx
+	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
+	addl %ebx,%eax
+	adcl $0,%eax
+80: 
+	ret
+				
+#endif /* CPU==686 */ 
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+.align 4
+.globl csum_partial_copy_generic
+				
+#if CPU!=686
+
+#define ARGBASE 16		
+#define FP		12
+		
+csum_partial_copy_generic:
+	subl  $4,%esp	
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+	movl ARGBASE+16(%esp),%eax	# sum
+	movl ARGBASE+12(%esp),%ecx	# len
+	movl ARGBASE+4(%esp),%esi	# src
+	movl ARGBASE+8(%esp),%edi	# dst
+
+	testl $2, %edi			# Check alignment. 
+	jz 2f				# Jump if alignment is ok.
+	subl $2, %ecx			# Alignment uses up two bytes.
+	jae 1f				# Jump if we had at least two bytes.
+	addl $2, %ecx			# ecx was < 2.  Deal with it.
+	jmp 4f
+SRC(1:	movw (%esi), %bx	)
+	addl $2, %esi
+DST(	movw %bx, (%edi)	)
+	addl $2, %edi
+	addw %bx, %ax	
+	adcl $0, %eax
+2:
+	movl %ecx, FP(%esp)
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+SRC(1:	movl (%esi), %ebx	)
+SRC(	movl 4(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 4(%edi)	)
+
+SRC(	movl 8(%esi), %ebx	)
+SRC(	movl 12(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 8(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 12(%edi)	)
+
+SRC(	movl 16(%esi), %ebx 	)
+SRC(	movl 20(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 16(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 20(%edi)	)
+
+SRC(	movl 24(%esi), %ebx	)
+SRC(	movl 28(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 24(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 28(%edi)	)
+
+	lea 32(%esi), %esi
+	lea 32(%edi), %edi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl FP(%esp), %edx
+	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx			# This clears CF
+SRC(3:	movl (%esi), %ebx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	lea 4(%esi), %esi
+	lea 4(%edi), %edi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+SRC(	movw (%esi), %cx	)
+	leal 2(%esi), %esi
+DST(	movw %cx, (%edi)	)
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%ecx
+SRC(5:	movb (%esi), %cl	)
+DST(	movb %cl, (%edi)	)
+6:	addl %ecx, %eax
+	adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
+	movl $-EFAULT, (%ebx)
+
+	# zero the complete destination - computing the rest
+	# is too much work 
+	movl ARGBASE+8(%esp), %edi	# dst
+	movl ARGBASE+12(%esp), %ecx	# len
+	xorl %eax,%eax
+	rep ; stosb
+
+	jmp 5000b
+
+6002:
+	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT,(%ebx)
+	jmp 5000b
+
+.previous
+
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ecx			# equivalent to addl $4,%esp
+	ret	
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	addl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	; 
+
+#define ROUND(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	adcl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	;
+
+#define ARGBASE 12
+		
+csum_partial_copy_generic:
+	pushl %ebx
+	pushl %edi
+	pushl %esi
+	movl ARGBASE+4(%esp),%esi	#src
+	movl ARGBASE+8(%esp),%edi	#dst	
+	movl ARGBASE+12(%esp),%ecx	#len
+	movl ARGBASE+16(%esp),%eax	#sum
+	movl %ecx, %edx  
+	movl %ecx, %ebx  
+	shrl $6, %ecx     
+	andl $0x3c, %ebx  
+	negl %ebx
+	subl %ebx, %esi  
+	subl %ebx, %edi  
+	lea 3f(%ebx,%ebx), %ebx
+	testl %esi, %esi 
+	jmp *%ebx         
+1:	addl $64,%esi
+	addl $64,%edi 
+	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
+	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
+	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
+	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
+3:	adcl $0,%eax
+	dec %ecx
+	jge 1b
+4:      andl $3, %edx
+	jz 7f
+	cmpl $2, %edx
+	jb 5f
+SRC(	movw (%esi), %dx         )
+	leal 2(%esi), %esi
+DST(	movw %dx, (%edi)         )
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%edx
+5:
+SRC(	movb (%esi), %dl         )
+DST(	movb %dl, (%edi)         )
+6:	addl %edx, %eax
+	adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
+	movl $-EFAULT, (%ebx)
+	# zero the complete destination (computing the rest is too much work)
+	movl ARGBASE+8(%esp),%edi	# dst
+	movl ARGBASE+12(%esp),%ecx	# len
+	xorl %eax,%eax
+	rep; stosb
+	jmp 7b
+6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT, (%ebx)
+	jmp  7b			
+.previous				
+
+	popl %esi
+	popl %edi
+	popl %ebx
+	ret
+				
+#undef ROUND
+#undef ROUND1		
+		
+#endif	/* CPU==i686 */ 
diff --git a/pfinet/linux-src/arch/i386/lib/old-checksum.c b/pfinet/linux-src/arch/i386/lib/old-checksum.c
new file mode 100644
index 00000000..ae3a3804
--- /dev/null
+++ b/pfinet/linux-src/arch/i386/lib/old-checksum.c
@@ -0,0 +1,19 @@
+/*
+ * FIXME: old compatibility stuff, will be removed soon.
+ */
+
+#include <net/checksum.h>
+
+unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum)
+{
+	int src_err=0, dst_err=0;
+
+	sum = csum_partial_copy_generic ( src, dst, len, sum, &src_err, &dst_err);
+
+	if (src_err || dst_err)
+		printk("old csum_partial_copy_fromuser(), tell mingo to convert me.\n");
+
+	return sum;
+}
+
+
diff --git a/pfinet/linux-src/arch/m68k/lib/checksum.c b/pfinet/linux-src/arch/m68k/lib/checksum.c
new file mode 100644
index 00000000..5110cac4
--- /dev/null
+++ b/pfinet/linux-src/arch/m68k/lib/checksum.c
@@ -0,0 +1,420 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *		Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * 03/02/96	Jes Sorensen, Andreas Schwab, Roman Hodek:
+ *		Fixed some nasty bugs, causing some horrible crashes.
+ *		A: At some points, the sum (%0) was used as
+ *		length-counter instead of the length counter
+ *		(%1). Thanks to Roman Hodek for pointing this out.
+ *		B: GCC seems to mess up if one uses too many
+ *		data-registers to hold input values and one tries to
+ *		specify d0 and d1 as scratch registers. Letting gcc
+ *		choose these registers itself solves the problem.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * 1998/8/31	Andreas Schwab:
+ *		Zero out rest of buffer on exception in
+ *		csum_partial_copy_from_user.
+ */
+
+#include <net/checksum.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+unsigned int
+csum_partial (const unsigned char *buff, int len, unsigned int sum)
+{
+	unsigned long tmp1, tmp2;
+	  /*
+	   * Experiments with ethernet and slip connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.
+	   */
+	__asm__("movel %2,%3\n\t"
+		"btst #1,%3\n\t"	/* Check alignment */
+		"jeq 2f\n\t"
+		"subql #2,%1\n\t"	/* buff%4==2: treat first word */
+		"jgt 1f\n\t"
+		"addql #2,%1\n\t"	/* len was == 2, treat only rest */
+		"jra 4f\n"
+	     "1:\t"
+		"addw %2@+,%0\n\t"	/* add first word to sum */
+		"clrl %3\n\t"
+		"addxl %3,%0\n"		/* add X bit */
+	     "2:\t"
+		/* unrolled loop for the main part: do 8 longs at once */
+		"movel %1,%3\n\t"	/* save len in tmp1 */
+		"lsrl #5,%1\n\t"	/* len/32 */
+		"jeq 2f\n\t"		/* not enough... */
+		"subql #1,%1\n"
+	     "1:\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"dbra %1,1b\n\t"
+		"clrl %4\n\t"
+		"addxl %4,%0\n\t"	/* add X bit */
+		"clrw %1\n\t"
+		"subql #1,%1\n\t"
+		"jcc 1b\n"
+	     "2:\t"
+		"movel %3,%1\n\t"	/* restore len from tmp1 */
+		"andw #0x1c,%3\n\t"	/* number of rest longs */
+		"jeq 4f\n\t"
+		"lsrw #2,%3\n\t"
+		"subqw #1,%3\n"
+	     "3:\t"
+		/* loop for rest longs */
+		"movel %2@+,%4\n\t"
+		"addxl %4,%0\n\t"
+		"dbra %3,3b\n\t"
+		"clrl %4\n\t"
+		"addxl %4,%0\n"		/* add X bit */
+	     "4:\t"
+		/* now check for rest bytes that do not fit into longs */
+		"andw #3,%1\n\t"
+		"jeq 7f\n\t"
+		"clrl %4\n\t"		/* clear tmp2 for rest bytes */
+		"subqw #2,%1\n\t"
+		"jlt 5f\n\t"
+		"movew %2@+,%4\n\t"	/* have rest >= 2: get word */
+		"swap %4\n\t"		/* into bits 16..31 */
+		"tstw %1\n\t"		/* another byte? */
+		"jeq 6f\n"
+	     "5:\t"
+		"moveb %2@,%4\n\t"	/* have odd rest: get byte */
+		"lslw #8,%4\n\t"	/* into bits 8..15; 16..31 untouched */
+	     "6:\t"
+		"addl %4,%0\n\t"	/* now add rest long to sum */
+		"clrl %4\n\t"
+		"addxl %4,%0\n"		/* add X bit */
+	     "7:\t"
+		: "=d" (sum), "=d" (len), "=a" (buff),
+		  "=&d" (tmp1), "=&d" (tmp2)
+		: "0" (sum), "1" (len), "2" (buff)
+	    );
+	return(sum);
+}
+
+
+
+/*
+ * copy from user space while checksumming, with exception handling.
+ */
+
+unsigned int
+csum_partial_copy_from_user(const char *src, char *dst, int len,
+			    int sum, int *csum_err)
+{
+	/*
+	 * GCC doesn't like more than 10 operands for the asm
+	 * statements so we have to use tmp2 for the error
+	 * code.
+	 */
+	unsigned long tmp1, tmp2;
+
+	__asm__("movel %2,%4\n\t"
+		"btst #1,%4\n\t"	/* Check alignment */
+		"jeq 2f\n\t"
+		"subql #2,%1\n\t"	/* buff%4==2: treat first word */
+		"jgt 1f\n\t"
+		"addql #2,%1\n\t"	/* len was == 2, treat only rest */
+		"jra 4f\n"
+	     "1:\n"
+	     "10:\t"
+		"movesw %2@+,%4\n\t"	/* add first word to sum */
+		"addw %4,%0\n\t"
+		"movew %4,%3@+\n\t"
+		"clrl %4\n\t"
+		"addxl %4,%0\n"		/* add X bit */
+	     "2:\t"
+		/* unrolled loop for the main part: do 8 longs at once */
+		"movel %1,%4\n\t"	/* save len in tmp1 */
+		"lsrl #5,%1\n\t"	/* len/32 */
+		"jeq 2f\n\t"		/* not enough... */
+		"subql #1,%1\n"
+	     "1:\n"
+	     "11:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "12:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "13:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "14:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "15:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "16:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "17:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+	     "18:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"dbra %1,1b\n\t"
+		"clrl %5\n\t"
+		"addxl %5,%0\n\t"	/* add X bit */
+		"clrw %1\n\t"
+		"subql #1,%1\n\t"
+		"jcc 1b\n"
+	     "2:\t"
+		"movel %4,%1\n\t"	/* restore len from tmp1 */
+		"andw #0x1c,%4\n\t"	/* number of rest longs */
+		"jeq 4f\n\t"
+		"lsrw #2,%4\n\t"
+		"subqw #1,%4\n"
+	     "3:\n"
+		/* loop for rest longs */
+	     "19:\t"
+		"movesl %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"dbra %4,3b\n\t"
+		"clrl %5\n\t"
+		"addxl %5,%0\n"		/* add X bit */
+	     "4:\t"
+		/* now check for rest bytes that do not fit into longs */
+		"andw #3,%1\n\t"
+		"jeq 7f\n\t"
+		"clrl %5\n\t"		/* clear tmp2 for rest bytes */
+		"subqw #2,%1\n\t"
+		"jlt 5f\n\t"
+	     "20:\t"
+		"movesw %2@+,%5\n\t"	/* have rest >= 2: get word */
+		"movew %5,%3@+\n\t"
+		"swap %5\n\t"		/* into bits 16..31 */
+		"tstw %1\n\t"		/* another byte? */
+		"jeq 6f\n"
+	     "5:\n"
+	     "21:\t"
+		"movesb %2@,%5\n\t"	/* have odd rest: get byte */
+		"moveb %5,%3@+\n\t"
+		"lslw #8,%5\n\t"	/* into bits 8..15; 16..31 untouched */
+	     "6:\t"
+		"addl %5,%0\n\t"	/* now add rest long to sum */
+		"clrl %5\n\t"
+		"addxl %5,%0\n\t"	/* add X bit */
+	     "7:\t"
+		"clrl %5\n"		/* no error - clear return value */
+	     "8:\n"
+		".section .fixup,\"ax\"\n"
+		".even\n"
+		/* If any execption occurs zero out the rest.
+		   Similarities with the code above are intentional :-) */
+	     "90:\t"
+		"clrw %3@+\n\t"
+		"movel %1,%4\n\t"
+		"lsrl #5,%1\n\t"
+		"jeq 1f\n\t"
+		"subql #1,%1\n"
+	     "91:\t"
+		"clrl %3@+\n"
+	     "92:\t"
+		"clrl %3@+\n"
+	     "93:\t"
+		"clrl %3@+\n"
+	     "94:\t"
+		"clrl %3@+\n"
+	     "95:\t"
+		"clrl %3@+\n"
+	     "96:\t"
+		"clrl %3@+\n"
+	     "97:\t"
+		"clrl %3@+\n"
+	     "98:\t"
+		"clrl %3@+\n\t"
+		"dbra %1,91b\n\t"
+		"clrw %1\n\t"
+		"subql #1,%1\n\t"
+		"jcc 91b\n"
+	     "1:\t"
+		"movel %4,%1\n\t"
+		"andw #0x1c,%4\n\t"
+		"jeq 1f\n\t"
+		"lsrw #2,%4\n\t"
+		"subqw #1,%4\n"
+	     "99:\t"
+		"clrl %3@+\n\t"
+		"dbra %4,99b\n\t"
+	     "1:\t"
+		"andw #3,%1\n\t"
+		"jeq 9f\n"
+	     "100:\t"
+		"clrw %3@+\n\t"
+		"tstw %1\n\t"
+		"jeq 9f\n"
+	     "101:\t"
+		"clrb %3@+\n"
+	     "9:\t"
+#define STR(X) STR1(X)
+#define STR1(X) #X
+		"moveq #-" STR(EFAULT) ",%5\n\t"
+		"jra 8b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		".long 10b,90b\n"
+		".long 11b,91b\n"
+		".long 12b,92b\n"
+		".long 13b,93b\n"
+		".long 14b,94b\n"
+		".long 15b,95b\n"
+		".long 16b,96b\n"
+		".long 17b,97b\n"
+		".long 18b,98b\n"
+		".long 19b,99b\n"
+		".long 20b,100b\n"
+		".long 21b,101b\n"
+		".previous"
+		: "=d" (sum), "=d" (len), "=a" (src), "=a" (dst),
+		  "=&d" (tmp1), "=d" (tmp2)
+		: "0" (sum), "1" (len), "2" (src), "3" (dst)
+	    );
+
+	*csum_err = tmp2;
+
+	return(sum);
+}
+
+/*
+ * copy from kernel space while checksumming, otherwise like csum_partial
+ */
+
+unsigned int
+csum_partial_copy(const char *src, char *dst, int len, int sum)
+{
+	unsigned long tmp1, tmp2;
+	__asm__("movel %2,%4\n\t"
+		"btst #1,%4\n\t"	/* Check alignment */
+		"jeq 2f\n\t"
+		"subql #2,%1\n\t"	/* buff%4==2: treat first word */
+		"jgt 1f\n\t"
+		"addql #2,%1\n\t"	/* len was == 2, treat only rest */
+		"jra 4f\n"
+	     "1:\t"
+		"movew %2@+,%4\n\t"	/* add first word to sum */
+		"addw %4,%0\n\t"
+		"movew %4,%3@+\n\t"
+		"clrl %4\n\t"
+		"addxl %4,%0\n"		/* add X bit */
+	     "2:\t"
+		/* unrolled loop for the main part: do 8 longs at once */
+		"movel %1,%4\n\t"	/* save len in tmp1 */
+		"lsrl #5,%1\n\t"	/* len/32 */
+		"jeq 2f\n\t"		/* not enough... */
+		"subql #1,%1\n"
+	     "1:\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"dbra %1,1b\n\t"
+		"clrl %5\n\t"
+		"addxl %5,%0\n\t"	/* add X bit */
+		"clrw %1\n\t"
+		"subql #1,%1\n\t"
+		"jcc 1b\n"
+	     "2:\t"
+		"movel %4,%1\n\t"	/* restore len from tmp1 */
+		"andw #0x1c,%4\n\t"	/* number of rest longs */
+		"jeq 4f\n\t"
+		"lsrw #2,%4\n\t"
+		"subqw #1,%4\n"
+	     "3:\t"
+		/* loop for rest longs */
+		"movel %2@+,%5\n\t"
+		"addxl %5,%0\n\t"
+		"movel %5,%3@+\n\t"
+		"dbra %4,3b\n\t"
+		"clrl %5\n\t"
+		"addxl %5,%0\n"		/* add X bit */
+	     "4:\t"
+		/* now check for rest bytes that do not fit into longs */
+		"andw #3,%1\n\t"
+		"jeq 7f\n\t"
+		"clrl %5\n\t"		/* clear tmp2 for rest bytes */
+		"subqw #2,%1\n\t"
+		"jlt 5f\n\t"
+		"movew %2@+,%5\n\t"	/* have rest >= 2: get word */
+		"movew %5,%3@+\n\t"
+		"swap %5\n\t"		/* into bits 16..31 */
+		"tstw %1\n\t"		/* another byte? */
+		"jeq 6f\n"
+	     "5:\t"
+		"moveb %2@,%5\n\t"	/* have odd rest: get byte */
+		"moveb %5,%3@+\n\t"
+		"lslw #8,%5\n"		/* into bits 8..15; 16..31 untouched */
+	     "6:\t"
+		"addl %5,%0\n\t"	/* now add rest long to sum */
+		"clrl %5\n\t"
+		"addxl %5,%0\n"		/* add X bit */
+	     "7:\t"
+		: "=d" (sum), "=d" (len), "=a" (src), "=a" (dst),
+		  "=&d" (tmp1), "=&d" (tmp2)
+		: "0" (sum), "1" (len), "2" (src), "3" (dst)
+	    );
+    return(sum);
+}
diff --git a/pfinet/linux-src/arch/ppc/lib/checksum.S b/pfinet/linux-src/arch/ppc/lib/checksum.S
new file mode 100644
index 00000000..66a2e3aa
--- /dev/null
+++ b/pfinet/linux-src/arch/ppc/lib/checksum.S
@@ -0,0 +1,194 @@
+/*
+ * This file contains assembly-language implementations
+ * of IP-style 1's complement checksum routines.
+ *	
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
+ */
+
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include "../kernel/ppc_asm.tmpl"
+
+	.text
+
+/*
+ * ip_fast_csum(buf, len) -- Optimized for IP header
+ * len is in words and is always >= 5.
+ */
+_GLOBAL(ip_fast_csum)
+	lwz	r0,0(r3)
+	lwzu	r5,4(r3)
+	addi	r4,r4,-2
+	addc	r0,r0,r5
+	mtctr	r4
+1:	lwzu	r4,4(r3)
+	adde	r0,r0,r4
+	bdnz	1b
+	addze	r0,r0		/* add in final carry */
+	rlwinm	r3,r0,16,0,31	/* fold two halves together */
+	add	r3,r0,r3
+	not	r3,r3
+	srwi	r3,r3,16
+	blr
+
+/*
+ * Compute checksum of TCP or UDP pseudo-header:
+ *   csum_tcpudp_magic(saddr, daddr, len, proto, sum)
+ */	
+_GLOBAL(csum_tcpudp_magic)
+	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
+	addc	r0,r3,r4	/* add 4 32-bit words together */
+	adde	r0,r0,r5
+	adde	r0,r0,r7
+	addze	r0,r0		/* add in final carry */
+	rlwinm	r3,r0,16,0,31	/* fold two halves together */
+	add	r3,r0,r3
+	not	r3,r3
+	srwi	r3,r3,16
+	blr
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * csum_partial(buff, len, sum)
+ */
+_GLOBAL(csum_partial)
+	addic	r0,r5,0
+	subi	r3,r3,4
+	srwi.	r6,r4,2
+	beq	3f		/* if we're doing < 4 bytes */
+	andi.	r5,r3,2		/* Align buffer to longword boundary */
+	beq+	1f
+	lhz	r5,4(r3)	/* do 2 bytes to get aligned */
+	addi	r3,r3,2
+	subi	r4,r4,2
+	addc	r0,r0,r5
+	srwi.	r6,r4,2		/* # words to do */
+	beq	3f
+1:	mtctr	r6
+2:	lwzu	r5,4(r3)	/* the bdnz has zero overhead, so it should */
+	adde	r0,r0,r5	/* be unnecessary to unroll this loop */
+	bdnz	2b
+	andi.	r4,r4,3
+3:	cmpi	0,r4,2
+	blt+	4f
+	lhz	r5,4(r3)
+	addi	r3,r3,2
+	subi	r4,r4,2
+	adde	r0,r0,r5
+4:	cmpi	0,r4,1
+	bne+	5f
+	lbz	r5,4(r3)
+	slwi	r5,r5,8		/* Upper byte of word */
+	adde	r0,r0,r5
+5:	addze	r3,r0		/* add in final carry */
+	blr
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit), while copying the block to dst.
+ * If an access exception occurs on src or dst, it stores -EFAULT
+ * to *src_err or *dst_err respectively, and (for an error on
+ * src) zeroes the rest of dst.
+ *
+ * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
+ */
+_GLOBAL(csum_partial_copy_generic)
+	addic	r0,r6,0
+	subi	r3,r3,4
+	subi	r4,r4,4
+	srwi.	r6,r5,2
+	beq	3f		/* if we're doing < 4 bytes */
+	andi.	r9,r4,2		/* Align dst to longword boundary */
+	beq+	1f
+81:	lhz	r6,4(r3)	/* do 2 bytes to get aligned */
+	addi	r3,r3,2
+	subi	r5,r5,2
+91:	sth	r6,4(r4)
+	addi	r4,r4,2
+	addc	r0,r0,r6
+	srwi.	r6,r5,2		/* # words to do */
+	beq	3f
+1:	mtctr	r6
+82:	lwzu	r6,4(r3)	/* the bdnz has zero overhead, so it should */
+92:	stwu	r6,4(r4)	/* be unnecessary to unroll this loop */
+	adde	r0,r0,r6
+	bdnz	82b
+	andi.	r5,r5,3
+3:	cmpi	0,r5,2
+	blt+	4f
+83:	lhz	r6,4(r3)
+	addi	r3,r3,2
+	subi	r5,r5,2
+93:	sth	r6,4(r4)
+	addi	r4,r4,2
+	adde	r0,r0,r6
+4:	cmpi	0,r5,1
+	bne+	5f
+84:	lbz	r6,4(r3)
+94:	stb	r6,4(r4)
+	slwi	r6,r6,8		/* Upper byte of word */
+	adde	r0,r0,r6
+5:	addze	r3,r0		/* add in final carry */
+	blr
+
+/* These shouldn't go in the fixup section, since that would
+   cause the ex_table addresses to get out of order. */
+
+src_error_1:
+	li	r6,0
+	subi	r5,r5,2
+95:	sth	r6,4(r4)
+	addi	r4,r4,2
+	srwi.	r6,r5,2
+	beq	3f
+	mtctr	r6
+src_error_2:
+	li	r6,0
+96:	stwu	r6,4(r4)
+	bdnz	96b
+3:	andi.	r5,r5,3
+	beq	src_error
+src_error_3:
+	li	r6,0
+	mtctr	r5
+	addi	r4,r4,3
+97:	stbu	r6,1(r4)
+	bdnz	97b
+src_error:
+	cmpi	0,r7,0
+	beq	1f
+	li	r6,-EFAULT
+	stw	r6,0(r7)
+1:	addze	r3,r0
+	blr
+
+dst_error:
+	cmpi	0,r8,0
+	beq	1f
+	li	r6,-EFAULT
+	stw	r6,0(r8)
+1:	addze	r3,r0
+	blr
+
+.section __ex_table,"a"
+	.long	81b,src_error_1
+	.long	91b,dst_error
+	.long	82b,src_error_2
+	.long	92b,dst_error
+	.long	83b,src_error_3
+	.long	93b,dst_error
+	.long	84b,src_error_3
+	.long	94b,dst_error
+	.long	95b,dst_error
+	.long	96b,dst_error
+	.long	97b,dst_error
diff --git a/pfinet/linux-src/arch/sparc/lib/checksum.S b/pfinet/linux-src/arch/sparc/lib/checksum.S
new file mode 100644
index 00000000..d02b6dfb
--- /dev/null
+++ b/pfinet/linux-src/arch/sparc/lib/checksum.S
@@ -0,0 +1,581 @@
+/* checksum.S: Sparc optimized checksum code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1995 Miguel de Icaza
+ *  Copyright(C) 1996 David S. Miller
+ *  Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ *	Linux/Alpha checksum c-code
+ *      Linux/ix86 inline checksum assembly
+ *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ *	David Mosberger-Tang for optimized reference c-code
+ *	BSD4.4 portable checksum routine
+ */
+
+#include <asm/cprefix.h>
+#include <asm/errno.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5)	\
+	ldd	[buf + offset + 0x00], t0;			\
+	ldd	[buf + offset + 0x08], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	ldd	[buf + offset + 0x10], t4;			\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;					\
+	ldd	[buf + offset + 0x18], t0;			\
+	addxcc	t4, sum, sum;					\
+	addxcc	t5, sum, sum;					\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)	\
+	ldd	[buf - offset - 0x08], t0;			\
+	ldd	[buf - offset - 0x00], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;
+
+	/* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+	be	1f				! caller asks %o1 & 0x8
+	 andcc	%o1, 4, %g0			! nope, check for word remaining
+	ldd	[%o0], %g2			! load two
+	addcc	%g2, %o2, %o2			! add first word to sum
+	addxcc	%g3, %o2, %o2			! add second word as well
+	add	%o0, 8, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 4, %g0			! check again for word remaining
+1:	be	1f				! nope, skip this code
+	 andcc	%o1, 3, %o1			! check for trailing bytes
+	ld	[%o0], %g2			! load it
+	addcc	%g2, %o2, %o2			! add to sum
+	add	%o0, 4, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 3, %g0			! check again for trailing bytes
+1:	be	1f				! no trailing bytes, return
+	 addcc	%o1, -1, %g0			! only one byte remains?
+	bne	2f				! at least two bytes more
+	 subcc	%o1, 2, %o1			! only two bytes more?
+	b	4f				! only one byte remains
+	 or	%g0, %g0, %o4			! clear fake hword value
+2:	lduh	[%o0], %o4			! get hword
+	be	6f				! jmp if only hword remains
+	 add	%o0, 2, %o0			! advance buf ptr either way
+	sll	%o4, 16, %o4			! create upper hword
+4:	ldub	[%o0], %o5			! get final byte
+	sll	%o5, 8, %o5			! put into place
+	or	%o5, %o4, %o4			! coalese with hword (if any)
+6:	addcc	%o4, %o2, %o2			! add to sum
+1:	retl					! get outta here
+	 addx	%g0, %o2, %o0			! add final carry into retval
+
+	/* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+	cmp	%o1, 6
+	bl	cpte - 0x4
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	lduh	[%o0 + 0x00], %g2
+	sub	%o1, 2, %o1
+	add	%o0, 2, %o0
+	sll	%g2, 16, %g2
+	addcc	%g2, %o2, %o2
+	srl	%o2, 16, %g3
+	addx	%g0, %g3, %g2
+	sll	%o2, 16, %o2
+	sll	%g2, 16, %g3
+	srl	%o2, 16, %o2
+	andcc	%o0, 0x4, %g0
+	or	%g3, %o2, %o2
+1:	be	cpa
+	 andcc	%o1, 0xffffff80, %o3
+	ld	[%o0 + 0x00], %g2
+	sub	%o1, 4, %o1
+	addcc	%g2, %o2, %o2
+	add	%o0, 4, %o0
+	addx	%g0, %o2, %o2
+	b	cpa
+	 andcc	%o1, 0xffffff80, %o3
+
+	/* The common case is to get called with a nicely aligned
+	 * buffer of size 0x20.  Follow the code path for that case.
+	 */
+	.globl	C_LABEL(csum_partial)
+C_LABEL(csum_partial):			/* %o0=buf, %o1=len, %o2=sum */
+	andcc	%o0, 0x7, %g0				! alignment problems?
+	bne	csum_partial_fix_alignment		! yep, handle it
+	 sethi	%hi(cpte - 8), %g7			! prepare table jmp ptr
+	andcc	%o1, 0xffffff80, %o3			! num loop iterations
+cpa:	be	3f					! none to do
+	 andcc	%o1, 0x70, %g1				! clears carry flag too
+5:	CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! sink in final carry
+	subcc	%o3, 128, %o3				! detract from loop iters
+	bne	5b					! more to do
+	 add	%o0, 128, %o0				! advance buf ptr
+	andcc	%o1, 0x70, %g1				! clears carry flag too
+3:	be	cpte					! nope
+	 andcc	%o1, 0xf, %g0				! anything left at all?
+	srl	%g1, 1, %o4				! compute offset
+	sub	%g7, %g1, %g7				! adjust jmp ptr
+	sub	%g7, %o4, %g7				! final jmp ptr adjust
+	jmp	%g7 + %lo(cpte - 8)			! enter the table
+	 add	%o0, %g1, %o0				! advance buf ptr
+cptbl:	CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! fetch final carry
+	andcc	%o1, 0xf, %g0				! anything left at all?
+cpte:	bne	csum_partial_end_cruft			! yep, handle it
+	 andcc	%o1, 8, %g0				! check how much
+cpout:	retl						! get outta here
+	 mov	%o2, %o0				! return computed csum
+
+	.globl C_LABEL(__csum_partial_copy_start), C_LABEL(__csum_partial_copy_end)
+C_LABEL(__csum_partial_copy_start):
+
+#define EX(x,y,a,b,z)                           \
+98:     x,y;                                    \
+        .section .fixup,z##alloc,z##execinstr;  \
+        .align  4;                              \
+99:     ba 30f;                                 \
+         a, b, %o3;                             \
+        .section __ex_table,z##alloc;           \
+        .align  4;                              \
+        .word   98b, 99b;                       \
+        .text;                                  \
+        .align  4
+
+#define EX2(x,y,z)                          	\
+98:     x,y;                                    \
+        .section __ex_table,z##alloc;           \
+        .align  4;                              \
+        .word   98b, 30f;                       \
+        .text;                                  \
+        .align  4
+
+#define EX3(x,y,z)                          	\
+98:     x,y;                                    \
+        .section __ex_table,z##alloc;           \
+        .align  4;                              \
+        .word   98b, 96f;                       \
+        .text;                                  \
+        .align  4
+
+#define EXT(start,end,handler,z)                \
+        .section __ex_table,z##alloc;           \
+        .align  4;                              \
+        .word   start, 0, end, handler;         \
+        .text;                                  \
+        .align  4
+
+	/* This aligned version executes typically in 8.5 superscalar cycles, this
+	 * is the best I can do.  I say 8.5 because the final add will pair with
+	 * the next ldd in the main unrolled loop.  Thus the pipe is always full.
+	 * If you change these macros (including order of instructions),
+	 * please check the fixup code below as well.
+	 */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;							\
+	ldd	[src + off + 0x08], t2;							\
+	addxcc	t0, sum, sum;								\
+	ldd	[src + off + 0x10], t4;							\
+	addxcc	t1, sum, sum;								\
+	ldd	[src + off + 0x18], t6;							\
+	addxcc	t2, sum, sum;								\
+	std	t0, [dst + off + 0x00];							\
+	addxcc	t3, sum, sum;								\
+	std	t2, [dst + off + 0x08];							\
+	addxcc	t4, sum, sum;								\
+	std	t4, [dst + off + 0x10];							\
+	addxcc	t5, sum, sum;								\
+	std	t6, [dst + off + 0x18];							\
+	addxcc	t6, sum, sum;								\
+	addxcc	t7, sum, sum;
+
+	/* 12 superscalar cycles seems to be the limit for this case,
+	 * because of this we thus do all the ldd's together to get
+	 * Viking MXCC into streaming mode.  Ho hum...
+	 */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;						\
+	ldd	[src + off + 0x08], t2;						\
+	ldd	[src + off + 0x10], t4;						\
+	ldd	[src + off + 0x18], t6;						\
+	st	t0, [dst + off + 0x00];						\
+	addxcc	t0, sum, sum;							\
+	st	t1, [dst + off + 0x04];						\
+	addxcc	t1, sum, sum;							\
+	st	t2, [dst + off + 0x08];						\
+	addxcc	t2, sum, sum;							\
+	st	t3, [dst + off + 0x0c];						\
+	addxcc	t3, sum, sum;							\
+	st	t4, [dst + off + 0x10];						\
+	addxcc	t4, sum, sum;							\
+	st	t5, [dst + off + 0x14];						\
+	addxcc	t5, sum, sum;							\
+	st	t6, [dst + off + 0x18];						\
+	addxcc	t6, sum, sum;							\
+	st	t7, [dst + off + 0x1c];						\
+	addxcc	t7, sum, sum;
+
+	/* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)	\
+	ldd	[src - off - 0x08], t0;				\
+	ldd	[src - off - 0x00], t2;				\
+	addxcc	t0, sum, sum;					\
+	st	t0, [dst - off - 0x08];				\
+	addxcc	t1, sum, sum;					\
+	st	t1, [dst - off - 0x04];				\
+	addxcc	t2, sum, sum;					\
+	st	t2, [dst - off - 0x00];				\
+	addxcc	t3, sum, sum;					\
+	st	t3, [dst - off + 0x04];
+
+	/* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+	be	1f
+	 andcc	%o3, 4, %g0
+	EX(ldd	[%o0 + 0x00], %g2, and %o3, 0xf,#)
+	add	%o1, 8, %o1
+	addcc	%g2, %g7, %g7
+	add	%o0, 8, %o0
+	addxcc	%g3, %g7, %g7
+	EX2(st	%g2, [%o1 - 0x08],#)
+	addx	%g0, %g7, %g7
+	andcc	%o3, 4, %g0
+	EX2(st	%g3, [%o1 - 0x04],#)
+1:	be	1f
+	 andcc	%o3, 3, %o3
+	EX(ld	[%o0 + 0x00], %g2, add %o3, 4,#)
+	add	%o1, 4, %o1
+	addcc	%g2, %g7, %g7
+	EX2(st	%g2, [%o1 - 0x04],#)
+	addx	%g0, %g7, %g7
+	andcc	%o3, 3, %g0
+	add	%o0, 4, %o0
+1:	be	1f
+	 addcc	%o3, -1, %g0
+	bne	2f
+	 subcc	%o3, 2, %o3
+	b	4f
+	 or	%g0, %g0, %o4
+2:	EX(lduh	[%o0 + 0x00], %o4, add %o3, 2,#)
+	add	%o0, 2, %o0
+	EX2(sth	%o4, [%o1 + 0x00],#)
+	be	6f
+	 add	%o1, 2, %o1
+	sll	%o4, 16, %o4
+4:	EX(ldub	[%o0 + 0x00], %o5, add %g0, 1,#)
+	EX2(stb	%o5, [%o1 + 0x00],#)
+	sll	%o5, 8, %o5
+	or	%o5, %o4, %o4
+6:	addcc	%o4, %g7, %g7
+1:	retl
+	 addx	%g0, %g7, %o0
+
+	/* Also, handle the alignment code out of band. */
+cc_dword_align:
+	cmp	%g1, 6
+	bl,a	ccte
+	 andcc	%g1, 0xf, %o3
+	andcc	%o0, 0x1, %g0
+	bne	ccslow
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	EX(lduh	[%o0 + 0x00], %g4, add %g1, 0,#)
+	sub	%g1, 2, %g1
+	EX2(sth	%g4, [%o1 + 0x00],#)
+	add	%o0, 2, %o0
+	sll	%g4, 16, %g4
+	addcc	%g4, %g7, %g7
+	add	%o1, 2, %o1
+	srl	%g7, 16, %g3
+	addx	%g0, %g3, %g4
+	sll	%g7, 16, %g7
+	sll	%g4, 16, %g3
+	srl	%g7, 16, %g7
+	andcc	%o0, 0x4, %g0
+	or	%g3, %g7, %g7
+1:	be	3f
+	 andcc	%g1, 0xffffff80, %g0
+	EX(ld	[%o0 + 0x00], %g4, add %g1, 0,#)
+	sub	%g1, 4, %g1
+	EX2(st	%g4, [%o1 + 0x00],#)
+	add	%o0, 4, %o0
+	addcc	%g4, %g7, %g7
+	add	%o1, 4, %o1
+	addx	%g0, %g7, %g7
+	b	3f
+	 andcc	%g1, 0xffffff80, %g0
+
+	/* Sun, you just can't beat me, you just can't.  Stop trying,
+	 * give up.  I'm serious, I am going to kick the living shit
+	 * out of you, game over, lights out.
+	 */
+	.align	8
+	.globl	C_LABEL(__csum_partial_copy_sparc_generic)
+C_LABEL(__csum_partial_copy_sparc_generic):
+					/* %o0=src, %o1=dest, %g1=len, %g7=sum */
+	xor	%o0, %o1, %o4		! get changing bits
+	andcc	%o4, 3, %g0		! check for mismatched alignment
+	bne	ccslow			! better this than unaligned/fixups
+	 andcc	%o0, 7, %g0		! need to align things?
+	bne	cc_dword_align		! yes, we check for short lengths there
+	 andcc	%g1, 0xffffff80, %g0	! can we use unrolled loop?
+3:	be	3f			! nope, less than one loop remains
+	 andcc	%o1, 4, %g0		! dest aligned on 4 or 8 byte boundry?
+	be	ccdbl + 4		! 8 byte aligned, kick ass
+5:	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+10:	EXT(5b, 10b, 20f,#)		! note for exception handling
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	5b			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+3:	andcc	%g1, 0x70, %o2		! can use table?
+ccmerge:be	ccte			! nope, go and check for end cruft
+	 andcc	%g1, 0xf, %o3		! get low bits of length (clears carry btw)
+	srl	%o2, 1, %o4		! begin negative offset computation
+	sethi	%hi(12f), %o5		! set up table ptr end
+	add	%o0, %o2, %o0		! advance src ptr
+	sub	%o5, %o4, %o5		! continue table calculation
+	sll	%o2, 1, %g2		! constant multiplies are fun...
+	sub	%o5, %g2, %o5		! some more adjustments
+	jmp	%o5 + %lo(12f)		! jump into it, duff style, wheee...
+	 add	%o1, %o2, %o1		! advance dest ptr (carry is clear btw)
+cctbl:	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+12:	EXT(cctbl, 12b, 22f,#)		! note for exception table handling
+	addx	%g0, %g7, %g7
+	andcc	%o3, 0xf, %g0		! check for low bits set
+ccte:	bne	cc_end_cruft		! something left, handle it out of band
+	 andcc	%o3, 8, %g0		! begin checks for that code
+	retl				! return
+	 mov	%g7, %o0		! give em the computed checksum
+ccdbl:	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+11:	EXT(ccdbl, 11b, 21f,#)		! note for exception table handling
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	ccdbl			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+	b	ccmerge			! finish it off, above
+	 andcc	%g1, 0x70, %o2		! can use table? (clears carry btw)
+
+ccslow:	cmp	%g1, 0
+	mov	0, %g5
+	bleu	4f
+	 andcc	%o0, 1, %o5		
+	be,a	1f
+	 srl	%g1, 1, %g4		
+	sub	%g1, 1, %g1	
+	EX(ldub	[%o0], %g5, add %g1, 1,#)
+	add	%o0, 1, %o0	
+	EX2(stb	%g5, [%o1],#)
+	srl	%g1, 1, %g4
+	add	%o1, 1, %o1
+1:	cmp	%g4, 0		
+	be,a	3f
+	 andcc	%g1, 1, %g0
+	andcc	%o0, 2, %g0	
+	be,a	1f
+	 srl	%g4, 1, %g4
+	EX(lduh	[%o0], %o4, add %g1, 0,#)
+	sub	%g1, 2, %g1	
+	srl	%o4, 8, %g2
+	sub	%g4, 1, %g4	
+	EX2(stb	%g2, [%o1],#)
+	add	%o4, %g5, %g5
+	EX2(stb	%o4, [%o1 + 1],#)
+	add	%o0, 2, %o0	
+	srl	%g4, 1, %g4
+	add	%o1, 2, %o1
+1:	cmp	%g4, 0		
+	be,a	2f
+	 andcc	%g1, 2, %g0
+	EX3(ld	[%o0], %o4,#)
+5:	srl	%o4, 24, %g2
+	srl	%o4, 16, %g3
+	EX2(stb	%g2, [%o1],#)
+	srl	%o4, 8, %g2
+	EX2(stb	%g3, [%o1 + 1],#)
+	add	%o0, 4, %o0
+	EX2(stb	%g2, [%o1 + 2],#)
+	addcc	%o4, %g5, %g5
+	EX2(stb	%o4, [%o1 + 3],#)
+	addx	%g5, %g0, %g5	! I am now to lazy to optimize this (question it
+	add	%o1, 4, %o1	! is worthy). Maybe some day - with the sll/srl
+	subcc	%g4, 1, %g4	! tricks
+	bne,a	5b
+	 EX3(ld	[%o0], %o4,#)
+	sll	%g5, 16, %g2
+	srl	%g5, 16, %g5
+	srl	%g2, 16, %g2
+	andcc	%g1, 2, %g0
+	add	%g2, %g5, %g5 
+2:	be,a	3f		
+	 andcc	%g1, 1, %g0
+	EX(lduh	[%o0], %o4, and %g1, 3,#)
+	andcc	%g1, 1, %g0
+	srl	%o4, 8, %g2
+	add	%o0, 2, %o0	
+	EX2(stb	%g2, [%o1],#)
+	add	%g5, %o4, %g5
+	EX2(stb	%o4, [%o1 + 1],#)
+	add	%o1, 2, %o1
+3:	be,a	1f		
+	 sll	%g5, 16, %o4
+	EX(ldub	[%o0], %g2, add %g0, 1,#)
+	sll	%g2, 8, %o4	
+	EX2(stb	%g2, [%o1],#)
+	add	%g5, %o4, %g5
+	sll	%g5, 16, %o4
+1:	addcc	%o4, %g5, %g5
+	srl	%g5, 16, %o4
+	addx	%g0, %o4, %g5
+	orcc	%o5, %g0, %g0
+	be	4f
+	 srl	%g5, 8, %o4
+	and	%g5, 0xff, %g2
+	and	%o4, 0xff, %o4
+	sll	%g2, 8, %g2
+	or	%g2, %o4, %g5
+4:	addcc	%g7, %g5, %g7
+	retl	
+	 addx	%g0, %g7, %o0
+C_LABEL(__csum_partial_copy_end):
+
+/* We do these strange calculations for the csum_*_from_user case only, ie.
+ * we only bother with faults on loads... */
+
+/* o2 = ((g2%20)&3)*8
+ * o3 = g1 - (g2/20)*32 - o2 */
+20:
+	cmp	%g2, 20
+	blu,a	1f
+	 and	%g2, 3, %o2
+	sub	%g1, 32, %g1
+	b	20b
+	 sub	%g2, 20, %g2
+1:
+	sll	%o2, 3, %o2
+	b	31f
+	 sub	%g1, %o2, %o3
+
+/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
+ * o3 = g1 - (g2/16)*32 - o2 */
+21:
+	andcc	%g2, 15, %o3
+	srl	%g2, 4, %g2
+	be,a	1f
+	 clr	%o2
+	add	%o3, 1, %o3
+	and	%o3, 14, %o3
+	sll	%o3, 3, %o2
+1:
+	sll	%g2, 5, %g2
+	sub	%g1, %g2, %o3
+	b	31f
+	 sub	%o3, %o2, %o3
+
+/* o0 += (g2/10)*16 - 0x70
+ * 01 += (g2/10)*16 - 0x70
+ * o2 = (g2 % 10) ? 8 : 0
+ * o3 += 0x70 - (g2/10)*16 - o2 */
+22:
+	cmp	%g2, 10
+	blu,a	1f
+	 sub	%o0, 0x70, %o0
+	add	%o0, 16, %o0
+	add	%o1, 16, %o1
+	sub	%o3, 16, %o3
+	b	22b
+	 sub	%g2, 10, %g2
+1:
+	sub	%o1, 0x70, %o1
+	add	%o3, 0x70, %o3
+	clr	%o2
+	tst	%g2
+	bne,a	1f
+	 mov	8, %o2
+1:
+	b	31f
+	 sub	%o3, %o2, %o3
+96:
+	and	%g1, 3, %g1
+	sll	%g4, 2, %g4
+	add	%g1, %g4, %o3
+30:
+/* %o1 is dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+	clr	%o2
+31:
+/* %o0 is src
+ * %o1 is dst
+ * %o2 is # of bytes to copy from src to dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+	save	%sp, -104, %sp
+        mov     %i5, %o0
+        mov     %i7, %o1
+        mov	%i4, %o2
+        call    C_LABEL(lookup_fault)
+	 mov	%g7, %i4
+	cmp	%o0, 2
+	bne	1f	
+	 add	%g0, -EFAULT, %i5
+	tst	%i2
+	be	2f
+	 mov	%i0, %o1
+	mov	%i1, %o0
+5:
+	call	C_LABEL(__memcpy)
+	 mov	%i2, %o2
+	tst	%o0
+	bne,a	2f
+	 add	%i3, %i2, %i3
+	add	%i1, %i2, %i1
+2:
+	mov	%i1, %o0
+6:
+	call	C_LABEL(__bzero)
+	 mov	%i3, %o1
+1:
+	ld	[%sp + 168], %o2		! struct_ptr of parent
+	st	%i5, [%o2]
+	ret
+	 restore
+
+        .section __ex_table,#alloc
+        .align 4
+        .word 5b,2
+	.word 6b,2
diff --git a/pfinet/linux-src/arch/sparc64/lib/checksum.S b/pfinet/linux-src/arch/sparc64/lib/checksum.S
new file mode 100644
index 00000000..ea732b36
--- /dev/null
+++ b/pfinet/linux-src/arch/sparc64/lib/checksum.S
@@ -0,0 +1,278 @@
+/* checksum.S: Sparc V9 optimized checksum code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1995 Miguel de Icaza
+ *  Copyright(C) 1996 David S. Miller
+ *  Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ *	Linux/Alpha checksum c-code
+ *      Linux/ix86 inline checksum assembly
+ *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ *	David Mosberger-Tang for optimized reference c-code
+ *	BSD4.4 portable checksum routine
+ */
+
+#include <asm/errno.h>
+#include <asm/head.h>
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	/* The problem with the "add with carry" instructions on Ultra
+	 * are two fold.  Firstly, they cannot pair with jack shit,
+	 * and also they only add in the 32-bit carry condition bit
+	 * into the accumulated sum.  The following is much better.
+	 * For larger chunks we use VIS code, which is faster ;)
+	 */
+
+#define src o0
+#define dst o1
+#define len o2
+#define sum o3
+
+	.text
+	/* I think I have an erection...  Once _AGAIN_ the SunSoft
+	 * engineers are caught asleep at the keyboard, tsk tsk...
+	 */
+
+#define CSUMCOPY_LASTCHUNK(off, t0, t1)							\
+	ldxa		[%src - off - 0x08] %asi, t0;					\
+	ldxa		[%src - off - 0x00] %asi, t1;					\
+	nop; nop;									\
+	addcc		t0, %sum, %sum;							\
+	stw		t0, [%dst - off - 0x04];					\
+	srlx		t0, 32, t0;							\
+	bcc,pt		%xcc, 51f;							\
+	 stw		t0, [%dst - off - 0x08];					\
+	add		%sum, 1, %sum;							\
+51:	addcc		t1, %sum, %sum;							\
+	stw		t1, [%dst - off + 0x04];					\
+	srlx		t1, 32, t1;							\
+	bcc,pt		%xcc, 52f;							\
+	 stw		t1, [%dst - off - 0x00];					\
+	add		%sum, 1, %sum;							\
+52:
+
+cpc_start:
+cc_end_cruft:
+	andcc		%g7, 8, %g0		! IEU1	Group
+	be,pn		%icc, 1f		! CTI
+	 and		%g7, 4, %g5		! IEU0
+	ldxa		[%src + 0x00] %asi, %g2	! Load	Group
+	add		%dst, 8, %dst		! IEU0
+	add		%src, 8, %src		! IEU1
+	addcc		%g2, %sum, %sum		! IEU1	Group + 2 bubbles
+	stw		%g2, [%dst - 0x04]	! Store
+	srlx		%g2, 32, %g2		! IEU0
+	bcc,pt		%xcc, 1f		! CTI	Group
+	 stw		%g2, [%dst - 0x08]	! Store
+	add		%sum, 1, %sum		! IEU0
+1:	brz,pt		%g5, 1f			! CTI	Group
+	 clr		%g2			! IEU0
+	lduwa		[%src + 0x00] %asi, %g2	! Load
+	add		%dst, 4, %dst		! IEU0	Group
+	add		%src, 4, %src		! IEU1
+	stw		%g2, [%dst - 0x04]	! Store	Group + 2 bubbles
+	sllx		%g2, 32, %g2		! IEU0
+1:	andcc		%g7, 2, %g0		! IEU1
+	be,pn		%icc, 1f		! CTI	Group
+	 clr		%o4			! IEU1
+	lduha		[%src + 0x00] %asi, %o4	! Load
+	add		%src, 2, %src		! IEU0	Group
+	add		%dst, 2, %dst		! IEU1
+	sth		%o4, [%dst - 0x2]	! Store Group + 2 bubbles
+	sll		%o4, 16, %o4		! IEU0
+1:	andcc		%g7, 1, %g0		! IEU1
+	be,pn		%icc, 1f		! CTI	Group
+	 clr		%o5			! IEU0
+	lduba		[%src + 0x00] %asi, %o5	! Load
+	stb		%o5, [%dst + 0x00]	! Store	Group + 2 bubbles
+	sll		%o5, 8, %o5		! IEU0
+1:	or		%g2, %o4, %o4		! IEU1
+	or		%o5, %o4, %o4		! IEU0	Group
+	addcc		%o4, %sum, %sum		! IEU1
+	bcc,pt		%xcc, ccfold		! CTI
+	 sethi		%uhi(PAGE_OFFSET), %g4	! IEU0	Group
+	b,pt		%xcc, ccfold		! CTI
+	 add		%sum, 1, %sum		! IEU1
+
+cc_fixit:
+	cmp		%len, 6			! IEU1	Group
+	bl,a,pn		%icc, ccte		! CTI
+	 andcc		%len, 0xf, %g7		! IEU1	Group
+	andcc		%src, 2, %g0		! IEU1	Group
+	be,pn		%icc, 1f		! CTI
+	 andcc		%src, 0x4, %g0		! IEU1	Group
+	lduha		[%src + 0x00] %asi, %g4	! Load
+	sub		%len, 2, %len		! IEU0
+	add		%src, 2, %src		! IEU0	Group
+	add		%dst, 2, %dst		! IEU1
+	sll		%g4, 16, %g3		! IEU0	Group + 1 bubble
+	addcc		%g3, %sum, %sum		! IEU1
+	bcc,pt		%xcc, 0f		! CTI
+	 srl		%sum, 16, %g3		! IEU0	Group
+	add		%g3, 1, %g3		! IEU0	4 clocks (mispredict)
+0:	andcc		%src, 0x4, %g0		! IEU1	Group
+	sth		%g4, [%dst - 0x2]	! Store
+	sll		%sum, 16, %sum		! IEU0
+	sll		%g3, 16, %g3		! IEU0	Group
+	srl		%sum, 16, %sum		! IEU0	Group
+	or		%g3, %sum, %sum		! IEU0	Group (regdep)
+1:	be,pt		%icc, ccmerge		! CTI
+	 andcc		%len, 0xf0, %g1		! IEU1
+	lduwa		[%src + 0x00] %asi, %g4	! Load	Group
+	sub		%len, 4, %len		! IEU0
+	add		%src, 4, %src		! IEU1
+	add		%dst, 4, %dst		! IEU0	Group
+	addcc		%g4, %sum, %sum		! IEU1	Group + 1 bubble
+	stw		%g4, [%dst - 0x4]	! Store
+	bcc,pt		%xcc, ccmerge		! CTI
+	 andcc		%len, 0xf0, %g1		! IEU1	Group
+	b,pt		%xcc, ccmerge		! CTI	4 clocks (mispredict)
+	 add		%sum, 1, %sum		! IEU0
+
+	.align		32
+	.globl		csum_partial_copy_sparc64
+csum_partial_copy_sparc64:			/* %o0=src, %o1=dest, %o2=len, %o3=sum */
+	xorcc		%src, %dst, %o4		! IEU1	Group
+	srl		%sum, 0, %sum		! IEU0
+	andcc		%o4, 3, %g0		! IEU1	Group
+	srl		%len, 0, %len		! IEU0
+	bne,pn		%icc, ccslow		! CTI
+	 andcc		%src, 1, %g0		! IEU1	Group
+	bne,pn		%icc, ccslow		! CTI
+	 cmp		%len, 256		! IEU1	Group
+	bgeu,pt		%icc, csum_partial_copy_vis ! CTI
+	 andcc		%src, 7, %g0		! IEU1	Group
+	bne,pn		%icc, cc_fixit		! CTI
+	 andcc		%len, 0xf0, %g1		! IEU1	Group
+ccmerge:be,pn		%icc, ccte		! CTI
+	 andcc		%len, 0xf, %g7		! IEU1	Group
+	sll		%g1, 2, %o4		! IEU0
+13:	sethi		%hi(12f), %o5		! IEU0	Group
+	add		%src, %g1, %src		! IEU1	
+	sub		%o5, %o4, %o5		! IEU0	Group
+	jmpl		%o5 + %lo(12f), %g0	! CTI	Group brk forced
+	 add		%dst, %g1, %dst		! IEU0	Group
+cctbl:	CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x98,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x88,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x78,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x68,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x58,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x48,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x38,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x28,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x18,%g2,%g3)
+	CSUMCOPY_LASTCHUNK(0x08,%g2,%g3)
+12:
+	andcc		%len, 0xf, %g7		! IEU1	Group
+ccte:	bne,pn		%icc, cc_end_cruft	! CTI
+	 sethi		%uhi(PAGE_OFFSET), %g4	! IEU0
+ccfold:	sllx		%sum, 32, %o0		! IEU0	Group
+	addcc		%sum, %o0, %o0		! IEU1	Group (regdep)
+	srlx		%o0, 32, %o0		! IEU0	Group (regdep)
+	bcs,a,pn	%xcc, 1f		! CTI
+	 add		%o0, 1, %o0		! IEU1	4 clocks (mispredict)
+1:	retl					! CTI	Group brk forced
+	 sllx		%g4, 32, %g4		! IEU0	Group
+
+ccslow:	mov	0, %g5
+	brlez,pn %len, 4f
+	 andcc	%src, 1, %o5		
+	be,a,pt	%icc, 1f
+	 srl	%len, 1, %g7		
+	sub	%len, 1, %len	
+	lduba [%src] %asi, %g5
+	add	%src, 1, %src	
+	stb	%g5, [%dst]
+	srl	%len, 1, %g7
+	add	%dst, 1, %dst
+1:	brz,a,pn %g7, 3f
+	 andcc	%len, 1, %g0
+	andcc	%src, 2, %g0	
+	be,a,pt	%icc, 1f
+	 srl	%g7, 1, %g7
+	lduha [%src] %asi, %o4
+	sub	%len, 2, %len	
+	srl	%o4, 8, %g2
+	sub	%g7, 1, %g7	
+	stb	%g2, [%dst]
+	add	%o4, %g5, %g5
+	stb	%o4, [%dst + 1]
+	add	%src, 2, %src	
+	srl	%g7, 1, %g7
+	add	%dst, 2, %dst
+1:	brz,a,pn %g7, 2f		
+	 andcc	%len, 2, %g0
+	lduwa	[%src] %asi, %o4
+5:	srl	%o4, 24, %g2
+	srl	%o4, 16, %g3
+	stb	%g2, [%dst]
+	srl	%o4, 8, %g2
+	stb	%g3, [%dst + 1]
+	add	%src, 4, %src
+	stb	%g2, [%dst + 2]
+	addcc	%o4, %g5, %g5
+	stb	%o4, [%dst + 3]
+	addc	%g5, %g0, %g5
+	add	%dst, 4, %dst
+	subcc	%g7, 1, %g7
+	bne,a,pt %icc, 5b
+	 lduwa [%src] %asi, %o4
+	sll	%g5, 16, %g2
+	srl	%g5, 16, %g5
+	srl	%g2, 16, %g2
+	andcc	%len, 2, %g0
+	add	%g2, %g5, %g5 
+2:	be,a,pt	%icc, 3f		
+	 andcc	%len, 1, %g0
+	lduha [%src] %asi, %o4
+	andcc	%len, 1, %g0
+	srl	%o4, 8, %g2
+	add	%src, 2, %src	
+	stb	%g2, [%dst]
+	add	%g5, %o4, %g5
+	stb	%o4, [%dst + 1]
+	add	%dst, 2, %dst
+3:	be,a,pt	%icc, 1f		
+	 sll	%g5, 16, %o4
+	lduba [%src] %asi, %g2
+	sll	%g2, 8, %o4	
+	stb	%g2, [%dst]
+	add	%g5, %o4, %g5
+	sll	%g5, 16, %o4
+1:	addcc	%o4, %g5, %g5
+	srl	%g5, 16, %o4
+	addc	%g0, %o4, %g5
+	brz,pt	%o5, 4f
+	 srl	%g5, 8, %o4
+	and	%g5, 0xff, %g2
+	and	%o4, 0xff, %o4
+	sll	%g2, 8, %g2
+	or	%g2, %o4, %g5
+4:	addcc	%sum, %g5, %sum
+	addc	%g0, %sum, %o0
+	retl	
+	 srl	%o0, 0, %o0
+cpc_end:
+
+	.globl	cpc_handler
+cpc_handler:
+	ldx	[%sp + 0x7ff + 128], %g1
+	sub	%g0, EFAULT, %g2
+	brnz,a,pt %g1, 1f
+	 st	%g2, [%g1]
+1:	sethi	%uhi(PAGE_OFFSET), %g4
+	retl
+	 sllx	%g4, 32, %g4
+
+	.section __ex_table
+	.align  4
+	.word  cpc_start, 0, cpc_end, cpc_handler
+
author	Roland McGrath <roland@gnu.org>	2000-02-04 03:21:18 +0000
committer	Roland McGrath <roland@gnu.org>	2000-02-04 03:21:18 +0000
commit	9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch)
tree	8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/arch