summaryrefslogtreecommitdiff
path: root/pfinet/linux-src/arch
diff options
context:
space:
mode:
authorRoland McGrath <roland@gnu.org>2000-02-04 03:21:18 +0000
committerRoland McGrath <roland@gnu.org>2000-02-04 03:21:18 +0000
commit9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch)
tree8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/arch
Import of Linux 2.2.12 subset (ipv4 stack and related)
Diffstat (limited to 'pfinet/linux-src/arch')
-rw-r--r--pfinet/linux-src/arch/alpha/lib/checksum.c169
-rw-r--r--pfinet/linux-src/arch/arm/lib/checksum.S730
-rw-r--r--pfinet/linux-src/arch/i386/lib/checksum.S447
-rw-r--r--pfinet/linux-src/arch/i386/lib/old-checksum.c19
-rw-r--r--pfinet/linux-src/arch/m68k/lib/checksum.c420
-rw-r--r--pfinet/linux-src/arch/ppc/lib/checksum.S194
-rw-r--r--pfinet/linux-src/arch/sparc/lib/checksum.S581
-rw-r--r--pfinet/linux-src/arch/sparc64/lib/checksum.S278
8 files changed, 2838 insertions, 0 deletions
diff --git a/pfinet/linux-src/arch/alpha/lib/checksum.c b/pfinet/linux-src/arch/alpha/lib/checksum.c
new file mode 100644
index 00000000..5165279f
--- /dev/null
+++ b/pfinet/linux-src/arch/alpha/lib/checksum.c
@@ -0,0 +1,169 @@
+/*
+ * arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short from64to16(unsigned long x)
+{
+ /* add up 32-bit words for 33 bits */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 17-bit words for 17+c bits */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up 16-bit and 2-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+unsigned short int csum_tcpudp_magic(unsigned long saddr,
+ unsigned long daddr,
+ unsigned short len,
+ unsigned short proto,
+ unsigned int sum)
+{
+ return ~from64to16(saddr + daddr + sum +
+ ((unsigned long) ntohs(len) << 16) +
+ ((unsigned long) proto << 8));
+}
+
+unsigned int csum_tcpudp_nofold(unsigned long saddr,
+ unsigned long daddr,
+ unsigned short len,
+ unsigned short proto,
+ unsigned int sum)
+{
+ unsigned long result;
+
+ result = (saddr + daddr + sum +
+ ((unsigned long) ntohs(len) << 16) +
+ ((unsigned long) proto << 8));
+
+ /* Fold down to 32-bits so we don't loose in the typedef-less
+ network stack. */
+ /* 64 to 33 */
+ result = (result & 0xffffffff) + (result >> 32);
+ /* 33 to 32 */
+ result = (result & 0xffffffff) + (result >> 32);
+ return result;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area..
+ *
+ * This isn't a great routine, but it's not _horrible_ either. The
+ * inner loop could be unrolled a bit further, and there are better
+ * ways to do the carry, but this is reasonable.
+ */
+static inline unsigned long do_csum(const unsigned char * buff, int len)
+{
+ int odd, count;
+ unsigned long result = 0;
+
+ if (len <= 0)
+ goto out;
+ odd = 1 & (unsigned long) buff;
+ if (odd) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *) buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+ if (count) {
+ unsigned long carry = 0;
+ do {
+ unsigned long w = *(unsigned long *) buff;
+ count--;
+ buff += 8;
+ result += carry;
+ result += w;
+ carry = (w > result);
+ } while (count);
+ result += carry;
+ result = (result & 0xffffffff) + (result >> 32);
+ }
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = from64to16(result);
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+out:
+ return result;
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ */
+unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
+{
+ return ~do_csum(iph,ihl*4);
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+{
+ unsigned long result = do_csum(buff, len);
+
+ /* add in old sum, and carry.. */
+ result += sum;
+ /* 32+c bits -> 32 bits */
+ result = (result & 0xffffffff) + (result >> 32);
+ return result;
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+ return ~from64to16(do_csum(buff,len));
+}
diff --git a/pfinet/linux-src/arch/arm/lib/checksum.S b/pfinet/linux-src/arch/arm/lib/checksum.S
new file mode 100644
index 00000000..bd5c78d3
--- /dev/null
+++ b/pfinet/linux-src/arch/arm/lib/checksum.S
@@ -0,0 +1,730 @@
+/*
+ * linux/arch/arm/lib/checksum.S
+ *
+ * Copyright (C) 1995, 1996, 1997, 1998 Russell King
+ */
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include "constants.h"
+
+ .text
+
+/* Function: __u32 csum_partial(const char *src, int len, __u32)
+ * Params : r0 = buffer, r1 = len, r2 = checksum
+ * Returns : r0 = new checksum
+ */
+
+ENTRY(csum_partial)
+ tst r0, #2
+ beq 1f
+ subs r1, r1, #2
+ addmi r1, r1, #2
+ bmi 3f
+ bic r0, r0, #3
+ ldr r3, [r0], #4
+ adds r2, r2, r3, lsr #16
+ adcs r2, r2, #0
+1: adds r2, r2, #0
+ bics ip, r1, #31
+ beq 3f
+ stmfd sp!, {r4 - r6}
+2: ldmia r0!, {r3 - r6}
+ adcs r2, r2, r3
+ adcs r2, r2, r4
+ adcs r2, r2, r5
+ adcs r2, r2, r6
+ ldmia r0!, {r3 - r6}
+ adcs r2, r2, r3
+ adcs r2, r2, r4
+ adcs r2, r2, r5
+ adcs r2, r2, r6
+ sub ip, ip, #32
+ teq ip, #0
+ bne 2b
+ adcs r2, r2, #0
+ ldmfd sp!, {r4 - r6}
+3: ands ip, r1, #0x1c
+ beq 5f
+4: ldr r3, [r0], #4
+ adcs r2, r2, r3
+ sub ip, ip, #4
+ teq ip, #0
+ bne 4b
+ adcs r2, r2, #0
+5: ands ip, r1, #3
+ moveq r0, r2
+ RETINSTR(moveq,pc,lr)
+ mov ip, ip, lsl #3
+ rsb ip, ip, #32
+ ldr r3, [r0]
+ mov r3, r3, lsl ip
+ adds r2, r2, r3, lsr ip
+ adc r0, r2, #0
+ RETINSTR(mov,pc,lr)
+
+/* Function: __u32 csum_partial_copy_from_user (const char *src, char *dst, int len, __u32 sum, int *err_ptr)
+ * Params : r0 = src, r1 = dst, r2 = len, r3 = sum, [sp, #0] = &err
+ * Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT
+ */
+#if defined(CONFIG_CPU_32)
+
+ .macro save_regs
+ stmfd sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}
+ .endm
+
+#define LOAD_REGS(cond) \
+ LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r8, fp, sp, pc})
+
+ .macro load1b, reg1
+9999: ldrbt \reg1, [r0], $1
+ .section __ex_table, "a"
+ .align 3
+ .long 9999b, 6001f
+ .previous
+ .endm
+
+ .macro load2b, reg1, reg2
+9999: ldrbt \reg1, [r0], $1
+9998: ldrbt \reg2, [r0], $1
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .previous
+ .endm
+
+ .macro load1l, reg1
+9999: ldrt \reg1, [r0], $4
+ .section __ex_table, "a"
+ .align 3
+ .long 9999b, 6001f
+ .previous
+ .endm
+
+ .macro load2l, reg1, reg2
+9999: ldrt \reg1, [r0], $4
+9998: ldrt \reg2, [r0], $4
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .previous
+ .endm
+
+ .macro load4l, reg1, reg2, reg3, reg4
+9999: ldrt \reg1, [r0], $4
+9998: ldrt \reg2, [r0], $4
+9997: ldrt \reg3, [r0], $4
+9996: ldrt \reg4, [r0], $4
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .long 9997b, 6001f
+ .long 9996b, 6001f
+ .previous
+ .endm
+
+#elif defined(CONFIG_CPU_26)
+
+ .macro save_regs
+ stmfd sp!, {r1 - r2, r4 - r9, fp, ip, lr, pc}
+ mov r9, sp, lsr #13
+ mov r9, r9, lsl #13
+ ldr r9, [r9, #TSK_ADDR_LIMIT]
+ mov r9, r9, lsr #24
+ .endm
+
+#define LOAD_REGS(cond) \
+ LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r9, fp, sp, pc})
+
+ .macro load1b, reg1
+ tst r9, #0x01
+9999: ldreqbt \reg1, [r0], #1
+ ldrneb \reg1, [r0], #1
+ .section __ex_table, "a"
+ .align 3
+ .long 9999b, 6001f
+ .previous
+ .endm
+
+ .macro load2b, reg1, reg2
+ tst r9, #0x01
+9999: ldreqbt \reg1, [r0], #1
+ ldrneb \reg1, [r0], #1
+9998: ldreqbt \reg2, [r0], #1
+ ldrneb \reg2, [r0], #1
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .previous
+ .endm
+
+ .macro load1l, reg1
+ tst r9, #0x01
+9999: ldreqt \reg1, [r0], #4
+ ldrne \reg1, [r0], #4
+ .section __ex_table, "a"
+ .align 3
+ .long 9999b, 6001f
+ .previous
+ .endm
+
+ .macro load2l, reg1, reg2
+ tst r9, #0x01
+ ldmneia r0!, {\reg1, \reg2}
+9999: ldreqt \reg1, [r0], #4
+9998: ldreqt \reg2, [r0], #4
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .previous
+ .endm
+
+ .macro load4l, reg1, reg2, reg3, reg4
+ tst r9, #0x01
+ ldmneia r0!, {\reg1, \reg2, \reg3, \reg4}
+9999: ldreqt \reg1, [r0], #4
+9998: ldreqt \reg2, [r0], #4
+9997: ldreqt \reg3, [r0], #4
+9996: ldreqt \reg4, [r0], #4
+ .section __ex_table, "a"
+ .long 9999b, 6001f
+ .long 9998b, 6001f
+ .long 9997b, 6001f
+ .long 9996b, 6001f
+ .previous
+ .endm
+
+#else
+#error Unknown CPU architecture
+#endif
+
+ENTRY(csum_partial_copy_from_user)
+ mov ip, sp
+ save_regs
+ sub fp, ip, #4
+ cmp r2, #4
+ blt .too_small_user
+ tst r1, #2 @ Test destination alignment
+ beq .dst_aligned_user
+ subs r2, r2, #2 @ We do not know if SRC is aligned...
+ load2b ip, r8
+ orr ip, ip, r8, lsl #8
+ adds r3, r3, ip
+ adcs r3, r3, #0
+ strb ip, [r1], #1
+ mov ip, ip, lsr #8
+ strb ip, [r1], #1 @ Destination now aligned
+.dst_aligned_user:
+ tst r0, #3
+ bne .src_not_aligned_user
+ adds r3, r3, #0
+ bics ip, r2, #15 @ Routine for src & dst aligned
+ beq 2f
+1: load4l r4, r5, r6, r7
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ load2l r4, r5
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ tst ip, #4
+ beq 4f
+3: load1l r4
+ str r4, [r1], #4
+ adcs r3, r3, r4
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOAD_REGS(eq)
+ load1l r4
+ tst r2, #2
+ beq .exit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+.exit: tst r2, #1
+ strneb r4, [r1], #1
+ andne r4, r4, #255
+ adcnes r3, r3, r4
+ adcs r0, r3, #0
+ LOAD_REGS(al)
+
+.too_small_user:
+ teq r2, #0
+ LOAD_REGS(eq)
+ cmp r2, #2
+ blt .too_small_user1
+ load2b ip, r8
+ orr ip, ip, r8, lsl #8
+ adds r3, r3, ip
+ strb ip, [r1], #1
+ strb r8, [r1], #1
+ tst r2, #1
+.too_small_user1: @ C = 0
+ beq .csum_exit
+ load1b ip
+ strb ip, [r1], #1
+ adcs r3, r3, ip
+.csum_exit: adc r0, r3, #0
+ LOAD_REGS(al)
+
+.src_not_aligned_user:
+ cmp r2, #4
+ blt .too_small_user
+ and ip, r0, #3
+ bic r0, r0, #3
+ load1l r4
+ cmp ip, #2
+ beq .src2_aligned_user
+ bhi .src3_aligned_user
+ mov r4, r4, lsr #8
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: load4l r5, r6, r7, r8
+ orr r4, r4, r5, lsl #24
+ mov r5, r5, lsr #8
+ orr r5, r5, r6, lsl #24
+ mov r6, r6, lsr #8
+ orr r6, r6, r7, lsl #24
+ mov r7, r7, lsr #8
+ orr r7, r7, r8, lsl #24
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #8
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ load2l r5, r6
+ orr r4, r4, r5, lsl #24
+ mov r5, r5, lsr #8
+ orr r5, r5, r6, lsl #24
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #8
+ tst ip, #4
+ beq 4f
+3: load1l r5
+ orr r4, r4, r5, lsl #24
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #8
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOAD_REGS(eq)
+ tst r2, #2
+ beq .exit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ b .exit
+
+.src2_aligned_user:
+ mov r4, r4, lsr #16
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: load4l r5, r6, r7, r8
+ orr r4, r4, r5, lsl #16
+ mov r5, r5, lsr #16
+ orr r5, r5, r6, lsl #16
+ mov r6, r6, lsr #16
+ orr r6, r6, r7, lsl #16
+ mov r7, r7, lsr #16
+ orr r7, r7, r8, lsl #16
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #16
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ load2l r5, r6
+ orr r4, r4, r5, lsl #16
+ mov r5, r5, lsr #16
+ orr r5, r5, r6, lsl #16
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #16
+ tst ip, #4
+ beq 4f
+3: load1l r5
+ orr r4, r4, r5, lsl #16
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #16
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOAD_REGS(eq)
+ tst r2, #2
+ beq .exit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ load1b r4
+ b .exit
+
+.src3_aligned_user:
+ mov r4, r4, lsr #24
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: load4l r5, r6, r7, r8
+ orr r4, r4, r5, lsl #8
+ mov r5, r5, lsr #24
+ orr r5, r5, r6, lsl #8
+ mov r6, r6, lsr #24
+ orr r6, r6, r7, lsl #8
+ mov r7, r7, lsr #24
+ orr r7, r7, r8, lsl #8
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #24
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ load2l r5, r6
+ orr r4, r4, r5, lsl #8
+ mov r5, r5, lsr #24
+ orr r5, r5, r6, lsl #8
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #24
+ tst ip, #4
+ beq 4f
+3: load1l r5
+ orr r4, r4, r5, lsl #8
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #24
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOAD_REGS(eq)
+ tst r2, #2
+ beq .exit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ load1l r4
+ strb r4, [r1], #1
+ adcs r3, r3, r4, lsl #24
+ mov r4, r4, lsr #8
+ b .exit
+
+#if defined(CONFIG_CPU_32)
+ .section .fixup,"ax"
+#endif
+ .align 4
+6001: mov r4, #-EFAULT
+ ldr r5, [fp, #4]
+ str r4, [r5]
+ ldmia sp, {r1, r2} @ retrieve original arguments
+ add r2, r2, r1
+ mov r3, #0 @ zero the buffer
+6002: teq r2, r1
+ strneb r3, [r1], #1
+ bne 6002b
+ LOAD_REGS(al)
+#if defined(CONFIG_CPU_32)
+ .previous
+#endif
+
+/* Function: __u32 csum_partial_copy (const char *src, char *dst, int len, __u32 sum)
+ * Params : r0 = src, r1 = dst, r2 = len, r3 = checksum
+ * Returns : r0 = new checksum
+ */
+ENTRY(csum_partial_copy_nocheck)
+ENTRY(csum_partial_copy)
+ mov ip, sp
+ stmfd sp!, {r4 - r8, fp, ip, lr, pc}
+ sub fp, ip, #4
+ cmp r2, #4
+ blt Ltoo_small
+ tst r1, #2 @ Test destination alignment
+ beq Ldst_aligned
+ subs r2, r2, #2 @ We do not know if SRC is aligned...
+ ldrb ip, [r0], #1
+ ldrb r8, [r0], #1
+ orr ip, ip, r8, lsl #8
+ adds r3, r3, ip
+ adcs r3, r3, #0
+ strb ip, [r1], #1
+ mov ip, ip, lsr #8
+ strb ip, [r1], #1 @ Destination now aligned
+Ldst_aligned: tst r0, #3
+ bne Lsrc_not_aligned
+ adds r3, r3, #0
+ bics ip, r2, #15 @ Routine for src & dst aligned
+ beq 3f
+1: ldmia r0!, {r4, r5, r6, r7}
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+3: ands ip, r2, #12
+ beq 5f
+ tst ip, #8
+ beq 4f
+ ldmia r0!, {r4, r5}
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ tst ip, #4
+ beq 5f
+4: ldr r4, [r0], #4
+ str r4, [r1], #4
+ adcs r3, r3, r4
+5: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+ ldr r4, [r0], #4
+ tst r2, #2
+ beq Lexit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ b Lexit
+
+Ltoo_small: teq r2, #0
+ LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+ cmp r2, #2
+ blt Ltoo_small1
+ ldrb ip, [r0], #1
+ ldrb r8, [r0], #1
+ orr ip, ip, r8, lsl #8
+ adds r3, r3, ip
+ strb ip, [r1], #1
+ strb r8, [r1], #1
+Lexit: tst r2, #1
+Ltoo_small1: ldrneb ip, [r0], #1
+ strneb ip, [r1], #1
+ adcnes r3, r3, ip
+ adcs r0, r3, #0
+ LOADREGS(ea,fp,{r4 - r8, fp, sp, pc})
+
+Lsrc_not_aligned:
+ cmp r2, #4
+ blt Ltoo_small
+ and ip, r0, #3
+ bic r0, r0, #3
+ ldr r4, [r0], #4
+ cmp ip, #2
+ beq Lsrc2_aligned
+ bhi Lsrc3_aligned
+ mov r4, r4, lsr #8
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: ldmia r0!, {r5, r6, r7, r8}
+ orr r4, r4, r5, lsl #24
+ mov r5, r5, lsr #8
+ orr r5, r5, r6, lsl #24
+ mov r6, r6, lsr #8
+ orr r6, r6, r7, lsl #24
+ mov r7, r7, lsr #8
+ orr r7, r7, r8, lsl #24
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #8
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ ldmia r0!, {r5, r6}
+ orr r4, r4, r5, lsl #24
+ mov r5, r5, lsr #8
+ orr r5, r5, r6, lsl #24
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #8
+ tst ip, #4
+ beq 4f
+3: ldr r5, [r0], #4
+ orr r4, r4, r5, lsl #24
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #8
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+ tst r2, #2
+ beq Lexit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ b Lexit
+
+Lsrc2_aligned: mov r4, r4, lsr #16
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: ldmia r0!, {r5, r6, r7, r8}
+ orr r4, r4, r5, lsl #16
+ mov r5, r5, lsr #16
+ orr r5, r5, r6, lsl #16
+ mov r6, r6, lsr #16
+ orr r6, r6, r7, lsl #16
+ mov r7, r7, lsr #16
+ orr r7, r7, r8, lsl #16
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #16
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ ldmia r0!, {r5, r6}
+ orr r4, r4, r5, lsl #16
+ mov r5, r5, lsr #16
+ orr r5, r5, r6, lsl #16
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #16
+ tst ip, #4
+ beq 4f
+3: ldr r5, [r0], #4
+ orr r4, r4, r5, lsl #16
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #16
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+ tst r2, #2
+ beq Lexit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ mov r4, r4, lsr #8
+ strb r4, [r1], #1
+ ldrb r4, [r0], #1
+ b Lexit
+
+Lsrc3_aligned: mov r4, r4, lsr #24
+ adds r3, r3, #0
+ bics ip, r2, #15
+ beq 2f
+1: ldmia r0!, {r5, r6, r7, r8}
+ orr r4, r4, r5, lsl #8
+ mov r5, r5, lsr #24
+ orr r5, r5, r6, lsl #8
+ mov r6, r6, lsr #24
+ orr r6, r6, r7, lsl #8
+ mov r7, r7, lsr #24
+ orr r7, r7, r8, lsl #8
+ stmia r1!, {r4, r5, r6, r7}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ adcs r3, r3, r6
+ adcs r3, r3, r7
+ mov r4, r8, lsr #24
+ sub ip, ip, #16
+ teq ip, #0
+ bne 1b
+2: ands ip, r2, #12
+ beq 4f
+ tst ip, #8
+ beq 3f
+ ldmia r0!, {r5, r6}
+ orr r4, r4, r5, lsl #8
+ mov r5, r5, lsr #24
+ orr r5, r5, r6, lsl #8
+ stmia r1!, {r4, r5}
+ adcs r3, r3, r4
+ adcs r3, r3, r5
+ mov r4, r6, lsr #24
+ tst ip, #4
+ beq 4f
+3: ldr r5, [r0], #4
+ orr r4, r4, r5, lsl #8
+ str r4, [r1], #4
+ adcs r3, r3, r4
+ mov r4, r5, lsr #24
+4: ands r2, r2, #3
+ adceq r0, r3, #0
+ LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc})
+ tst r2, #2
+ beq Lexit
+ adcs r3, r3, r4, lsl #16
+ strb r4, [r1], #1
+ ldr r4, [r0], #4
+ strb r4, [r1], #1
+ adcs r3, r3, r4, lsl #24
+ mov r4, r4, lsr #8
+ b Lexit
+
+ENTRY(__csum_ipv6_magic)
+ stmfd sp!, {lr}
+ adds ip, r2, r3
+ ldmia r1, {r1 - r3, lr}
+ adcs ip, ip, r1
+ adcs ip, ip, r2
+ adcs ip, ip, r3
+ adcs ip, ip, lr
+ ldmia r0, {r0 - r3}
+ adcs r0, ip, r0
+ adcs r0, r0, r1
+ adcs r0, r0, r2
+ adcs r0, r0, r3
+ ldr r3, [sp, #4]
+ adcs r0, r0, r3
+ adcs r0, r0, #0
+ LOADREGS(fd, sp!, {pc})
diff --git a/pfinet/linux-src/arch/i386/lib/checksum.S b/pfinet/linux-src/arch/i386/lib/checksum.S
new file mode 100644
index 00000000..af10dc7c
--- /dev/null
+++ b/pfinet/linux-src/arch/i386/lib/checksum.S
@@ -0,0 +1,447 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Pentium Pro/II routines:
+ * Alexander Kjeldaas <astor@guardian.no>
+ * Finn Arne Gangstad <finnag@guardian.no>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ * handling.
+ * Andi Kleen, add zeroing on error
+ * converted to pure assembler
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+
+.text
+.align 4
+.globl csum_partial
+
+#if CPU!=686
+
+ /*
+ * Experiments with Ethernet and SLIP connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary. We get at
+ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+ * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+ * alignment for the unrolled loop.
+ */
+csum_partial:
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: unsigned char *buff
+ testl $2, %esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%esi), %bx
+ addl $2, %esi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, %edx
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+1: movl (%esi), %ebx
+ adcl %ebx, %eax
+ movl 4(%esi), %ebx
+ adcl %ebx, %eax
+ movl 8(%esi), %ebx
+ adcl %ebx, %eax
+ movl 12(%esi), %ebx
+ adcl %ebx, %eax
+ movl 16(%esi), %ebx
+ adcl %ebx, %eax
+ movl 20(%esi), %ebx
+ adcl %ebx, %eax
+ movl 24(%esi), %ebx
+ adcl %ebx, %eax
+ movl 28(%esi), %ebx
+ adcl %ebx, %eax
+ lea 32(%esi), %esi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+3: adcl (%esi), %eax
+ lea 4(%esi), %esi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+ movw (%esi),%cx
+ leal 2(%esi),%esi
+ je 6f
+ shll $16,%ecx
+5: movb (%esi),%cl
+6: addl %ecx,%eax
+ adcl $0, %eax
+7:
+ popl %ebx
+ popl %esi
+ ret
+
+#else /* CPU==686 */
+
+csum_partial:
+ movl 12(%esp),%eax # Function arg: unsigned int sum
+ movl 8(%esp),%ecx # Function arg: int len
+ movl 4(%esp),%esi # Function arg: const unsigned char *buf
+
+ testl $2, %esi
+ jnz 30f
+10:
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ andl $0x7c, %ebx
+ shrl $7, %ecx
+ addl %ebx,%esi
+ shrl $2, %ebx
+ negl %ebx
+ lea 45f(%ebx,%ebx,2), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+
+ # Handle 2-byte-aligned regions
+20: addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+
+30: subl $2, %ecx
+ ja 20b
+ je 32f
+ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
+ addl %ebx, %eax
+ adcl $0, %eax
+ jmp 80f
+32:
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ adcl $0, %eax
+ jmp 80f
+
+40:
+ addl -128(%esi), %eax
+ adcl -124(%esi), %eax
+ adcl -120(%esi), %eax
+ adcl -116(%esi), %eax
+ adcl -112(%esi), %eax
+ adcl -108(%esi), %eax
+ adcl -104(%esi), %eax
+ adcl -100(%esi), %eax
+ adcl -96(%esi), %eax
+ adcl -92(%esi), %eax
+ adcl -88(%esi), %eax
+ adcl -84(%esi), %eax
+ adcl -80(%esi), %eax
+ adcl -76(%esi), %eax
+ adcl -72(%esi), %eax
+ adcl -68(%esi), %eax
+ adcl -64(%esi), %eax
+ adcl -60(%esi), %eax
+ adcl -56(%esi), %eax
+ adcl -52(%esi), %eax
+ adcl -48(%esi), %eax
+ adcl -44(%esi), %eax
+ adcl -40(%esi), %eax
+ adcl -36(%esi), %eax
+ adcl -32(%esi), %eax
+ adcl -28(%esi), %eax
+ adcl -24(%esi), %eax
+ adcl -20(%esi), %eax
+ adcl -16(%esi), %eax
+ adcl -12(%esi), %eax
+ adcl -8(%esi), %eax
+ adcl -4(%esi), %eax
+45:
+ lea 128(%esi), %esi
+ adcl $0, %eax
+ dec %ecx
+ jge 40b
+ movl %edx, %ecx
+50: andl $3, %ecx
+ jz 80f
+
+ # Handle the last 1-3 bytes without jumping
+ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%ebx # by the shll and shrl instructions
+ shll $3,%ecx
+ shrl %cl,%ebx
+ andl -128(%esi),%ebx # esi is 4-aligned so should be ok
+ addl %ebx,%eax
+ adcl $0,%eax
+80:
+ ret
+
+#endif /* CPU==686 */
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+ int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ * DST definitions? It's damn hard to trigger all cases. I hope I got
+ * them all but there's no guarantee.
+ */
+
+#define SRC(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6001f ; \
+ .previous
+
+#define DST(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6002f ; \
+ .previous
+
+.align 4
+.globl csum_partial_copy_generic
+
+#if CPU!=686
+
+#define ARGBASE 16
+#define FP 12
+
+csum_partial_copy_generic:
+ subl $4,%esp
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ movl ARGBASE+16(%esp),%eax # sum
+ movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+4(%esp),%esi # src
+ movl ARGBASE+8(%esp),%edi # dst
+
+ testl $2, %edi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+SRC(1: movw (%esi), %bx )
+ addl $2, %esi
+DST( movw %bx, (%edi) )
+ addl $2, %edi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, FP(%esp)
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+SRC(1: movl (%esi), %ebx )
+SRC( movl 4(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 4(%edi) )
+
+SRC( movl 8(%esi), %ebx )
+SRC( movl 12(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 8(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 12(%edi) )
+
+SRC( movl 16(%esi), %ebx )
+SRC( movl 20(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 16(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 20(%edi) )
+
+SRC( movl 24(%esi), %ebx )
+SRC( movl 28(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 24(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 28(%edi) )
+
+ lea 32(%esi), %esi
+ lea 32(%edi), %edi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl FP(%esp), %edx
+ movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+SRC(3: movl (%esi), %ebx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ lea 4(%esi), %esi
+ lea 4(%edi), %edi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+SRC( movw (%esi), %cx )
+ leal 2(%esi), %esi
+DST( movw %cx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%ecx
+SRC(5: movb (%esi), %cl )
+DST( movb %cl, (%edi) )
+6: addl %ecx, %eax
+ adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+
+6001:
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+
+ # zero the complete destination - computing the rest
+ # is too much work
+ movl ARGBASE+8(%esp), %edi # dst
+ movl ARGBASE+12(%esp), %ecx # len
+ xorl %eax,%eax
+ rep ; stosb
+
+ jmp 5000b
+
+6002:
+ movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT,(%ebx)
+ jmp 5000b
+
+.previous
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ecx # equivalent to addl $4,%esp
+ ret
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ addl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ROUND(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ adcl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ARGBASE 12
+
+csum_partial_copy_generic:
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl ARGBASE+4(%esp),%esi #src
+ movl ARGBASE+8(%esp),%edi #dst
+ movl ARGBASE+12(%esp),%ecx #len
+ movl ARGBASE+16(%esp),%eax #sum
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ shrl $6, %ecx
+ andl $0x3c, %ebx
+ negl %ebx
+ subl %ebx, %esi
+ subl %ebx, %edi
+ lea 3f(%ebx,%ebx), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+1: addl $64,%esi
+ addl $64,%edi
+ ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
+ ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
+ ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
+ ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
+3: adcl $0,%eax
+ dec %ecx
+ jge 1b
+4: andl $3, %edx
+ jz 7f
+ cmpl $2, %edx
+ jb 5f
+SRC( movw (%esi), %dx )
+ leal 2(%esi), %esi
+DST( movw %dx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%edx
+5:
+SRC( movb (%esi), %dl )
+DST( movb %dl, (%edi) )
+6: addl %edx, %eax
+ adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+ # zero the complete destination (computing the rest is too much work)
+ movl ARGBASE+8(%esp),%edi # dst
+ movl ARGBASE+12(%esp),%ecx # len
+ xorl %eax,%eax
+ rep; stosb
+ jmp 7b
+6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT, (%ebx)
+ jmp 7b
+.previous
+
+ popl %esi
+ popl %edi
+ popl %ebx
+ ret
+
+#undef ROUND
+#undef ROUND1
+
+#endif /* CPU==i686 */
diff --git a/pfinet/linux-src/arch/i386/lib/old-checksum.c b/pfinet/linux-src/arch/i386/lib/old-checksum.c
new file mode 100644
index 00000000..ae3a3804
--- /dev/null
+++ b/pfinet/linux-src/arch/i386/lib/old-checksum.c
@@ -0,0 +1,19 @@
+/*
+ * FIXME: old compatibility stuff, will be removed soon.
+ */
+
+#include <net/checksum.h>
+
+unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum)
+{
+ int src_err=0, dst_err=0;
+
+ sum = csum_partial_copy_generic ( src, dst, len, sum, &src_err, &dst_err);
+
+ if (src_err || dst_err)
+ printk("old csum_partial_copy_fromuser(), tell mingo to convert me.\n");
+
+ return sum;
+}
+
+
diff --git a/pfinet/linux-src/arch/m68k/lib/checksum.c b/pfinet/linux-src/arch/m68k/lib/checksum.c
new file mode 100644
index 00000000..5110cac4
--- /dev/null
+++ b/pfinet/linux-src/arch/m68k/lib/checksum.c
@@ -0,0 +1,420 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek:
+ * Fixed some nasty bugs, causing some horrible crashes.
+ * A: At some points, the sum (%0) was used as
+ * length-counter instead of the length counter
+ * (%1). Thanks to Roman Hodek for pointing this out.
+ * B: GCC seems to mess up if one uses too many
+ * data-registers to hold input values and one tries to
+ * specify d0 and d1 as scratch registers. Letting gcc
+ * choose these registers itself solves the problem.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 1998/8/31 Andreas Schwab:
+ * Zero out rest of buffer on exception in
+ * csum_partial_copy_from_user.
+ */
+
+#include <net/checksum.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+unsigned int
+csum_partial (const unsigned char *buff, int len, unsigned int sum)
+{
+ unsigned long tmp1, tmp2;
+ /*
+ * Experiments with ethernet and slip connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary.
+ */
+ __asm__("movel %2,%3\n\t"
+ "btst #1,%3\n\t" /* Check alignment */
+ "jeq 2f\n\t"
+ "subql #2,%1\n\t" /* buff%4==2: treat first word */
+ "jgt 1f\n\t"
+ "addql #2,%1\n\t" /* len was == 2, treat only rest */
+ "jra 4f\n"
+ "1:\t"
+ "addw %2@+,%0\n\t" /* add first word to sum */
+ "clrl %3\n\t"
+ "addxl %3,%0\n" /* add X bit */
+ "2:\t"
+ /* unrolled loop for the main part: do 8 longs at once */
+ "movel %1,%3\n\t" /* save len in tmp1 */
+ "lsrl #5,%1\n\t" /* len/32 */
+ "jeq 2f\n\t" /* not enough... */
+ "subql #1,%1\n"
+ "1:\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "dbra %1,1b\n\t"
+ "clrl %4\n\t"
+ "addxl %4,%0\n\t" /* add X bit */
+ "clrw %1\n\t"
+ "subql #1,%1\n\t"
+ "jcc 1b\n"
+ "2:\t"
+ "movel %3,%1\n\t" /* restore len from tmp1 */
+ "andw #0x1c,%3\n\t" /* number of rest longs */
+ "jeq 4f\n\t"
+ "lsrw #2,%3\n\t"
+ "subqw #1,%3\n"
+ "3:\t"
+ /* loop for rest longs */
+ "movel %2@+,%4\n\t"
+ "addxl %4,%0\n\t"
+ "dbra %3,3b\n\t"
+ "clrl %4\n\t"
+ "addxl %4,%0\n" /* add X bit */
+ "4:\t"
+ /* now check for rest bytes that do not fit into longs */
+ "andw #3,%1\n\t"
+ "jeq 7f\n\t"
+ "clrl %4\n\t" /* clear tmp2 for rest bytes */
+ "subqw #2,%1\n\t"
+ "jlt 5f\n\t"
+ "movew %2@+,%4\n\t" /* have rest >= 2: get word */
+ "swap %4\n\t" /* into bits 16..31 */
+ "tstw %1\n\t" /* another byte? */
+ "jeq 6f\n"
+ "5:\t"
+ "moveb %2@,%4\n\t" /* have odd rest: get byte */
+ "lslw #8,%4\n\t" /* into bits 8..15; 16..31 untouched */
+ "6:\t"
+ "addl %4,%0\n\t" /* now add rest long to sum */
+ "clrl %4\n\t"
+ "addxl %4,%0\n" /* add X bit */
+ "7:\t"
+ : "=d" (sum), "=d" (len), "=a" (buff),
+ "=&d" (tmp1), "=&d" (tmp2)
+ : "0" (sum), "1" (len), "2" (buff)
+ );
+ return(sum);
+}
+
+
+
+/*
+ * copy from user space while checksumming, with exception handling.
+ */
+
+unsigned int
+csum_partial_copy_from_user(const char *src, char *dst, int len,
+ int sum, int *csum_err)
+{
+ /*
+ * GCC doesn't like more than 10 operands for the asm
+ * statements so we have to use tmp2 for the error
+ * code.
+ */
+ unsigned long tmp1, tmp2;
+
+ __asm__("movel %2,%4\n\t"
+ "btst #1,%4\n\t" /* Check alignment */
+ "jeq 2f\n\t"
+ "subql #2,%1\n\t" /* buff%4==2: treat first word */
+ "jgt 1f\n\t"
+ "addql #2,%1\n\t" /* len was == 2, treat only rest */
+ "jra 4f\n"
+ "1:\n"
+ "10:\t"
+ "movesw %2@+,%4\n\t" /* add first word to sum */
+ "addw %4,%0\n\t"
+ "movew %4,%3@+\n\t"
+ "clrl %4\n\t"
+ "addxl %4,%0\n" /* add X bit */
+ "2:\t"
+ /* unrolled loop for the main part: do 8 longs at once */
+ "movel %1,%4\n\t" /* save len in tmp1 */
+ "lsrl #5,%1\n\t" /* len/32 */
+ "jeq 2f\n\t" /* not enough... */
+ "subql #1,%1\n"
+ "1:\n"
+ "11:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "12:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "13:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "14:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "15:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "16:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "17:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "18:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "dbra %1,1b\n\t"
+ "clrl %5\n\t"
+ "addxl %5,%0\n\t" /* add X bit */
+ "clrw %1\n\t"
+ "subql #1,%1\n\t"
+ "jcc 1b\n"
+ "2:\t"
+ "movel %4,%1\n\t" /* restore len from tmp1 */
+ "andw #0x1c,%4\n\t" /* number of rest longs */
+ "jeq 4f\n\t"
+ "lsrw #2,%4\n\t"
+ "subqw #1,%4\n"
+ "3:\n"
+ /* loop for rest longs */
+ "19:\t"
+ "movesl %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "dbra %4,3b\n\t"
+ "clrl %5\n\t"
+ "addxl %5,%0\n" /* add X bit */
+ "4:\t"
+ /* now check for rest bytes that do not fit into longs */
+ "andw #3,%1\n\t"
+ "jeq 7f\n\t"
+ "clrl %5\n\t" /* clear tmp2 for rest bytes */
+ "subqw #2,%1\n\t"
+ "jlt 5f\n\t"
+ "20:\t"
+ "movesw %2@+,%5\n\t" /* have rest >= 2: get word */
+ "movew %5,%3@+\n\t"
+ "swap %5\n\t" /* into bits 16..31 */
+ "tstw %1\n\t" /* another byte? */
+ "jeq 6f\n"
+ "5:\n"
+ "21:\t"
+ "movesb %2@,%5\n\t" /* have odd rest: get byte */
+ "moveb %5,%3@+\n\t"
+ "lslw #8,%5\n\t" /* into bits 8..15; 16..31 untouched */
+ "6:\t"
+ "addl %5,%0\n\t" /* now add rest long to sum */
+ "clrl %5\n\t"
+ "addxl %5,%0\n\t" /* add X bit */
+ "7:\t"
+ "clrl %5\n" /* no error - clear return value */
+ "8:\n"
+ ".section .fixup,\"ax\"\n"
+ ".even\n"
+ /* If any execption occurs zero out the rest.
+ Similarities with the code above are intentional :-) */
+ "90:\t"
+ "clrw %3@+\n\t"
+ "movel %1,%4\n\t"
+ "lsrl #5,%1\n\t"
+ "jeq 1f\n\t"
+ "subql #1,%1\n"
+ "91:\t"
+ "clrl %3@+\n"
+ "92:\t"
+ "clrl %3@+\n"
+ "93:\t"
+ "clrl %3@+\n"
+ "94:\t"
+ "clrl %3@+\n"
+ "95:\t"
+ "clrl %3@+\n"
+ "96:\t"
+ "clrl %3@+\n"
+ "97:\t"
+ "clrl %3@+\n"
+ "98:\t"
+ "clrl %3@+\n\t"
+ "dbra %1,91b\n\t"
+ "clrw %1\n\t"
+ "subql #1,%1\n\t"
+ "jcc 91b\n"
+ "1:\t"
+ "movel %4,%1\n\t"
+ "andw #0x1c,%4\n\t"
+ "jeq 1f\n\t"
+ "lsrw #2,%4\n\t"
+ "subqw #1,%4\n"
+ "99:\t"
+ "clrl %3@+\n\t"
+ "dbra %4,99b\n\t"
+ "1:\t"
+ "andw #3,%1\n\t"
+ "jeq 9f\n"
+ "100:\t"
+ "clrw %3@+\n\t"
+ "tstw %1\n\t"
+ "jeq 9f\n"
+ "101:\t"
+ "clrb %3@+\n"
+ "9:\t"
+#define STR(X) STR1(X)
+#define STR1(X) #X
+ "moveq #-" STR(EFAULT) ",%5\n\t"
+ "jra 8b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ ".long 10b,90b\n"
+ ".long 11b,91b\n"
+ ".long 12b,92b\n"
+ ".long 13b,93b\n"
+ ".long 14b,94b\n"
+ ".long 15b,95b\n"
+ ".long 16b,96b\n"
+ ".long 17b,97b\n"
+ ".long 18b,98b\n"
+ ".long 19b,99b\n"
+ ".long 20b,100b\n"
+ ".long 21b,101b\n"
+ ".previous"
+ : "=d" (sum), "=d" (len), "=a" (src), "=a" (dst),
+ "=&d" (tmp1), "=d" (tmp2)
+ : "0" (sum), "1" (len), "2" (src), "3" (dst)
+ );
+
+ *csum_err = tmp2;
+
+ return(sum);
+}
+
+/*
+ * copy from kernel space while checksumming, otherwise like csum_partial
+ */
+
+unsigned int
+csum_partial_copy(const char *src, char *dst, int len, int sum)
+{
+ unsigned long tmp1, tmp2;
+ __asm__("movel %2,%4\n\t"
+ "btst #1,%4\n\t" /* Check alignment */
+ "jeq 2f\n\t"
+ "subql #2,%1\n\t" /* buff%4==2: treat first word */
+ "jgt 1f\n\t"
+ "addql #2,%1\n\t" /* len was == 2, treat only rest */
+ "jra 4f\n"
+ "1:\t"
+ "movew %2@+,%4\n\t" /* add first word to sum */
+ "addw %4,%0\n\t"
+ "movew %4,%3@+\n\t"
+ "clrl %4\n\t"
+ "addxl %4,%0\n" /* add X bit */
+ "2:\t"
+ /* unrolled loop for the main part: do 8 longs at once */
+ "movel %1,%4\n\t" /* save len in tmp1 */
+ "lsrl #5,%1\n\t" /* len/32 */
+ "jeq 2f\n\t" /* not enough... */
+ "subql #1,%1\n"
+ "1:\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "dbra %1,1b\n\t"
+ "clrl %5\n\t"
+ "addxl %5,%0\n\t" /* add X bit */
+ "clrw %1\n\t"
+ "subql #1,%1\n\t"
+ "jcc 1b\n"
+ "2:\t"
+ "movel %4,%1\n\t" /* restore len from tmp1 */
+ "andw #0x1c,%4\n\t" /* number of rest longs */
+ "jeq 4f\n\t"
+ "lsrw #2,%4\n\t"
+ "subqw #1,%4\n"
+ "3:\t"
+ /* loop for rest longs */
+ "movel %2@+,%5\n\t"
+ "addxl %5,%0\n\t"
+ "movel %5,%3@+\n\t"
+ "dbra %4,3b\n\t"
+ "clrl %5\n\t"
+ "addxl %5,%0\n" /* add X bit */
+ "4:\t"
+ /* now check for rest bytes that do not fit into longs */
+ "andw #3,%1\n\t"
+ "jeq 7f\n\t"
+ "clrl %5\n\t" /* clear tmp2 for rest bytes */
+ "subqw #2,%1\n\t"
+ "jlt 5f\n\t"
+ "movew %2@+,%5\n\t" /* have rest >= 2: get word */
+ "movew %5,%3@+\n\t"
+ "swap %5\n\t" /* into bits 16..31 */
+ "tstw %1\n\t" /* another byte? */
+ "jeq 6f\n"
+ "5:\t"
+ "moveb %2@,%5\n\t" /* have odd rest: get byte */
+ "moveb %5,%3@+\n\t"
+ "lslw #8,%5\n" /* into bits 8..15; 16..31 untouched */
+ "6:\t"
+ "addl %5,%0\n\t" /* now add rest long to sum */
+ "clrl %5\n\t"
+ "addxl %5,%0\n" /* add X bit */
+ "7:\t"
+ : "=d" (sum), "=d" (len), "=a" (src), "=a" (dst),
+ "=&d" (tmp1), "=&d" (tmp2)
+ : "0" (sum), "1" (len), "2" (src), "3" (dst)
+ );
+ return(sum);
+}
diff --git a/pfinet/linux-src/arch/ppc/lib/checksum.S b/pfinet/linux-src/arch/ppc/lib/checksum.S
new file mode 100644
index 00000000..66a2e3aa
--- /dev/null
+++ b/pfinet/linux-src/arch/ppc/lib/checksum.S
@@ -0,0 +1,194 @@
+/*
+ * This file contains assembly-language implementations
+ * of IP-style 1's complement checksum routines.
+ *
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
+ */
+
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include "../kernel/ppc_asm.tmpl"
+
+ .text
+
+/*
+ * ip_fast_csum(buf, len) -- Optimized for IP header
+ * len is in words and is always >= 5.
+ */
+_GLOBAL(ip_fast_csum)
+ lwz r0,0(r3)
+ lwzu r5,4(r3)
+ addi r4,r4,-2
+ addc r0,r0,r5
+ mtctr r4
+1: lwzu r4,4(r3)
+ adde r0,r0,r4
+ bdnz 1b
+ addze r0,r0 /* add in final carry */
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * Compute checksum of TCP or UDP pseudo-header:
+ * csum_tcpudp_magic(saddr, daddr, len, proto, sum)
+ */
+_GLOBAL(csum_tcpudp_magic)
+ rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
+ addc r0,r3,r4 /* add 4 32-bit words together */
+ adde r0,r0,r5
+ adde r0,r0,r7
+ addze r0,r0 /* add in final carry */
+ rlwinm r3,r0,16,0,31 /* fold two halves together */
+ add r3,r0,r3
+ not r3,r3
+ srwi r3,r3,16
+ blr
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * csum_partial(buff, len, sum)
+ */
+_GLOBAL(csum_partial)
+ addic r0,r5,0
+ subi r3,r3,4
+ srwi. r6,r4,2
+ beq 3f /* if we're doing < 4 bytes */
+ andi. r5,r3,2 /* Align buffer to longword boundary */
+ beq+ 1f
+ lhz r5,4(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r4,r4,2
+ addc r0,r0,r5
+ srwi. r6,r4,2 /* # words to do */
+ beq 3f
+1: mtctr r6
+2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */
+ adde r0,r0,r5 /* be unnecessary to unroll this loop */
+ bdnz 2b
+ andi. r4,r4,3
+3: cmpi 0,r4,2
+ blt+ 4f
+ lhz r5,4(r3)
+ addi r3,r3,2
+ subi r4,r4,2
+ adde r0,r0,r5
+4: cmpi 0,r4,1
+ bne+ 5f
+ lbz r5,4(r3)
+ slwi r5,r5,8 /* Upper byte of word */
+ adde r0,r0,r5
+5: addze r3,r0 /* add in final carry */
+ blr
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit), while copying the block to dst.
+ * If an access exception occurs on src or dst, it stores -EFAULT
+ * to *src_err or *dst_err respectively, and (for an error on
+ * src) zeroes the rest of dst.
+ *
+ * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
+ */
+_GLOBAL(csum_partial_copy_generic)
+ addic r0,r6,0
+ subi r3,r3,4
+ subi r4,r4,4
+ srwi. r6,r5,2
+ beq 3f /* if we're doing < 4 bytes */
+ andi. r9,r4,2 /* Align dst to longword boundary */
+ beq+ 1f
+81: lhz r6,4(r3) /* do 2 bytes to get aligned */
+ addi r3,r3,2
+ subi r5,r5,2
+91: sth r6,4(r4)
+ addi r4,r4,2
+ addc r0,r0,r6
+ srwi. r6,r5,2 /* # words to do */
+ beq 3f
+1: mtctr r6
+82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
+92: stwu r6,4(r4) /* be unnecessary to unroll this loop */
+ adde r0,r0,r6
+ bdnz 82b
+ andi. r5,r5,3
+3: cmpi 0,r5,2
+ blt+ 4f
+83: lhz r6,4(r3)
+ addi r3,r3,2
+ subi r5,r5,2
+93: sth r6,4(r4)
+ addi r4,r4,2
+ adde r0,r0,r6
+4: cmpi 0,r5,1
+ bne+ 5f
+84: lbz r6,4(r3)
+94: stb r6,4(r4)
+ slwi r6,r6,8 /* Upper byte of word */
+ adde r0,r0,r6
+5: addze r3,r0 /* add in final carry */
+ blr
+
+/* These shouldn't go in the fixup section, since that would
+ cause the ex_table addresses to get out of order. */
+
+src_error_1:
+ li r6,0
+ subi r5,r5,2
+95: sth r6,4(r4)
+ addi r4,r4,2
+ srwi. r6,r5,2
+ beq 3f
+ mtctr r6
+src_error_2:
+ li r6,0
+96: stwu r6,4(r4)
+ bdnz 96b
+3: andi. r5,r5,3
+ beq src_error
+src_error_3:
+ li r6,0
+ mtctr r5
+ addi r4,r4,3
+97: stbu r6,1(r4)
+ bdnz 97b
+src_error:
+ cmpi 0,r7,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r7)
+1: addze r3,r0
+ blr
+
+dst_error:
+ cmpi 0,r8,0
+ beq 1f
+ li r6,-EFAULT
+ stw r6,0(r8)
+1: addze r3,r0
+ blr
+
+.section __ex_table,"a"
+ .long 81b,src_error_1
+ .long 91b,dst_error
+ .long 82b,src_error_2
+ .long 92b,dst_error
+ .long 83b,src_error_3
+ .long 93b,dst_error
+ .long 84b,src_error_3
+ .long 94b,dst_error
+ .long 95b,dst_error
+ .long 96b,dst_error
+ .long 97b,dst_error
diff --git a/pfinet/linux-src/arch/sparc/lib/checksum.S b/pfinet/linux-src/arch/sparc/lib/checksum.S
new file mode 100644
index 00000000..d02b6dfb
--- /dev/null
+++ b/pfinet/linux-src/arch/sparc/lib/checksum.S
@@ -0,0 +1,581 @@
+/* checksum.S: Sparc optimized checksum code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1995 Miguel de Icaza
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ * Linux/Alpha checksum c-code
+ * Linux/ix86 inline checksum assembly
+ * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ * David Mosberger-Tang for optimized reference c-code
+ * BSD4.4 portable checksum routine
+ */
+
+#include <asm/cprefix.h>
+#include <asm/errno.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
+ ldd [buf + offset + 0x00], t0; \
+ ldd [buf + offset + 0x08], t2; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum; \
+ ldd [buf + offset + 0x10], t4; \
+ addxcc t2, sum, sum; \
+ addxcc t3, sum, sum; \
+ ldd [buf + offset + 0x18], t0; \
+ addxcc t4, sum, sum; \
+ addxcc t5, sum, sum; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \
+ ldd [buf - offset - 0x08], t0; \
+ ldd [buf - offset - 0x00], t2; \
+ addxcc t0, sum, sum; \
+ addxcc t1, sum, sum; \
+ addxcc t2, sum, sum; \
+ addxcc t3, sum, sum;
+
+ /* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+ be 1f ! caller asks %o1 & 0x8
+ andcc %o1, 4, %g0 ! nope, check for word remaining
+ ldd [%o0], %g2 ! load two
+ addcc %g2, %o2, %o2 ! add first word to sum
+ addxcc %g3, %o2, %o2 ! add second word as well
+ add %o0, 8, %o0 ! advance buf ptr
+ addx %g0, %o2, %o2 ! add in final carry
+ andcc %o1, 4, %g0 ! check again for word remaining
+1: be 1f ! nope, skip this code
+ andcc %o1, 3, %o1 ! check for trailing bytes
+ ld [%o0], %g2 ! load it
+ addcc %g2, %o2, %o2 ! add to sum
+ add %o0, 4, %o0 ! advance buf ptr
+ addx %g0, %o2, %o2 ! add in final carry
+ andcc %o1, 3, %g0 ! check again for trailing bytes
+1: be 1f ! no trailing bytes, return
+ addcc %o1, -1, %g0 ! only one byte remains?
+ bne 2f ! at least two bytes more
+ subcc %o1, 2, %o1 ! only two bytes more?
+ b 4f ! only one byte remains
+ or %g0, %g0, %o4 ! clear fake hword value
+2: lduh [%o0], %o4 ! get hword
+ be 6f ! jmp if only hword remains
+ add %o0, 2, %o0 ! advance buf ptr either way
+ sll %o4, 16, %o4 ! create upper hword
+4: ldub [%o0], %o5 ! get final byte
+ sll %o5, 8, %o5 ! put into place
+ or %o5, %o4, %o4 ! coalese with hword (if any)
+6: addcc %o4, %o2, %o2 ! add to sum
+1: retl ! get outta here
+ addx %g0, %o2, %o0 ! add final carry into retval
+
+ /* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+ cmp %o1, 6
+ bl cpte - 0x4
+ andcc %o0, 0x2, %g0
+ be 1f
+ andcc %o0, 0x4, %g0
+ lduh [%o0 + 0x00], %g2
+ sub %o1, 2, %o1
+ add %o0, 2, %o0
+ sll %g2, 16, %g2
+ addcc %g2, %o2, %o2
+ srl %o2, 16, %g3
+ addx %g0, %g3, %g2
+ sll %o2, 16, %o2
+ sll %g2, 16, %g3
+ srl %o2, 16, %o2
+ andcc %o0, 0x4, %g0
+ or %g3, %o2, %o2
+1: be cpa
+ andcc %o1, 0xffffff80, %o3
+ ld [%o0 + 0x00], %g2
+ sub %o1, 4, %o1
+ addcc %g2, %o2, %o2
+ add %o0, 4, %o0
+ addx %g0, %o2, %o2
+ b cpa
+ andcc %o1, 0xffffff80, %o3
+
+ /* The common case is to get called with a nicely aligned
+ * buffer of size 0x20. Follow the code path for that case.
+ */
+ .globl C_LABEL(csum_partial)
+C_LABEL(csum_partial): /* %o0=buf, %o1=len, %o2=sum */
+ andcc %o0, 0x7, %g0 ! alignment problems?
+ bne csum_partial_fix_alignment ! yep, handle it
+ sethi %hi(cpte - 8), %g7 ! prepare table jmp ptr
+ andcc %o1, 0xffffff80, %o3 ! num loop iterations
+cpa: be 3f ! none to do
+ andcc %o1, 0x70, %g1 ! clears carry flag too
+5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+ addx %g0, %o2, %o2 ! sink in final carry
+ subcc %o3, 128, %o3 ! detract from loop iters
+ bne 5b ! more to do
+ add %o0, 128, %o0 ! advance buf ptr
+ andcc %o1, 0x70, %g1 ! clears carry flag too
+3: be cpte ! nope
+ andcc %o1, 0xf, %g0 ! anything left at all?
+ srl %g1, 1, %o4 ! compute offset
+ sub %g7, %g1, %g7 ! adjust jmp ptr
+ sub %g7, %o4, %g7 ! final jmp ptr adjust
+ jmp %g7 + %lo(cpte - 8) ! enter the table
+ add %o0, %g1, %o0 ! advance buf ptr
+cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+ CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+ addx %g0, %o2, %o2 ! fetch final carry
+ andcc %o1, 0xf, %g0 ! anything left at all?
+cpte: bne csum_partial_end_cruft ! yep, handle it
+ andcc %o1, 8, %g0 ! check how much
+cpout: retl ! get outta here
+ mov %o2, %o0 ! return computed csum
+
+ .globl C_LABEL(__csum_partial_copy_start), C_LABEL(__csum_partial_copy_end)
+C_LABEL(__csum_partial_copy_start):
+
+#define EX(x,y,a,b,z) \
+98: x,y; \
+ .section .fixup,z##alloc,z##execinstr; \
+ .align 4; \
+99: ba 30f; \
+ a, b, %o3; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4
+
+#define EX2(x,y,z) \
+98: x,y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 30f; \
+ .text; \
+ .align 4
+
+#define EX3(x,y,z) \
+98: x,y; \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word 98b, 96f; \
+ .text; \
+ .align 4
+
+#define EXT(start,end,handler,z) \
+ .section __ex_table,z##alloc; \
+ .align 4; \
+ .word start, 0, end, handler; \
+ .text; \
+ .align 4
+
+ /* This aligned version executes typically in 8.5 superscalar cycles, this
+ * is the best I can do. I say 8.5 because the final add will pair with
+ * the next ldd in the main unrolled loop. Thus the pipe is always full.
+ * If you change these macros (including order of instructions),
+ * please check the fixup code below as well.
+ */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ addxcc t0, sum, sum; \
+ ldd [src + off + 0x10], t4; \
+ addxcc t1, sum, sum; \
+ ldd [src + off + 0x18], t6; \
+ addxcc t2, sum, sum; \
+ std t0, [dst + off + 0x00]; \
+ addxcc t3, sum, sum; \
+ std t2, [dst + off + 0x08]; \
+ addxcc t4, sum, sum; \
+ std t4, [dst + off + 0x10]; \
+ addxcc t5, sum, sum; \
+ std t6, [dst + off + 0x18]; \
+ addxcc t6, sum, sum; \
+ addxcc t7, sum, sum;
+
+ /* 12 superscalar cycles seems to be the limit for this case,
+ * because of this we thus do all the ldd's together to get
+ * Viking MXCC into streaming mode. Ho hum...
+ */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \
+ ldd [src + off + 0x00], t0; \
+ ldd [src + off + 0x08], t2; \
+ ldd [src + off + 0x10], t4; \
+ ldd [src + off + 0x18], t6; \
+ st t0, [dst + off + 0x00]; \
+ addxcc t0, sum, sum; \
+ st t1, [dst + off + 0x04]; \
+ addxcc t1, sum, sum; \
+ st t2, [dst + off + 0x08]; \
+ addxcc t2, sum, sum; \
+ st t3, [dst + off + 0x0c]; \
+ addxcc t3, sum, sum; \
+ st t4, [dst + off + 0x10]; \
+ addxcc t4, sum, sum; \
+ st t5, [dst + off + 0x14]; \
+ addxcc t5, sum, sum; \
+ st t6, [dst + off + 0x18]; \
+ addxcc t6, sum, sum; \
+ st t7, [dst + off + 0x1c]; \
+ addxcc t7, sum, sum;
+
+ /* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \
+ ldd [src - off - 0x08], t0; \
+ ldd [src - off - 0x00], t2; \
+ addxcc t0, sum, sum; \
+ st t0, [dst - off - 0x08]; \
+ addxcc t1, sum, sum; \
+ st t1, [dst - off - 0x04]; \
+ addxcc t2, sum, sum; \
+ st t2, [dst - off - 0x00]; \
+ addxcc t3, sum, sum; \
+ st t3, [dst - off + 0x04];
+
+ /* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+ be 1f
+ andcc %o3, 4, %g0
+ EX(ldd [%o0 + 0x00], %g2, and %o3, 0xf,#)
+ add %o1, 8, %o1
+ addcc %g2, %g7, %g7
+ add %o0, 8, %o0
+ addxcc %g3, %g7, %g7
+ EX2(st %g2, [%o1 - 0x08],#)
+ addx %g0, %g7, %g7
+ andcc %o3, 4, %g0
+ EX2(st %g3, [%o1 - 0x04],#)
+1: be 1f
+ andcc %o3, 3, %o3
+ EX(ld [%o0 + 0x00], %g2, add %o3, 4,#)
+ add %o1, 4, %o1
+ addcc %g2, %g7, %g7
+ EX2(st %g2, [%o1 - 0x04],#)
+ addx %g0, %g7, %g7
+ andcc %o3, 3, %g0
+ add %o0, 4, %o0
+1: be 1f
+ addcc %o3, -1, %g0
+ bne 2f
+ subcc %o3, 2, %o3
+ b 4f
+ or %g0, %g0, %o4
+2: EX(lduh [%o0 + 0x00], %o4, add %o3, 2,#)
+ add %o0, 2, %o0
+ EX2(sth %o4, [%o1 + 0x00],#)
+ be 6f
+ add %o1, 2, %o1
+ sll %o4, 16, %o4
+4: EX(ldub [%o0 + 0x00], %o5, add %g0, 1,#)
+ EX2(stb %o5, [%o1 + 0x00],#)
+ sll %o5, 8, %o5
+ or %o5, %o4, %o4
+6: addcc %o4, %g7, %g7
+1: retl
+ addx %g0, %g7, %o0
+
+ /* Also, handle the alignment code out of band. */
+cc_dword_align:
+ cmp %g1, 6
+ bl,a ccte
+ andcc %g1, 0xf, %o3
+ andcc %o0, 0x1, %g0
+ bne ccslow
+ andcc %o0, 0x2, %g0
+ be 1f
+ andcc %o0, 0x4, %g0
+ EX(lduh [%o0 + 0x00], %g4, add %g1, 0,#)
+ sub %g1, 2, %g1
+ EX2(sth %g4, [%o1 + 0x00],#)
+ add %o0, 2, %o0
+ sll %g4, 16, %g4
+ addcc %g4, %g7, %g7
+ add %o1, 2, %o1
+ srl %g7, 16, %g3
+ addx %g0, %g3, %g4
+ sll %g7, 16, %g7
+ sll %g4, 16, %g3
+ srl %g7, 16, %g7
+ andcc %o0, 0x4, %g0
+ or %g3, %g7, %g7
+1: be 3f
+ andcc %g1, 0xffffff80, %g0
+ EX(ld [%o0 + 0x00], %g4, add %g1, 0,#)
+ sub %g1, 4, %g1
+ EX2(st %g4, [%o1 + 0x00],#)
+ add %o0, 4, %o0
+ addcc %g4, %g7, %g7
+ add %o1, 4, %o1
+ addx %g0, %g7, %g7
+ b 3f
+ andcc %g1, 0xffffff80, %g0
+
+ /* Sun, you just can't beat me, you just can't. Stop trying,
+ * give up. I'm serious, I am going to kick the living shit
+ * out of you, game over, lights out.
+ */
+ .align 8
+ .globl C_LABEL(__csum_partial_copy_sparc_generic)
+C_LABEL(__csum_partial_copy_sparc_generic):
+ /* %o0=src, %o1=dest, %g1=len, %g7=sum */
+ xor %o0, %o1, %o4 ! get changing bits
+ andcc %o4, 3, %g0 ! check for mismatched alignment
+ bne ccslow ! better this than unaligned/fixups
+ andcc %o0, 7, %g0 ! need to align things?
+ bne cc_dword_align ! yes, we check for short lengths there
+ andcc %g1, 0xffffff80, %g0 ! can we use unrolled loop?
+3: be 3f ! nope, less than one loop remains
+ andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry?
+ be ccdbl + 4 ! 8 byte aligned, kick ass
+5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+10: EXT(5b, 10b, 20f,#) ! note for exception handling
+ sub %g1, 128, %g1 ! detract from length
+ addx %g0, %g7, %g7 ! add in last carry bit
+ andcc %g1, 0xffffff80, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne 5b ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+3: andcc %g1, 0x70, %o2 ! can use table?
+ccmerge:be ccte ! nope, go and check for end cruft
+ andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw)
+ srl %o2, 1, %o4 ! begin negative offset computation
+ sethi %hi(12f), %o5 ! set up table ptr end
+ add %o0, %o2, %o0 ! advance src ptr
+ sub %o5, %o4, %o5 ! continue table calculation
+ sll %o2, 1, %g2 ! constant multiplies are fun...
+ sub %o5, %g2, %o5 ! some more adjustments
+ jmp %o5 + %lo(12f) ! jump into it, duff style, wheee...
+ add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw)
+cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+ CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+12: EXT(cctbl, 12b, 22f,#) ! note for exception table handling
+ addx %g0, %g7, %g7
+ andcc %o3, 0xf, %g0 ! check for low bits set
+ccte: bne cc_end_cruft ! something left, handle it out of band
+ andcc %o3, 8, %g0 ! begin checks for that code
+ retl ! return
+ mov %g7, %o0 ! give em the computed checksum
+ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+ CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+11: EXT(ccdbl, 11b, 21f,#) ! note for exception table handling
+ sub %g1, 128, %g1 ! detract from length
+ addx %g0, %g7, %g7 ! add in last carry bit
+ andcc %g1, 0xffffff80, %g0 ! more to csum?
+ add %o0, 128, %o0 ! advance src ptr
+ bne ccdbl ! we did not go negative, continue looping
+ add %o1, 128, %o1 ! advance dest ptr
+ b ccmerge ! finish it off, above
+ andcc %g1, 0x70, %o2 ! can use table? (clears carry btw)
+
+ccslow: cmp %g1, 0
+ mov 0, %g5
+ bleu 4f
+ andcc %o0, 1, %o5
+ be,a 1f
+ srl %g1, 1, %g4
+ sub %g1, 1, %g1
+ EX(ldub [%o0], %g5, add %g1, 1,#)
+ add %o0, 1, %o0
+ EX2(stb %g5, [%o1],#)
+ srl %g1, 1, %g4
+ add %o1, 1, %o1
+1: cmp %g4, 0
+ be,a 3f
+ andcc %g1, 1, %g0
+ andcc %o0, 2, %g0
+ be,a 1f
+ srl %g4, 1, %g4
+ EX(lduh [%o0], %o4, add %g1, 0,#)
+ sub %g1, 2, %g1
+ srl %o4, 8, %g2
+ sub %g4, 1, %g4
+ EX2(stb %g2, [%o1],#)
+ add %o4, %g5, %g5
+ EX2(stb %o4, [%o1 + 1],#)
+ add %o0, 2, %o0
+ srl %g4, 1, %g4
+ add %o1, 2, %o1
+1: cmp %g4, 0
+ be,a 2f
+ andcc %g1, 2, %g0
+ EX3(ld [%o0], %o4,#)
+5: srl %o4, 24, %g2
+ srl %o4, 16, %g3
+ EX2(stb %g2, [%o1],#)
+ srl %o4, 8, %g2
+ EX2(stb %g3, [%o1 + 1],#)
+ add %o0, 4, %o0
+ EX2(stb %g2, [%o1 + 2],#)
+ addcc %o4, %g5, %g5
+ EX2(stb %o4, [%o1 + 3],#)
+ addx %g5, %g0, %g5 ! I am now to lazy to optimize this (question it
+ add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl
+ subcc %g4, 1, %g4 ! tricks
+ bne,a 5b
+ EX3(ld [%o0], %o4,#)
+ sll %g5, 16, %g2
+ srl %g5, 16, %g5
+ srl %g2, 16, %g2
+ andcc %g1, 2, %g0
+ add %g2, %g5, %g5
+2: be,a 3f
+ andcc %g1, 1, %g0
+ EX(lduh [%o0], %o4, and %g1, 3,#)
+ andcc %g1, 1, %g0
+ srl %o4, 8, %g2
+ add %o0, 2, %o0
+ EX2(stb %g2, [%o1],#)
+ add %g5, %o4, %g5
+ EX2(stb %o4, [%o1 + 1],#)
+ add %o1, 2, %o1
+3: be,a 1f
+ sll %g5, 16, %o4
+ EX(ldub [%o0], %g2, add %g0, 1,#)
+ sll %g2, 8, %o4
+ EX2(stb %g2, [%o1],#)
+ add %g5, %o4, %g5
+ sll %g5, 16, %o4
+1: addcc %o4, %g5, %g5
+ srl %g5, 16, %o4
+ addx %g0, %o4, %g5
+ orcc %o5, %g0, %g0
+ be 4f
+ srl %g5, 8, %o4
+ and %g5, 0xff, %g2
+ and %o4, 0xff, %o4
+ sll %g2, 8, %g2
+ or %g2, %o4, %g5
+4: addcc %g7, %g5, %g7
+ retl
+ addx %g0, %g7, %o0
+C_LABEL(__csum_partial_copy_end):
+
+/* We do these strange calculations for the csum_*_from_user case only, ie.
+ * we only bother with faults on loads... */
+
+/* o2 = ((g2%20)&3)*8
+ * o3 = g1 - (g2/20)*32 - o2 */
+20:
+ cmp %g2, 20
+ blu,a 1f
+ and %g2, 3, %o2
+ sub %g1, 32, %g1
+ b 20b
+ sub %g2, 20, %g2
+1:
+ sll %o2, 3, %o2
+ b 31f
+ sub %g1, %o2, %o3
+
+/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
+ * o3 = g1 - (g2/16)*32 - o2 */
+21:
+ andcc %g2, 15, %o3
+ srl %g2, 4, %g2
+ be,a 1f
+ clr %o2
+ add %o3, 1, %o3
+ and %o3, 14, %o3
+ sll %o3, 3, %o2
+1:
+ sll %g2, 5, %g2
+ sub %g1, %g2, %o3
+ b 31f
+ sub %o3, %o2, %o3
+
+/* o0 += (g2/10)*16 - 0x70
+ * 01 += (g2/10)*16 - 0x70
+ * o2 = (g2 % 10) ? 8 : 0
+ * o3 += 0x70 - (g2/10)*16 - o2 */
+22:
+ cmp %g2, 10
+ blu,a 1f
+ sub %o0, 0x70, %o0
+ add %o0, 16, %o0
+ add %o1, 16, %o1
+ sub %o3, 16, %o3
+ b 22b
+ sub %g2, 10, %g2
+1:
+ sub %o1, 0x70, %o1
+ add %o3, 0x70, %o3
+ clr %o2
+ tst %g2
+ bne,a 1f
+ mov 8, %o2
+1:
+ b 31f
+ sub %o3, %o2, %o3
+96:
+ and %g1, 3, %g1
+ sll %g4, 2, %g4
+ add %g1, %g4, %o3
+30:
+/* %o1 is dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+ clr %o2
+31:
+/* %o0 is src
+ * %o1 is dst
+ * %o2 is # of bytes to copy from src to dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+ save %sp, -104, %sp
+ mov %i5, %o0
+ mov %i7, %o1
+ mov %i4, %o2
+ call C_LABEL(lookup_fault)
+ mov %g7, %i4
+ cmp %o0, 2
+ bne 1f
+ add %g0, -EFAULT, %i5
+ tst %i2
+ be 2f
+ mov %i0, %o1
+ mov %i1, %o0
+5:
+ call C_LABEL(__memcpy)
+ mov %i2, %o2
+ tst %o0
+ bne,a 2f
+ add %i3, %i2, %i3
+ add %i1, %i2, %i1
+2:
+ mov %i1, %o0
+6:
+ call C_LABEL(__bzero)
+ mov %i3, %o1
+1:
+ ld [%sp + 168], %o2 ! struct_ptr of parent
+ st %i5, [%o2]
+ ret
+ restore
+
+ .section __ex_table,#alloc
+ .align 4
+ .word 5b,2
+ .word 6b,2
diff --git a/pfinet/linux-src/arch/sparc64/lib/checksum.S b/pfinet/linux-src/arch/sparc64/lib/checksum.S
new file mode 100644
index 00000000..ea732b36
--- /dev/null
+++ b/pfinet/linux-src/arch/sparc64/lib/checksum.S
@@ -0,0 +1,278 @@
+/* checksum.S: Sparc V9 optimized checksum code.
+ *
+ * Copyright(C) 1995 Linus Torvalds
+ * Copyright(C) 1995 Miguel de Icaza
+ * Copyright(C) 1996 David S. Miller
+ * Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ * Linux/Alpha checksum c-code
+ * Linux/ix86 inline checksum assembly
+ * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ * David Mosberger-Tang for optimized reference c-code
+ * BSD4.4 portable checksum routine
+ */
+
+#include <asm/errno.h>
+#include <asm/head.h>
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+#include <asm/page.h>
+
+ /* The problem with the "add with carry" instructions on Ultra
+ * are two fold. Firstly, they cannot pair with jack shit,
+ * and also they only add in the 32-bit carry condition bit
+ * into the accumulated sum. The following is much better.
+ * For larger chunks we use VIS code, which is faster ;)
+ */
+
+#define src o0
+#define dst o1
+#define len o2
+#define sum o3
+
+ .text
+ /* I think I have an erection... Once _AGAIN_ the SunSoft
+ * engineers are caught asleep at the keyboard, tsk tsk...
+ */
+
+#define CSUMCOPY_LASTCHUNK(off, t0, t1) \
+ ldxa [%src - off - 0x08] %asi, t0; \
+ ldxa [%src - off - 0x00] %asi, t1; \
+ nop; nop; \
+ addcc t0, %sum, %sum; \
+ stw t0, [%dst - off - 0x04]; \
+ srlx t0, 32, t0; \
+ bcc,pt %xcc, 51f; \
+ stw t0, [%dst - off - 0x08]; \
+ add %sum, 1, %sum; \
+51: addcc t1, %sum, %sum; \
+ stw t1, [%dst - off + 0x04]; \
+ srlx t1, 32, t1; \
+ bcc,pt %xcc, 52f; \
+ stw t1, [%dst - off - 0x00]; \
+ add %sum, 1, %sum; \
+52:
+
+cpc_start:
+cc_end_cruft:
+ andcc %g7, 8, %g0 ! IEU1 Group
+ be,pn %icc, 1f ! CTI
+ and %g7, 4, %g5 ! IEU0
+ ldxa [%src + 0x00] %asi, %g2 ! Load Group
+ add %dst, 8, %dst ! IEU0
+ add %src, 8, %src ! IEU1
+ addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles
+ stw %g2, [%dst - 0x04] ! Store
+ srlx %g2, 32, %g2 ! IEU0
+ bcc,pt %xcc, 1f ! CTI Group
+ stw %g2, [%dst - 0x08] ! Store
+ add %sum, 1, %sum ! IEU0
+1: brz,pt %g5, 1f ! CTI Group
+ clr %g2 ! IEU0
+ lduwa [%src + 0x00] %asi, %g2 ! Load
+ add %dst, 4, %dst ! IEU0 Group
+ add %src, 4, %src ! IEU1
+ stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles
+ sllx %g2, 32, %g2 ! IEU0
+1: andcc %g7, 2, %g0 ! IEU1
+ be,pn %icc, 1f ! CTI Group
+ clr %o4 ! IEU1
+ lduha [%src + 0x00] %asi, %o4 ! Load
+ add %src, 2, %src ! IEU0 Group
+ add %dst, 2, %dst ! IEU1
+ sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles
+ sll %o4, 16, %o4 ! IEU0
+1: andcc %g7, 1, %g0 ! IEU1
+ be,pn %icc, 1f ! CTI Group
+ clr %o5 ! IEU0
+ lduba [%src + 0x00] %asi, %o5 ! Load
+ stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles
+ sll %o5, 8, %o5 ! IEU0
+1: or %g2, %o4, %o4 ! IEU1
+ or %o5, %o4, %o4 ! IEU0 Group
+ addcc %o4, %sum, %sum ! IEU1
+ bcc,pt %xcc, ccfold ! CTI
+ sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group
+ b,pt %xcc, ccfold ! CTI
+ add %sum, 1, %sum ! IEU1
+
+cc_fixit:
+ cmp %len, 6 ! IEU1 Group
+ bl,a,pn %icc, ccte ! CTI
+ andcc %len, 0xf, %g7 ! IEU1 Group
+ andcc %src, 2, %g0 ! IEU1 Group
+ be,pn %icc, 1f ! CTI
+ andcc %src, 0x4, %g0 ! IEU1 Group
+ lduha [%src + 0x00] %asi, %g4 ! Load
+ sub %len, 2, %len ! IEU0
+ add %src, 2, %src ! IEU0 Group
+ add %dst, 2, %dst ! IEU1
+ sll %g4, 16, %g3 ! IEU0 Group + 1 bubble
+ addcc %g3, %sum, %sum ! IEU1
+ bcc,pt %xcc, 0f ! CTI
+ srl %sum, 16, %g3 ! IEU0 Group
+ add %g3, 1, %g3 ! IEU0 4 clocks (mispredict)
+0: andcc %src, 0x4, %g0 ! IEU1 Group
+ sth %g4, [%dst - 0x2] ! Store
+ sll %sum, 16, %sum ! IEU0
+ sll %g3, 16, %g3 ! IEU0 Group
+ srl %sum, 16, %sum ! IEU0 Group
+ or %g3, %sum, %sum ! IEU0 Group (regdep)
+1: be,pt %icc, ccmerge ! CTI
+ andcc %len, 0xf0, %g1 ! IEU1
+ lduwa [%src + 0x00] %asi, %g4 ! Load Group
+ sub %len, 4, %len ! IEU0
+ add %src, 4, %src ! IEU1
+ add %dst, 4, %dst ! IEU0 Group
+ addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble
+ stw %g4, [%dst - 0x4] ! Store
+ bcc,pt %xcc, ccmerge ! CTI
+ andcc %len, 0xf0, %g1 ! IEU1 Group
+ b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict)
+ add %sum, 1, %sum ! IEU0
+
+ .align 32
+ .globl csum_partial_copy_sparc64
+csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */
+ xorcc %src, %dst, %o4 ! IEU1 Group
+ srl %sum, 0, %sum ! IEU0
+ andcc %o4, 3, %g0 ! IEU1 Group
+ srl %len, 0, %len ! IEU0
+ bne,pn %icc, ccslow ! CTI
+ andcc %src, 1, %g0 ! IEU1 Group
+ bne,pn %icc, ccslow ! CTI
+ cmp %len, 256 ! IEU1 Group
+ bgeu,pt %icc, csum_partial_copy_vis ! CTI
+ andcc %src, 7, %g0 ! IEU1 Group
+ bne,pn %icc, cc_fixit ! CTI
+ andcc %len, 0xf0, %g1 ! IEU1 Group
+ccmerge:be,pn %icc, ccte ! CTI
+ andcc %len, 0xf, %g7 ! IEU1 Group
+ sll %g1, 2, %o4 ! IEU0
+13: sethi %hi(12f), %o5 ! IEU0 Group
+ add %src, %g1, %src ! IEU1
+ sub %o5, %o4, %o5 ! IEU0 Group
+ jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced
+ add %dst, %g1, %dst ! IEU0 Group
+cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x98,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x88,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x78,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x68,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x58,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x48,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x38,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x28,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x18,%g2,%g3)
+ CSUMCOPY_LASTCHUNK(0x08,%g2,%g3)
+12:
+ andcc %len, 0xf, %g7 ! IEU1 Group
+ccte: bne,pn %icc, cc_end_cruft ! CTI
+ sethi %uhi(PAGE_OFFSET), %g4 ! IEU0
+ccfold: sllx %sum, 32, %o0 ! IEU0 Group
+ addcc %sum, %o0, %o0 ! IEU1 Group (regdep)
+ srlx %o0, 32, %o0 ! IEU0 Group (regdep)
+ bcs,a,pn %xcc, 1f ! CTI
+ add %o0, 1, %o0 ! IEU1 4 clocks (mispredict)
+1: retl ! CTI Group brk forced
+ sllx %g4, 32, %g4 ! IEU0 Group
+
+ccslow: mov 0, %g5
+ brlez,pn %len, 4f
+ andcc %src, 1, %o5
+ be,a,pt %icc, 1f
+ srl %len, 1, %g7
+ sub %len, 1, %len
+ lduba [%src] %asi, %g5
+ add %src, 1, %src
+ stb %g5, [%dst]
+ srl %len, 1, %g7
+ add %dst, 1, %dst
+1: brz,a,pn %g7, 3f
+ andcc %len, 1, %g0
+ andcc %src, 2, %g0
+ be,a,pt %icc, 1f
+ srl %g7, 1, %g7
+ lduha [%src] %asi, %o4
+ sub %len, 2, %len
+ srl %o4, 8, %g2
+ sub %g7, 1, %g7
+ stb %g2, [%dst]
+ add %o4, %g5, %g5
+ stb %o4, [%dst + 1]
+ add %src, 2, %src
+ srl %g7, 1, %g7
+ add %dst, 2, %dst
+1: brz,a,pn %g7, 2f
+ andcc %len, 2, %g0
+ lduwa [%src] %asi, %o4
+5: srl %o4, 24, %g2
+ srl %o4, 16, %g3
+ stb %g2, [%dst]
+ srl %o4, 8, %g2
+ stb %g3, [%dst + 1]
+ add %src, 4, %src
+ stb %g2, [%dst + 2]
+ addcc %o4, %g5, %g5
+ stb %o4, [%dst + 3]
+ addc %g5, %g0, %g5
+ add %dst, 4, %dst
+ subcc %g7, 1, %g7
+ bne,a,pt %icc, 5b
+ lduwa [%src] %asi, %o4
+ sll %g5, 16, %g2
+ srl %g5, 16, %g5
+ srl %g2, 16, %g2
+ andcc %len, 2, %g0
+ add %g2, %g5, %g5
+2: be,a,pt %icc, 3f
+ andcc %len, 1, %g0
+ lduha [%src] %asi, %o4
+ andcc %len, 1, %g0
+ srl %o4, 8, %g2
+ add %src, 2, %src
+ stb %g2, [%dst]
+ add %g5, %o4, %g5
+ stb %o4, [%dst + 1]
+ add %dst, 2, %dst
+3: be,a,pt %icc, 1f
+ sll %g5, 16, %o4
+ lduba [%src] %asi, %g2
+ sll %g2, 8, %o4
+ stb %g2, [%dst]
+ add %g5, %o4, %g5
+ sll %g5, 16, %o4
+1: addcc %o4, %g5, %g5
+ srl %g5, 16, %o4
+ addc %g0, %o4, %g5
+ brz,pt %o5, 4f
+ srl %g5, 8, %o4
+ and %g5, 0xff, %g2
+ and %o4, 0xff, %o4
+ sll %g2, 8, %g2
+ or %g2, %o4, %g5
+4: addcc %sum, %g5, %sum
+ addc %g0, %sum, %o0
+ retl
+ srl %o0, 0, %o0
+cpc_end:
+
+ .globl cpc_handler
+cpc_handler:
+ ldx [%sp + 0x7ff + 128], %g1
+ sub %g0, EFAULT, %g2
+ brnz,a,pt %g1, 1f
+ st %g2, [%g1]
+1: sethi %uhi(PAGE_OFFSET), %g4
+ retl
+ sllx %g4, 32, %g4
+
+ .section __ex_table
+ .align 4
+ .word cpc_start, 0, cpc_end, cpc_handler
+