diff options
author | Roland McGrath <roland@gnu.org> | 2000-02-04 03:21:18 +0000 |
---|---|---|
committer | Roland McGrath <roland@gnu.org> | 2000-02-04 03:21:18 +0000 |
commit | 9fd51e9b0ad33a89a83fdbbb66bd20d85f7893fb (patch) | |
tree | 8845b79f170028cb4380045c50277bbf075b5b7d /pfinet/linux-src/arch |
Import of Linux 2.2.12 subset (ipv4 stack and related)
Diffstat (limited to 'pfinet/linux-src/arch')
-rw-r--r-- | pfinet/linux-src/arch/alpha/lib/checksum.c | 169 | ||||
-rw-r--r-- | pfinet/linux-src/arch/arm/lib/checksum.S | 730 | ||||
-rw-r--r-- | pfinet/linux-src/arch/i386/lib/checksum.S | 447 | ||||
-rw-r--r-- | pfinet/linux-src/arch/i386/lib/old-checksum.c | 19 | ||||
-rw-r--r-- | pfinet/linux-src/arch/m68k/lib/checksum.c | 420 | ||||
-rw-r--r-- | pfinet/linux-src/arch/ppc/lib/checksum.S | 194 | ||||
-rw-r--r-- | pfinet/linux-src/arch/sparc/lib/checksum.S | 581 | ||||
-rw-r--r-- | pfinet/linux-src/arch/sparc64/lib/checksum.S | 278 |
8 files changed, 2838 insertions, 0 deletions
diff --git a/pfinet/linux-src/arch/alpha/lib/checksum.c b/pfinet/linux-src/arch/alpha/lib/checksum.c new file mode 100644 index 00000000..5165279f --- /dev/null +++ b/pfinet/linux-src/arch/alpha/lib/checksum.c @@ -0,0 +1,169 @@ +/* + * arch/alpha/lib/checksum.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed.. + */ + +#include <linux/string.h> + +#include <asm/byteorder.h> + +static inline unsigned short from64to16(unsigned long x) +{ + /* add up 32-bit words for 33 bits */ + x = (x & 0xffffffff) + (x >> 32); + /* add up 16-bit and 17-bit words for 17+c bits */ + x = (x & 0xffff) + (x >> 16); + /* add up 16-bit and 2-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented. + */ +unsigned short int csum_tcpudp_magic(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + return ~from64to16(saddr + daddr + sum + + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); +} + +unsigned int csum_tcpudp_nofold(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + unsigned long result; + + result = (saddr + daddr + sum + + ((unsigned long) ntohs(len) << 16) + + ((unsigned long) proto << 8)); + + /* Fold down to 32-bits so we don't loose in the typedef-less + network stack. */ + /* 64 to 33 */ + result = (result & 0xffffffff) + (result >> 32); + /* 33 to 32 */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +/* + * Do a 64-bit checksum on an arbitrary memory area.. + * + * This isn't a great routine, but it's not _horrible_ either. The + * inner loop could be unrolled a bit further, and there are better + * ways to do the carry, but this is reasonable. + */ +static inline unsigned long do_csum(const unsigned char * buff, int len) +{ + int odd, count; + unsigned long result = 0; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long) buff; + if (odd) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *) buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + if (count) { + unsigned long carry = 0; + do { + unsigned long w = *(unsigned long *) buff; + count--; + buff += 8; + result += carry; + result += w; + carry = (w > result); + } while (count); + result += carry; + result = (result & 0xffffffff) + (result >> 32); + } + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = from64to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + */ +unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) +{ + return ~do_csum(iph,ihl*4); +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) +{ + unsigned long result = do_csum(buff, len); + + /* add in old sum, and carry.. */ + result += sum; + /* 32+c bits -> 32 bits */ + result = (result & 0xffffffff) + (result >> 32); + return result; +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +unsigned short ip_compute_csum(unsigned char * buff, int len) +{ + return ~from64to16(do_csum(buff,len)); +} diff --git a/pfinet/linux-src/arch/arm/lib/checksum.S b/pfinet/linux-src/arch/arm/lib/checksum.S new file mode 100644 index 00000000..bd5c78d3 --- /dev/null +++ b/pfinet/linux-src/arch/arm/lib/checksum.S @@ -0,0 +1,730 @@ +/* + * linux/arch/arm/lib/checksum.S + * + * Copyright (C) 1995, 1996, 1997, 1998 Russell King + */ +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/errno.h> +#include "constants.h" + + .text + +/* Function: __u32 csum_partial(const char *src, int len, __u32) + * Params : r0 = buffer, r1 = len, r2 = checksum + * Returns : r0 = new checksum + */ + +ENTRY(csum_partial) + tst r0, #2 + beq 1f + subs r1, r1, #2 + addmi r1, r1, #2 + bmi 3f + bic r0, r0, #3 + ldr r3, [r0], #4 + adds r2, r2, r3, lsr #16 + adcs r2, r2, #0 +1: adds r2, r2, #0 + bics ip, r1, #31 + beq 3f + stmfd sp!, {r4 - r6} +2: ldmia r0!, {r3 - r6} + adcs r2, r2, r3 + adcs r2, r2, r4 + adcs r2, r2, r5 + adcs r2, r2, r6 + ldmia r0!, {r3 - r6} + adcs r2, r2, r3 + adcs r2, r2, r4 + adcs r2, r2, r5 + adcs r2, r2, r6 + sub ip, ip, #32 + teq ip, #0 + bne 2b + adcs r2, r2, #0 + ldmfd sp!, {r4 - r6} +3: ands ip, r1, #0x1c + beq 5f +4: ldr r3, [r0], #4 + adcs r2, r2, r3 + sub ip, ip, #4 + teq ip, #0 + bne 4b + adcs r2, r2, #0 +5: ands ip, r1, #3 + moveq r0, r2 + RETINSTR(moveq,pc,lr) + mov ip, ip, lsl #3 + rsb ip, ip, #32 + ldr r3, [r0] + mov r3, r3, lsl ip + adds r2, r2, r3, lsr ip + adc r0, r2, #0 + RETINSTR(mov,pc,lr) + +/* Function: __u32 csum_partial_copy_from_user (const char *src, char *dst, int len, __u32 sum, int *err_ptr) + * Params : r0 = src, r1 = dst, r2 = len, r3 = sum, [sp, #0] = &err + * Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT + */ +#if defined(CONFIG_CPU_32) + + .macro save_regs + stmfd sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc} + .endm + +#define LOAD_REGS(cond) \ + LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r8, fp, sp, pc}) + + .macro load1b, reg1 +9999: ldrbt \reg1, [r0], $1 + .section __ex_table, "a" + .align 3 + .long 9999b, 6001f + .previous + .endm + + .macro load2b, reg1, reg2 +9999: ldrbt \reg1, [r0], $1 +9998: ldrbt \reg2, [r0], $1 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .previous + .endm + + .macro load1l, reg1 +9999: ldrt \reg1, [r0], $4 + .section __ex_table, "a" + .align 3 + .long 9999b, 6001f + .previous + .endm + + .macro load2l, reg1, reg2 +9999: ldrt \reg1, [r0], $4 +9998: ldrt \reg2, [r0], $4 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .previous + .endm + + .macro load4l, reg1, reg2, reg3, reg4 +9999: ldrt \reg1, [r0], $4 +9998: ldrt \reg2, [r0], $4 +9997: ldrt \reg3, [r0], $4 +9996: ldrt \reg4, [r0], $4 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .long 9997b, 6001f + .long 9996b, 6001f + .previous + .endm + +#elif defined(CONFIG_CPU_26) + + .macro save_regs + stmfd sp!, {r1 - r2, r4 - r9, fp, ip, lr, pc} + mov r9, sp, lsr #13 + mov r9, r9, lsl #13 + ldr r9, [r9, #TSK_ADDR_LIMIT] + mov r9, r9, lsr #24 + .endm + +#define LOAD_REGS(cond) \ + LOADREGS(##cond##ea,fp,{r1 - r2, r4 - r9, fp, sp, pc}) + + .macro load1b, reg1 + tst r9, #0x01 +9999: ldreqbt \reg1, [r0], #1 + ldrneb \reg1, [r0], #1 + .section __ex_table, "a" + .align 3 + .long 9999b, 6001f + .previous + .endm + + .macro load2b, reg1, reg2 + tst r9, #0x01 +9999: ldreqbt \reg1, [r0], #1 + ldrneb \reg1, [r0], #1 +9998: ldreqbt \reg2, [r0], #1 + ldrneb \reg2, [r0], #1 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .previous + .endm + + .macro load1l, reg1 + tst r9, #0x01 +9999: ldreqt \reg1, [r0], #4 + ldrne \reg1, [r0], #4 + .section __ex_table, "a" + .align 3 + .long 9999b, 6001f + .previous + .endm + + .macro load2l, reg1, reg2 + tst r9, #0x01 + ldmneia r0!, {\reg1, \reg2} +9999: ldreqt \reg1, [r0], #4 +9998: ldreqt \reg2, [r0], #4 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .previous + .endm + + .macro load4l, reg1, reg2, reg3, reg4 + tst r9, #0x01 + ldmneia r0!, {\reg1, \reg2, \reg3, \reg4} +9999: ldreqt \reg1, [r0], #4 +9998: ldreqt \reg2, [r0], #4 +9997: ldreqt \reg3, [r0], #4 +9996: ldreqt \reg4, [r0], #4 + .section __ex_table, "a" + .long 9999b, 6001f + .long 9998b, 6001f + .long 9997b, 6001f + .long 9996b, 6001f + .previous + .endm + +#else +#error Unknown CPU architecture +#endif + +ENTRY(csum_partial_copy_from_user) + mov ip, sp + save_regs + sub fp, ip, #4 + cmp r2, #4 + blt .too_small_user + tst r1, #2 @ Test destination alignment + beq .dst_aligned_user + subs r2, r2, #2 @ We do not know if SRC is aligned... + load2b ip, r8 + orr ip, ip, r8, lsl #8 + adds r3, r3, ip + adcs r3, r3, #0 + strb ip, [r1], #1 + mov ip, ip, lsr #8 + strb ip, [r1], #1 @ Destination now aligned +.dst_aligned_user: + tst r0, #3 + bne .src_not_aligned_user + adds r3, r3, #0 + bics ip, r2, #15 @ Routine for src & dst aligned + beq 2f +1: load4l r4, r5, r6, r7 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + load2l r4, r5 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + tst ip, #4 + beq 4f +3: load1l r4 + str r4, [r1], #4 + adcs r3, r3, r4 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOAD_REGS(eq) + load1l r4 + tst r2, #2 + beq .exit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + mov r4, r4, lsr #8 +.exit: tst r2, #1 + strneb r4, [r1], #1 + andne r4, r4, #255 + adcnes r3, r3, r4 + adcs r0, r3, #0 + LOAD_REGS(al) + +.too_small_user: + teq r2, #0 + LOAD_REGS(eq) + cmp r2, #2 + blt .too_small_user1 + load2b ip, r8 + orr ip, ip, r8, lsl #8 + adds r3, r3, ip + strb ip, [r1], #1 + strb r8, [r1], #1 + tst r2, #1 +.too_small_user1: @ C = 0 + beq .csum_exit + load1b ip + strb ip, [r1], #1 + adcs r3, r3, ip +.csum_exit: adc r0, r3, #0 + LOAD_REGS(al) + +.src_not_aligned_user: + cmp r2, #4 + blt .too_small_user + and ip, r0, #3 + bic r0, r0, #3 + load1l r4 + cmp ip, #2 + beq .src2_aligned_user + bhi .src3_aligned_user + mov r4, r4, lsr #8 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: load4l r5, r6, r7, r8 + orr r4, r4, r5, lsl #24 + mov r5, r5, lsr #8 + orr r5, r5, r6, lsl #24 + mov r6, r6, lsr #8 + orr r6, r6, r7, lsl #24 + mov r7, r7, lsr #8 + orr r7, r7, r8, lsl #24 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #8 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + load2l r5, r6 + orr r4, r4, r5, lsl #24 + mov r5, r5, lsr #8 + orr r5, r5, r6, lsl #24 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #8 + tst ip, #4 + beq 4f +3: load1l r5 + orr r4, r4, r5, lsl #24 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #8 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOAD_REGS(eq) + tst r2, #2 + beq .exit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + b .exit + +.src2_aligned_user: + mov r4, r4, lsr #16 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: load4l r5, r6, r7, r8 + orr r4, r4, r5, lsl #16 + mov r5, r5, lsr #16 + orr r5, r5, r6, lsl #16 + mov r6, r6, lsr #16 + orr r6, r6, r7, lsl #16 + mov r7, r7, lsr #16 + orr r7, r7, r8, lsl #16 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #16 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + load2l r5, r6 + orr r4, r4, r5, lsl #16 + mov r5, r5, lsr #16 + orr r5, r5, r6, lsl #16 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #16 + tst ip, #4 + beq 4f +3: load1l r5 + orr r4, r4, r5, lsl #16 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #16 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOAD_REGS(eq) + tst r2, #2 + beq .exit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + load1b r4 + b .exit + +.src3_aligned_user: + mov r4, r4, lsr #24 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: load4l r5, r6, r7, r8 + orr r4, r4, r5, lsl #8 + mov r5, r5, lsr #24 + orr r5, r5, r6, lsl #8 + mov r6, r6, lsr #24 + orr r6, r6, r7, lsl #8 + mov r7, r7, lsr #24 + orr r7, r7, r8, lsl #8 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #24 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + load2l r5, r6 + orr r4, r4, r5, lsl #8 + mov r5, r5, lsr #24 + orr r5, r5, r6, lsl #8 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #24 + tst ip, #4 + beq 4f +3: load1l r5 + orr r4, r4, r5, lsl #8 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #24 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOAD_REGS(eq) + tst r2, #2 + beq .exit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + load1l r4 + strb r4, [r1], #1 + adcs r3, r3, r4, lsl #24 + mov r4, r4, lsr #8 + b .exit + +#if defined(CONFIG_CPU_32) + .section .fixup,"ax" +#endif + .align 4 +6001: mov r4, #-EFAULT + ldr r5, [fp, #4] + str r4, [r5] + ldmia sp, {r1, r2} @ retrieve original arguments + add r2, r2, r1 + mov r3, #0 @ zero the buffer +6002: teq r2, r1 + strneb r3, [r1], #1 + bne 6002b + LOAD_REGS(al) +#if defined(CONFIG_CPU_32) + .previous +#endif + +/* Function: __u32 csum_partial_copy (const char *src, char *dst, int len, __u32 sum) + * Params : r0 = src, r1 = dst, r2 = len, r3 = checksum + * Returns : r0 = new checksum + */ +ENTRY(csum_partial_copy_nocheck) +ENTRY(csum_partial_copy) + mov ip, sp + stmfd sp!, {r4 - r8, fp, ip, lr, pc} + sub fp, ip, #4 + cmp r2, #4 + blt Ltoo_small + tst r1, #2 @ Test destination alignment + beq Ldst_aligned + subs r2, r2, #2 @ We do not know if SRC is aligned... + ldrb ip, [r0], #1 + ldrb r8, [r0], #1 + orr ip, ip, r8, lsl #8 + adds r3, r3, ip + adcs r3, r3, #0 + strb ip, [r1], #1 + mov ip, ip, lsr #8 + strb ip, [r1], #1 @ Destination now aligned +Ldst_aligned: tst r0, #3 + bne Lsrc_not_aligned + adds r3, r3, #0 + bics ip, r2, #15 @ Routine for src & dst aligned + beq 3f +1: ldmia r0!, {r4, r5, r6, r7} + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + sub ip, ip, #16 + teq ip, #0 + bne 1b +3: ands ip, r2, #12 + beq 5f + tst ip, #8 + beq 4f + ldmia r0!, {r4, r5} + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + tst ip, #4 + beq 5f +4: ldr r4, [r0], #4 + str r4, [r1], #4 + adcs r3, r3, r4 +5: ands r2, r2, #3 + adceq r0, r3, #0 + LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc}) + ldr r4, [r0], #4 + tst r2, #2 + beq Lexit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + b Lexit + +Ltoo_small: teq r2, #0 + LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc}) + cmp r2, #2 + blt Ltoo_small1 + ldrb ip, [r0], #1 + ldrb r8, [r0], #1 + orr ip, ip, r8, lsl #8 + adds r3, r3, ip + strb ip, [r1], #1 + strb r8, [r1], #1 +Lexit: tst r2, #1 +Ltoo_small1: ldrneb ip, [r0], #1 + strneb ip, [r1], #1 + adcnes r3, r3, ip + adcs r0, r3, #0 + LOADREGS(ea,fp,{r4 - r8, fp, sp, pc}) + +Lsrc_not_aligned: + cmp r2, #4 + blt Ltoo_small + and ip, r0, #3 + bic r0, r0, #3 + ldr r4, [r0], #4 + cmp ip, #2 + beq Lsrc2_aligned + bhi Lsrc3_aligned + mov r4, r4, lsr #8 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: ldmia r0!, {r5, r6, r7, r8} + orr r4, r4, r5, lsl #24 + mov r5, r5, lsr #8 + orr r5, r5, r6, lsl #24 + mov r6, r6, lsr #8 + orr r6, r6, r7, lsl #24 + mov r7, r7, lsr #8 + orr r7, r7, r8, lsl #24 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #8 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + ldmia r0!, {r5, r6} + orr r4, r4, r5, lsl #24 + mov r5, r5, lsr #8 + orr r5, r5, r6, lsl #24 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #8 + tst ip, #4 + beq 4f +3: ldr r5, [r0], #4 + orr r4, r4, r5, lsl #24 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #8 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc}) + tst r2, #2 + beq Lexit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + b Lexit + +Lsrc2_aligned: mov r4, r4, lsr #16 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: ldmia r0!, {r5, r6, r7, r8} + orr r4, r4, r5, lsl #16 + mov r5, r5, lsr #16 + orr r5, r5, r6, lsl #16 + mov r6, r6, lsr #16 + orr r6, r6, r7, lsl #16 + mov r7, r7, lsr #16 + orr r7, r7, r8, lsl #16 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #16 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + ldmia r0!, {r5, r6} + orr r4, r4, r5, lsl #16 + mov r5, r5, lsr #16 + orr r5, r5, r6, lsl #16 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #16 + tst ip, #4 + beq 4f +3: ldr r5, [r0], #4 + orr r4, r4, r5, lsl #16 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #16 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc}) + tst r2, #2 + beq Lexit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + mov r4, r4, lsr #8 + strb r4, [r1], #1 + ldrb r4, [r0], #1 + b Lexit + +Lsrc3_aligned: mov r4, r4, lsr #24 + adds r3, r3, #0 + bics ip, r2, #15 + beq 2f +1: ldmia r0!, {r5, r6, r7, r8} + orr r4, r4, r5, lsl #8 + mov r5, r5, lsr #24 + orr r5, r5, r6, lsl #8 + mov r6, r6, lsr #24 + orr r6, r6, r7, lsl #8 + mov r7, r7, lsr #24 + orr r7, r7, r8, lsl #8 + stmia r1!, {r4, r5, r6, r7} + adcs r3, r3, r4 + adcs r3, r3, r5 + adcs r3, r3, r6 + adcs r3, r3, r7 + mov r4, r8, lsr #24 + sub ip, ip, #16 + teq ip, #0 + bne 1b +2: ands ip, r2, #12 + beq 4f + tst ip, #8 + beq 3f + ldmia r0!, {r5, r6} + orr r4, r4, r5, lsl #8 + mov r5, r5, lsr #24 + orr r5, r5, r6, lsl #8 + stmia r1!, {r4, r5} + adcs r3, r3, r4 + adcs r3, r3, r5 + mov r4, r6, lsr #24 + tst ip, #4 + beq 4f +3: ldr r5, [r0], #4 + orr r4, r4, r5, lsl #8 + str r4, [r1], #4 + adcs r3, r3, r4 + mov r4, r5, lsr #24 +4: ands r2, r2, #3 + adceq r0, r3, #0 + LOADREGS(eqea,fp,{r4 - r8, fp, sp, pc}) + tst r2, #2 + beq Lexit + adcs r3, r3, r4, lsl #16 + strb r4, [r1], #1 + ldr r4, [r0], #4 + strb r4, [r1], #1 + adcs r3, r3, r4, lsl #24 + mov r4, r4, lsr #8 + b Lexit + +ENTRY(__csum_ipv6_magic) + stmfd sp!, {lr} + adds ip, r2, r3 + ldmia r1, {r1 - r3, lr} + adcs ip, ip, r1 + adcs ip, ip, r2 + adcs ip, ip, r3 + adcs ip, ip, lr + ldmia r0, {r0 - r3} + adcs r0, ip, r0 + adcs r0, r0, r1 + adcs r0, r0, r2 + adcs r0, r0, r3 + ldr r3, [sp, #4] + adcs r0, r0, r3 + adcs r0, r0, #0 + LOADREGS(fd, sp!, {pc}) diff --git a/pfinet/linux-src/arch/i386/lib/checksum.S b/pfinet/linux-src/arch/i386/lib/checksum.S new file mode 100644 index 00000000..af10dc7c --- /dev/null +++ b/pfinet/linux-src/arch/i386/lib/checksum.S @@ -0,0 +1,447 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/errno.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + +.text +.align 4 +.globl csum_partial + +#if CPU!=686 + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: unsigned char *buff + testl $2, %esi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +1: movw (%esi), %bx + addl $2, %esi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, %edx + shrl $5, %ecx + jz 2f + testl %esi, %esi +1: movl (%esi), %ebx + adcl %ebx, %eax + movl 4(%esi), %ebx + adcl %ebx, %eax + movl 8(%esi), %ebx + adcl %ebx, %eax + movl 12(%esi), %ebx + adcl %ebx, %eax + movl 16(%esi), %ebx + adcl %ebx, %eax + movl 20(%esi), %ebx + adcl %ebx, %eax + movl 24(%esi), %ebx + adcl %ebx, %eax + movl 28(%esi), %ebx + adcl %ebx, %eax + lea 32(%esi), %esi + dec %ecx + jne 1b + adcl $0, %eax +2: movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +3: adcl (%esi), %eax + lea 4(%esi), %esi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f + movw (%esi),%cx + leal 2(%esi),%esi + je 6f + shll $16,%ecx +5: movb (%esi),%cl +6: addl %ecx,%eax + adcl $0, %eax +7: + popl %ebx + popl %esi + ret + +#else /* CPU==686 */ + +csum_partial: + movl 12(%esp),%eax # Function arg: unsigned int sum + movl 8(%esp),%ecx # Function arg: int len + movl 4(%esp),%esi # Function arg: const unsigned char *buf + + testl $2, %esi + jnz 30f +10: + movl %ecx, %edx + movl %ecx, %ebx + andl $0x7c, %ebx + shrl $7, %ecx + addl %ebx,%esi + shrl $2, %ebx + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + + # Handle 2-byte-aligned regions +20: addw (%esi), %ax + lea 2(%esi), %esi + adcl $0, %eax + jmp 10b + +30: subl $2, %ecx + ja 20b + je 32f + movzbl (%esi),%ebx # csumming 1 byte, 2-aligned + addl %ebx, %eax + adcl $0, %eax + jmp 80f +32: + addw (%esi), %ax # csumming 2 bytes, 2-aligned + adcl $0, %eax + jmp 80f + +40: + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +45: + lea 128(%esi), %esi + adcl $0, %eax + dec %ecx + jge 40b + movl %edx, %ecx +50: andl $3, %ecx + jz 80f + + # Handle the last 1-3 bytes without jumping + notl %ecx # 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff,%ebx # by the shll and shrl instructions + shll $3,%ecx + shrl %cl,%ebx + andl -128(%esi),%ebx # esi is 4-aligned so should be ok + addl %ebx,%eax + adcl $0,%eax +80: + ret + +#endif /* CPU==686 */ + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +.align 4 +.globl csum_partial_copy_generic + +#if CPU!=686 + +#define ARGBASE 16 +#define FP 12 + +csum_partial_copy_generic: + subl $4,%esp + pushl %edi + pushl %esi + pushl %ebx + movl ARGBASE+16(%esp),%eax # sum + movl ARGBASE+12(%esp),%ecx # len + movl ARGBASE+4(%esp),%esi # src + movl ARGBASE+8(%esp),%edi # dst + + testl $2, %edi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +SRC(1: movw (%esi), %bx ) + addl $2, %esi +DST( movw %bx, (%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, FP(%esp) + shrl $5, %ecx + jz 2f + testl %esi, %esi +SRC(1: movl (%esi), %ebx ) +SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + adcl %edx, %eax +DST( movl %edx, 4(%edi) ) + +SRC( movl 8(%esi), %ebx ) +SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 8(%edi) ) + adcl %edx, %eax +DST( movl %edx, 12(%edi) ) + +SRC( movl 16(%esi), %ebx ) +SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 16(%edi) ) + adcl %edx, %eax +DST( movl %edx, 20(%edi) ) + +SRC( movl 24(%esi), %ebx ) +SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 24(%edi) ) + adcl %edx, %eax +DST( movl %edx, 28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi + dec %ecx + jne 1b + adcl $0, %eax +2: movl FP(%esp), %edx + movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx +SRC(5: movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +6: addl %ecx, %eax + adcl $0, %eax +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + + # zero the complete destination - computing the rest + # is too much work + movl ARGBASE+8(%esp), %edi # dst + movl ARGBASE+12(%esp), %ecx # len + xorl %eax,%eax + rep ; stosb + + jmp 5000b + +6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT,(%ebx) + jmp 5000b + +.previous + + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp + ret + +#else + +/* Version for PentiumII/PPro */ + +#define ROUND1(x) \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ROUND(x) \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ARGBASE 12 + +csum_partial_copy_generic: + pushl %ebx + pushl %edi + pushl %esi + movl ARGBASE+4(%esp),%esi #src + movl ARGBASE+8(%esp),%edi #dst + movl ARGBASE+12(%esp),%ecx #len + movl ARGBASE+16(%esp),%eax #sum + movl %ecx, %edx + movl %ecx, %ebx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi + jmp *%ebx +1: addl $64,%esi + addl $64,%edi + ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) +3: adcl $0,%eax + dec %ecx + jge 1b +4: andl $3, %edx + jz 7f + cmpl $2, %edx + jb 5f +SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +DST( movw %dx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx +5: +SRC( movb (%esi), %dl ) +DST( movb %dl, (%edi) ) +6: addl %edx, %eax + adcl $0, %eax +7: +.section .fixup, "ax" +6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len + xorl %eax,%eax + rep; stosb + jmp 7b +6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 7b +.previous + + popl %esi + popl %edi + popl %ebx + ret + +#undef ROUND +#undef ROUND1 + +#endif /* CPU==i686 */ diff --git a/pfinet/linux-src/arch/i386/lib/old-checksum.c b/pfinet/linux-src/arch/i386/lib/old-checksum.c new file mode 100644 index 00000000..ae3a3804 --- /dev/null +++ b/pfinet/linux-src/arch/i386/lib/old-checksum.c @@ -0,0 +1,19 @@ +/* + * FIXME: old compatibility stuff, will be removed soon. + */ + +#include <net/checksum.h> + +unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum) +{ + int src_err=0, dst_err=0; + + sum = csum_partial_copy_generic ( src, dst, len, sum, &src_err, &dst_err); + + if (src_err || dst_err) + printk("old csum_partial_copy_fromuser(), tell mingo to convert me.\n"); + + return sum; +} + + diff --git a/pfinet/linux-src/arch/m68k/lib/checksum.c b/pfinet/linux-src/arch/m68k/lib/checksum.c new file mode 100644 index 00000000..5110cac4 --- /dev/null +++ b/pfinet/linux-src/arch/m68k/lib/checksum.c @@ -0,0 +1,420 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: + * Fixed some nasty bugs, causing some horrible crashes. + * A: At some points, the sum (%0) was used as + * length-counter instead of the length counter + * (%1). Thanks to Roman Hodek for pointing this out. + * B: GCC seems to mess up if one uses too many + * data-registers to hold input values and one tries to + * specify d0 and d1 as scratch registers. Letting gcc + * choose these registers itself solves the problem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1998/8/31 Andreas Schwab: + * Zero out rest of buffer on exception in + * csum_partial_copy_from_user. + */ + +#include <net/checksum.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +unsigned int +csum_partial (const unsigned char *buff, int len, unsigned int sum) +{ + unsigned long tmp1, tmp2; + /* + * Experiments with ethernet and slip connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. + */ + __asm__("movel %2,%3\n\t" + "btst #1,%3\n\t" /* Check alignment */ + "jeq 2f\n\t" + "subql #2,%1\n\t" /* buff%4==2: treat first word */ + "jgt 1f\n\t" + "addql #2,%1\n\t" /* len was == 2, treat only rest */ + "jra 4f\n" + "1:\t" + "addw %2@+,%0\n\t" /* add first word to sum */ + "clrl %3\n\t" + "addxl %3,%0\n" /* add X bit */ + "2:\t" + /* unrolled loop for the main part: do 8 longs at once */ + "movel %1,%3\n\t" /* save len in tmp1 */ + "lsrl #5,%1\n\t" /* len/32 */ + "jeq 2f\n\t" /* not enough... */ + "subql #1,%1\n" + "1:\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "dbra %1,1b\n\t" + "clrl %4\n\t" + "addxl %4,%0\n\t" /* add X bit */ + "clrw %1\n\t" + "subql #1,%1\n\t" + "jcc 1b\n" + "2:\t" + "movel %3,%1\n\t" /* restore len from tmp1 */ + "andw #0x1c,%3\n\t" /* number of rest longs */ + "jeq 4f\n\t" + "lsrw #2,%3\n\t" + "subqw #1,%3\n" + "3:\t" + /* loop for rest longs */ + "movel %2@+,%4\n\t" + "addxl %4,%0\n\t" + "dbra %3,3b\n\t" + "clrl %4\n\t" + "addxl %4,%0\n" /* add X bit */ + "4:\t" + /* now check for rest bytes that do not fit into longs */ + "andw #3,%1\n\t" + "jeq 7f\n\t" + "clrl %4\n\t" /* clear tmp2 for rest bytes */ + "subqw #2,%1\n\t" + "jlt 5f\n\t" + "movew %2@+,%4\n\t" /* have rest >= 2: get word */ + "swap %4\n\t" /* into bits 16..31 */ + "tstw %1\n\t" /* another byte? */ + "jeq 6f\n" + "5:\t" + "moveb %2@,%4\n\t" /* have odd rest: get byte */ + "lslw #8,%4\n\t" /* into bits 8..15; 16..31 untouched */ + "6:\t" + "addl %4,%0\n\t" /* now add rest long to sum */ + "clrl %4\n\t" + "addxl %4,%0\n" /* add X bit */ + "7:\t" + : "=d" (sum), "=d" (len), "=a" (buff), + "=&d" (tmp1), "=&d" (tmp2) + : "0" (sum), "1" (len), "2" (buff) + ); + return(sum); +} + + + +/* + * copy from user space while checksumming, with exception handling. + */ + +unsigned int +csum_partial_copy_from_user(const char *src, char *dst, int len, + int sum, int *csum_err) +{ + /* + * GCC doesn't like more than 10 operands for the asm + * statements so we have to use tmp2 for the error + * code. + */ + unsigned long tmp1, tmp2; + + __asm__("movel %2,%4\n\t" + "btst #1,%4\n\t" /* Check alignment */ + "jeq 2f\n\t" + "subql #2,%1\n\t" /* buff%4==2: treat first word */ + "jgt 1f\n\t" + "addql #2,%1\n\t" /* len was == 2, treat only rest */ + "jra 4f\n" + "1:\n" + "10:\t" + "movesw %2@+,%4\n\t" /* add first word to sum */ + "addw %4,%0\n\t" + "movew %4,%3@+\n\t" + "clrl %4\n\t" + "addxl %4,%0\n" /* add X bit */ + "2:\t" + /* unrolled loop for the main part: do 8 longs at once */ + "movel %1,%4\n\t" /* save len in tmp1 */ + "lsrl #5,%1\n\t" /* len/32 */ + "jeq 2f\n\t" /* not enough... */ + "subql #1,%1\n" + "1:\n" + "11:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "12:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "13:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "14:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "15:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "16:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "17:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "18:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "dbra %1,1b\n\t" + "clrl %5\n\t" + "addxl %5,%0\n\t" /* add X bit */ + "clrw %1\n\t" + "subql #1,%1\n\t" + "jcc 1b\n" + "2:\t" + "movel %4,%1\n\t" /* restore len from tmp1 */ + "andw #0x1c,%4\n\t" /* number of rest longs */ + "jeq 4f\n\t" + "lsrw #2,%4\n\t" + "subqw #1,%4\n" + "3:\n" + /* loop for rest longs */ + "19:\t" + "movesl %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "dbra %4,3b\n\t" + "clrl %5\n\t" + "addxl %5,%0\n" /* add X bit */ + "4:\t" + /* now check for rest bytes that do not fit into longs */ + "andw #3,%1\n\t" + "jeq 7f\n\t" + "clrl %5\n\t" /* clear tmp2 for rest bytes */ + "subqw #2,%1\n\t" + "jlt 5f\n\t" + "20:\t" + "movesw %2@+,%5\n\t" /* have rest >= 2: get word */ + "movew %5,%3@+\n\t" + "swap %5\n\t" /* into bits 16..31 */ + "tstw %1\n\t" /* another byte? */ + "jeq 6f\n" + "5:\n" + "21:\t" + "movesb %2@,%5\n\t" /* have odd rest: get byte */ + "moveb %5,%3@+\n\t" + "lslw #8,%5\n\t" /* into bits 8..15; 16..31 untouched */ + "6:\t" + "addl %5,%0\n\t" /* now add rest long to sum */ + "clrl %5\n\t" + "addxl %5,%0\n\t" /* add X bit */ + "7:\t" + "clrl %5\n" /* no error - clear return value */ + "8:\n" + ".section .fixup,\"ax\"\n" + ".even\n" + /* If any execption occurs zero out the rest. + Similarities with the code above are intentional :-) */ + "90:\t" + "clrw %3@+\n\t" + "movel %1,%4\n\t" + "lsrl #5,%1\n\t" + "jeq 1f\n\t" + "subql #1,%1\n" + "91:\t" + "clrl %3@+\n" + "92:\t" + "clrl %3@+\n" + "93:\t" + "clrl %3@+\n" + "94:\t" + "clrl %3@+\n" + "95:\t" + "clrl %3@+\n" + "96:\t" + "clrl %3@+\n" + "97:\t" + "clrl %3@+\n" + "98:\t" + "clrl %3@+\n\t" + "dbra %1,91b\n\t" + "clrw %1\n\t" + "subql #1,%1\n\t" + "jcc 91b\n" + "1:\t" + "movel %4,%1\n\t" + "andw #0x1c,%4\n\t" + "jeq 1f\n\t" + "lsrw #2,%4\n\t" + "subqw #1,%4\n" + "99:\t" + "clrl %3@+\n\t" + "dbra %4,99b\n\t" + "1:\t" + "andw #3,%1\n\t" + "jeq 9f\n" + "100:\t" + "clrw %3@+\n\t" + "tstw %1\n\t" + "jeq 9f\n" + "101:\t" + "clrb %3@+\n" + "9:\t" +#define STR(X) STR1(X) +#define STR1(X) #X + "moveq #-" STR(EFAULT) ",%5\n\t" + "jra 8b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + ".long 10b,90b\n" + ".long 11b,91b\n" + ".long 12b,92b\n" + ".long 13b,93b\n" + ".long 14b,94b\n" + ".long 15b,95b\n" + ".long 16b,96b\n" + ".long 17b,97b\n" + ".long 18b,98b\n" + ".long 19b,99b\n" + ".long 20b,100b\n" + ".long 21b,101b\n" + ".previous" + : "=d" (sum), "=d" (len), "=a" (src), "=a" (dst), + "=&d" (tmp1), "=d" (tmp2) + : "0" (sum), "1" (len), "2" (src), "3" (dst) + ); + + *csum_err = tmp2; + + return(sum); +} + +/* + * copy from kernel space while checksumming, otherwise like csum_partial + */ + +unsigned int +csum_partial_copy(const char *src, char *dst, int len, int sum) +{ + unsigned long tmp1, tmp2; + __asm__("movel %2,%4\n\t" + "btst #1,%4\n\t" /* Check alignment */ + "jeq 2f\n\t" + "subql #2,%1\n\t" /* buff%4==2: treat first word */ + "jgt 1f\n\t" + "addql #2,%1\n\t" /* len was == 2, treat only rest */ + "jra 4f\n" + "1:\t" + "movew %2@+,%4\n\t" /* add first word to sum */ + "addw %4,%0\n\t" + "movew %4,%3@+\n\t" + "clrl %4\n\t" + "addxl %4,%0\n" /* add X bit */ + "2:\t" + /* unrolled loop for the main part: do 8 longs at once */ + "movel %1,%4\n\t" /* save len in tmp1 */ + "lsrl #5,%1\n\t" /* len/32 */ + "jeq 2f\n\t" /* not enough... */ + "subql #1,%1\n" + "1:\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "dbra %1,1b\n\t" + "clrl %5\n\t" + "addxl %5,%0\n\t" /* add X bit */ + "clrw %1\n\t" + "subql #1,%1\n\t" + "jcc 1b\n" + "2:\t" + "movel %4,%1\n\t" /* restore len from tmp1 */ + "andw #0x1c,%4\n\t" /* number of rest longs */ + "jeq 4f\n\t" + "lsrw #2,%4\n\t" + "subqw #1,%4\n" + "3:\t" + /* loop for rest longs */ + "movel %2@+,%5\n\t" + "addxl %5,%0\n\t" + "movel %5,%3@+\n\t" + "dbra %4,3b\n\t" + "clrl %5\n\t" + "addxl %5,%0\n" /* add X bit */ + "4:\t" + /* now check for rest bytes that do not fit into longs */ + "andw #3,%1\n\t" + "jeq 7f\n\t" + "clrl %5\n\t" /* clear tmp2 for rest bytes */ + "subqw #2,%1\n\t" + "jlt 5f\n\t" + "movew %2@+,%5\n\t" /* have rest >= 2: get word */ + "movew %5,%3@+\n\t" + "swap %5\n\t" /* into bits 16..31 */ + "tstw %1\n\t" /* another byte? */ + "jeq 6f\n" + "5:\t" + "moveb %2@,%5\n\t" /* have odd rest: get byte */ + "moveb %5,%3@+\n\t" + "lslw #8,%5\n" /* into bits 8..15; 16..31 untouched */ + "6:\t" + "addl %5,%0\n\t" /* now add rest long to sum */ + "clrl %5\n\t" + "addxl %5,%0\n" /* add X bit */ + "7:\t" + : "=d" (sum), "=d" (len), "=a" (src), "=a" (dst), + "=&d" (tmp1), "=&d" (tmp2) + : "0" (sum), "1" (len), "2" (src), "3" (dst) + ); + return(sum); +} diff --git a/pfinet/linux-src/arch/ppc/lib/checksum.S b/pfinet/linux-src/arch/ppc/lib/checksum.S new file mode 100644 index 00000000..66a2e3aa --- /dev/null +++ b/pfinet/linux-src/arch/ppc/lib/checksum.S @@ -0,0 +1,194 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include "../kernel/ppc_asm.tmpl" + + .text + +/* + * ip_fast_csum(buf, len) -- Optimized for IP header + * len is in words and is always >= 5. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addi r4,r4,-2 + addc r0,r0,r5 + mtctr r4 +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(saddr, daddr, len, proto, sum) + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * csum_partial(buff, len, sum) + */ +_GLOBAL(csum_partial) + addic r0,r5,0 + subi r3,r3,4 + srwi. r6,r4,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r5,r3,2 /* Align buffer to longword boundary */ + beq+ 1f + lhz r5,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r0,r0,r5 + srwi. r6,r4,2 /* # words to do */ + beq 3f +1: mtctr r6 +2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ + adde r0,r0,r5 /* be unnecessary to unroll this loop */ + bdnz 2b + andi. r4,r4,3 +3: cmpi 0,r4,2 + blt+ 4f + lhz r5,4(r3) + addi r3,r3,2 + subi r4,r4,2 + adde r0,r0,r5 +4: cmpi 0,r4,1 + bne+ 5f + lbz r5,4(r3) + slwi r5,r5,8 /* Upper byte of word */ + adde r0,r0,r5 +5: addze r3,r0 /* add in final carry */ + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: mtctr r6 +82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ +92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ + adde r0,r0,r6 + bdnz 82b + andi. r5,r5,3 +3: cmpi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry */ + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 + srwi. r6,r5,2 + beq 3f + mtctr r6 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b +src_error: + cmpi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + +dst_error: + cmpi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .long 81b,src_error_1 + .long 91b,dst_error + .long 82b,src_error_2 + .long 92b,dst_error + .long 83b,src_error_3 + .long 93b,dst_error + .long 84b,src_error_3 + .long 94b,dst_error + .long 95b,dst_error + .long 96b,dst_error + .long 97b,dst_error diff --git a/pfinet/linux-src/arch/sparc/lib/checksum.S b/pfinet/linux-src/arch/sparc/lib/checksum.S new file mode 100644 index 00000000..d02b6dfb --- /dev/null +++ b/pfinet/linux-src/arch/sparc/lib/checksum.S @@ -0,0 +1,581 @@ +/* checksum.S: Sparc optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1997 Jakub Jelinek + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#include <asm/cprefix.h> +#include <asm/errno.h> + +#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \ + ldd [buf + offset + 0x00], t0; \ + ldd [buf + offset + 0x08], t2; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; \ + ldd [buf + offset + 0x10], t4; \ + addxcc t2, sum, sum; \ + addxcc t3, sum, sum; \ + ldd [buf + offset + 0x18], t0; \ + addxcc t4, sum, sum; \ + addxcc t5, sum, sum; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; + +#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3) \ + ldd [buf - offset - 0x08], t0; \ + ldd [buf - offset - 0x00], t2; \ + addxcc t0, sum, sum; \ + addxcc t1, sum, sum; \ + addxcc t2, sum, sum; \ + addxcc t3, sum, sum; + + /* Do end cruft out of band to get better cache patterns. */ +csum_partial_end_cruft: + be 1f ! caller asks %o1 & 0x8 + andcc %o1, 4, %g0 ! nope, check for word remaining + ldd [%o0], %g2 ! load two + addcc %g2, %o2, %o2 ! add first word to sum + addxcc %g3, %o2, %o2 ! add second word as well + add %o0, 8, %o0 ! advance buf ptr + addx %g0, %o2, %o2 ! add in final carry + andcc %o1, 4, %g0 ! check again for word remaining +1: be 1f ! nope, skip this code + andcc %o1, 3, %o1 ! check for trailing bytes + ld [%o0], %g2 ! load it + addcc %g2, %o2, %o2 ! add to sum + add %o0, 4, %o0 ! advance buf ptr + addx %g0, %o2, %o2 ! add in final carry + andcc %o1, 3, %g0 ! check again for trailing bytes +1: be 1f ! no trailing bytes, return + addcc %o1, -1, %g0 ! only one byte remains? + bne 2f ! at least two bytes more + subcc %o1, 2, %o1 ! only two bytes more? + b 4f ! only one byte remains + or %g0, %g0, %o4 ! clear fake hword value +2: lduh [%o0], %o4 ! get hword + be 6f ! jmp if only hword remains + add %o0, 2, %o0 ! advance buf ptr either way + sll %o4, 16, %o4 ! create upper hword +4: ldub [%o0], %o5 ! get final byte + sll %o5, 8, %o5 ! put into place + or %o5, %o4, %o4 ! coalese with hword (if any) +6: addcc %o4, %o2, %o2 ! add to sum +1: retl ! get outta here + addx %g0, %o2, %o0 ! add final carry into retval + + /* Also do alignment out of band to get better cache patterns. */ +csum_partial_fix_alignment: + cmp %o1, 6 + bl cpte - 0x4 + andcc %o0, 0x2, %g0 + be 1f + andcc %o0, 0x4, %g0 + lduh [%o0 + 0x00], %g2 + sub %o1, 2, %o1 + add %o0, 2, %o0 + sll %g2, 16, %g2 + addcc %g2, %o2, %o2 + srl %o2, 16, %g3 + addx %g0, %g3, %g2 + sll %o2, 16, %o2 + sll %g2, 16, %g3 + srl %o2, 16, %o2 + andcc %o0, 0x4, %g0 + or %g3, %o2, %o2 +1: be cpa + andcc %o1, 0xffffff80, %o3 + ld [%o0 + 0x00], %g2 + sub %o1, 4, %o1 + addcc %g2, %o2, %o2 + add %o0, 4, %o0 + addx %g0, %o2, %o2 + b cpa + andcc %o1, 0xffffff80, %o3 + + /* The common case is to get called with a nicely aligned + * buffer of size 0x20. Follow the code path for that case. + */ + .globl C_LABEL(csum_partial) +C_LABEL(csum_partial): /* %o0=buf, %o1=len, %o2=sum */ + andcc %o0, 0x7, %g0 ! alignment problems? + bne csum_partial_fix_alignment ! yep, handle it + sethi %hi(cpte - 8), %g7 ! prepare table jmp ptr + andcc %o1, 0xffffff80, %o3 ! num loop iterations +cpa: be 3f ! none to do + andcc %o1, 0x70, %g1 ! clears carry flag too +5: CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5) + addx %g0, %o2, %o2 ! sink in final carry + subcc %o3, 128, %o3 ! detract from loop iters + bne 5b ! more to do + add %o0, 128, %o0 ! advance buf ptr + andcc %o1, 0x70, %g1 ! clears carry flag too +3: be cpte ! nope + andcc %o1, 0xf, %g0 ! anything left at all? + srl %g1, 1, %o4 ! compute offset + sub %g7, %g1, %g7 ! adjust jmp ptr + sub %g7, %o4, %g7 ! final jmp ptr adjust + jmp %g7 + %lo(cpte - 8) ! enter the table + add %o0, %g1, %o0 ! advance buf ptr +cptbl: CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5) + CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5) + addx %g0, %o2, %o2 ! fetch final carry + andcc %o1, 0xf, %g0 ! anything left at all? +cpte: bne csum_partial_end_cruft ! yep, handle it + andcc %o1, 8, %g0 ! check how much +cpout: retl ! get outta here + mov %o2, %o0 ! return computed csum + + .globl C_LABEL(__csum_partial_copy_start), C_LABEL(__csum_partial_copy_end) +C_LABEL(__csum_partial_copy_start): + +#define EX(x,y,a,b,z) \ +98: x,y; \ + .section .fixup,z##alloc,z##execinstr; \ + .align 4; \ +99: ba 30f; \ + a, b, %o3; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 99b; \ + .text; \ + .align 4 + +#define EX2(x,y,z) \ +98: x,y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 30f; \ + .text; \ + .align 4 + +#define EX3(x,y,z) \ +98: x,y; \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word 98b, 96f; \ + .text; \ + .align 4 + +#define EXT(start,end,handler,z) \ + .section __ex_table,z##alloc; \ + .align 4; \ + .word start, 0, end, handler; \ + .text; \ + .align 4 + + /* This aligned version executes typically in 8.5 superscalar cycles, this + * is the best I can do. I say 8.5 because the final add will pair with + * the next ldd in the main unrolled loop. Thus the pipe is always full. + * If you change these macros (including order of instructions), + * please check the fixup code below as well. + */ +#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + addxcc t0, sum, sum; \ + ldd [src + off + 0x10], t4; \ + addxcc t1, sum, sum; \ + ldd [src + off + 0x18], t6; \ + addxcc t2, sum, sum; \ + std t0, [dst + off + 0x00]; \ + addxcc t3, sum, sum; \ + std t2, [dst + off + 0x08]; \ + addxcc t4, sum, sum; \ + std t4, [dst + off + 0x10]; \ + addxcc t5, sum, sum; \ + std t6, [dst + off + 0x18]; \ + addxcc t6, sum, sum; \ + addxcc t7, sum, sum; + + /* 12 superscalar cycles seems to be the limit for this case, + * because of this we thus do all the ldd's together to get + * Viking MXCC into streaming mode. Ho hum... + */ +#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7) \ + ldd [src + off + 0x00], t0; \ + ldd [src + off + 0x08], t2; \ + ldd [src + off + 0x10], t4; \ + ldd [src + off + 0x18], t6; \ + st t0, [dst + off + 0x00]; \ + addxcc t0, sum, sum; \ + st t1, [dst + off + 0x04]; \ + addxcc t1, sum, sum; \ + st t2, [dst + off + 0x08]; \ + addxcc t2, sum, sum; \ + st t3, [dst + off + 0x0c]; \ + addxcc t3, sum, sum; \ + st t4, [dst + off + 0x10]; \ + addxcc t4, sum, sum; \ + st t5, [dst + off + 0x14]; \ + addxcc t5, sum, sum; \ + st t6, [dst + off + 0x18]; \ + addxcc t6, sum, sum; \ + st t7, [dst + off + 0x1c]; \ + addxcc t7, sum, sum; + + /* Yuck, 6 superscalar cycles... */ +#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3) \ + ldd [src - off - 0x08], t0; \ + ldd [src - off - 0x00], t2; \ + addxcc t0, sum, sum; \ + st t0, [dst - off - 0x08]; \ + addxcc t1, sum, sum; \ + st t1, [dst - off - 0x04]; \ + addxcc t2, sum, sum; \ + st t2, [dst - off - 0x00]; \ + addxcc t3, sum, sum; \ + st t3, [dst - off + 0x04]; + + /* Handle the end cruft code out of band for better cache patterns. */ +cc_end_cruft: + be 1f + andcc %o3, 4, %g0 + EX(ldd [%o0 + 0x00], %g2, and %o3, 0xf,#) + add %o1, 8, %o1 + addcc %g2, %g7, %g7 + add %o0, 8, %o0 + addxcc %g3, %g7, %g7 + EX2(st %g2, [%o1 - 0x08],#) + addx %g0, %g7, %g7 + andcc %o3, 4, %g0 + EX2(st %g3, [%o1 - 0x04],#) +1: be 1f + andcc %o3, 3, %o3 + EX(ld [%o0 + 0x00], %g2, add %o3, 4,#) + add %o1, 4, %o1 + addcc %g2, %g7, %g7 + EX2(st %g2, [%o1 - 0x04],#) + addx %g0, %g7, %g7 + andcc %o3, 3, %g0 + add %o0, 4, %o0 +1: be 1f + addcc %o3, -1, %g0 + bne 2f + subcc %o3, 2, %o3 + b 4f + or %g0, %g0, %o4 +2: EX(lduh [%o0 + 0x00], %o4, add %o3, 2,#) + add %o0, 2, %o0 + EX2(sth %o4, [%o1 + 0x00],#) + be 6f + add %o1, 2, %o1 + sll %o4, 16, %o4 +4: EX(ldub [%o0 + 0x00], %o5, add %g0, 1,#) + EX2(stb %o5, [%o1 + 0x00],#) + sll %o5, 8, %o5 + or %o5, %o4, %o4 +6: addcc %o4, %g7, %g7 +1: retl + addx %g0, %g7, %o0 + + /* Also, handle the alignment code out of band. */ +cc_dword_align: + cmp %g1, 6 + bl,a ccte + andcc %g1, 0xf, %o3 + andcc %o0, 0x1, %g0 + bne ccslow + andcc %o0, 0x2, %g0 + be 1f + andcc %o0, 0x4, %g0 + EX(lduh [%o0 + 0x00], %g4, add %g1, 0,#) + sub %g1, 2, %g1 + EX2(sth %g4, [%o1 + 0x00],#) + add %o0, 2, %o0 + sll %g4, 16, %g4 + addcc %g4, %g7, %g7 + add %o1, 2, %o1 + srl %g7, 16, %g3 + addx %g0, %g3, %g4 + sll %g7, 16, %g7 + sll %g4, 16, %g3 + srl %g7, 16, %g7 + andcc %o0, 0x4, %g0 + or %g3, %g7, %g7 +1: be 3f + andcc %g1, 0xffffff80, %g0 + EX(ld [%o0 + 0x00], %g4, add %g1, 0,#) + sub %g1, 4, %g1 + EX2(st %g4, [%o1 + 0x00],#) + add %o0, 4, %o0 + addcc %g4, %g7, %g7 + add %o1, 4, %o1 + addx %g0, %g7, %g7 + b 3f + andcc %g1, 0xffffff80, %g0 + + /* Sun, you just can't beat me, you just can't. Stop trying, + * give up. I'm serious, I am going to kick the living shit + * out of you, game over, lights out. + */ + .align 8 + .globl C_LABEL(__csum_partial_copy_sparc_generic) +C_LABEL(__csum_partial_copy_sparc_generic): + /* %o0=src, %o1=dest, %g1=len, %g7=sum */ + xor %o0, %o1, %o4 ! get changing bits + andcc %o4, 3, %g0 ! check for mismatched alignment + bne ccslow ! better this than unaligned/fixups + andcc %o0, 7, %g0 ! need to align things? + bne cc_dword_align ! yes, we check for short lengths there + andcc %g1, 0xffffff80, %g0 ! can we use unrolled loop? +3: be 3f ! nope, less than one loop remains + andcc %o1, 4, %g0 ! dest aligned on 4 or 8 byte boundry? + be ccdbl + 4 ! 8 byte aligned, kick ass +5: CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) +10: EXT(5b, 10b, 20f,#) ! note for exception handling + sub %g1, 128, %g1 ! detract from length + addx %g0, %g7, %g7 ! add in last carry bit + andcc %g1, 0xffffff80, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne 5b ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr +3: andcc %g1, 0x70, %o2 ! can use table? +ccmerge:be ccte ! nope, go and check for end cruft + andcc %g1, 0xf, %o3 ! get low bits of length (clears carry btw) + srl %o2, 1, %o4 ! begin negative offset computation + sethi %hi(12f), %o5 ! set up table ptr end + add %o0, %o2, %o0 ! advance src ptr + sub %o5, %o4, %o5 ! continue table calculation + sll %o2, 1, %g2 ! constant multiplies are fun... + sub %o5, %g2, %o5 ! some more adjustments + jmp %o5 + %lo(12f) ! jump into it, duff style, wheee... + add %o1, %o2, %o1 ! advance dest ptr (carry is clear btw) +cctbl: CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5) + CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5) +12: EXT(cctbl, 12b, 22f,#) ! note for exception table handling + addx %g0, %g7, %g7 + andcc %o3, 0xf, %g0 ! check for low bits set +ccte: bne cc_end_cruft ! something left, handle it out of band + andcc %o3, 8, %g0 ! begin checks for that code + retl ! return + mov %g7, %o0 ! give em the computed checksum +ccdbl: CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) + CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3) +11: EXT(ccdbl, 11b, 21f,#) ! note for exception table handling + sub %g1, 128, %g1 ! detract from length + addx %g0, %g7, %g7 ! add in last carry bit + andcc %g1, 0xffffff80, %g0 ! more to csum? + add %o0, 128, %o0 ! advance src ptr + bne ccdbl ! we did not go negative, continue looping + add %o1, 128, %o1 ! advance dest ptr + b ccmerge ! finish it off, above + andcc %g1, 0x70, %o2 ! can use table? (clears carry btw) + +ccslow: cmp %g1, 0 + mov 0, %g5 + bleu 4f + andcc %o0, 1, %o5 + be,a 1f + srl %g1, 1, %g4 + sub %g1, 1, %g1 + EX(ldub [%o0], %g5, add %g1, 1,#) + add %o0, 1, %o0 + EX2(stb %g5, [%o1],#) + srl %g1, 1, %g4 + add %o1, 1, %o1 +1: cmp %g4, 0 + be,a 3f + andcc %g1, 1, %g0 + andcc %o0, 2, %g0 + be,a 1f + srl %g4, 1, %g4 + EX(lduh [%o0], %o4, add %g1, 0,#) + sub %g1, 2, %g1 + srl %o4, 8, %g2 + sub %g4, 1, %g4 + EX2(stb %g2, [%o1],#) + add %o4, %g5, %g5 + EX2(stb %o4, [%o1 + 1],#) + add %o0, 2, %o0 + srl %g4, 1, %g4 + add %o1, 2, %o1 +1: cmp %g4, 0 + be,a 2f + andcc %g1, 2, %g0 + EX3(ld [%o0], %o4,#) +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + EX2(stb %g2, [%o1],#) + srl %o4, 8, %g2 + EX2(stb %g3, [%o1 + 1],#) + add %o0, 4, %o0 + EX2(stb %g2, [%o1 + 2],#) + addcc %o4, %g5, %g5 + EX2(stb %o4, [%o1 + 3],#) + addx %g5, %g0, %g5 ! I am now to lazy to optimize this (question it + add %o1, 4, %o1 ! is worthy). Maybe some day - with the sll/srl + subcc %g4, 1, %g4 ! tricks + bne,a 5b + EX3(ld [%o0], %o4,#) + sll %g5, 16, %g2 + srl %g5, 16, %g5 + srl %g2, 16, %g2 + andcc %g1, 2, %g0 + add %g2, %g5, %g5 +2: be,a 3f + andcc %g1, 1, %g0 + EX(lduh [%o0], %o4, and %g1, 3,#) + andcc %g1, 1, %g0 + srl %o4, 8, %g2 + add %o0, 2, %o0 + EX2(stb %g2, [%o1],#) + add %g5, %o4, %g5 + EX2(stb %o4, [%o1 + 1],#) + add %o1, 2, %o1 +3: be,a 1f + sll %g5, 16, %o4 + EX(ldub [%o0], %g2, add %g0, 1,#) + sll %g2, 8, %o4 + EX2(stb %g2, [%o1],#) + add %g5, %o4, %g5 + sll %g5, 16, %o4 +1: addcc %o4, %g5, %g5 + srl %g5, 16, %o4 + addx %g0, %o4, %g5 + orcc %o5, %g0, %g0 + be 4f + srl %g5, 8, %o4 + and %g5, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, %g5 +4: addcc %g7, %g5, %g7 + retl + addx %g0, %g7, %o0 +C_LABEL(__csum_partial_copy_end): + +/* We do these strange calculations for the csum_*_from_user case only, ie. + * we only bother with faults on loads... */ + +/* o2 = ((g2%20)&3)*8 + * o3 = g1 - (g2/20)*32 - o2 */ +20: + cmp %g2, 20 + blu,a 1f + and %g2, 3, %o2 + sub %g1, 32, %g1 + b 20b + sub %g2, 20, %g2 +1: + sll %o2, 3, %o2 + b 31f + sub %g1, %o2, %o3 + +/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8) + * o3 = g1 - (g2/16)*32 - o2 */ +21: + andcc %g2, 15, %o3 + srl %g2, 4, %g2 + be,a 1f + clr %o2 + add %o3, 1, %o3 + and %o3, 14, %o3 + sll %o3, 3, %o2 +1: + sll %g2, 5, %g2 + sub %g1, %g2, %o3 + b 31f + sub %o3, %o2, %o3 + +/* o0 += (g2/10)*16 - 0x70 + * 01 += (g2/10)*16 - 0x70 + * o2 = (g2 % 10) ? 8 : 0 + * o3 += 0x70 - (g2/10)*16 - o2 */ +22: + cmp %g2, 10 + blu,a 1f + sub %o0, 0x70, %o0 + add %o0, 16, %o0 + add %o1, 16, %o1 + sub %o3, 16, %o3 + b 22b + sub %g2, 10, %g2 +1: + sub %o1, 0x70, %o1 + add %o3, 0x70, %o3 + clr %o2 + tst %g2 + bne,a 1f + mov 8, %o2 +1: + b 31f + sub %o3, %o2, %o3 +96: + and %g1, 3, %g1 + sll %g4, 2, %g4 + add %g1, %g4, %o3 +30: +/* %o1 is dst + * %o3 is # bytes to zero out + * %o4 is faulting address + * %o5 is %pc where fault occurred */ + clr %o2 +31: +/* %o0 is src + * %o1 is dst + * %o2 is # of bytes to copy from src to dst + * %o3 is # bytes to zero out + * %o4 is faulting address + * %o5 is %pc where fault occurred */ + save %sp, -104, %sp + mov %i5, %o0 + mov %i7, %o1 + mov %i4, %o2 + call C_LABEL(lookup_fault) + mov %g7, %i4 + cmp %o0, 2 + bne 1f + add %g0, -EFAULT, %i5 + tst %i2 + be 2f + mov %i0, %o1 + mov %i1, %o0 +5: + call C_LABEL(__memcpy) + mov %i2, %o2 + tst %o0 + bne,a 2f + add %i3, %i2, %i3 + add %i1, %i2, %i1 +2: + mov %i1, %o0 +6: + call C_LABEL(__bzero) + mov %i3, %o1 +1: + ld [%sp + 168], %o2 ! struct_ptr of parent + st %i5, [%o2] + ret + restore + + .section __ex_table,#alloc + .align 4 + .word 5b,2 + .word 6b,2 diff --git a/pfinet/linux-src/arch/sparc64/lib/checksum.S b/pfinet/linux-src/arch/sparc64/lib/checksum.S new file mode 100644 index 00000000..ea732b36 --- /dev/null +++ b/pfinet/linux-src/arch/sparc64/lib/checksum.S @@ -0,0 +1,278 @@ +/* checksum.S: Sparc V9 optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1997 Jakub Jelinek + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#include <asm/errno.h> +#include <asm/head.h> +#include <asm/ptrace.h> +#include <asm/asi.h> +#include <asm/page.h> + + /* The problem with the "add with carry" instructions on Ultra + * are two fold. Firstly, they cannot pair with jack shit, + * and also they only add in the 32-bit carry condition bit + * into the accumulated sum. The following is much better. + * For larger chunks we use VIS code, which is faster ;) + */ + +#define src o0 +#define dst o1 +#define len o2 +#define sum o3 + + .text + /* I think I have an erection... Once _AGAIN_ the SunSoft + * engineers are caught asleep at the keyboard, tsk tsk... + */ + +#define CSUMCOPY_LASTCHUNK(off, t0, t1) \ + ldxa [%src - off - 0x08] %asi, t0; \ + ldxa [%src - off - 0x00] %asi, t1; \ + nop; nop; \ + addcc t0, %sum, %sum; \ + stw t0, [%dst - off - 0x04]; \ + srlx t0, 32, t0; \ + bcc,pt %xcc, 51f; \ + stw t0, [%dst - off - 0x08]; \ + add %sum, 1, %sum; \ +51: addcc t1, %sum, %sum; \ + stw t1, [%dst - off + 0x04]; \ + srlx t1, 32, t1; \ + bcc,pt %xcc, 52f; \ + stw t1, [%dst - off - 0x00]; \ + add %sum, 1, %sum; \ +52: + +cpc_start: +cc_end_cruft: + andcc %g7, 8, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + and %g7, 4, %g5 ! IEU0 + ldxa [%src + 0x00] %asi, %g2 ! Load Group + add %dst, 8, %dst ! IEU0 + add %src, 8, %src ! IEU1 + addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles + stw %g2, [%dst - 0x04] ! Store + srlx %g2, 32, %g2 ! IEU0 + bcc,pt %xcc, 1f ! CTI Group + stw %g2, [%dst - 0x08] ! Store + add %sum, 1, %sum ! IEU0 +1: brz,pt %g5, 1f ! CTI Group + clr %g2 ! IEU0 + lduwa [%src + 0x00] %asi, %g2 ! Load + add %dst, 4, %dst ! IEU0 Group + add %src, 4, %src ! IEU1 + stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles + sllx %g2, 32, %g2 ! IEU0 +1: andcc %g7, 2, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o4 ! IEU1 + lduha [%src + 0x00] %asi, %o4 ! Load + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles + sll %o4, 16, %o4 ! IEU0 +1: andcc %g7, 1, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o5 ! IEU0 + lduba [%src + 0x00] %asi, %o5 ! Load + stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles + sll %o5, 8, %o5 ! IEU0 +1: or %g2, %o4, %o4 ! IEU1 + or %o5, %o4, %o4 ! IEU0 Group + addcc %o4, %sum, %sum ! IEU1 + bcc,pt %xcc, ccfold ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group + b,pt %xcc, ccfold ! CTI + add %sum, 1, %sum ! IEU1 + +cc_fixit: + cmp %len, 6 ! IEU1 Group + bl,a,pn %icc, ccte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + andcc %src, 2, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + andcc %src, 0x4, %g0 ! IEU1 Group + lduha [%src + 0x00] %asi, %g4 ! Load + sub %len, 2, %len ! IEU0 + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + sll %g4, 16, %g3 ! IEU0 Group + 1 bubble + addcc %g3, %sum, %sum ! IEU1 + bcc,pt %xcc, 0f ! CTI + srl %sum, 16, %g3 ! IEU0 Group + add %g3, 1, %g3 ! IEU0 4 clocks (mispredict) +0: andcc %src, 0x4, %g0 ! IEU1 Group + sth %g4, [%dst - 0x2] ! Store + sll %sum, 16, %sum ! IEU0 + sll %g3, 16, %g3 ! IEU0 Group + srl %sum, 16, %sum ! IEU0 Group + or %g3, %sum, %sum ! IEU0 Group (regdep) +1: be,pt %icc, ccmerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 + lduwa [%src + 0x00] %asi, %g4 ! Load Group + sub %len, 4, %len ! IEU0 + add %src, 4, %src ! IEU1 + add %dst, 4, %dst ! IEU0 Group + addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble + stw %g4, [%dst - 0x4] ! Store + bcc,pt %xcc, ccmerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group + b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict) + add %sum, 1, %sum ! IEU0 + + .align 32 + .globl csum_partial_copy_sparc64 +csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */ + xorcc %src, %dst, %o4 ! IEU1 Group + srl %sum, 0, %sum ! IEU0 + andcc %o4, 3, %g0 ! IEU1 Group + srl %len, 0, %len ! IEU0 + bne,pn %icc, ccslow ! CTI + andcc %src, 1, %g0 ! IEU1 Group + bne,pn %icc, ccslow ! CTI + cmp %len, 256 ! IEU1 Group + bgeu,pt %icc, csum_partial_copy_vis ! CTI + andcc %src, 7, %g0 ! IEU1 Group + bne,pn %icc, cc_fixit ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group +ccmerge:be,pn %icc, ccte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + sll %g1, 2, %o4 ! IEU0 +13: sethi %hi(12f), %o5 ! IEU0 Group + add %src, %g1, %src ! IEU1 + sub %o5, %o4, %o5 ! IEU0 Group + jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced + add %dst, %g1, %dst ! IEU0 Group +cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x98,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x88,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x78,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x68,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x58,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x48,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x38,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x28,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x18,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x08,%g2,%g3) +12: + andcc %len, 0xf, %g7 ! IEU1 Group +ccte: bne,pn %icc, cc_end_cruft ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 +ccfold: sllx %sum, 32, %o0 ! IEU0 Group + addcc %sum, %o0, %o0 ! IEU1 Group (regdep) + srlx %o0, 32, %o0 ! IEU0 Group (regdep) + bcs,a,pn %xcc, 1f ! CTI + add %o0, 1, %o0 ! IEU1 4 clocks (mispredict) +1: retl ! CTI Group brk forced + sllx %g4, 32, %g4 ! IEU0 Group + +ccslow: mov 0, %g5 + brlez,pn %len, 4f + andcc %src, 1, %o5 + be,a,pt %icc, 1f + srl %len, 1, %g7 + sub %len, 1, %len + lduba [%src] %asi, %g5 + add %src, 1, %src + stb %g5, [%dst] + srl %len, 1, %g7 + add %dst, 1, %dst +1: brz,a,pn %g7, 3f + andcc %len, 1, %g0 + andcc %src, 2, %g0 + be,a,pt %icc, 1f + srl %g7, 1, %g7 + lduha [%src] %asi, %o4 + sub %len, 2, %len + srl %o4, 8, %g2 + sub %g7, 1, %g7 + stb %g2, [%dst] + add %o4, %g5, %g5 + stb %o4, [%dst + 1] + add %src, 2, %src + srl %g7, 1, %g7 + add %dst, 2, %dst +1: brz,a,pn %g7, 2f + andcc %len, 2, %g0 + lduwa [%src] %asi, %o4 +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + stb %g2, [%dst] + srl %o4, 8, %g2 + stb %g3, [%dst + 1] + add %src, 4, %src + stb %g2, [%dst + 2] + addcc %o4, %g5, %g5 + stb %o4, [%dst + 3] + addc %g5, %g0, %g5 + add %dst, 4, %dst + subcc %g7, 1, %g7 + bne,a,pt %icc, 5b + lduwa [%src] %asi, %o4 + sll %g5, 16, %g2 + srl %g5, 16, %g5 + srl %g2, 16, %g2 + andcc %len, 2, %g0 + add %g2, %g5, %g5 +2: be,a,pt %icc, 3f + andcc %len, 1, %g0 + lduha [%src] %asi, %o4 + andcc %len, 1, %g0 + srl %o4, 8, %g2 + add %src, 2, %src + stb %g2, [%dst] + add %g5, %o4, %g5 + stb %o4, [%dst + 1] + add %dst, 2, %dst +3: be,a,pt %icc, 1f + sll %g5, 16, %o4 + lduba [%src] %asi, %g2 + sll %g2, 8, %o4 + stb %g2, [%dst] + add %g5, %o4, %g5 + sll %g5, 16, %o4 +1: addcc %o4, %g5, %g5 + srl %g5, 16, %o4 + addc %g0, %o4, %g5 + brz,pt %o5, 4f + srl %g5, 8, %o4 + and %g5, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, %g5 +4: addcc %sum, %g5, %sum + addc %g0, %sum, %o0 + retl + srl %o0, 0, %o0 +cpc_end: + + .globl cpc_handler +cpc_handler: + ldx [%sp + 0x7ff + 128], %g1 + sub %g0, EFAULT, %g2 + brnz,a,pt %g1, 1f + st %g2, [%g1] +1: sethi %uhi(PAGE_OFFSET), %g4 + retl + sllx %g4, 32, %g4 + + .section __ex_table + .align 4 + .word cpc_start, 0, cpc_end, cpc_handler + |