diff options
Diffstat (limited to 'pfinet/linux-src/arch/sparc64/lib')
-rw-r--r-- | pfinet/linux-src/arch/sparc64/lib/checksum.S | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/pfinet/linux-src/arch/sparc64/lib/checksum.S b/pfinet/linux-src/arch/sparc64/lib/checksum.S new file mode 100644 index 00000000..ea732b36 --- /dev/null +++ b/pfinet/linux-src/arch/sparc64/lib/checksum.S @@ -0,0 +1,278 @@ +/* checksum.S: Sparc V9 optimized checksum code. + * + * Copyright(C) 1995 Linus Torvalds + * Copyright(C) 1995 Miguel de Icaza + * Copyright(C) 1996 David S. Miller + * Copyright(C) 1997 Jakub Jelinek + * + * derived from: + * Linux/Alpha checksum c-code + * Linux/ix86 inline checksum assembly + * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) + * David Mosberger-Tang for optimized reference c-code + * BSD4.4 portable checksum routine + */ + +#include <asm/errno.h> +#include <asm/head.h> +#include <asm/ptrace.h> +#include <asm/asi.h> +#include <asm/page.h> + + /* The problem with the "add with carry" instructions on Ultra + * are two fold. Firstly, they cannot pair with jack shit, + * and also they only add in the 32-bit carry condition bit + * into the accumulated sum. The following is much better. + * For larger chunks we use VIS code, which is faster ;) + */ + +#define src o0 +#define dst o1 +#define len o2 +#define sum o3 + + .text + /* I think I have an erection... Once _AGAIN_ the SunSoft + * engineers are caught asleep at the keyboard, tsk tsk... + */ + +#define CSUMCOPY_LASTCHUNK(off, t0, t1) \ + ldxa [%src - off - 0x08] %asi, t0; \ + ldxa [%src - off - 0x00] %asi, t1; \ + nop; nop; \ + addcc t0, %sum, %sum; \ + stw t0, [%dst - off - 0x04]; \ + srlx t0, 32, t0; \ + bcc,pt %xcc, 51f; \ + stw t0, [%dst - off - 0x08]; \ + add %sum, 1, %sum; \ +51: addcc t1, %sum, %sum; \ + stw t1, [%dst - off + 0x04]; \ + srlx t1, 32, t1; \ + bcc,pt %xcc, 52f; \ + stw t1, [%dst - off - 0x00]; \ + add %sum, 1, %sum; \ +52: + +cpc_start: +cc_end_cruft: + andcc %g7, 8, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + and %g7, 4, %g5 ! IEU0 + ldxa [%src + 0x00] %asi, %g2 ! Load Group + add %dst, 8, %dst ! IEU0 + add %src, 8, %src ! IEU1 + addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles + stw %g2, [%dst - 0x04] ! Store + srlx %g2, 32, %g2 ! IEU0 + bcc,pt %xcc, 1f ! CTI Group + stw %g2, [%dst - 0x08] ! Store + add %sum, 1, %sum ! IEU0 +1: brz,pt %g5, 1f ! CTI Group + clr %g2 ! IEU0 + lduwa [%src + 0x00] %asi, %g2 ! Load + add %dst, 4, %dst ! IEU0 Group + add %src, 4, %src ! IEU1 + stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles + sllx %g2, 32, %g2 ! IEU0 +1: andcc %g7, 2, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o4 ! IEU1 + lduha [%src + 0x00] %asi, %o4 ! Load + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles + sll %o4, 16, %o4 ! IEU0 +1: andcc %g7, 1, %g0 ! IEU1 + be,pn %icc, 1f ! CTI Group + clr %o5 ! IEU0 + lduba [%src + 0x00] %asi, %o5 ! Load + stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles + sll %o5, 8, %o5 ! IEU0 +1: or %g2, %o4, %o4 ! IEU1 + or %o5, %o4, %o4 ! IEU0 Group + addcc %o4, %sum, %sum ! IEU1 + bcc,pt %xcc, ccfold ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group + b,pt %xcc, ccfold ! CTI + add %sum, 1, %sum ! IEU1 + +cc_fixit: + cmp %len, 6 ! IEU1 Group + bl,a,pn %icc, ccte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + andcc %src, 2, %g0 ! IEU1 Group + be,pn %icc, 1f ! CTI + andcc %src, 0x4, %g0 ! IEU1 Group + lduha [%src + 0x00] %asi, %g4 ! Load + sub %len, 2, %len ! IEU0 + add %src, 2, %src ! IEU0 Group + add %dst, 2, %dst ! IEU1 + sll %g4, 16, %g3 ! IEU0 Group + 1 bubble + addcc %g3, %sum, %sum ! IEU1 + bcc,pt %xcc, 0f ! CTI + srl %sum, 16, %g3 ! IEU0 Group + add %g3, 1, %g3 ! IEU0 4 clocks (mispredict) +0: andcc %src, 0x4, %g0 ! IEU1 Group + sth %g4, [%dst - 0x2] ! Store + sll %sum, 16, %sum ! IEU0 + sll %g3, 16, %g3 ! IEU0 Group + srl %sum, 16, %sum ! IEU0 Group + or %g3, %sum, %sum ! IEU0 Group (regdep) +1: be,pt %icc, ccmerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 + lduwa [%src + 0x00] %asi, %g4 ! Load Group + sub %len, 4, %len ! IEU0 + add %src, 4, %src ! IEU1 + add %dst, 4, %dst ! IEU0 Group + addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble + stw %g4, [%dst - 0x4] ! Store + bcc,pt %xcc, ccmerge ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group + b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict) + add %sum, 1, %sum ! IEU0 + + .align 32 + .globl csum_partial_copy_sparc64 +csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */ + xorcc %src, %dst, %o4 ! IEU1 Group + srl %sum, 0, %sum ! IEU0 + andcc %o4, 3, %g0 ! IEU1 Group + srl %len, 0, %len ! IEU0 + bne,pn %icc, ccslow ! CTI + andcc %src, 1, %g0 ! IEU1 Group + bne,pn %icc, ccslow ! CTI + cmp %len, 256 ! IEU1 Group + bgeu,pt %icc, csum_partial_copy_vis ! CTI + andcc %src, 7, %g0 ! IEU1 Group + bne,pn %icc, cc_fixit ! CTI + andcc %len, 0xf0, %g1 ! IEU1 Group +ccmerge:be,pn %icc, ccte ! CTI + andcc %len, 0xf, %g7 ! IEU1 Group + sll %g1, 2, %o4 ! IEU0 +13: sethi %hi(12f), %o5 ! IEU0 Group + add %src, %g1, %src ! IEU1 + sub %o5, %o4, %o5 ! IEU0 Group + jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced + add %dst, %g1, %dst ! IEU0 Group +cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x98,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x88,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x78,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x68,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x58,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x48,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x38,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x28,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x18,%g2,%g3) + CSUMCOPY_LASTCHUNK(0x08,%g2,%g3) +12: + andcc %len, 0xf, %g7 ! IEU1 Group +ccte: bne,pn %icc, cc_end_cruft ! CTI + sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 +ccfold: sllx %sum, 32, %o0 ! IEU0 Group + addcc %sum, %o0, %o0 ! IEU1 Group (regdep) + srlx %o0, 32, %o0 ! IEU0 Group (regdep) + bcs,a,pn %xcc, 1f ! CTI + add %o0, 1, %o0 ! IEU1 4 clocks (mispredict) +1: retl ! CTI Group brk forced + sllx %g4, 32, %g4 ! IEU0 Group + +ccslow: mov 0, %g5 + brlez,pn %len, 4f + andcc %src, 1, %o5 + be,a,pt %icc, 1f + srl %len, 1, %g7 + sub %len, 1, %len + lduba [%src] %asi, %g5 + add %src, 1, %src + stb %g5, [%dst] + srl %len, 1, %g7 + add %dst, 1, %dst +1: brz,a,pn %g7, 3f + andcc %len, 1, %g0 + andcc %src, 2, %g0 + be,a,pt %icc, 1f + srl %g7, 1, %g7 + lduha [%src] %asi, %o4 + sub %len, 2, %len + srl %o4, 8, %g2 + sub %g7, 1, %g7 + stb %g2, [%dst] + add %o4, %g5, %g5 + stb %o4, [%dst + 1] + add %src, 2, %src + srl %g7, 1, %g7 + add %dst, 2, %dst +1: brz,a,pn %g7, 2f + andcc %len, 2, %g0 + lduwa [%src] %asi, %o4 +5: srl %o4, 24, %g2 + srl %o4, 16, %g3 + stb %g2, [%dst] + srl %o4, 8, %g2 + stb %g3, [%dst + 1] + add %src, 4, %src + stb %g2, [%dst + 2] + addcc %o4, %g5, %g5 + stb %o4, [%dst + 3] + addc %g5, %g0, %g5 + add %dst, 4, %dst + subcc %g7, 1, %g7 + bne,a,pt %icc, 5b + lduwa [%src] %asi, %o4 + sll %g5, 16, %g2 + srl %g5, 16, %g5 + srl %g2, 16, %g2 + andcc %len, 2, %g0 + add %g2, %g5, %g5 +2: be,a,pt %icc, 3f + andcc %len, 1, %g0 + lduha [%src] %asi, %o4 + andcc %len, 1, %g0 + srl %o4, 8, %g2 + add %src, 2, %src + stb %g2, [%dst] + add %g5, %o4, %g5 + stb %o4, [%dst + 1] + add %dst, 2, %dst +3: be,a,pt %icc, 1f + sll %g5, 16, %o4 + lduba [%src] %asi, %g2 + sll %g2, 8, %o4 + stb %g2, [%dst] + add %g5, %o4, %g5 + sll %g5, 16, %o4 +1: addcc %o4, %g5, %g5 + srl %g5, 16, %o4 + addc %g0, %o4, %g5 + brz,pt %o5, 4f + srl %g5, 8, %o4 + and %g5, 0xff, %g2 + and %o4, 0xff, %o4 + sll %g2, 8, %g2 + or %g2, %o4, %g5 +4: addcc %sum, %g5, %sum + addc %g0, %sum, %o0 + retl + srl %o0, 0, %o0 +cpc_end: + + .globl cpc_handler +cpc_handler: + ldx [%sp + 0x7ff + 128], %g1 + sub %g0, EFAULT, %g2 + brnz,a,pt %g1, 1f + st %g2, [%g1] +1: sethi %uhi(PAGE_OFFSET), %g4 + retl + sllx %g4, 32, %g4 + + .section __ex_table + .align 4 + .word cpc_start, 0, cpc_end, cpc_handler + |