1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
|
/* checksum.S: Sparc V9 optimized checksum code.
*
* Copyright(C) 1995 Linus Torvalds
* Copyright(C) 1995 Miguel de Icaza
* Copyright(C) 1996 David S. Miller
* Copyright(C) 1997 Jakub Jelinek
*
* derived from:
* Linux/Alpha checksum c-code
* Linux/ix86 inline checksum assembly
* RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
* David Mosberger-Tang for optimized reference c-code
* BSD4.4 portable checksum routine
*/
#include <asm/errno.h>
#include <asm/head.h>
#include <asm/ptrace.h>
#include <asm/asi.h>
#include <asm/page.h>
/* The problem with the "add with carry" instructions on Ultra
* are two fold. Firstly, they cannot pair with jack shit,
* and also they only add in the 32-bit carry condition bit
* into the accumulated sum. The following is much better.
* For larger chunks we use VIS code, which is faster ;)
*/
#define src o0
#define dst o1
#define len o2
#define sum o3
.text
/* I think I have an erection... Once _AGAIN_ the SunSoft
* engineers are caught asleep at the keyboard, tsk tsk...
*/
#define CSUMCOPY_LASTCHUNK(off, t0, t1) \
ldxa [%src - off - 0x08] %asi, t0; \
ldxa [%src - off - 0x00] %asi, t1; \
nop; nop; \
addcc t0, %sum, %sum; \
stw t0, [%dst - off - 0x04]; \
srlx t0, 32, t0; \
bcc,pt %xcc, 51f; \
stw t0, [%dst - off - 0x08]; \
add %sum, 1, %sum; \
51: addcc t1, %sum, %sum; \
stw t1, [%dst - off + 0x04]; \
srlx t1, 32, t1; \
bcc,pt %xcc, 52f; \
stw t1, [%dst - off - 0x00]; \
add %sum, 1, %sum; \
52:
cpc_start:
cc_end_cruft:
andcc %g7, 8, %g0 ! IEU1 Group
be,pn %icc, 1f ! CTI
and %g7, 4, %g5 ! IEU0
ldxa [%src + 0x00] %asi, %g2 ! Load Group
add %dst, 8, %dst ! IEU0
add %src, 8, %src ! IEU1
addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles
stw %g2, [%dst - 0x04] ! Store
srlx %g2, 32, %g2 ! IEU0
bcc,pt %xcc, 1f ! CTI Group
stw %g2, [%dst - 0x08] ! Store
add %sum, 1, %sum ! IEU0
1: brz,pt %g5, 1f ! CTI Group
clr %g2 ! IEU0
lduwa [%src + 0x00] %asi, %g2 ! Load
add %dst, 4, %dst ! IEU0 Group
add %src, 4, %src ! IEU1
stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles
sllx %g2, 32, %g2 ! IEU0
1: andcc %g7, 2, %g0 ! IEU1
be,pn %icc, 1f ! CTI Group
clr %o4 ! IEU1
lduha [%src + 0x00] %asi, %o4 ! Load
add %src, 2, %src ! IEU0 Group
add %dst, 2, %dst ! IEU1
sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles
sll %o4, 16, %o4 ! IEU0
1: andcc %g7, 1, %g0 ! IEU1
be,pn %icc, 1f ! CTI Group
clr %o5 ! IEU0
lduba [%src + 0x00] %asi, %o5 ! Load
stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles
sll %o5, 8, %o5 ! IEU0
1: or %g2, %o4, %o4 ! IEU1
or %o5, %o4, %o4 ! IEU0 Group
addcc %o4, %sum, %sum ! IEU1
bcc,pt %xcc, ccfold ! CTI
sethi %uhi(PAGE_OFFSET), %g4 ! IEU0 Group
b,pt %xcc, ccfold ! CTI
add %sum, 1, %sum ! IEU1
cc_fixit:
cmp %len, 6 ! IEU1 Group
bl,a,pn %icc, ccte ! CTI
andcc %len, 0xf, %g7 ! IEU1 Group
andcc %src, 2, %g0 ! IEU1 Group
be,pn %icc, 1f ! CTI
andcc %src, 0x4, %g0 ! IEU1 Group
lduha [%src + 0x00] %asi, %g4 ! Load
sub %len, 2, %len ! IEU0
add %src, 2, %src ! IEU0 Group
add %dst, 2, %dst ! IEU1
sll %g4, 16, %g3 ! IEU0 Group + 1 bubble
addcc %g3, %sum, %sum ! IEU1
bcc,pt %xcc, 0f ! CTI
srl %sum, 16, %g3 ! IEU0 Group
add %g3, 1, %g3 ! IEU0 4 clocks (mispredict)
0: andcc %src, 0x4, %g0 ! IEU1 Group
sth %g4, [%dst - 0x2] ! Store
sll %sum, 16, %sum ! IEU0
sll %g3, 16, %g3 ! IEU0 Group
srl %sum, 16, %sum ! IEU0 Group
or %g3, %sum, %sum ! IEU0 Group (regdep)
1: be,pt %icc, ccmerge ! CTI
andcc %len, 0xf0, %g1 ! IEU1
lduwa [%src + 0x00] %asi, %g4 ! Load Group
sub %len, 4, %len ! IEU0
add %src, 4, %src ! IEU1
add %dst, 4, %dst ! IEU0 Group
addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble
stw %g4, [%dst - 0x4] ! Store
bcc,pt %xcc, ccmerge ! CTI
andcc %len, 0xf0, %g1 ! IEU1 Group
b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict)
add %sum, 1, %sum ! IEU0
.align 32
.globl csum_partial_copy_sparc64
csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */
xorcc %src, %dst, %o4 ! IEU1 Group
srl %sum, 0, %sum ! IEU0
andcc %o4, 3, %g0 ! IEU1 Group
srl %len, 0, %len ! IEU0
bne,pn %icc, ccslow ! CTI
andcc %src, 1, %g0 ! IEU1 Group
bne,pn %icc, ccslow ! CTI
cmp %len, 256 ! IEU1 Group
bgeu,pt %icc, csum_partial_copy_vis ! CTI
andcc %src, 7, %g0 ! IEU1 Group
bne,pn %icc, cc_fixit ! CTI
andcc %len, 0xf0, %g1 ! IEU1 Group
ccmerge:be,pn %icc, ccte ! CTI
andcc %len, 0xf, %g7 ! IEU1 Group
sll %g1, 2, %o4 ! IEU0
13: sethi %hi(12f), %o5 ! IEU0 Group
add %src, %g1, %src ! IEU1
sub %o5, %o4, %o5 ! IEU0 Group
jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced
add %dst, %g1, %dst ! IEU0 Group
cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3)
CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3)
CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3)
CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3)
CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x98,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x88,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x78,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x68,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x58,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x48,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x38,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x28,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x18,%g2,%g3)
CSUMCOPY_LASTCHUNK(0x08,%g2,%g3)
12:
andcc %len, 0xf, %g7 ! IEU1 Group
ccte: bne,pn %icc, cc_end_cruft ! CTI
sethi %uhi(PAGE_OFFSET), %g4 ! IEU0
ccfold: sllx %sum, 32, %o0 ! IEU0 Group
addcc %sum, %o0, %o0 ! IEU1 Group (regdep)
srlx %o0, 32, %o0 ! IEU0 Group (regdep)
bcs,a,pn %xcc, 1f ! CTI
add %o0, 1, %o0 ! IEU1 4 clocks (mispredict)
1: retl ! CTI Group brk forced
sllx %g4, 32, %g4 ! IEU0 Group
ccslow: mov 0, %g5
brlez,pn %len, 4f
andcc %src, 1, %o5
be,a,pt %icc, 1f
srl %len, 1, %g7
sub %len, 1, %len
lduba [%src] %asi, %g5
add %src, 1, %src
stb %g5, [%dst]
srl %len, 1, %g7
add %dst, 1, %dst
1: brz,a,pn %g7, 3f
andcc %len, 1, %g0
andcc %src, 2, %g0
be,a,pt %icc, 1f
srl %g7, 1, %g7
lduha [%src] %asi, %o4
sub %len, 2, %len
srl %o4, 8, %g2
sub %g7, 1, %g7
stb %g2, [%dst]
add %o4, %g5, %g5
stb %o4, [%dst + 1]
add %src, 2, %src
srl %g7, 1, %g7
add %dst, 2, %dst
1: brz,a,pn %g7, 2f
andcc %len, 2, %g0
lduwa [%src] %asi, %o4
5: srl %o4, 24, %g2
srl %o4, 16, %g3
stb %g2, [%dst]
srl %o4, 8, %g2
stb %g3, [%dst + 1]
add %src, 4, %src
stb %g2, [%dst + 2]
addcc %o4, %g5, %g5
stb %o4, [%dst + 3]
addc %g5, %g0, %g5
add %dst, 4, %dst
subcc %g7, 1, %g7
bne,a,pt %icc, 5b
lduwa [%src] %asi, %o4
sll %g5, 16, %g2
srl %g5, 16, %g5
srl %g2, 16, %g2
andcc %len, 2, %g0
add %g2, %g5, %g5
2: be,a,pt %icc, 3f
andcc %len, 1, %g0
lduha [%src] %asi, %o4
andcc %len, 1, %g0
srl %o4, 8, %g2
add %src, 2, %src
stb %g2, [%dst]
add %g5, %o4, %g5
stb %o4, [%dst + 1]
add %dst, 2, %dst
3: be,a,pt %icc, 1f
sll %g5, 16, %o4
lduba [%src] %asi, %g2
sll %g2, 8, %o4
stb %g2, [%dst]
add %g5, %o4, %g5
sll %g5, 16, %o4
1: addcc %o4, %g5, %g5
srl %g5, 16, %o4
addc %g0, %o4, %g5
brz,pt %o5, 4f
srl %g5, 8, %o4
and %g5, 0xff, %g2
and %o4, 0xff, %o4
sll %g2, 8, %g2
or %g2, %o4, %g5
4: addcc %sum, %g5, %sum
addc %g0, %sum, %o0
retl
srl %o0, 0, %o0
cpc_end:
.globl cpc_handler
cpc_handler:
ldx [%sp + 0x7ff + 128], %g1
sub %g0, EFAULT, %g2
brnz,a,pt %g1, 1f
st %g2, [%g1]
1: sethi %uhi(PAGE_OFFSET), %g4
retl
sllx %g4, 32, %g4
.section __ex_table
.align 4
.word cpc_start, 0, cpc_end, cpc_handler
|