1 /* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $
2 * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3 * copying utilizing the UltraSparc Visual Instruction Set.
5 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
7 * Based on older sparc32/sparc64 checksum.S, which is:
9 * Copyright(C) 1995 Linus Torvalds
10 * Copyright(C) 1995 Miguel de Icaza
11 * Copyright(C) 1996,1997 David S. Miller
13 * Linux/Alpha checksum c-code
14 * Linux/ix86 inline checksum assembly
15 * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16 * David Mosberger-Tang for optimized reference c-code
17 * BSD4.4 portable checksum routine
21 #define STACKOFF 0x7ff+128
30 #include <asm/visasm.h>
31 #include <asm/thread_info.h>
33 #define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
34 #define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)
37 #define ASI_BLK_P 0xf0
41 #define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P)
57 /* Dobrou noc, SunSoft engineers. Spete sladce.
58 * This has a couple of tricks in and those
59 * tricks are UltraLinux trade secrets :))
60 * Once AGAIN, the SunSoft engineers are caught
61 * asleep at the keyboard :)).
62 * The main loop does about 20 superscalar cycles
63 * per 64bytes checksummed/copied.
67 ldda [%src] %asi, %O0 /* Load Group */
70 stda %f48, [%dst] ASI_BLK_P /* Store */
73 std %fx, [%dst + off] /* Store */
79 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \
80 LOAD /* Load (Group) */; \
81 faligndata %A14, %F0, %A14 /* FPA Group */; \
83 STORE1 /* Store (optional) */; \
84 faligndata %F0, %F2, %A0 /* FPA Group */; \
85 srl %x5, 1, %x5 /* IEU0 */; \
86 add %sum, %x4, %sum /* IEU1 */; \
87 fpadd32 %F0, %f0, %F0 /* FPA Group */; \
89 STORE2 /* Store (optional) */; \
90 faligndata %F2, %F4, %A2 /* FPA Group */; \
91 srl %x6, 1, %x6 /* IEU0 */; \
92 add %sum, %x5, %sum /* IEU1 */; \
93 fpadd32 %F2, %f2, %F2 /* FPA Group */; \
94 add %src, 64, %src /* IEU0 */; \
95 fcmpgt32 %f0, %F0, %x1 /* FPM */; \
96 add %dst, 64, %dst /* IEU1 Group */; \
98 STORE3 /* Store (optional) */; \
99 faligndata %F4, %F6, %A4 /* FPA */; \
100 fpadd32 %F4, %f4, %F4 /* FPA Group */; \
101 add %sum, %x6, %sum /* IEU1 */; \
102 fcmpgt32 %f2, %F2, %x2 /* FPM */; \
103 srl %x7, 1, %x7 /* IEU0 Group */; \
104 inc %x8 /* IEU1 */; \
105 STORE4 /* Store (optional) */; \
106 faligndata %F6, %F8, %A6 /* FPA */; \
107 fpadd32 %F6, %f6, %F6 /* FPA Group */; \
108 srl %x8, 1, %x8 /* IEU0 */; \
109 fcmpgt32 %f4, %F4, %x3 /* FPM */; \
110 add %sum, %x7, %sum /* IEU0 Group */; \
111 inc %x1 /* IEU1 */; \
112 STORE5 /* Store (optional) */; \
113 faligndata %F8, %F10, %A8 /* FPA */; \
114 fpadd32 %F8, %f8, %F8 /* FPA Group */; \
115 srl %x1, 1, %x1 /* IEU0 */; \
116 fcmpgt32 %f6, %F6, %x4 /* FPM */; \
117 add %sum, %x8, %sum /* IEU0 Group */; \
118 inc %x2 /* IEU1 */; \
119 STORE6 /* Store (optional) */; \
120 faligndata %F10, %F12, %A10 /* FPA */; \
121 fpadd32 %F10, %f10, %F10 /* FPA Group */; \
122 srl %x2, 1, %x2 /* IEU0 */; \
123 fcmpgt32 %f8, %F8, %x5 /* FPM */; \
124 add %sum, %x1, %sum /* IEU0 Group */; \
125 inc %x3 /* IEU1 */; \
126 STORE7 /* Store (optional) */; \
127 faligndata %F12, %F14, %A12 /* FPA */; \
128 fpadd32 %F12, %f12, %F12 /* FPA Group */; \
129 srl %x3, 1, %x3 /* IEU0 */; \
130 fcmpgt32 %f10, %F10, %x6 /* FPM */; \
131 add %sum, %x2, %sum /* IEU0 Group */; \
132 inc %x4 /* IEU1 */; \
133 STORE8 /* Store (optional) */; \
134 fmovd %F14, %B14 /* FPA */; \
135 fpadd32 %F14, %f14, %F14 /* FPA Group */; \
136 srl %x4, 1, %x4 /* IEU0 */; \
137 fcmpgt32 %f12, %F12, %x7 /* FPM */; \
138 add %sum, %x3, %sum /* IEU0 Group */; \
139 subcc %len, 64, %len /* IEU1 */; \
141 fcmpgt32 %f14, %F14, %x8 /* FPM Group */;
143 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
144 inc %x5 /* IEU0 Group */; \
145 fpadd32 %f2, %f0, %S0 /* FPA */; \
146 add %sum, %x4, %sum /* IEU1 */; \
147 srl %x5, 1, %x5 /* IEU0 Group */; \
148 fpadd32 %f6, %f4, %S1 /* FPA */; \
149 inc %x6 /* IEU1 */; \
150 fpadd32 %f10, %f8, %S2 /* FPA Group */; \
151 add %sum, %x5, %sum /* IEU0 */; \
152 fcmpgt32 %f0, %S0, %x1 /* FPM */; \
153 fpadd32 %f14, %f12, %S3 /* FPA Group */; \
154 srl %x6, 1, %x6 /* IEU0 */; \
155 fcmpgt32 %f4, %S1, %x2 /* FPM */; \
156 add %sum, %x6, %sum /* IEU0 Group */; \
157 fzero %fz /* FPA */; \
158 fcmpgt32 %f8, %S2, %x3 /* FPM */; \
159 inc %x7 /* IEU0 Group */; \
160 inc %x8 /* IEU1 */; \
161 srl %x7, 1, %x7 /* IEU0 Group */; \
162 inc %x1 /* IEU1 */; \
163 fpadd32 %S0, %S1, %T0 /* FPA */; \
164 fpadd32 %S2, %S3, %T1 /* FPA Group */; \
165 add %sum, %x7, %sum /* IEU0 */; \
166 fcmpgt32 %f12, %S3, %x4 /* FPM */; \
167 srl %x8, 1, %x8 /* IEU0 Group */; \
168 inc %x2 /* IEU1 */; \
169 srl %x1, 1, %x1 /* IEU0 Group */; \
170 add %sum, %x8, %sum /* IEU1 */; \
171 add %sum, %x1, %sum /* IEU0 Group */; \
172 fcmpgt32 %S0, %T0, %x5 /* FPM */; \
173 srl %x2, 1, %x2 /* IEU0 Group */; \
174 fcmpgt32 %S2, %T1, %x6 /* FPM */; \
175 inc %x3 /* IEU0 Group */; \
176 add %sum, %x2, %sum /* IEU1 */; \
177 srl %x3, 1, %x3 /* IEU0 Group */; \
178 inc %x4 /* IEU1 */; \
179 fpadd32 %T0, %T1, %U0 /* FPA Group */; \
180 add %sum, %x3, %sum /* IEU0 */; \
181 fcmpgt32 %fz, %f2, %x7 /* FPM */; \
182 srl %x4, 1, %x4 /* IEU0 Group */; \
183 fcmpgt32 %fz, %f6, %x8 /* FPM */; \
184 inc %x5 /* IEU0 Group */; \
185 add %sum, %x4, %sum /* IEU1 */; \
186 srl %x5, 1, %x5 /* IEU0 Group */; \
187 fcmpgt32 %fz, %f10, %x1 /* FPM */; \
188 inc %x6 /* IEU0 Group */; \
189 add %sum, %x5, %sum /* IEU1 */; \
190 fmovd %FA, %FB /* FPA Group */; \
191 fcmpgt32 %fz, %f14, %x2 /* FPM */; \
192 srl %x6, 1, %x6 /* IEU0 Group */; \
193 ba,pt %xcc, ett /* CTI */; \
196 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \
197 END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
199 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \
200 fpadd32 %U0, %U1, %V0 /* FPA Group */; \
201 srl %x7, 1, %x7 /* IEU0 */; \
202 add %sum, %x6, %sum /* IEU1 */; \
203 std %V0, [%sp + STACKOFF] /* Store Group */; \
204 inc %x8 /* IEU0 */; \
205 sub %sum, %x7, %sum /* IEU1 */; \
206 srl %x8, 1, %x8 /* IEU0 Group */; \
207 fcmpgt32 %fz, %S1, %x3 /* FPM */; \
208 inc %x1 /* IEU0 Group */; \
209 fcmpgt32 %fz, %S3, %x4 /* FPM */; \
210 srl %x1, 1, %x1 /* IEU0 Group */; \
211 sub %sum, %x8, %sum /* IEU1 */; \
212 ldx [%sp + STACKOFF], %x8 /* Load Group */; \
213 inc %x2 /* IEU0 */; \
214 sub %sum, %x1, %sum /* IEU1 */; \
215 srl %x2, 1, %x2 /* IEU0 Group */; \
216 fcmpgt32 %fz, %T1, %x5 /* FPM */; \
217 inc %x3 /* IEU0 Group */; \
218 fcmpgt32 %T0, %U0, %x6 /* FPM */; \
219 srl %x3, 1, %x3 /* IEU0 Group */; \
220 sub %sum, %x2, %sum /* IEU1 */; \
221 inc %x4 /* IEU0 Group */; \
222 sub %sum, %x3, %sum /* IEU1 */; \
223 srl %x4, 1, %x4 /* IEU0 Group */; \
224 fcmpgt32 %fz, %U1, %x7 /* FPM */; \
225 inc %x5 /* IEU0 Group */; \
226 fcmpgt32 %U0, %V0, %x1 /* FPM */; \
227 srl %x5, 1, %x5 /* IEU0 Group */; \
228 sub %sum, %x4, %sum /* IEU1 */; \
229 sub %sum, %x5, %sum /* IEU0 Group */; \
230 fcmpgt32 %fz, %V0, %x2 /* FPM */; \
231 inc %x6 /* IEU0 Group */; \
232 inc %x7 /* IEU1 */; \
233 srl %x6, 1, %x6 /* IEU0 Group */; \
234 inc %x1 /* IEU1 */; \
235 srl %x7, 1, %x7 /* IEU0 Group */; \
236 add %sum, %x6, %sum /* IEU1 */; \
237 srl %x1, 1, %x1 /* IEU0 Group */; \
238 sub %sum, %x7, %sum /* IEU1 */; \
239 inc %x2 /* IEU0 Group */; \
240 add %sum, %x1, %sum /* IEU1 */; \
241 srl %x2, 1, %x2 /* IEU0 Group */; \
242 sub %sum, %x2, %sum /* IEU0 Group */; \
243 addcc %sum, %x8, %sum /* IEU1 Group */; \
244 bcs,a,pn %xcc, 33f /* CTI */; \
245 add %sum, 1, %sum /* IEU0 (Group) */; \
249 .globl csum_partial_copy_vis
251 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
252 * csum_partial_copy_from_user
253 * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
255 csum_partial_copy_vis:
256 andcc %dst, 7, %g0 /* IEU1 Group */
257 be,pt %icc, 4f /* CTI */
258 and %dst, 0x38, %o4 /* IEU0 */
259 mov 1, %g5 /* IEU0 Group */
260 andcc %dst, 2, %g0 /* IEU1 */
261 be,pt %icc, 1f /* CTI */
262 and %dst, 4, %g7 /* IEU0 Group */
263 lduha [%src] %asi, %g2 /* Load */
264 sub %len, 2, %len /* IEU0 Group */
265 add %dst, 2, %dst /* IEU1 */
266 andcc %dst, 4, %g7 /* IEU1 Group */
267 sll %g5, 16, %g5 /* IEU0 */
268 sth %g2, [%dst - 2] /* Store Group */
269 sll %g2, 16, %g2 /* IEU0 */
270 add %src, 2, %src /* IEU1 */
271 addcc %g2, %sum, %sum /* IEU1 Group */
272 bcs,a,pn %icc, 1f /* CTI */
273 add %sum, %g5, %sum /* IEU0 */
274 1: lduwa [%src] %asi, %g2 /* Load */
275 brz,a,pn %g7, 4f /* CTI+IEU1 Group */
276 and %dst, 0x38, %o4 /* IEU0 */
277 add %dst, 4, %dst /* IEU0 Group */
278 sub %len, 4, %len /* IEU1 */
279 addcc %g2, %sum, %sum /* IEU1 Group */
280 bcs,a,pn %icc, 1f /* CTI */
281 add %sum, 1, %sum /* IEU0 */
282 1: and %dst, 0x38, %o4 /* IEU0 Group */
283 stw %g2, [%dst - 4] /* Store */
284 add %src, 4, %src /* IEU1 */
289 mov %src, %g7 /* IEU1 Group */
291 alignaddr %src, %g0, %src /* Single Group */
292 subcc %g7, %src, %g7 /* IEU1 Group */
293 be,pt %xcc, 1f /* CTI */
294 mov 0x40, %g1 /* IEU0 */
295 lduwa [%src] %asi, %g2 /* Load Group */
296 subcc %sum, %g2, %sum /* IEU1 Group+load stall*/
297 bcs,a,pn %icc, 1f /* CTI */
298 sub %sum, 1, %sum /* IEU0 */
299 1: srl %sum, 0, %sum /* IEU0 Group */
301 brz,pn %o4, 3f /* CTI+IEU1 Group */
302 sub %g1, %o4, %g1 /* IEU0 */
303 ldda [%src] %asi, %f0 /* Load */
304 clr %o4 /* IEU0 Group */
305 andcc %dst, 8, %g0 /* IEU1 */
306 be,pn %icc, 1f /* CTI */
307 ldda [%src + 8] %asi, %f2 /* Load Group */
308 add %src, 8, %src /* IEU0 */
309 sub %len, 8, %len /* IEU1 */
310 fpadd32 %f0, %f48, %f50 /* FPA */
311 addcc %dst, 8, %dst /* IEU1 Group */
312 faligndata %f0, %f2, %f16 /* FPA */
313 fcmpgt32 %f48, %f50, %o4 /* FPM Group */
314 fmovd %f2, %f0 /* FPA Group */
315 ldda [%src + 8] %asi, %f2 /* Load */
316 std %f16, [%dst - 8] /* Store */
317 fmovd %f50, %f48 /* FPA */
318 1: andcc %g1, 0x10, %g0 /* IEU1 Group */
319 be,pn %icc, 1f /* CTI */
320 and %g1, 0x20, %g1 /* IEU0 */
321 fpadd32 %f0, %f48, %f50 /* FPA */
322 ldda [%src + 16] %asi, %f4 /* Load Group */
323 add %src, 16, %src /* IEU0 */
324 add %dst, 16, %dst /* IEU1 */
325 faligndata %f0, %f2, %f16 /* FPA */
326 fcmpgt32 %f48, %f50, %g5 /* FPM Group */
327 sub %len, 16, %len /* IEU0 */
329 std %f16, [%dst - 16] /* Store Group */
330 fpadd32 %f2, %f50, %f48 /* FPA */
331 srl %o4, 1, %o5 /* IEU0 */
332 faligndata %f2, %f4, %f18 /* FPA Group */
333 std %f18, [%dst - 8] /* Store */
334 fcmpgt32 %f50, %f48, %o4 /* FPM Group */
335 add %o5, %sum, %sum /* IEU0 */
336 ldda [%src + 8] %asi, %f2 /* Load */
337 fmovd %f4, %f0 /* FPA */
338 1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */
339 rd %asi, %g2 /* LSU Group + 4 bubbles*/
341 fpadd32 %f0, %f48, %f50 /* FPA */
342 ldda [%src + 16] %asi, %f4 /* Load Group */
343 srl %g5, 1, %g5 /* IEU0 */
344 add %dst, 32, %dst /* IEU1 */
345 faligndata %f0, %f2, %f16 /* FPA */
346 fcmpgt32 %f48, %f50, %o5 /* FPM Group */
348 ldda [%src + 24] %asi, %f6 /* Load */
349 srl %o4, 1, %o4 /* IEU0 Group */
350 add %g5, %sum, %sum /* IEU1 */
351 ldda [%src + 32] %asi, %f8 /* Load */
352 fpadd32 %f2, %f50, %f48 /* FPA */
353 faligndata %f2, %f4, %f18 /* FPA Group */
354 sub %len, 32, %len /* IEU0 */
355 std %f16, [%dst - 32] /* Store */
356 fcmpgt32 %f50, %f48, %g3 /* FPM Group */
358 add %o4, %sum, %sum /* IEU1 */
359 fpadd32 %f4, %f48, %f50 /* FPA */
360 faligndata %f4, %f6, %f20 /* FPA Group */
361 srl %o5, 1, %o5 /* IEU0 */
362 fcmpgt32 %f48, %f50, %g5 /* FPM Group */
363 add %o5, %sum, %sum /* IEU0 */
364 std %f18, [%dst - 24] /* Store */
365 fpadd32 %f6, %f50, %f48 /* FPA */
366 inc %g3 /* IEU0 Group */
367 std %f20, [%dst - 16] /* Store */
368 add %src, 32, %src /* IEU1 */
369 faligndata %f6, %f8, %f22 /* FPA */
370 fcmpgt32 %f50, %f48, %o4 /* FPM Group */
371 srl %g3, 1, %g3 /* IEU0 */
372 std %f22, [%dst - 8] /* Store */
373 add %g3, %sum, %sum /* IEU0 Group */
374 3: rd %asi, %g2 /* LSU Group + 4 bubbles*/
376 4: sethi %hi(vis0s), %g7 /* IEU0 Group */
377 or %g2, ASI_BLK_OR, %g2 /* IEU1 */
379 4: rd %pc, %g7 /* LSU Group + 4 bubbles*/
381 inc %g5 /* IEU0 Group */
382 and %src, 0x38, %g3 /* IEU1 */
383 membar #StoreLoad /* LSU Group */
384 srl %g5, 1, %g5 /* IEU0 */
386 sll %g3, 8, %g3 /* IEU0 Group */
387 sub %len, 0xc0, %len /* IEU1 */
388 addcc %g5, %sum, %sum /* IEU1 Group */
389 srl %o4, 1, %o4 /* IEU0 */
390 add %g7, %g3, %g7 /* IEU0 Group */
391 add %o4, %sum, %sum /* IEU1 */
393 jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */
395 jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */
400 vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
401 add %src, 128, %src /* IEU0 Group */
402 ldda [%src-128] %asi, %f0 /* Load Group */
403 ldda [%src-64] %asi, %f16 /* Load Group */
404 fmovd %f48, %f62 /* FPA Group f0 available*/
405 faligndata %f0, %f2, %f48 /* FPA Group f2 available*/
406 fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/
407 fpadd32 %f0, %f62, %f0 /* FPA */
408 fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/
409 faligndata %f2, %f4, %f50 /* FPA */
410 fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/
411 faligndata %f4, %f6, %f52 /* FPA */
412 fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/
414 faligndata %f6, %f8, %f54 /* FPA */
415 fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/
416 srl %x1, 1, %x1 /* IEU0 */
418 faligndata %f8, %f10, %f56 /* FPA */
419 fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/
420 srl %x2, 1, %x2 /* IEU0 */
421 add %sum, %x1, %sum /* IEU1 */
422 faligndata %f10, %f12, %f58 /* FPA */
423 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
425 add %sum, %x2, %sum /* IEU1 */
426 faligndata %f12, %f14, %f60 /* FPA */
427 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
428 srl %x3, 1, %x3 /* IEU0 */
430 fmovd %f14, %f62 /* FPA */
431 srl %x4, 1, %x4 /* IEU0 Group */
432 add %sum, %x3, %sum /* IEU1 */
433 vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
434 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
435 ,LDBLK(f32), STBLK,,,,,,,,
436 ,bcs,pn %icc, vis0e1)
437 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
438 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
439 ,LDBLK(f0), STBLK,,,,,,,,
440 ,bcs,pn %icc, vis0e2)
441 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
442 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
443 ,LDBLK(f16), STBLK,,,,,,,,
445 vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
446 ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
447 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
448 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
449 vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
450 ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
451 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
452 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
453 vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
454 ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
455 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
456 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
458 vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
459 add %src, 128 - 8, %src /* IEU0 Group */
460 ldda [%src-128] %asi, %f0 /* Load Group */
461 ldda [%src-64] %asi, %f16 /* Load Group */
462 fmovd %f0, %f58 /* FPA Group */
463 fmovd %f48, %f0 /* FPA Group */
464 fcmpgt32 %f32, %f2, %x2 /* FPM Group */
465 faligndata %f2, %f4, %f48 /* FPA */
466 fcmpgt32 %f32, %f4, %x3 /* FPM Group */
467 faligndata %f4, %f6, %f50 /* FPA */
468 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
469 faligndata %f6, %f8, %f52 /* FPA */
470 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
472 faligndata %f8, %f10, %f54 /* FPA */
473 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
474 srl %x2, 1, %x2 /* IEU0 */
475 faligndata %f10, %f12, %f56 /* FPA */
476 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
478 add %sum, %x2, %sum /* IEU1 */
479 faligndata %f12, %f14, %f58 /* FPA */
480 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
481 srl %x3, 1, %x3 /* IEU0 */
483 fmovd %f14, %f60 /* FPA */
484 srl %x4, 1, %x4 /* IEU0 Group */
485 add %sum, %x3, %sum /* IEU1 */
486 vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
487 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
488 ,LDBLK(f32), ,STBLK,,,,,,,
489 ,bcs,pn %icc, vis1e1)
490 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
491 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
492 ,LDBLK(f0), ,STBLK,,,,,,,
493 ,bcs,pn %icc, vis1e2)
494 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
495 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
496 ,LDBLK(f16), ,STBLK,,,,,,,
498 vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
499 ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
500 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
501 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
502 vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
503 ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
504 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
505 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
506 vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
507 ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
508 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
509 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
511 vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
512 add %src, 128 - 16, %src /* IEU0 Group */
513 ldda [%src-128] %asi, %f0 /* Load Group */
514 ldda [%src-64] %asi, %f16 /* Load Group */
515 fmovd %f0, %f56 /* FPA Group */
516 fmovd %f48, %f0 /* FPA Group */
517 sub %dst, 64, %dst /* IEU0 */
518 fpsub32 %f2, %f2, %f2 /* FPA Group */
519 fcmpgt32 %f32, %f4, %x3 /* FPM Group */
520 faligndata %f4, %f6, %f48 /* FPA */
521 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
522 faligndata %f6, %f8, %f50 /* FPA */
523 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
524 faligndata %f8, %f10, %f52 /* FPA */
525 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
526 faligndata %f10, %f12, %f54 /* FPA */
527 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
529 faligndata %f12, %f14, %f56 /* FPA */
530 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
531 srl %x3, 1, %x3 /* IEU0 */
533 fmovd %f14, %f58 /* FPA */
534 srl %x4, 1, %x4 /* IEU0 Group */
535 add %sum, %x3, %sum /* IEU1 */
536 vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
537 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
538 ,LDBLK(f32), ,,STBLK,,,,,,
539 ,bcs,pn %icc, vis2e1)
540 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
541 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
542 ,LDBLK(f0), ,,STBLK,,,,,,
543 ,bcs,pn %icc, vis2e2)
544 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
545 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
546 ,LDBLK(f16), ,,STBLK,,,,,,
548 vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
549 ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
550 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
551 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
552 vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
553 ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
554 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
555 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
556 vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
557 ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
558 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
559 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
561 vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
562 add %src, 128 - 24, %src /* IEU0 Group */
563 ldda [%src-128] %asi, %f0 /* Load Group */
564 ldda [%src-64] %asi, %f16 /* Load Group */
565 fmovd %f0, %f54 /* FPA Group */
566 fmovd %f48, %f0 /* FPA Group */
567 sub %dst, 64, %dst /* IEU0 */
568 fpsub32 %f2, %f2, %f2 /* FPA Group */
569 fpsub32 %f4, %f4, %f4 /* FPA Group */
570 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
571 faligndata %f6, %f8, %f48 /* FPA */
572 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
573 faligndata %f8, %f10, %f50 /* FPA */
574 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
575 faligndata %f10, %f12, %f52 /* FPA */
576 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
577 faligndata %f12, %f14, %f54 /* FPA */
578 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
579 fmovd %f14, %f56 /* FPA */
581 srl %x4, 1, %x4 /* IEU0 Group */
582 vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
583 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
584 ,LDBLK(f32), ,,,STBLK,,,,,
585 ,bcs,pn %icc, vis3e1)
586 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
587 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
588 ,LDBLK(f0), ,,,STBLK,,,,,
589 ,bcs,pn %icc, vis3e2)
590 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
591 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
592 ,LDBLK(f16), ,,,STBLK,,,,,
594 vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
595 ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
596 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
597 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
598 vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
599 ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
600 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
601 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
602 vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
603 ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
604 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
605 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
607 vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
608 add %src, 128 - 32, %src /* IEU0 Group */
609 ldda [%src-128] %asi, %f0 /* Load Group */
610 ldda [%src-64] %asi, %f16 /* Load Group */
611 fmovd %f0, %f52 /* FPA Group */
612 fmovd %f48, %f0 /* FPA Group */
613 sub %dst, 64, %dst /* IEU0 */
614 fpsub32 %f2, %f2, %f2 /* FPA Group */
615 fpsub32 %f4, %f4, %f4 /* FPA Group */
616 fpsub32 %f6, %f6, %f6 /* FPA Group */
618 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
619 faligndata %f8, %f10, %f48 /* FPA */
620 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
621 faligndata %f10, %f12, %f50 /* FPA */
622 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
623 faligndata %f12, %f14, %f52 /* FPA */
624 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
625 fmovd %f14, %f54 /* FPA */
626 vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
627 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
628 ,LDBLK(f32), ,,,,STBLK,,,,
629 ,bcs,pn %icc, vis4e1)
630 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
631 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
632 ,LDBLK(f0), ,,,,STBLK,,,,
633 ,bcs,pn %icc, vis4e2)
634 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
635 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
636 ,LDBLK(f16), ,,,,STBLK,,,,
638 vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
639 ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
640 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
641 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
642 vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
643 ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
644 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
645 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
646 vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
647 ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
648 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
649 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
651 vis5s: add %src, 128 - 40, %src /* IEU0 Group */
652 ldda [%src-88] %asi, %f10 /* Load Group */
653 ldda [%src-80] %asi, %f12 /* Load Group */
654 ldda [%src-72] %asi, %f14 /* Load Group */
655 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
656 ldda [%src-64] %asi, %f16 /* Load Group */
657 fmovd %f48, %f0 /* FPA Group */
658 fmuld %f32, %f32, %f2 /* FPM */
660 faddd %f32, %f32, %f4 /* FPA Group */
661 fmuld %f32, %f32, %f6 /* FPM */
663 faddd %f32, %f32, %f8 /* FPA Group */
664 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
665 sub %dst, 64, %dst /* IEU0 */
666 faligndata %f10, %f12, %f48 /* FPA */
667 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
668 faligndata %f12, %f14, %f50 /* FPA */
669 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
670 fmovd %f14, %f52 /* FPA */
671 vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
672 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
673 ,LDBLK(f32), ,,,,,STBLK,,,
674 ,bcs,pn %icc, vis5e1)
675 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
676 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
677 ,LDBLK(f0), ,,,,,STBLK,,,
678 ,bcs,pn %icc, vis5e2)
679 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
680 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
681 ,LDBLK(f16), ,,,,,STBLK,,,
683 vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
684 ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
685 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
686 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
687 vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
688 ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
689 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
690 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
691 vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
692 ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
693 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
694 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
696 vis6s: add %src, 128 - 48, %src /* IEU0 Group */
697 ldda [%src-80] %asi, %f12 /* Load Group */
698 ldda [%src-72] %asi, %f14 /* Load Group */
699 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
700 ldda [%src-64] %asi, %f16 /* Load Group */
701 fmovd %f48, %f0 /* FPA Group */
702 fmuld %f32, %f32, %f2 /* FPM */
704 faddd %f32, %f32, %f4 /* FPA Group */
705 fmuld %f32, %f32, %f6 /* FPM */
707 faddd %f32, %f32, %f8 /* FPA Group */
708 fmuld %f32, %f32, %f10 /* FPM */
710 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
711 sub %dst, 64, %dst /* IEU0 */
712 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
713 faligndata %f12, %f14, %f48 /* FPA */
714 fmovd %f14, %f50 /* FPA Group */
715 vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
716 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
717 ,LDBLK(f32), ,,,,,,STBLK,,
718 ,bcs,pn %icc, vis6e1)
719 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
720 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
721 ,LDBLK(f0), ,,,,,,STBLK,,
722 ,bcs,pn %icc, vis6e2)
723 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
724 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
725 ,LDBLK(f16), ,,,,,,STBLK,,
727 vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
728 ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
729 ,SYNC, ,,,,,,STBLK,ST(f48,64),
730 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
731 vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
732 ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
733 ,SYNC, ,,,,,,STBLK,ST(f48,64),
734 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
735 vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
736 ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
737 ,SYNC, ,,,,,,STBLK,ST(f48,64),
738 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
740 vis7s: add %src, 128 - 56, %src /* IEU0 Group */
741 ldda [%src-72] %asi, %f14 /* Load Group */
742 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
743 ldda [%src-64] %asi, %f16 /* Load Group */
744 fmovd %f48, %f0 /* FPA Group */
745 fmuld %f32, %f32, %f2 /* FPM */
747 faddd %f32, %f32, %f4 /* FPA Group */
748 fmuld %f32, %f32, %f6 /* FPM */
750 faddd %f32, %f32, %f8 /* FPA Group */
751 fmuld %f32, %f32, %f10 /* FPM */
753 faddd %f32, %f32, %f12 /* FPA Group */
755 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
756 sub %dst, 64, %dst /* IEU0 */
757 fmovd %f14, %f48 /* FPA */
758 vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
759 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
760 ,LDBLK(f32), ,,,,,,,STBLK,
761 ,bcs,pn %icc, vis7e1)
762 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
763 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
764 ,LDBLK(f0), ,,,,,,,STBLK,
765 ,bcs,pn %icc, vis7e2)
766 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
767 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
768 ,LDBLK(f16), ,,,,,,,STBLK,
770 vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
771 ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
772 ,SYNC, ,,,,,,,STBLK,
773 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
774 vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
775 ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
776 ,SYNC, ,,,,,,,STBLK,
777 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
778 vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
779 ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
780 ,SYNC, ,,,,,,,STBLK,
781 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
782 e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
783 e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
784 e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
785 ett: rd %asi, %x4 /* LSU Group+4bubbles */
786 rd %gsr, %x3 /* LSU Group+4bubbles */
788 srl %x4, 3, %x5 /* IEU0 Group */
789 xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */
790 wr %x4, %x5, %asi /* LSU Group+4bubbles */
792 wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */
794 andcc %x3, 7, %x3 /* IEU1 Group */
795 add %dst, 8, %dst /* IEU0 */
796 bne,pn %icc, 1f /* CTI */
798 brz,a,pn %len, 2f /* CTI+IEU1 Group */
799 std %f6, [%dst - 8] /* Store */
800 1: cmp %len, 8 /* IEU1 */
801 blu,pn %icc, 3f /* CTI */
802 sub %src, 64, %src /* IEU0 Group */
803 1: ldda [%src] %asi, %f2 /* Load Group */
804 fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/
805 add %src, 8, %src /* IEU0 */
806 add %dst, 8, %dst /* IEU1 */
807 faligndata %f6, %f2, %f14 /* FPA Group */
808 fcmpgt32 %f10, %f12, %x5 /* FPM Group */
809 std %f14, [%dst - 16] /* Store */
810 fmovd %f2, %f6 /* FPA */
811 fmovd %f12, %f10 /* FPA Group */
812 sub %len, 8, %len /* IEU1 */
813 fzero %f16 /* FPA Group - FPU nop */
814 fzero %f18 /* FPA Group - FPU nop */
816 srl %x5, 1, %x5 /* IEU0 Group (regdep) */
817 cmp %len, 8 /* IEU1 */
818 bgeu,pt %icc, 1b /* CTI */
819 add %x5, %sum, %sum /* IEU0 Group */
820 3: brz,a,pt %x3, 2f /* CTI+IEU1 */
821 std %f6, [%dst - 8] /* Store Group */
822 st %f7, [%dst - 8] /* Store Group */
823 sub %dst, 4, %dst /* IEU0 */
824 add %len, 4, %len /* IEU1 */
827 sub %sp, 8, %sp /* IEU0 Group */
829 END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
830 membar #Sync /* LSU Group */
833 add %sp, 8, %sp /* IEU0 Group */
835 23: brnz,pn %len, 26f /* CTI+IEU1 Group */
836 24: sllx %sum, 32, %g1 /* IEU0 */
837 25: addcc %sum, %g1, %src /* IEU1 Group */
838 srlx %src, 32, %src /* IEU0 Group (regdep) */
839 bcs,a,pn %xcc, 1f /* CTI */
840 add %src, 1, %src /* IEU1 */
842 1: retl /* CTI Group brk forced*/
843 srl %src, 0, %src /* IEU0 */
845 1: retl /* CTI Group brk forced*/
846 ldx [%g6 + TI_TASK], %g4 /* Load */
848 26: andcc %len, 8, %g0 /* IEU1 Group */
849 be,pn %icc, 1f /* CTI */
850 lduwa [%src] %asi, %o4 /* Load */
851 lduwa [%src+4] %asi, %g2 /* Load Group */
852 add %src, 8, %src /* IEU0 */
853 add %dst, 8, %dst /* IEU1 */
854 sllx %o4, 32, %g5 /* IEU0 Group */
855 stw %o4, [%dst - 8] /* Store */
856 or %g5, %g2, %g5 /* IEU0 Group */
857 stw %g2, [%dst - 4] /* Store */
858 addcc %g5, %sum, %sum /* IEU1 Group */
859 bcs,a,pn %xcc, 1f /* CTI */
860 add %sum, 1, %sum /* IEU0 */
861 1: andcc %len, 4, %g0 /* IEU1 Group */
862 be,a,pn %icc, 1f /* CTI */
864 lduwa [%src] %asi, %g7 /* Load */
865 add %src, 4, %src /* IEU0 Group */
866 add %dst, 4, %dst /* IEU1 */
867 sllx %g7, 32, %g2 /* IEU0 Group */
868 stw %g7, [%dst - 4] /* Store */
869 1: andcc %len, 2, %g0 /* IEU1 */
870 be,a,pn %icc, 1f /* CTI */
871 clr %g3 /* IEU0 Group */
872 lduha [%src] %asi, %g7 /* Load */
873 add %src, 2, %src /* IEU1 */
874 add %dst, 2, %dst /* IEU0 Group */
875 sll %g7, 16, %g3 /* IEU0 Group */
876 sth %g7, [%dst - 2] /* Store */
877 1: andcc %len, 1, %g0 /* IEU1 */
878 be,a,pn %icc, 1f /* CTI */
879 clr %o5 /* IEU0 Group */
880 lduba [%src] %asi, %g7 /* Load */
881 sll %g7, 8, %o5 /* IEU0 Group */
882 stb %g7, [%dst] /* Store */
883 1: or %g2, %g3, %g3 /* IEU1 */
884 or %o5, %g3, %g3 /* IEU0 Group (regdep) */
885 addcc %g3, %sum, %sum /* IEU1 Group (regdep) */
886 bcs,a,pn %xcc, 1f /* CTI */
887 add %sum, 1, %sum /* IEU0 */
888 1: ba,pt %xcc, 25b /* CTI Group */
889 sllx %sum, 32, %g1 /* IEU0 */
896 .word csum_partial_copy_vis, 0, end, cpc_handler