ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / arch / sparc64 / lib / VIScsumcopy.S
1 /* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $
2  * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3  *            copying utilizing the UltraSparc Visual Instruction Set.
4  *
5  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6  *
7  * Based on older sparc32/sparc64 checksum.S, which is:
8  *
9  *      Copyright(C) 1995 Linus Torvalds
10  *      Copyright(C) 1995 Miguel de Icaza
11  *      Copyright(C) 1996,1997 David S. Miller
12  *    derived from:
13  *        Linux/Alpha checksum c-code
14  *        Linux/ix86 inline checksum assembly
15  *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16  *        David Mosberger-Tang for optimized reference c-code
17  *        BSD4.4 portable checksum routine
18  */
19
20 #ifdef __sparc_v9__
21 #define STACKOFF        0x7ff+128
22 #else
23 #define STACKOFF        64
24 #endif
25
26 #ifdef __KERNEL__
27 #include <asm/head.h>
28 #include <asm/asi.h>
29 #include <asm/page.h>
30 #include <asm/visasm.h>
31 #include <asm/thread_info.h>
32 #define ASI_BLK_XOR     0
33 #define ASI_BLK_XOR1    (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
34 #define ASI_BLK_OR      (ASI_BLK_P & ~ASI_P)
35 #else
36 #define ASI_P           0x80
37 #define ASI_BLK_P       0xf0
38 #define FRPS_FEF        0x04
39 #define FPRS_DU         0x02
40 #define FPRS_DL         0x01
41 #define ASI_BLK_XOR     (ASI_BLK_P ^ ASI_P)
42 #endif
43
44 #define src             o0
45 #define dst             o1
46 #define len             o2
47 #define sum             o3
48 #define x1              g1
49 #define x2              g2
50 #define x3              o4
51 #define x4              g4
52 #define x5              g5
53 #define x6              g7
54 #define x7              g3
55 #define x8              o5
56
57 /* Dobrou noc, SunSoft engineers. Spete sladce.
58  * This has a couple of tricks in and those
59  * tricks are UltraLinux trade secrets :))
60  * Once AGAIN, the SunSoft engineers are caught
61  * asleep at the keyboard :)).
62  * The main loop does about 20 superscalar cycles
63  * per 64bytes checksummed/copied.
64  */
65
66 #define LDBLK(O0)                                                                       \
67         ldda            [%src] %asi, %O0        /*  Load        Group           */
68
69 #define STBLK                                                                           \
70         stda            %f48, [%dst] ASI_BLK_P  /*  Store                       */
71
72 #define ST(fx,off)                                                                      \
73         std             %fx, [%dst + off]       /*  Store                       */
74
75 #define SYNC                                                                            \
76         membar          #Sync
77
78
79 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)  \
80         LOAD                                    /*  Load        (Group)         */;     \
81         faligndata      %A14, %F0, %A14         /*  FPA         Group           */;     \
82         inc             %x5                     /*  IEU0                        */;     \
83         STORE1                                  /*  Store (optional)            */;     \
84         faligndata      %F0, %F2, %A0           /*  FPA         Group           */;     \
85         srl             %x5, 1, %x5             /*  IEU0                        */;     \
86         add             %sum, %x4, %sum         /*  IEU1                        */;     \
87         fpadd32         %F0, %f0, %F0           /*  FPA         Group           */;     \
88         inc             %x6                     /*  IEU0                        */;     \
89         STORE2                                  /*  Store (optional)            */;     \
90         faligndata      %F2, %F4, %A2           /*  FPA         Group           */;     \
91         srl             %x6, 1, %x6             /*  IEU0                        */;     \
92         add             %sum, %x5, %sum         /*  IEU1                        */;     \
93         fpadd32         %F2, %f2, %F2           /*  FPA         Group           */;     \
94         add             %src, 64, %src          /*  IEU0                        */;     \
95         fcmpgt32        %f0, %F0, %x1           /*  FPM                         */;     \
96         add             %dst, 64, %dst          /*  IEU1        Group           */;     \
97         inc             %x7                     /*  IEU0                        */;     \
98         STORE3                                  /*  Store (optional)            */;     \
99         faligndata      %F4, %F6, %A4           /*  FPA                         */;     \
100         fpadd32         %F4, %f4, %F4           /*  FPA         Group           */;     \
101         add             %sum, %x6, %sum         /*  IEU1                        */;     \
102         fcmpgt32        %f2, %F2, %x2           /*  FPM                         */;     \
103         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
104         inc             %x8                     /*  IEU1                        */;     \
105         STORE4                                  /*  Store (optional)            */;     \
106         faligndata      %F6, %F8, %A6           /*  FPA                         */;     \
107         fpadd32         %F6, %f6, %F6           /*  FPA         Group           */;     \
108         srl             %x8, 1, %x8             /*  IEU0                        */;     \
109         fcmpgt32        %f4, %F4, %x3           /*  FPM                         */;     \
110         add             %sum, %x7, %sum         /*  IEU0        Group           */;     \
111         inc             %x1                     /*  IEU1                        */;     \
112         STORE5                                  /*  Store (optional)            */;     \
113         faligndata      %F8, %F10, %A8          /*  FPA                         */;     \
114         fpadd32         %F8, %f8, %F8           /*  FPA         Group           */;     \
115         srl             %x1, 1, %x1             /*  IEU0                        */;     \
116         fcmpgt32        %f6, %F6, %x4           /*  FPM                         */;     \
117         add             %sum, %x8, %sum         /*  IEU0        Group           */;     \
118         inc             %x2                     /*  IEU1                        */;     \
119         STORE6                                  /*  Store (optional)            */;     \
120         faligndata      %F10, %F12, %A10        /*  FPA                         */;     \
121         fpadd32         %F10, %f10, %F10        /*  FPA         Group           */;     \
122         srl             %x2, 1, %x2             /*  IEU0                        */;     \
123         fcmpgt32        %f8, %F8, %x5           /*  FPM                         */;     \
124         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
125         inc             %x3                     /*  IEU1                        */;     \
126         STORE7                                  /*  Store (optional)            */;     \
127         faligndata      %F12, %F14, %A12        /*  FPA                         */;     \
128         fpadd32         %F12, %f12, %F12        /*  FPA         Group           */;     \
129         srl             %x3, 1, %x3             /*  IEU0                        */;     \
130         fcmpgt32        %f10, %F10, %x6         /*  FPM                         */;     \
131         add             %sum, %x2, %sum         /*  IEU0        Group           */;     \
132         inc             %x4                     /*  IEU1                        */;     \
133         STORE8                                  /*  Store (optional)            */;     \
134         fmovd           %F14, %B14              /*  FPA                         */;     \
135         fpadd32         %F14, %f14, %F14        /*  FPA         Group           */;     \
136         srl             %x4, 1, %x4             /*  IEU0                        */;     \
137         fcmpgt32        %f12, %F12, %x7         /*  FPM                         */;     \
138         add             %sum, %x3, %sum         /*  IEU0        Group           */;     \
139         subcc           %len, 64, %len          /*  IEU1                        */;     \
140         BRANCH                                  /*  CTI                         */;     \
141         fcmpgt32        %f14, %F14, %x8         /*  FPM         Group           */;
142
143 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
144         inc             %x5                     /*  IEU0        Group           */;     \
145         fpadd32         %f2, %f0, %S0           /*  FPA                         */;     \
146         add             %sum, %x4, %sum         /*  IEU1                        */;     \
147         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
148         fpadd32         %f6, %f4, %S1           /*  FPA                         */;     \
149         inc             %x6                     /*  IEU1                        */;     \
150         fpadd32         %f10, %f8, %S2          /*  FPA         Group           */;     \
151         add             %sum, %x5, %sum         /*  IEU0                        */;     \
152         fcmpgt32        %f0, %S0, %x1           /*  FPM                         */;     \
153         fpadd32         %f14, %f12, %S3         /*  FPA         Group           */;     \
154         srl             %x6, 1, %x6             /*  IEU0                        */;     \
155         fcmpgt32        %f4, %S1, %x2           /*  FPM                         */;     \
156         add             %sum, %x6, %sum         /*  IEU0        Group           */;     \
157         fzero           %fz                     /*  FPA                         */;     \
158         fcmpgt32        %f8, %S2, %x3           /*  FPM                         */;     \
159         inc             %x7                     /*  IEU0        Group           */;     \
160         inc             %x8                     /*  IEU1                        */;     \
161         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
162         inc             %x1                     /*  IEU1                        */;     \
163         fpadd32         %S0, %S1, %T0           /*  FPA                         */;     \
164         fpadd32         %S2, %S3, %T1           /*  FPA         Group           */;     \
165         add             %sum, %x7, %sum         /*  IEU0                        */;     \
166         fcmpgt32        %f12, %S3, %x4          /*  FPM                         */;     \
167         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
168         inc             %x2                     /*  IEU1                        */;     \
169         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
170         add             %sum, %x8, %sum         /*  IEU1                        */;     \
171         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
172         fcmpgt32        %S0, %T0, %x5           /*  FPM                         */;     \
173         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
174         fcmpgt32        %S2, %T1, %x6           /*  FPM                         */;     \
175         inc             %x3                     /*  IEU0        Group           */;     \
176         add             %sum, %x2, %sum         /*  IEU1                        */;     \
177         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
178         inc             %x4                     /*  IEU1                        */;     \
179         fpadd32         %T0, %T1, %U0           /*  FPA         Group           */;     \
180         add             %sum, %x3, %sum         /*  IEU0                        */;     \
181         fcmpgt32        %fz, %f2, %x7           /*  FPM                         */;     \
182         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
183         fcmpgt32        %fz, %f6, %x8           /*  FPM                         */;     \
184         inc             %x5                     /*  IEU0        Group           */;     \
185         add             %sum, %x4, %sum         /*  IEU1                        */;     \
186         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
187         fcmpgt32        %fz, %f10, %x1          /*  FPM                         */;     \
188         inc             %x6                     /*  IEU0        Group           */;     \
189         add             %sum, %x5, %sum         /*  IEU1                        */;     \
190         fmovd           %FA, %FB                /*  FPA         Group           */;     \
191         fcmpgt32        %fz, %f14, %x2          /*  FPM                         */;     \
192         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
193         ba,pt           %xcc, ett               /*  CTI                         */;     \
194          inc            %x7                     /*  IEU1                        */;
195
196 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB)                                \
197         END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
198
199 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz)                                   \
200         fpadd32         %U0, %U1, %V0           /*  FPA         Group           */;     \
201         srl             %x7, 1, %x7             /*  IEU0                        */;     \
202         add             %sum, %x6, %sum         /*  IEU1                        */;     \
203         std             %V0, [%sp + STACKOFF]   /*  Store       Group           */;     \
204         inc             %x8                     /*  IEU0                        */;     \
205         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
206         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
207         fcmpgt32        %fz, %S1, %x3           /*  FPM                         */;     \
208         inc             %x1                     /*  IEU0        Group           */;     \
209         fcmpgt32        %fz, %S3, %x4           /*  FPM                         */;     \
210         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
211         sub             %sum, %x8, %sum         /*  IEU1                        */;     \
212         ldx             [%sp + STACKOFF], %x8   /*  Load        Group           */;     \
213         inc             %x2                     /*  IEU0                        */;     \
214         sub             %sum, %x1, %sum         /*  IEU1                        */;     \
215         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
216         fcmpgt32        %fz, %T1, %x5           /*  FPM                         */;     \
217         inc             %x3                     /*  IEU0        Group           */;     \
218         fcmpgt32        %T0, %U0, %x6           /*  FPM                         */;     \
219         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
220         sub             %sum, %x2, %sum         /*  IEU1                        */;     \
221         inc             %x4                     /*  IEU0        Group           */;     \
222         sub             %sum, %x3, %sum         /*  IEU1                        */;     \
223         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
224         fcmpgt32        %fz, %U1, %x7           /*  FPM                         */;     \
225         inc             %x5                     /*  IEU0        Group           */;     \
226         fcmpgt32        %U0, %V0, %x1           /*  FPM                         */;     \
227         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
228         sub             %sum, %x4, %sum         /*  IEU1                        */;     \
229         sub             %sum, %x5, %sum         /*  IEU0        Group           */;     \
230         fcmpgt32        %fz, %V0, %x2           /*  FPM                         */;     \
231         inc             %x6                     /*  IEU0        Group           */;     \
232         inc             %x7                     /*  IEU1                        */;     \
233         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
234         inc             %x1                     /*  IEU1                        */;     \
235         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
236         add             %sum, %x6, %sum         /*  IEU1                        */;     \
237         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
238         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
239         inc             %x2                     /*  IEU0        Group           */;     \
240         add             %sum, %x1, %sum         /*  IEU1                        */;     \
241         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
242         sub             %sum, %x2, %sum         /*  IEU0        Group           */;     \
243         addcc           %sum, %x8, %sum         /*  IEU1        Group           */;     \
244         bcs,a,pn        %xcc, 33f               /*  CTI                         */;     \
245          add            %sum, 1, %sum           /*  IEU0        (Group)         */;     \
246 33:                                             /*  That's it                   */;
247
248         .text
249         .globl          csum_partial_copy_vis
250         .align          32
251 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
252  * csum_partial_copy_from_user
253  * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
254  */
255 csum_partial_copy_vis:
256         andcc           %dst, 7, %g0            /*  IEU1        Group           */
257         be,pt           %icc, 4f                /*  CTI                         */
258          and            %dst, 0x38, %o4         /*  IEU0                        */
259         mov             1, %g5                  /*  IEU0        Group           */
260         andcc           %dst, 2, %g0            /*  IEU1                        */
261         be,pt           %icc, 1f                /*  CTI                         */
262          and            %dst, 4, %g7            /*  IEU0        Group           */
263         lduha           [%src] %asi, %g2        /*  Load                        */
264         sub             %len, 2, %len           /*  IEU0        Group           */
265         add             %dst, 2, %dst           /*  IEU1                        */
266         andcc           %dst, 4, %g7            /*  IEU1        Group           */
267         sll             %g5, 16, %g5            /*  IEU0                        */
268         sth             %g2, [%dst - 2]         /*  Store       Group           */
269         sll             %g2, 16, %g2            /*  IEU0                        */
270         add             %src, 2, %src           /*  IEU1                        */
271         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
272         bcs,a,pn        %icc, 1f                /*  CTI                         */
273          add            %sum, %g5, %sum         /*  IEU0                        */
274 1:      lduwa           [%src] %asi, %g2        /*  Load                        */
275         brz,a,pn        %g7, 4f                 /*  CTI+IEU1    Group           */
276          and            %dst, 0x38, %o4         /*  IEU0                        */
277         add             %dst, 4, %dst           /*  IEU0        Group           */
278         sub             %len, 4, %len           /*  IEU1                        */
279         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
280         bcs,a,pn        %icc, 1f                /*  CTI                         */
281          add            %sum, 1, %sum           /*  IEU0                        */
282 1:      and             %dst, 0x38, %o4         /*  IEU0        Group           */
283         stw             %g2, [%dst - 4]         /*  Store                       */
284         add             %src, 4, %src           /*  IEU1                        */
285 4:
286 #ifdef __KERNEL__
287         VISEntry
288 #endif
289         mov             %src, %g7               /*  IEU1        Group           */
290         fzero           %f48                    /*  FPA                         */
291         alignaddr       %src, %g0, %src         /*  Single      Group           */
292         subcc           %g7, %src, %g7          /*  IEU1        Group           */
293         be,pt           %xcc, 1f                /*  CTI                         */
294          mov            0x40, %g1               /*  IEU0                        */
295         lduwa           [%src] %asi, %g2        /*  Load        Group           */
296         subcc           %sum, %g2, %sum         /*  IEU1        Group+load stall*/
297         bcs,a,pn        %icc, 1f                /*  CTI                         */
298          sub            %sum, 1, %sum           /*  IEU0                        */
299 1:      srl             %sum, 0, %sum           /*  IEU0        Group           */
300         clr             %g5                     /*  IEU1                        */
301         brz,pn          %o4, 3f                 /*  CTI+IEU1    Group           */
302          sub            %g1, %o4, %g1           /*  IEU0                        */
303         ldda            [%src] %asi, %f0        /*  Load                        */
304         clr             %o4                     /*  IEU0        Group           */
305         andcc           %dst, 8, %g0            /*  IEU1                        */
306         be,pn           %icc, 1f                /*  CTI                         */
307          ldda           [%src + 8] %asi, %f2    /*  Load        Group           */
308         add             %src, 8, %src           /*  IEU0                        */
309         sub             %len, 8, %len           /*  IEU1                        */
310         fpadd32         %f0, %f48, %f50         /*  FPA                         */
311         addcc           %dst, 8, %dst           /*  IEU1        Group           */
312         faligndata      %f0, %f2, %f16          /*  FPA                         */
313         fcmpgt32        %f48, %f50, %o4         /*  FPM         Group           */
314         fmovd           %f2, %f0                /*  FPA         Group           */
315         ldda            [%src + 8] %asi, %f2    /*  Load                        */
316         std             %f16, [%dst - 8]        /*  Store                       */
317         fmovd           %f50, %f48              /*  FPA                         */
318 1:      andcc           %g1, 0x10, %g0          /*  IEU1        Group           */
319         be,pn           %icc, 1f                /*  CTI                         */
320          and            %g1, 0x20, %g1          /*  IEU0                        */
321         fpadd32         %f0, %f48, %f50         /*  FPA                         */
322         ldda            [%src + 16] %asi, %f4   /*  Load        Group           */
323         add             %src, 16, %src          /*  IEU0                        */
324         add             %dst, 16, %dst          /*  IEU1                        */
325         faligndata      %f0, %f2, %f16          /*  FPA                         */
326         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
327         sub             %len, 16, %len          /*  IEU0                        */
328         inc             %o4                     /*  IEU1                        */
329         std             %f16, [%dst - 16]       /*  Store       Group           */
330         fpadd32         %f2, %f50, %f48         /*  FPA                         */
331         srl             %o4, 1, %o5             /*  IEU0                        */
332         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
333         std             %f18, [%dst - 8]        /*  Store                       */
334         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
335         add             %o5, %sum, %sum         /*  IEU0                        */
336         ldda            [%src + 8] %asi, %f2    /*  Load                        */
337         fmovd           %f4, %f0                /*  FPA                         */
338 1:      brz,a,pn        %g1, 4f                 /*  CTI+IEU1    Group           */
339          rd             %asi, %g2               /*  LSU         Group + 4 bubbles*/
340         inc             %g5                     /*  IEU0                        */
341         fpadd32         %f0, %f48, %f50         /*  FPA                         */
342         ldda            [%src + 16] %asi, %f4   /*  Load        Group           */
343         srl             %g5, 1, %g5             /*  IEU0                        */
344         add             %dst, 32, %dst          /*  IEU1                        */
345         faligndata      %f0, %f2, %f16          /*  FPA                         */
346         fcmpgt32        %f48, %f50, %o5         /*  FPM         Group           */
347         inc             %o4                     /*  IEU0                        */
348         ldda            [%src + 24] %asi, %f6   /*  Load                        */
349         srl             %o4, 1, %o4             /*  IEU0        Group           */
350         add             %g5, %sum, %sum         /*  IEU1                        */
351         ldda            [%src + 32] %asi, %f8   /*  Load                        */
352         fpadd32         %f2, %f50, %f48         /*  FPA                         */
353         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
354         sub             %len, 32, %len          /*  IEU0                        */
355         std             %f16, [%dst - 32]       /*  Store                       */
356         fcmpgt32        %f50, %f48, %g3         /*  FPM         Group           */
357         inc             %o5                     /*  IEU0                        */
358         add             %o4, %sum, %sum         /*  IEU1                        */
359         fpadd32         %f4, %f48, %f50         /*  FPA                         */
360         faligndata      %f4, %f6, %f20          /*  FPA         Group           */
361         srl             %o5, 1, %o5             /*  IEU0                        */
362         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
363         add             %o5, %sum, %sum         /*  IEU0                        */
364         std             %f18, [%dst - 24]       /*  Store                       */
365         fpadd32         %f6, %f50, %f48         /*  FPA                         */
366         inc             %g3                     /*  IEU0        Group           */
367         std             %f20, [%dst - 16]       /*  Store                       */
368         add             %src, 32, %src          /*  IEU1                        */
369         faligndata      %f6, %f8, %f22          /*  FPA                         */
370         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
371         srl             %g3, 1, %g3             /*  IEU0                        */
372         std             %f22, [%dst - 8]        /*  Store                       */      
373         add             %g3, %sum, %sum         /*  IEU0        Group           */
374 3:      rd              %asi, %g2               /*  LSU         Group + 4 bubbles*/
375 #ifdef __KERNEL__
376 4:      sethi           %hi(vis0s), %g7         /*  IEU0        Group           */
377         or              %g2, ASI_BLK_OR, %g2    /*  IEU1                        */
378 #else
379 4:      rd              %pc, %g7                /*  LSU         Group + 4 bubbles*/
380 #endif
381         inc             %g5                     /*  IEU0        Group           */
382         and             %src, 0x38, %g3         /*  IEU1                        */      
383         membar          #StoreLoad              /*  LSU         Group           */
384         srl             %g5, 1, %g5             /*  IEU0                        */
385         inc             %o4                     /*  IEU1                        */
386         sll             %g3, 8, %g3             /*  IEU0        Group           */
387         sub             %len, 0xc0, %len        /*  IEU1                        */
388         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
389         srl             %o4, 1, %o4             /*  IEU0                        */
390         add             %g7, %g3, %g7           /*  IEU0        Group           */
391         add             %o4, %sum, %sum         /*  IEU1                        */
392 #ifdef __KERNEL__
393         jmpl            %g7 + %lo(vis0s), %g0   /*  CTI+IEU1    Group           */
394 #else
395         jmpl            %g7 + (vis0s - 4b), %g0 /*  CTI+IEU1    Group           */
396 #endif
397          fzero          %f32                    /*  FPA                         */
398
399         .align          2048
400 vis0s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
401         add             %src, 128, %src         /*  IEU0        Group           */
402         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
403         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
404         fmovd           %f48, %f62              /*  FPA         Group   f0 available*/
405         faligndata      %f0, %f2, %f48          /*  FPA         Group   f2 available*/
406         fcmpgt32        %f32, %f2, %x1          /*  FPM         Group   f4 available*/
407         fpadd32         %f0, %f62, %f0          /*  FPA                         */
408         fcmpgt32        %f32, %f4, %x2          /*  FPM         Group   f6 available*/
409         faligndata      %f2, %f4, %f50          /*  FPA                         */
410         fcmpgt32        %f62, %f0, %x3          /*  FPM         Group   f8 available*/
411         faligndata      %f4, %f6, %f52          /*  FPA                         */
412         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group   f10 available*/
413         inc             %x1                     /*  IEU0                        */
414         faligndata      %f6, %f8, %f54          /*  FPA                         */
415         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group   f12 available*/
416         srl             %x1, 1, %x1             /*  IEU0                        */
417         inc             %x2                     /*  IEU1                        */
418         faligndata      %f8, %f10, %f56         /*  FPA                         */
419         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group   f14 available*/
420         srl             %x2, 1, %x2             /*  IEU0                        */
421         add             %sum, %x1, %sum         /*  IEU1                        */
422         faligndata      %f10, %f12, %f58        /*  FPA                         */
423         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
424         inc             %x3                     /*  IEU0                        */
425         add             %sum, %x2, %sum         /*  IEU1                        */
426         faligndata      %f12, %f14, %f60        /*  FPA                         */
427         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
428         srl             %x3, 1, %x3             /*  IEU0                        */
429         inc             %x4                     /*  IEU1                        */
430         fmovd           %f14, %f62              /*  FPA                         */
431         srl             %x4, 1, %x4             /*  IEU0        Group           */
432         add             %sum, %x3, %sum         /*  IEU1                        */
433 vis0:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
434                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
435                         ,LDBLK(f32),    STBLK,,,,,,,,
436                         ,bcs,pn %icc, vis0e1)
437         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
438                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
439                         ,LDBLK(f0),     STBLK,,,,,,,,
440                         ,bcs,pn %icc, vis0e2)
441         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
442                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
443                         ,LDBLK(f16),    STBLK,,,,,,,,
444                         ,bcc,pt %icc, vis0)
445 vis0e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
446                         ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
447                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
448                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
449 vis0e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
450                         ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
451                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
452                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
453 vis0e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
454                         ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
455                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
456                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
457         .align          2048
458 vis1s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
459         add             %src, 128 - 8, %src     /*  IEU0        Group           */
460         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
461         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
462         fmovd           %f0, %f58               /*  FPA         Group           */
463         fmovd           %f48, %f0               /*  FPA         Group           */
464         fcmpgt32        %f32, %f2, %x2          /*  FPM         Group           */
465         faligndata      %f2, %f4, %f48          /*  FPA                         */
466         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
467         faligndata      %f4, %f6, %f50          /*  FPA                         */
468         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
469         faligndata      %f6, %f8, %f52          /*  FPA                         */
470         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
471         inc             %x2                     /*  IEU1                        */
472         faligndata      %f8, %f10, %f54         /*  FPA                         */
473         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
474         srl             %x2, 1, %x2             /*  IEU0                        */
475         faligndata      %f10, %f12, %f56        /*  FPA                         */
476         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
477         inc             %x3                     /*  IEU0                        */
478         add             %sum, %x2, %sum         /*  IEU1                        */
479         faligndata      %f12, %f14, %f58        /*  FPA                         */
480         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
481         srl             %x3, 1, %x3             /*  IEU0                        */
482         inc             %x4                     /*  IEU1                        */
483         fmovd           %f14, %f60              /*  FPA                         */
484         srl             %x4, 1, %x4             /*  IEU0        Group           */
485         add             %sum, %x3, %sum         /*  IEU1                        */
486 vis1:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
487                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
488                         ,LDBLK(f32),    ,STBLK,,,,,,,
489                         ,bcs,pn %icc, vis1e1)
490         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
491                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
492                         ,LDBLK(f0),     ,STBLK,,,,,,,
493                         ,bcs,pn %icc, vis1e2)
494         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
495                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
496                         ,LDBLK(f16),    ,STBLK,,,,,,,
497                         ,bcc,pt %icc, vis1)
498 vis1e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
499                         ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
500                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
501                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
502 vis1e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
503                         ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
504                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
505                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
506 vis1e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
507                         ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
508                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
509                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
510         .align          2048
511 vis2s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
512         add             %src, 128 - 16, %src    /*  IEU0        Group           */
513         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
514         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
515         fmovd           %f0, %f56               /*  FPA         Group           */
516         fmovd           %f48, %f0               /*  FPA         Group           */      
517         sub             %dst, 64, %dst          /*  IEU0                        */
518         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
519         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
520         faligndata      %f4, %f6, %f48          /*  FPA                         */
521         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
522         faligndata      %f6, %f8, %f50          /*  FPA                         */
523         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
524         faligndata      %f8, %f10, %f52         /*  FPA                         */
525         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
526         faligndata      %f10, %f12, %f54        /*  FPA                         */
527         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
528         inc             %x3                     /*  IEU0                        */
529         faligndata      %f12, %f14, %f56        /*  FPA                         */
530         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
531         srl             %x3, 1, %x3             /*  IEU0                        */
532         inc             %x4                     /*  IEU1                        */
533         fmovd           %f14, %f58              /*  FPA                         */
534         srl             %x4, 1, %x4             /*  IEU0        Group           */
535         add             %sum, %x3, %sum         /*  IEU1                        */
536 vis2:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
537                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
538                         ,LDBLK(f32),    ,,STBLK,,,,,,
539                         ,bcs,pn %icc, vis2e1)
540         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
541                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
542                         ,LDBLK(f0),     ,,STBLK,,,,,,
543                         ,bcs,pn %icc, vis2e2)
544         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
545                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
546                         ,LDBLK(f16),    ,,STBLK,,,,,,
547                         ,bcc,pt %icc, vis2)
548 vis2e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
549                         ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
550                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
551                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
552 vis2e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
553                         ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
554                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
555                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
556 vis2e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
557                         ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
558                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
559                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
560         .align          2048
561 vis3s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
562         add             %src, 128 - 24, %src    /*  IEU0        Group           */
563         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
564         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
565         fmovd           %f0, %f54               /*  FPA         Group           */
566         fmovd           %f48, %f0               /*  FPA         Group           */
567         sub             %dst, 64, %dst          /*  IEU0                        */
568         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
569         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
570         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
571         faligndata      %f6, %f8, %f48          /*  FPA                         */
572         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
573         faligndata      %f8, %f10, %f50         /*  FPA                         */
574         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
575         faligndata      %f10, %f12, %f52        /*  FPA                         */
576         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
577         faligndata      %f12, %f14, %f54        /*  FPA                         */
578         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
579         fmovd           %f14, %f56              /*  FPA                         */
580         inc             %x4                     /*  IEU0                        */
581         srl             %x4, 1, %x4             /*  IEU0        Group           */
582 vis3:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
583                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
584                         ,LDBLK(f32),    ,,,STBLK,,,,,
585                         ,bcs,pn %icc, vis3e1)
586         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
587                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
588                         ,LDBLK(f0),     ,,,STBLK,,,,,
589                         ,bcs,pn %icc, vis3e2)
590         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
591                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
592                         ,LDBLK(f16),    ,,,STBLK,,,,,
593                         ,bcc,pt %icc, vis3)
594 vis3e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
595                         ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
596                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
597                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
598 vis3e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
599                         ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
600                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
601                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
602 vis3e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
603                         ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
604                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
605                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
606         .align          2048
607 vis4s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
608         add             %src, 128 - 32, %src    /*  IEU0        Group           */
609         ldda            [%src-128] %asi, %f0    /*  Load        Group           */
610         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
611         fmovd           %f0, %f52               /*  FPA         Group           */
612         fmovd           %f48, %f0               /*  FPA         Group           */
613         sub             %dst, 64, %dst          /*  IEU0                        */
614         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
615         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
616         fpsub32         %f6, %f6, %f6           /*  FPA         Group           */
617         clr             %x4                     /*  IEU0                        */
618         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
619         faligndata      %f8, %f10, %f48         /*  FPA                         */
620         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
621         faligndata      %f10, %f12, %f50        /*  FPA                         */
622         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
623         faligndata      %f12, %f14, %f52        /*  FPA                         */
624         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
625         fmovd           %f14, %f54              /*  FPA                         */
626 vis4:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
627                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
628                         ,LDBLK(f32),    ,,,,STBLK,,,,
629                         ,bcs,pn %icc, vis4e1)
630         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
631                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
632                         ,LDBLK(f0),     ,,,,STBLK,,,,
633                         ,bcs,pn %icc, vis4e2)
634         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
635                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
636                         ,LDBLK(f16),    ,,,,STBLK,,,,
637                         ,bcc,pt %icc, vis4)
638 vis4e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
639                         ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
640                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
641                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
642 vis4e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
643                         ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
644                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
645                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
646 vis4e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
647                         ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
648                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
649                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
650         .align          2048
651 vis5s:  add             %src, 128 - 40, %src    /*  IEU0        Group           */
652         ldda            [%src-88] %asi, %f10    /*  Load        Group           */
653         ldda            [%src-80] %asi, %f12    /*  Load        Group           */
654         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
655         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
656         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
657         fmovd           %f48, %f0               /*  FPA         Group           */
658         fmuld           %f32, %f32, %f2         /*  FPM                         */
659         clr             %x4                     /*  IEU0                        */
660         faddd           %f32, %f32, %f4         /*  FPA         Group           */
661         fmuld           %f32, %f32, %f6         /*  FPM                         */
662         clr             %x5                     /*  IEU0                        */
663         faddd           %f32, %f32, %f8         /*  FPA         Group           */
664         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
665         sub             %dst, 64, %dst          /*  IEU0                        */
666         faligndata      %f10, %f12, %f48        /*  FPA                         */
667         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
668         faligndata      %f12, %f14, %f50        /*  FPA                         */
669         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
670         fmovd           %f14, %f52              /*  FPA                         */
671 vis5:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
672                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
673                         ,LDBLK(f32),    ,,,,,STBLK,,,
674                         ,bcs,pn %icc, vis5e1)
675         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
676                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
677                         ,LDBLK(f0),     ,,,,,STBLK,,,
678                         ,bcs,pn %icc, vis5e2)
679         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
680                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
681                         ,LDBLK(f16),    ,,,,,STBLK,,,
682                         ,bcc,pt %icc, vis5)
683 vis5e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
684                         ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
685                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
686                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
687 vis5e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
688                         ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
689                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
690                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
691 vis5e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
692                         ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
693                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
694                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
695         .align          2048
696 vis6s:  add             %src, 128 - 48, %src    /*  IEU0        Group           */
697         ldda            [%src-80] %asi, %f12    /*  Load        Group           */
698         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
699         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
700         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
701         fmovd           %f48, %f0               /*  FPA         Group           */
702         fmuld           %f32, %f32, %f2         /*  FPM                         */
703         clr             %x4                     /*  IEU0                        */
704         faddd           %f32, %f32, %f4         /*  FPA         Group           */
705         fmuld           %f32, %f32, %f6         /*  FPM                         */
706         clr             %x5                     /*  IEU0                        */
707         faddd           %f32, %f32, %f8         /*  FPA         Group           */
708         fmuld           %f32, %f32, %f10        /*  FPM                         */
709         clr             %x6                     /*  IEU0                        */
710         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
711         sub             %dst, 64, %dst          /*  IEU0                        */
712         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
713         faligndata      %f12, %f14, %f48        /*  FPA                         */
714         fmovd           %f14, %f50              /*  FPA         Group           */
715 vis6:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
716                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
717                         ,LDBLK(f32),    ,,,,,,STBLK,,
718                         ,bcs,pn %icc, vis6e1)
719         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
720                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
721                         ,LDBLK(f0),     ,,,,,,STBLK,,
722                         ,bcs,pn %icc, vis6e2)
723         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
724                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
725                         ,LDBLK(f16),    ,,,,,,STBLK,,
726                         ,bcc,pt %icc, vis6)
727 vis6e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
728                         ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
729                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
730                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
731 vis6e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
732                         ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
733                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
734                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
735 vis6e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
736                         ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
737                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
738                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
739         .align          2048
740 vis7s:  add             %src, 128 - 56, %src    /*  IEU0        Group           */
741         ldda            [%src-72] %asi, %f14    /*  Load        Group           */
742         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
743         ldda            [%src-64] %asi, %f16    /*  Load        Group           */
744         fmovd           %f48, %f0               /*  FPA         Group           */
745         fmuld           %f32, %f32, %f2         /*  FPM                         */
746         clr             %x4                     /*  IEU0                        */
747         faddd           %f32, %f32, %f4         /*  FPA         Group           */
748         fmuld           %f32, %f32, %f6         /*  FPM                         */
749         clr             %x5                     /*  IEU0                        */
750         faddd           %f32, %f32, %f8         /*  FPA         Group           */
751         fmuld           %f32, %f32, %f10        /*  FPM                         */
752         clr             %x6                     /*  IEU0                        */
753         faddd           %f32, %f32, %f12        /*  FPA         Group           */
754         clr             %x7                     /*  IEU0                        */
755         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
756         sub             %dst, 64, %dst          /*  IEU0                        */
757         fmovd           %f14, %f48              /*  FPA                         */
758 vis7:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
759                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
760                         ,LDBLK(f32),    ,,,,,,,STBLK,
761                         ,bcs,pn %icc, vis7e1)
762         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
763                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
764                         ,LDBLK(f0),     ,,,,,,,STBLK,
765                         ,bcs,pn %icc, vis7e2)
766         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
767                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
768                         ,LDBLK(f16),    ,,,,,,,STBLK,
769                         ,bcc,pt %icc, vis7)
770 vis7e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
771                         ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
772                         ,SYNC,          ,,,,,,,STBLK,
773                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
774 vis7e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
775                         ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
776                         ,SYNC,          ,,,,,,,STBLK,
777                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
778 vis7e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
779                         ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
780                         ,SYNC,          ,,,,,,,STBLK,
781                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
782 e1:     END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
783 e2:     END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
784 e3:     END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
785 ett:    rd              %asi, %x4               /*  LSU         Group+4bubbles  */
786         rd              %gsr, %x3               /*  LSU         Group+4bubbles  */
787 #ifdef __KERNEL__
788         srl             %x4, 3, %x5             /*  IEU0        Group           */
789         xor             %x4, ASI_BLK_XOR1, %x4  /*  IEU1                        */
790         wr              %x4, %x5, %asi          /*  LSU         Group+4bubbles  */
791 #else
792         wr              %x4, ASI_BLK_XOR, %asi  /*  LSU         Group+4bubbles  */
793 #endif
794         andcc           %x3, 7, %x3             /*  IEU1        Group           */
795         add             %dst, 8, %dst           /*  IEU0                        */
796         bne,pn          %icc, 1f                /*  CTI                         */
797          fzero          %f10                    /*  FPA                         */
798         brz,a,pn        %len, 2f                /*  CTI+IEU1    Group           */
799          std            %f6, [%dst - 8]         /*  Store                       */
800 1:      cmp             %len, 8                 /*  IEU1                        */
801         blu,pn          %icc, 3f                /*  CTI                         */
802          sub            %src, 64, %src          /*  IEU0        Group           */
803 1:      ldda            [%src] %asi, %f2        /*  Load        Group           */
804         fpadd32         %f10, %f2, %f12         /*  FPA         Group+load stall*/
805         add             %src, 8, %src           /*  IEU0                        */
806         add             %dst, 8, %dst           /*  IEU1                        */
807         faligndata      %f6, %f2, %f14          /*  FPA         Group           */
808         fcmpgt32        %f10, %f12, %x5         /*  FPM         Group           */
809         std             %f14, [%dst - 16]       /*  Store                       */
810         fmovd           %f2, %f6                /*  FPA                         */
811         fmovd           %f12, %f10              /*  FPA         Group           */
812         sub             %len, 8, %len           /*  IEU1                        */
813         fzero           %f16                    /*  FPA         Group - FPU nop */
814         fzero           %f18                    /*  FPA         Group - FPU nop */
815         inc             %x5                     /*  IEU0                        */
816         srl             %x5, 1, %x5             /*  IEU0        Group (regdep)  */
817         cmp             %len, 8                 /*  IEU1                        */
818         bgeu,pt         %icc, 1b                /*  CTI                         */
819          add            %x5, %sum, %sum         /*  IEU0        Group           */
820 3:      brz,a,pt        %x3, 2f                 /*  CTI+IEU1                    */
821          std            %f6, [%dst - 8]         /*  Store       Group           */
822         st              %f7, [%dst - 8]         /*  Store       Group           */
823         sub             %dst, 4, %dst           /*  IEU0                        */
824         add             %len, 4, %len           /*  IEU1                        */
825 2:
826 #ifdef __KERNEL__
827         sub             %sp, 8, %sp             /*  IEU0        Group           */
828 #endif
829         END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
830         membar          #Sync                   /*  LSU         Group           */
831 #ifdef __KERNEL__
832         VISExit
833         add             %sp, 8, %sp             /*  IEU0        Group           */
834 #endif
835 23:     brnz,pn         %len, 26f               /*  CTI+IEU1    Group           */
836 24:      sllx           %sum, 32, %g1           /*  IEU0                        */
837 25:     addcc           %sum, %g1, %src         /*  IEU1        Group           */
838         srlx            %src, 32, %src          /*  IEU0        Group (regdep)  */
839         bcs,a,pn        %xcc, 1f                /*  CTI                         */
840          add            %src, 1, %src           /*  IEU1                        */
841 #ifndef __KERNEL__
842 1:      retl                                    /*  CTI         Group brk forced*/
843          srl            %src, 0, %src           /*  IEU0                        */
844 #else
845 1:      retl                                    /*  CTI         Group brk forced*/
846          ldx            [%g6 + TI_TASK], %g4    /*  Load                        */
847 #endif
848 26:     andcc           %len, 8, %g0            /*  IEU1        Group           */
849         be,pn           %icc, 1f                /*  CTI                         */
850          lduwa          [%src] %asi, %o4        /*  Load                        */
851         lduwa           [%src+4] %asi, %g2      /*  Load        Group           */
852         add             %src, 8, %src           /*  IEU0                        */
853         add             %dst, 8, %dst           /*  IEU1                        */
854         sllx            %o4, 32, %g5            /*  IEU0        Group           */
855         stw             %o4, [%dst - 8]         /*  Store                       */
856         or              %g5, %g2, %g5           /*  IEU0        Group           */
857         stw             %g2, [%dst - 4]         /*  Store                       */
858         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
859         bcs,a,pn        %xcc, 1f                /*  CTI                         */
860          add            %sum, 1, %sum           /*  IEU0                        */
861 1:      andcc           %len, 4, %g0            /*  IEU1        Group           */
862         be,a,pn         %icc, 1f                /*  CTI                         */
863          clr            %g2                     /*  IEU0                        */
864         lduwa           [%src] %asi, %g7        /*  Load                        */
865         add             %src, 4, %src           /*  IEU0        Group           */
866         add             %dst, 4, %dst           /*  IEU1                        */
867         sllx            %g7, 32, %g2            /*  IEU0        Group           */
868         stw             %g7, [%dst - 4]         /*  Store                       */
869 1:      andcc           %len, 2, %g0            /*  IEU1                        */
870         be,a,pn         %icc, 1f                /*  CTI                         */
871          clr            %g3                     /*  IEU0        Group           */
872         lduha           [%src] %asi, %g7        /*  Load                        */
873         add             %src, 2, %src           /*  IEU1                        */
874         add             %dst, 2, %dst           /*  IEU0        Group           */
875         sll             %g7, 16, %g3            /*  IEU0        Group           */
876         sth             %g7, [%dst - 2]         /*  Store                       */
877 1:      andcc           %len, 1, %g0            /*  IEU1                        */
878         be,a,pn         %icc, 1f                /*  CTI                         */
879          clr            %o5                     /*  IEU0        Group           */
880         lduba           [%src] %asi, %g7        /*  Load                        */
881         sll             %g7, 8, %o5             /*  IEU0        Group           */
882         stb             %g7, [%dst]             /*  Store                       */
883 1:      or              %g2, %g3, %g3           /*  IEU1                        */
884         or              %o5, %g3, %g3           /*  IEU0        Group (regdep)  */
885         addcc           %g3, %sum, %sum         /*  IEU1        Group (regdep)  */
886         bcs,a,pn        %xcc, 1f                /*  CTI                         */
887          add            %sum, 1, %sum           /*  IEU0                        */
888 1:      ba,pt           %xcc, 25b               /*  CTI         Group           */
889          sllx           %sum, 32, %g1           /*  IEU0                        */
890
891 #ifdef __KERNEL__
892 end:
893
894         .section        __ex_table
895         .align          4
896         .word           csum_partial_copy_vis, 0, end, cpc_handler
897 #endif