ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / arch / sparc64 / lib / VIScsumcopyusr.S
1 /* $Id: VIScsumcopyusr.S,v 1.2 2000/02/20 23:21:40 davem Exp $
2  * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous
3  *                   copying utilizing the UltraSparc Visual Instruction Set.
4  *
5  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6  * Copyright (C) 2000 David S. Miller (davem@redhat.com)
7  *
8  * Based on older sparc32/sparc64 checksum.S, which is:
9  *
10  *      Copyright(C) 1995 Linus Torvalds
11  *      Copyright(C) 1995 Miguel de Icaza
12  *      Copyright(C) 1996,1997 David S. Miller
13  *    derived from:
14  *        Linux/Alpha checksum c-code
15  *        Linux/ix86 inline checksum assembly
16  *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
17  *        David Mosberger-Tang for optimized reference c-code
18  *        BSD4.4 portable checksum routine
19  */
20
21 #ifdef __sparc_v9__
22 #define STACKOFF        0x7ff+128
23 #else
24 #define STACKOFF        64
25 #endif
26
27 #ifdef __KERNEL__
28 #include <asm/head.h>
29 #include <asm/asi.h>
30 #include <asm/page.h>
31 #include <asm/visasm.h>
32 #include <asm/thread_info.h>
33 #define ASI_BLK_XOR     0
34 #define ASI_BLK_XOR1    (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
35 #define ASI_BLK_OR      (ASI_BLK_P & ~ASI_P)
36 #else
37 #define ASI_P           0x80
38 #define ASI_BLK_P       0xf0
39 #define FRPS_FEF        0x04
40 #define FPRS_DU         0x02
41 #define FPRS_DL         0x01
42 #define ASI_BLK_XOR     (ASI_BLK_P ^ ASI_P)
43 #endif
44
45 #define src             o0
46 #define dst             o1
47 #define len             o2
48 #define sum             o3
49 #define x1              g1
50 #define x2              g2
51 #define x3              o4
52 #define x4              g4
53 #define x5              g5
54 #define x6              g7
55 #define x7              g3
56 #define x8              o5
57
58 /* Dobrou noc, SunSoft engineers. Spete sladce.
59  * This has a couple of tricks in and those
60  * tricks are UltraLinux trade secrets :))
61  * Once AGAIN, the SunSoft engineers are caught
62  * asleep at the keyboard :)).
63  * The main loop does about 20 superscalar cycles
64  * per 64bytes checksummed/copied.
65  */
66
67 #define LDBLK(O0)                                                                               \
68         ldda            [%src] ASI_BLK_P, %O0   /*  Load        Group                   */
69
70 #define STBLK                                                                                   \
71         stda            %f48, [%dst] %asi       /*  Store                               */
72
73 #ifdef __KERNEL__
74 #define STBLK_XORASI(tmpreg1,tmpreg2)                                                           \
75         stda            %f48, [%dst] %asi       /*  Store                               */;     \
76         rd              %asi, %tmpreg1;                                                         \
77         srl             %tmpreg1, 3, %tmpreg2;                                                  \
78         xor             %tmpreg1, ASI_BLK_XOR1, %tmpreg1;                                       \
79         wr              %tmpreg1, %tmpreg2, %asi;
80 #else
81 #define STBLK_XORASI(tmpreg1,tmpreg2)                                                           \
82         stda            %f48, [%dst] %asi       /*  Store                               */;     \
83         rd              %asi, %tmpreg1;                                                         \
84         wr              %tmpreg1, ASI_BLK_XOR, %asi;
85 #endif
86
87 #define ST(fx,off)                                                                              \
88         stda            %fx, [%dst + off] %asi  /*  Store                               */
89
90 #define SYNC                                                                                    \
91         membar          #Sync
92
93
94 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)  \
95         LOAD                                    /*  Load        (Group)         */;     \
96         faligndata      %A14, %F0, %A14         /*  FPA         Group           */;     \
97         inc             %x5                     /*  IEU0                        */;     \
98         STORE1                                  /*  Store (optional)            */;     \
99         faligndata      %F0, %F2, %A0           /*  FPA         Group           */;     \
100         srl             %x5, 1, %x5             /*  IEU0                        */;     \
101         add             %sum, %x4, %sum         /*  IEU1                        */;     \
102         fpadd32         %F0, %f0, %F0           /*  FPA         Group           */;     \
103         inc             %x6                     /*  IEU0                        */;     \
104         STORE2                                  /*  Store (optional)            */;     \
105         faligndata      %F2, %F4, %A2           /*  FPA         Group           */;     \
106         srl             %x6, 1, %x6             /*  IEU0                        */;     \
107         add             %sum, %x5, %sum         /*  IEU1                        */;     \
108         fpadd32         %F2, %f2, %F2           /*  FPA         Group           */;     \
109         add             %src, 64, %src          /*  IEU0                        */;     \
110         fcmpgt32        %f0, %F0, %x1           /*  FPM                         */;     \
111         add             %dst, 64, %dst          /*  IEU1        Group           */;     \
112         inc             %x7                     /*  IEU0                        */;     \
113         STORE3                                  /*  Store (optional)            */;     \
114         faligndata      %F4, %F6, %A4           /*  FPA                         */;     \
115         fpadd32         %F4, %f4, %F4           /*  FPA         Group           */;     \
116         add             %sum, %x6, %sum         /*  IEU1                        */;     \
117         fcmpgt32        %f2, %F2, %x2           /*  FPM                         */;     \
118         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
119         inc             %x8                     /*  IEU1                        */;     \
120         STORE4                                  /*  Store (optional)            */;     \
121         faligndata      %F6, %F8, %A6           /*  FPA                         */;     \
122         fpadd32         %F6, %f6, %F6           /*  FPA         Group           */;     \
123         srl             %x8, 1, %x8             /*  IEU0                        */;     \
124         fcmpgt32        %f4, %F4, %x3           /*  FPM                         */;     \
125         add             %sum, %x7, %sum         /*  IEU0        Group           */;     \
126         inc             %x1                     /*  IEU1                        */;     \
127         STORE5                                  /*  Store (optional)            */;     \
128         faligndata      %F8, %F10, %A8          /*  FPA                         */;     \
129         fpadd32         %F8, %f8, %F8           /*  FPA         Group           */;     \
130         srl             %x1, 1, %x1             /*  IEU0                        */;     \
131         fcmpgt32        %f6, %F6, %x4           /*  FPM                         */;     \
132         add             %sum, %x8, %sum         /*  IEU0        Group           */;     \
133         inc             %x2                     /*  IEU1                        */;     \
134         STORE6                                  /*  Store (optional)            */;     \
135         faligndata      %F10, %F12, %A10        /*  FPA                         */;     \
136         fpadd32         %F10, %f10, %F10        /*  FPA         Group           */;     \
137         srl             %x2, 1, %x2             /*  IEU0                        */;     \
138         fcmpgt32        %f8, %F8, %x5           /*  FPM                         */;     \
139         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
140         inc             %x3                     /*  IEU1                        */;     \
141         STORE7                                  /*  Store (optional)            */;     \
142         faligndata      %F12, %F14, %A12        /*  FPA                         */;     \
143         fpadd32         %F12, %f12, %F12        /*  FPA         Group           */;     \
144         srl             %x3, 1, %x3             /*  IEU0                        */;     \
145         fcmpgt32        %f10, %F10, %x6         /*  FPM                         */;     \
146         add             %sum, %x2, %sum         /*  IEU0        Group           */;     \
147         inc             %x4                     /*  IEU1                        */;     \
148         STORE8                                  /*  Store (optional)            */;     \
149         fmovd           %F14, %B14              /*  FPA                         */;     \
150         fpadd32         %F14, %f14, %F14        /*  FPA         Group           */;     \
151         srl             %x4, 1, %x4             /*  IEU0                        */;     \
152         fcmpgt32        %f12, %F12, %x7         /*  FPM                         */;     \
153         add             %sum, %x3, %sum         /*  IEU0        Group           */;     \
154         subcc           %len, 64, %len          /*  IEU1                        */;     \
155         BRANCH                                  /*  CTI                         */;     \
156         fcmpgt32        %f14, %F14, %x8         /*  FPM         Group           */;
157
158 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
159         inc             %x5                     /*  IEU0        Group           */;     \
160         fpadd32         %f2, %f0, %S0           /*  FPA                         */;     \
161         add             %sum, %x4, %sum         /*  IEU1                        */;     \
162         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
163         fpadd32         %f6, %f4, %S1           /*  FPA                         */;     \
164         inc             %x6                     /*  IEU1                        */;     \
165         fpadd32         %f10, %f8, %S2          /*  FPA         Group           */;     \
166         add             %sum, %x5, %sum         /*  IEU0                        */;     \
167         fcmpgt32        %f0, %S0, %x1           /*  FPM                         */;     \
168         fpadd32         %f14, %f12, %S3         /*  FPA         Group           */;     \
169         srl             %x6, 1, %x6             /*  IEU0                        */;     \
170         fcmpgt32        %f4, %S1, %x2           /*  FPM                         */;     \
171         add             %sum, %x6, %sum         /*  IEU0        Group           */;     \
172         fzero           %fz                     /*  FPA                         */;     \
173         fcmpgt32        %f8, %S2, %x3           /*  FPM                         */;     \
174         inc             %x7                     /*  IEU0        Group           */;     \
175         inc             %x8                     /*  IEU1                        */;     \
176         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
177         inc             %x1                     /*  IEU1                        */;     \
178         fpadd32         %S0, %S1, %T0           /*  FPA                         */;     \
179         fpadd32         %S2, %S3, %T1           /*  FPA         Group           */;     \
180         add             %sum, %x7, %sum         /*  IEU0                        */;     \
181         fcmpgt32        %f12, %S3, %x4          /*  FPM                         */;     \
182         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
183         inc             %x2                     /*  IEU1                        */;     \
184         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
185         add             %sum, %x8, %sum         /*  IEU1                        */;     \
186         add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
187         fcmpgt32        %S0, %T0, %x5           /*  FPM                         */;     \
188         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
189         fcmpgt32        %S2, %T1, %x6           /*  FPM                         */;     \
190         inc             %x3                     /*  IEU0        Group           */;     \
191         add             %sum, %x2, %sum         /*  IEU1                        */;     \
192         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
193         inc             %x4                     /*  IEU1                        */;     \
194         fpadd32         %T0, %T1, %U0           /*  FPA         Group           */;     \
195         add             %sum, %x3, %sum         /*  IEU0                        */;     \
196         fcmpgt32        %fz, %f2, %x7           /*  FPM                         */;     \
197         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
198         fcmpgt32        %fz, %f6, %x8           /*  FPM                         */;     \
199         inc             %x5                     /*  IEU0        Group           */;     \
200         add             %sum, %x4, %sum         /*  IEU1                        */;     \
201         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
202         fcmpgt32        %fz, %f10, %x1          /*  FPM                         */;     \
203         inc             %x6                     /*  IEU0        Group           */;     \
204         add             %sum, %x5, %sum         /*  IEU1                        */;     \
205         fmovd           %FA, %FB                /*  FPA         Group           */;     \
206         fcmpgt32        %fz, %f14, %x2          /*  FPM                         */;     \
207         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
208         ba,pt           %xcc, ett               /*  CTI                         */;     \
209          inc            %x7                     /*  IEU1                        */;
210
211 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB)                                \
212         END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
213
214 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz)                                   \
215         fpadd32         %U0, %U1, %V0           /*  FPA         Group           */;     \
216         srl             %x7, 1, %x7             /*  IEU0                        */;     \
217         add             %sum, %x6, %sum         /*  IEU1                        */;     \
218         std             %V0, [%sp + STACKOFF]   /*  Store       Group           */;     \
219         inc             %x8                     /*  IEU0                        */;     \
220         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
221         srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
222         fcmpgt32        %fz, %S1, %x3           /*  FPM                         */;     \
223         inc             %x1                     /*  IEU0        Group           */;     \
224         fcmpgt32        %fz, %S3, %x4           /*  FPM                         */;     \
225         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
226         sub             %sum, %x8, %sum         /*  IEU1                        */;     \
227         ldx             [%sp + STACKOFF], %x8   /*  Load        Group           */;     \
228         inc             %x2                     /*  IEU0                        */;     \
229         sub             %sum, %x1, %sum         /*  IEU1                        */;     \
230         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
231         fcmpgt32        %fz, %T1, %x5           /*  FPM                         */;     \
232         inc             %x3                     /*  IEU0        Group           */;     \
233         fcmpgt32        %T0, %U0, %x6           /*  FPM                         */;     \
234         srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
235         sub             %sum, %x2, %sum         /*  IEU1                        */;     \
236         inc             %x4                     /*  IEU0        Group           */;     \
237         sub             %sum, %x3, %sum         /*  IEU1                        */;     \
238         srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
239         fcmpgt32        %fz, %U1, %x7           /*  FPM                         */;     \
240         inc             %x5                     /*  IEU0        Group           */;     \
241         fcmpgt32        %U0, %V0, %x1           /*  FPM                         */;     \
242         srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
243         sub             %sum, %x4, %sum         /*  IEU1                        */;     \
244         sub             %sum, %x5, %sum         /*  IEU0        Group           */;     \
245         fcmpgt32        %fz, %V0, %x2           /*  FPM                         */;     \
246         inc             %x6                     /*  IEU0        Group           */;     \
247         inc             %x7                     /*  IEU1                        */;     \
248         srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
249         inc             %x1                     /*  IEU1                        */;     \
250         srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
251         add             %sum, %x6, %sum         /*  IEU1                        */;     \
252         srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
253         sub             %sum, %x7, %sum         /*  IEU1                        */;     \
254         inc             %x2                     /*  IEU0        Group           */;     \
255         add             %sum, %x1, %sum         /*  IEU1                        */;     \
256         srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
257         sub             %sum, %x2, %sum         /*  IEU0        Group           */;     \
258         addcc           %sum, %x8, %sum         /*  IEU1        Group           */;     \
259         bcs,a,pn        %xcc, 33f               /*  CTI                         */;     \
260          add            %sum, 1, %sum           /*  IEU0        (Group)         */;     \
261 33:                                             /*  That's it                   */;
262
263         .text
264         .globl          csum_partial_copy_user_vis
265         .align          32
266 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
267  * csum_partial_copy_from_user
268  * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
269  */
270 csum_partial_copy_user_vis:
271         andcc           %dst, 7, %g0            /*  IEU1        Group           */
272         be,pt           %icc, 4f                /*  CTI                         */
273          and            %dst, 0x38, %o4         /*  IEU0                        */
274         mov             1, %g5                  /*  IEU0        Group           */
275         andcc           %dst, 2, %g0            /*  IEU1                        */
276         be,pt           %icc, 1f                /*  CTI                         */
277          and            %dst, 4, %g7            /*  IEU0        Group           */
278         lduh            [%src], %g2             /*  Load                        */
279         sub             %len, 2, %len           /*  IEU0        Group           */
280         add             %dst, 2, %dst           /*  IEU1                        */
281         andcc           %dst, 4, %g7            /*  IEU1        Group           */
282         sll             %g5, 16, %g5            /*  IEU0                        */
283         stha            %g2, [%dst - 2] %asi    /*  Store       Group           */
284         sll             %g2, 16, %g2            /*  IEU0                        */
285         add             %src, 2, %src           /*  IEU1                        */
286         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
287         bcs,a,pn        %icc, 1f                /*  CTI                         */
288          add            %sum, %g5, %sum         /*  IEU0                        */
289 1:      lduw            [%src], %g2             /*  Load                        */
290         brz,a,pn        %g7, 4f                 /*  CTI+IEU1    Group           */
291          and            %dst, 0x38, %o4         /*  IEU0                        */
292         add             %dst, 4, %dst           /*  IEU0        Group           */
293         sub             %len, 4, %len           /*  IEU1                        */
294         addcc           %g2, %sum, %sum         /*  IEU1        Group           */
295         bcs,a,pn        %icc, 1f                /*  CTI                         */
296          add            %sum, 1, %sum           /*  IEU0                        */
297 1:      and             %dst, 0x38, %o4         /*  IEU0        Group           */
298         stwa            %g2, [%dst - 4] %asi    /*  Store                       */
299         add             %src, 4, %src           /*  IEU1                        */
300 4:
301 #ifdef __KERNEL__
302         VISEntry
303 #endif
304         mov             %src, %g7               /*  IEU1        Group           */
305         fzero           %f48                    /*  FPA                         */
306         alignaddr       %src, %g0, %src         /*  Single      Group           */
307         subcc           %g7, %src, %g7          /*  IEU1        Group           */
308         be,pt           %xcc, 1f                /*  CTI                         */
309          mov            0x40, %g1               /*  IEU0                        */
310         lduw            [%src], %g2             /*  Load        Group           */
311         subcc           %sum, %g2, %sum         /*  IEU1        Group+load stall*/
312         bcs,a,pn        %icc, 1f                /*  CTI                         */
313          sub            %sum, 1, %sum           /*  IEU0                        */
314 1:      srl             %sum, 0, %sum           /*  IEU0        Group           */
315         clr             %g5                     /*  IEU1                        */
316         brz,pn          %o4, 3f                 /*  CTI+IEU1    Group           */
317          sub            %g1, %o4, %g1           /*  IEU0                        */
318         ldd             [%src], %f0             /*  Load                        */
319         clr             %o4                     /*  IEU0        Group           */
320         andcc           %dst, 8, %g0            /*  IEU1                        */
321         be,pn           %icc, 1f                /*  CTI                         */
322          ldd            [%src + 8], %f2         /*  Load        Group           */
323         add             %src, 8, %src           /*  IEU0                        */
324         sub             %len, 8, %len           /*  IEU1                        */
325         fpadd32         %f0, %f48, %f50         /*  FPA                         */
326         addcc           %dst, 8, %dst           /*  IEU1        Group           */
327         faligndata      %f0, %f2, %f16          /*  FPA                         */
328         fcmpgt32        %f48, %f50, %o4         /*  FPM         Group           */
329         fmovd           %f2, %f0                /*  FPA         Group           */
330         ldd             [%src + 8], %f2         /*  Load                        */
331         stda            %f16, [%dst - 8] %asi   /*  Store                       */
332         fmovd           %f50, %f48              /*  FPA                         */
333 1:      andcc           %g1, 0x10, %g0          /*  IEU1        Group           */
334         be,pn           %icc, 1f                /*  CTI                         */
335          and            %g1, 0x20, %g1          /*  IEU0                        */
336         fpadd32         %f0, %f48, %f50         /*  FPA                         */
337         ldd             [%src + 16], %f4        /*  Load        Group           */
338         add             %src, 16, %src          /*  IEU0                        */
339         add             %dst, 16, %dst          /*  IEU1                        */
340         faligndata      %f0, %f2, %f16          /*  FPA                         */
341         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
342         sub             %len, 16, %len          /*  IEU0                        */
343         inc             %o4                     /*  IEU1                        */
344         stda            %f16, [%dst - 16] %asi  /*  Store       Group           */
345         fpadd32         %f2, %f50, %f48         /*  FPA                         */
346         srl             %o4, 1, %o5             /*  IEU0                        */
347         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
348         stda            %f18, [%dst - 8] %asi   /*  Store                       */
349         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
350         add             %o5, %sum, %sum         /*  IEU0                        */
351         ldd             [%src + 8], %f2         /*  Load                        */
352         fmovd           %f4, %f0                /*  FPA                         */
353 1:      brz,a,pn        %g1, 4f                 /*  CTI+IEU1    Group           */
354          rd             %asi, %g2               /*  LSU         Group + 4 bubbles*/
355         inc             %g5                     /*  IEU0                        */
356         fpadd32         %f0, %f48, %f50         /*  FPA                         */
357         ldd             [%src + 16], %f4        /*  Load        Group           */
358         srl             %g5, 1, %g5             /*  IEU0                        */
359         add             %dst, 32, %dst          /*  IEU1                        */
360         faligndata      %f0, %f2, %f16          /*  FPA                         */
361         fcmpgt32        %f48, %f50, %o5         /*  FPM         Group           */
362         inc             %o4                     /*  IEU0                        */
363         ldd             [%src + 24], %f6        /*  Load                        */
364         srl             %o4, 1, %o4             /*  IEU0        Group           */
365         add             %g5, %sum, %sum         /*  IEU1                        */
366         ldd             [%src + 32], %f8        /*  Load                        */
367         fpadd32         %f2, %f50, %f48         /*  FPA                         */
368         faligndata      %f2, %f4, %f18          /*  FPA         Group           */
369         sub             %len, 32, %len          /*  IEU0                        */
370         stda            %f16, [%dst - 32] %asi  /*  Store                       */
371         fcmpgt32        %f50, %f48, %g3         /*  FPM         Group           */
372         inc             %o5                     /*  IEU0                        */
373         add             %o4, %sum, %sum         /*  IEU1                        */
374         fpadd32         %f4, %f48, %f50         /*  FPA                         */
375         faligndata      %f4, %f6, %f20          /*  FPA         Group           */
376         srl             %o5, 1, %o5             /*  IEU0                        */
377         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
378         add             %o5, %sum, %sum         /*  IEU0                        */
379         stda            %f18, [%dst - 24] %asi  /*  Store                       */
380         fpadd32         %f6, %f50, %f48         /*  FPA                         */
381         inc             %g3                     /*  IEU0        Group           */
382         stda            %f20, [%dst - 16] %asi  /*  Store                       */
383         add             %src, 32, %src          /*  IEU1                        */
384         faligndata      %f6, %f8, %f22          /*  FPA                         */
385         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
386         srl             %g3, 1, %g3             /*  IEU0                        */
387         stda            %f22, [%dst - 8] %asi   /*  Store                       */      
388         add             %g3, %sum, %sum         /*  IEU0        Group           */
389 3:      rd              %asi, %g2               /*  LSU         Group + 4 bubbles*/
390 #ifdef __KERNEL__
391 4:      sethi           %hi(vis0s), %g7         /*  IEU0        Group           */
392         or              %g2, ASI_BLK_OR, %g2    /*  IEU1                        */
393 #else
394 4:      rd              %pc, %g7                /*  LSU         Group + 4 bubbles*/
395 #endif
396         inc             %g5                     /*  IEU0        Group           */
397         and             %src, 0x38, %g3         /*  IEU1                        */      
398         membar          #StoreLoad              /*  LSU         Group           */
399         srl             %g5, 1, %g5             /*  IEU0                        */
400         inc             %o4                     /*  IEU1                        */
401         sll             %g3, 8, %g3             /*  IEU0        Group           */
402         sub             %len, 0xc0, %len        /*  IEU1                        */
403         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
404         srl             %o4, 1, %o4             /*  IEU0                        */
405         add             %g7, %g3, %g7           /*  IEU0        Group           */
406         add             %o4, %sum, %sum         /*  IEU1                        */
407 #ifdef __KERNEL__
408         jmpl            %g7 + %lo(vis0s), %g0   /*  CTI+IEU1    Group           */
409 #else
410         jmpl            %g7 + (vis0s - 4b), %g0 /*  CTI+IEU1    Group           */
411 #endif
412          fzero          %f32                    /*  FPA                         */
413
414         .align          2048
415 vis0s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
416         ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
417         add             %src, 64, %src          /*  IEU0        Group           */
418         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
419         add             %src, 64, %src          /*  IEU0        Group           */
420         fmovd           %f48, %f62              /*  FPA         Group   f0 available*/
421         faligndata      %f0, %f2, %f48          /*  FPA         Group   f2 available*/
422         fcmpgt32        %f32, %f2, %x1          /*  FPM         Group   f4 available*/
423         fpadd32         %f0, %f62, %f0          /*  FPA                         */
424         fcmpgt32        %f32, %f4, %x2          /*  FPM         Group   f6 available*/
425         faligndata      %f2, %f4, %f50          /*  FPA                         */
426         fcmpgt32        %f62, %f0, %x3          /*  FPM         Group   f8 available*/
427         faligndata      %f4, %f6, %f52          /*  FPA                         */
428         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group   f10 available*/
429         inc             %x1                     /*  IEU0                        */
430         faligndata      %f6, %f8, %f54          /*  FPA                         */
431         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group   f12 available*/
432         srl             %x1, 1, %x1             /*  IEU0                        */
433         inc             %x2                     /*  IEU1                        */
434         faligndata      %f8, %f10, %f56         /*  FPA                         */
435         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group   f14 available*/
436         srl             %x2, 1, %x2             /*  IEU0                        */
437         add             %sum, %x1, %sum         /*  IEU1                        */
438         faligndata      %f10, %f12, %f58        /*  FPA                         */
439         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
440         inc             %x3                     /*  IEU0                        */
441         add             %sum, %x2, %sum         /*  IEU1                        */
442         faligndata      %f12, %f14, %f60        /*  FPA                         */
443         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
444         srl             %x3, 1, %x3             /*  IEU0                        */
445         inc             %x4                     /*  IEU1                        */
446         fmovd           %f14, %f62              /*  FPA                         */
447         srl             %x4, 1, %x4             /*  IEU0        Group           */
448         add             %sum, %x3, %sum         /*  IEU1                        */
449 vis0:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
450                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,                                                           
451                         ,LDBLK(f32),    STBLK,,,,,,,,                                                                   
452                         ,bcs,pn %icc, vis0e1)
453         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
454                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,                                                           
455                         ,LDBLK(f0),     STBLK,,,,,,,,                                                                   
456                         ,bcs,pn %icc, vis0e2)
457         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
458                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
459                         ,LDBLK(f16),    STBLK,,,,,,,,
460                         ,bcc,pt %icc, vis0)
461 vis0e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
462                         ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
463                         ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
464                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
465 vis0e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
466                         ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
467                         ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
468                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
469 vis0e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
470                         ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
471                         ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
472                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
473         .align          2048
474 vis1s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
475         sub             %src, 8, %src           /*  IEU0        Group           */
476         ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
477         add             %src, 64, %src          /*  IEU0        Group           */
478         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
479         add             %src, 64, %src          /*  IEU0        Group           */
480         fmovd           %f0, %f58               /*  FPA         Group           */
481         fmovd           %f48, %f0               /*  FPA         Group           */
482         fcmpgt32        %f32, %f2, %x2          /*  FPM         Group           */
483         faligndata      %f2, %f4, %f48          /*  FPA                         */
484         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
485         faligndata      %f4, %f6, %f50          /*  FPA                         */
486         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
487         faligndata      %f6, %f8, %f52          /*  FPA                         */
488         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
489         inc             %x2                     /*  IEU1                        */
490         faligndata      %f8, %f10, %f54         /*  FPA                         */
491         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
492         srl             %x2, 1, %x2             /*  IEU0                        */
493         faligndata      %f10, %f12, %f56        /*  FPA                         */
494         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
495         inc             %x3                     /*  IEU0                        */
496         add             %sum, %x2, %sum         /*  IEU1                        */
497         faligndata      %f12, %f14, %f58        /*  FPA                         */
498         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
499         srl             %x3, 1, %x3             /*  IEU0                        */
500         inc             %x4                     /*  IEU1                        */
501         fmovd           %f14, %f60              /*  FPA                         */
502         srl             %x4, 1, %x4             /*  IEU0        Group           */
503         add             %sum, %x3, %sum         /*  IEU1                        */
504 vis1:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
505                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
506                         ,LDBLK(f32),    ,STBLK,,,,,,,
507                         ,bcs,pn %icc, vis1e1)
508         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
509                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
510                         ,LDBLK(f0),     ,STBLK,,,,,,,
511                         ,bcs,pn %icc, vis1e2)
512         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
513                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
514                         ,LDBLK(f16),    ,STBLK,,,,,,,
515                         ,bcc,pt %icc, vis1)
516 vis1e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
517                         ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
518                         ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
519                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
520 vis1e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
521                         ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
522                         ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
523                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
524 vis1e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
525                         ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
526                         ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
527                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
528         .align          2048
529 vis2s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
530         sub             %src, 16, %src          /*  IEU0        Group           */
531         ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
532         add             %src, 64, %src          /*  IEU0        Group           */
533         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
534         add             %src, 64, %src          /*  IEU0        Group           */
535         fmovd           %f0, %f56               /*  FPA         Group           */
536         fmovd           %f48, %f0               /*  FPA         Group           */      
537         sub             %dst, 64, %dst          /*  IEU0                        */
538         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
539         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
540         faligndata      %f4, %f6, %f48          /*  FPA                         */
541         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
542         faligndata      %f6, %f8, %f50          /*  FPA                         */
543         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
544         faligndata      %f8, %f10, %f52         /*  FPA                         */
545         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
546         faligndata      %f10, %f12, %f54        /*  FPA                         */
547         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
548         inc             %x3                     /*  IEU0                        */
549         faligndata      %f12, %f14, %f56        /*  FPA                         */
550         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
551         srl             %x3, 1, %x3             /*  IEU0                        */
552         inc             %x4                     /*  IEU1                        */
553         fmovd           %f14, %f58              /*  FPA                         */
554         srl             %x4, 1, %x4             /*  IEU0        Group           */
555         add             %sum, %x3, %sum         /*  IEU1                        */
556 vis2:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
557                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
558                         ,LDBLK(f32),    ,,STBLK,,,,,,
559                         ,bcs,pn %icc, vis2e1)
560         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
561                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
562                         ,LDBLK(f0),     ,,STBLK,,,,,,
563                         ,bcs,pn %icc, vis2e2)
564         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
565                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
566                         ,LDBLK(f16),    ,,STBLK,,,,,,
567                         ,bcc,pt %icc, vis2)
568 vis2e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
569                         ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
570                         ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
571                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
572 vis2e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
573                         ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
574                         ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
575                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
576 vis2e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
577                         ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
578                         ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
579                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
580         .align          2048
581 vis3s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
582         sub             %src, 24, %src          /*  IEU0        Group           */
583         ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
584         add             %src, 64, %src          /*  IEU0        Group           */
585         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
586         add             %src, 64, %src          /*  IEU0        Group           */
587         fmovd           %f0, %f54               /*  FPA         Group           */
588         fmovd           %f48, %f0               /*  FPA         Group           */
589         sub             %dst, 64, %dst          /*  IEU0                        */
590         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
591         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
592         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
593         faligndata      %f6, %f8, %f48          /*  FPA                         */
594         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
595         faligndata      %f8, %f10, %f50         /*  FPA                         */
596         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
597         faligndata      %f10, %f12, %f52        /*  FPA                         */
598         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
599         faligndata      %f12, %f14, %f54        /*  FPA                         */
600         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
601         fmovd           %f14, %f56              /*  FPA                         */
602         inc             %x4                     /*  IEU0                        */
603         srl             %x4, 1, %x4             /*  IEU0        Group           */
604 vis3:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
605                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
606                         ,LDBLK(f32),    ,,,STBLK,,,,,
607                         ,bcs,pn %icc, vis3e1)
608         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
609                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
610                         ,LDBLK(f0),     ,,,STBLK,,,,,
611                         ,bcs,pn %icc, vis3e2)
612         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
613                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
614                         ,LDBLK(f16),    ,,,STBLK,,,,,
615                         ,bcc,pt %icc, vis3)
616 vis3e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
617                         ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
618                         ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
619                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
620 vis3e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
621                         ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
622                         ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
623                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
624 vis3e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
625                         ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
626                         ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
627                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
628         .align          2048
629 vis4s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
630         sub             %src, 32, %src          /*  IEU0        Group           */
631         ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
632         add             %src, 64, %src          /*  IEU0        Group           */
633         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
634         add             %src, 64, %src          /*  IEU0        Group           */
635         fmovd           %f0, %f52               /*  FPA         Group           */
636         fmovd           %f48, %f0               /*  FPA         Group           */
637         sub             %dst, 64, %dst          /*  IEU0                        */
638         fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
639         fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
640         fpsub32         %f6, %f6, %f6           /*  FPA         Group           */
641         clr             %x4                     /*  IEU0                        */
642         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
643         faligndata      %f8, %f10, %f48         /*  FPA                         */
644         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
645         faligndata      %f10, %f12, %f50        /*  FPA                         */
646         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
647         faligndata      %f12, %f14, %f52        /*  FPA                         */
648         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
649         fmovd           %f14, %f54              /*  FPA                         */
650 vis4:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
651                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
652                         ,LDBLK(f32),    ,,,,STBLK,,,,
653                         ,bcs,pn %icc, vis4e1)
654         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
655                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
656                         ,LDBLK(f0),     ,,,,STBLK,,,,
657                         ,bcs,pn %icc, vis4e2)
658         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
659                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
660                         ,LDBLK(f16),    ,,,,STBLK,,,,
661                         ,bcc,pt %icc, vis4)
662 vis4e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
663                         ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
664                         ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
665                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
666 vis4e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
667                         ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
668                         ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
669                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
670 vis4e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
671                         ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
672                         ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
673                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
674         .align          2048
675 vis5s:  ldd             [%src+0], %f10          /*  Load        Group           */
676         ldd             [%src+8], %f12          /*  Load        Group           */
677         ldd             [%src+16], %f14         /*  Load        Group           */
678         add             %src, 24, %src          /*  IEU0        Group           */
679         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
680         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
681         add             %src, 64, %src          /*  IEU0        Group           */
682         fmovd           %f48, %f0               /*  FPA         Group           */
683         fmuld           %f32, %f32, %f2         /*  FPM                         */
684         clr             %x4                     /*  IEU0                        */
685         faddd           %f32, %f32, %f4         /*  FPA         Group           */
686         fmuld           %f32, %f32, %f6         /*  FPM                         */
687         clr             %x5                     /*  IEU0                        */
688         faddd           %f32, %f32, %f8         /*  FPA         Group           */
689         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
690         sub             %dst, 64, %dst          /*  IEU0                        */
691         faligndata      %f10, %f12, %f48        /*  FPA                         */
692         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
693         faligndata      %f12, %f14, %f50        /*  FPA                         */
694         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
695         fmovd           %f14, %f52              /*  FPA                         */
696 vis5:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
697                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
698                         ,LDBLK(f32),    ,,,,,STBLK,,,
699                         ,bcs,pn %icc, vis5e1)
700         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
701                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
702                         ,LDBLK(f0),     ,,,,,STBLK,,,
703                         ,bcs,pn %icc, vis5e2)
704         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
705                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
706                         ,LDBLK(f16),    ,,,,,STBLK,,,
707                         ,bcc,pt %icc, vis5)
708 vis5e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
709                         ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
710                         ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
711                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
712 vis5e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
713                         ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
714                         ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
715                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
716 vis5e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
717                         ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
718                         ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
719                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
720         .align          2048
721 vis6s:  ldd             [%src+0], %f12          /*  Load        Group           */
722         ldd             [%src+8], %f14          /*  Load        Group           */
723         add             %src, 16, %src          /*  IEU0        Group           */
724         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
725         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
726         add             %src, 64, %src          /*  IEU0        Group           */
727         fmovd           %f48, %f0               /*  FPA         Group           */
728         fmuld           %f32, %f32, %f2         /*  FPM                         */
729         clr             %x4                     /*  IEU0                        */
730         faddd           %f32, %f32, %f4         /*  FPA         Group           */
731         fmuld           %f32, %f32, %f6         /*  FPM                         */
732         clr             %x5                     /*  IEU0                        */
733         faddd           %f32, %f32, %f8         /*  FPA         Group           */
734         fmuld           %f32, %f32, %f10        /*  FPM                         */
735         clr             %x6                     /*  IEU0                        */
736         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
737         sub             %dst, 64, %dst          /*  IEU0                        */
738         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
739         faligndata      %f12, %f14, %f48        /*  FPA                         */
740         fmovd           %f14, %f50              /*  FPA         Group           */
741 vis6:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
742                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
743                         ,LDBLK(f32),    ,,,,,,STBLK,,
744                         ,bcs,pn %icc, vis6e1)
745         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
746                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
747                         ,LDBLK(f0),     ,,,,,,STBLK,,
748                         ,bcs,pn %icc, vis6e2)
749         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
750                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
751                         ,LDBLK(f16),    ,,,,,,STBLK,,
752                         ,bcc,pt %icc, vis6)
753 vis6e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
754                         ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
755                         ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
756                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
757 vis6e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
758                         ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
759                         ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
760                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
761 vis6e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
762                         ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
763                         ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
764                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
765         .align          2048
766 vis7s:  ldd             [%src+0], %f14          /*  Load        Group           */
767         add             %src, 8, %src           /*  IEU0        Group           */
768         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
769         ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
770         add             %src, 64, %src          /*  IEU0        Group           */
771         fmovd           %f48, %f0               /*  FPA         Group           */
772         fmuld           %f32, %f32, %f2         /*  FPM                         */
773         clr             %x4                     /*  IEU0                        */
774         faddd           %f32, %f32, %f4         /*  FPA         Group           */
775         fmuld           %f32, %f32, %f6         /*  FPM                         */
776         clr             %x5                     /*  IEU0                        */
777         faddd           %f32, %f32, %f8         /*  FPA         Group           */
778         fmuld           %f32, %f32, %f10        /*  FPM                         */
779         clr             %x6                     /*  IEU0                        */
780         faddd           %f32, %f32, %f12        /*  FPA         Group           */
781         clr             %x7                     /*  IEU0                        */
782         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
783         sub             %dst, 64, %dst          /*  IEU0                        */
784         fmovd           %f14, %f48              /*  FPA                         */
785 vis7:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
786                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
787                         ,LDBLK(f32),    ,,,,,,,STBLK,
788                         ,bcs,pn %icc, vis7e1)
789         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
790                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
791                         ,LDBLK(f0),     ,,,,,,,STBLK,
792                         ,bcs,pn %icc, vis7e2)
793         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
794                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
795                         ,LDBLK(f16),    ,,,,,,,STBLK,
796                         ,bcc,pt %icc, vis7)
797 vis7e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
798                         ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
799                         ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
800                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
801 vis7e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
802                         ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
803                         ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
804                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
805 vis7e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
806                         ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
807                         ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
808                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
809 e1:     END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
810 e2:     END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
811 e3:     END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
812 ett:    rd              %gsr, %x3               /*  LSU         Group+4bubbles  */
813         andcc           %x3, 7, %x3             /*  IEU1        Group           */
814         add             %dst, 8, %dst           /*  IEU0                        */
815         bne,pn          %icc, 1f                /*  CTI                         */
816          fzero          %f10                    /*  FPA                         */
817         brz,a,pn        %len, 2f                /*  CTI+IEU1    Group           */
818          stda           %f6, [%dst - 8] %asi    /*  Store                       */
819 1:      cmp             %len, 8                 /*  IEU1                        */
820         blu,pn          %icc, 3f                /*  CTI                         */
821          sub            %src, 64, %src          /*  IEU0        Group           */
822 1:      ldd             [%src], %f2             /*  Load        Group           */
823         fpadd32         %f10, %f2, %f12         /*  FPA         Group+load stall*/
824         add             %src, 8, %src           /*  IEU0                        */
825         add             %dst, 8, %dst           /*  IEU1                        */
826         faligndata      %f6, %f2, %f14          /*  FPA         Group           */
827         fcmpgt32        %f10, %f12, %x5         /*  FPM         Group           */
828         stda            %f14, [%dst - 16] %asi  /*  Store                       */
829         fmovd           %f2, %f6                /*  FPA                         */
830         fmovd           %f12, %f10              /*  FPA         Group           */
831         sub             %len, 8, %len           /*  IEU1                        */
832         fzero           %f16                    /*  FPA         Group - FPU nop */
833         fzero           %f18                    /*  FPA         Group - FPU nop */
834         inc             %x5                     /*  IEU0                        */
835         srl             %x5, 1, %x5             /*  IEU0        Group (regdep)  */
836         cmp             %len, 8                 /*  IEU1                        */
837         bgeu,pt         %icc, 1b                /*  CTI                         */
838          add            %x5, %sum, %sum         /*  IEU0        Group           */
839 3:      brz,a,pt        %x3, 2f                 /*  CTI+IEU1                    */
840          stda           %f6, [%dst - 8] %asi    /*  Store       Group           */
841         sta             %f7, [%dst - 8] %asi    /*  Store       Group           */
842         sub             %dst, 4, %dst           /*  IEU0                        */
843         add             %len, 4, %len           /*  IEU1                        */
844 2:
845 #ifdef __KERNEL__
846         sub             %sp, 8, %sp             /*  IEU0        Group           */
847 #endif
848         END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
849         membar          #Sync                   /*  LSU         Group           */
850 #ifdef __KERNEL__
851         VISExit
852         add             %sp, 8, %sp             /*  IEU0        Group           */
853 #endif
854 23:     brnz,pn         %len, 26f               /*  CTI+IEU1    Group           */
855 24:      sllx           %sum, 32, %g1           /*  IEU0                        */
856 25:     addcc           %sum, %g1, %src         /*  IEU1        Group           */
857         srlx            %src, 32, %src          /*  IEU0        Group (regdep)  */
858         bcs,a,pn        %xcc, 1f                /*  CTI                         */
859          add            %src, 1, %src           /*  IEU1                        */
860 #ifndef __KERNEL__
861 1:      retl                                    /*  CTI         Group brk forced*/
862          srl            %src, 0, %src           /*  IEU0                        */
863 #else
864 1:      retl                                    /*  CTI         Group brk forced*/
865          ldx            [%g6 + TI_TASK], %g4    /*  Load                        */
866 #endif
867 26:     andcc           %len, 8, %g0            /*  IEU1        Group           */
868         be,pn           %icc, 1f                /*  CTI                         */
869          lduw           [%src], %o4             /*  Load                        */
870         lduw            [%src+4], %g2           /*  Load        Group           */
871         add             %src, 8, %src           /*  IEU0                        */
872         add             %dst, 8, %dst           /*  IEU1                        */
873         sllx            %o4, 32, %g5            /*  IEU0        Group           */
874         stwa            %o4, [%dst - 8] %asi    /*  Store                       */
875         or              %g5, %g2, %g5           /*  IEU0        Group           */
876         stwa            %g2, [%dst - 4] %asi    /*  Store                       */
877         addcc           %g5, %sum, %sum         /*  IEU1        Group           */
878         bcs,a,pn        %xcc, 1f                /*  CTI                         */
879          add            %sum, 1, %sum           /*  IEU0                        */
880 1:      andcc           %len, 4, %g0            /*  IEU1        Group           */
881         be,a,pn         %icc, 1f                /*  CTI                         */
882          clr            %g2                     /*  IEU0                        */
883         lduw            [%src], %g7             /*  Load                        */
884         add             %src, 4, %src           /*  IEU0        Group           */
885         add             %dst, 4, %dst           /*  IEU1                        */
886         sllx            %g7, 32, %g2            /*  IEU0        Group           */
887         stwa            %g7, [%dst - 4] %asi    /*  Store                       */
888 1:      andcc           %len, 2, %g0            /*  IEU1                        */
889         be,a,pn         %icc, 1f                /*  CTI                         */
890          clr            %g3                     /*  IEU0        Group           */
891         lduh            [%src], %g7             /*  Load                        */
892         add             %src, 2, %src           /*  IEU1                        */
893         add             %dst, 2, %dst           /*  IEU0        Group           */
894         sll             %g7, 16, %g3            /*  IEU0        Group           */
895         stha            %g7, [%dst - 2] %asi    /*  Store                       */
896 1:      andcc           %len, 1, %g0            /*  IEU1                        */
897         be,a,pn         %icc, 1f                /*  CTI                         */
898          clr            %o5                     /*  IEU0        Group           */
899         ldub            [%src], %g7             /*  Load                        */
900         sll             %g7, 8, %o5             /*  IEU0        Group           */
901         stba            %g7, [%dst] %asi        /*  Store                       */
902 1:      or              %g2, %g3, %g3           /*  IEU1                        */
903         or              %o5, %g3, %g3           /*  IEU0        Group (regdep)  */
904         addcc           %g3, %sum, %sum         /*  IEU1        Group (regdep)  */
905         bcs,a,pn        %xcc, 1f                /*  CTI                         */
906          add            %sum, 1, %sum           /*  IEU0                        */
907 1:      ba,pt           %xcc, 25b               /*  CTI         Group           */
908          sllx           %sum, 32, %g1           /*  IEU0                        */
909
910 #ifdef __KERNEL__
911 end:
912
913         .section        __ex_table
914         .align          4
915         .word           csum_partial_copy_user_vis, 0, end, cpc_handler
916 #endif