arch/sparc64/lib/U3copy_to_user.S

   1 /* $Id: U3copy_to_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
   2  * U3memcpy.S: UltraSparc-III optimized copy to userspace.
   3  *
   4  * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
   5  */
   6
   7 #ifdef __KERNEL__
   8 #include <asm/visasm.h>
   9 #include <asm/asi.h>
  10 #include <asm/dcu.h>
  11 #include <asm/spitfire.h>
  12 #undef SMALL_COPY_USES_FPU
  13 #define EXNV(x,y,a,b)   \
  14 98:     x,y;                            \
  15         .section .fixup;                \
  16         .align 4;                       \
  17 99:     retl;                           \
  18          a, b, %o0;                     \
  19         .section __ex_table;            \
  20         .align 4;                       \
  21         .word 98b, 99b;                 \
  22         .text;                          \
  23         .align 4;
  24 #define EXNV2(x,y,a,b)  \
  25 98:     x,y;                            \
  26         .section .fixup;                \
  27         .align 4;                       \
  28 99:     a, b, %o0;                      \
  29         retl;                           \
  30          add %o0, 1, %o0;               \
  31         .section __ex_table;            \
  32         .align 4;                       \
  33         .word 98b, 99b;                 \
  34         .text;                          \
  35         .align 4;
  36 #define EXNV3(x,y,a,b)  \
  37 98:     x,y;                            \
  38         .section .fixup;                \
  39         .align 4;                       \
  40 99:     a, b, %o0;                      \
  41         retl;                           \
  42          add %o0, 8, %o0;               \
  43         .section __ex_table;            \
  44         .align 4;                       \
  45         .word 98b, 99b;                 \
  46         .text;                          \
  47         .align 4;
  48 #define EX(x,y,a,b)                     \
  49 98:     x,y;                            \
  50         .section .fixup;                \
  51         .align 4;                       \
  52 99:     VISExitHalf;                    \
  53         retl;                           \
  54          a, b, %o0;                     \
  55         .section __ex_table;            \
  56         .align 4;                       \
  57         .word 98b, 99b;                 \
  58         .text;                          \
  59         .align 4;
  60 #define EXBLK1(x,y)                     \
  61 98:     x,y;                            \
  62         .section .fixup;                \
  63         .align 4;                       \
  64 99:     VISExitHalf;                    \
  65         add %o4, 0x1c0, %o1;            \
  66         and %o2, (0x40 - 1), %o2;       \
  67         retl;                           \
  68          add %o1, %o2, %o0;             \
  69         .section __ex_table;            \
  70         .align 4;                       \
  71         .word 98b, 99b;                 \
  72         .text;                          \
  73         .align 4;
  74 #define EXBLK2(x,y)                     \
  75 98:     x,y;                            \
  76         .section .fixup;                \
  77         .align 4;                       \
  78 99:     VISExitHalf;                    \
  79         sll %o3, 6, %o3;                \
  80         and %o2, (0x40 - 1), %o2;       \
  81         add %o3, 0x80, %o1;             \
  82         retl;                           \
  83          add %o1, %o2, %o0;             \
  84         .section __ex_table;            \
  85         .align 4;                       \
  86         .word 98b, 99b;                 \
  87         .text;                          \
  88         .align 4;
  89 #define EXBLK3(x,y)                     \
  90 98:     x,y;                            \
  91         .section .fixup;                \
  92         .align 4;                       \
  93 99:     VISExitHalf;                    \
  94         and %o2, (0x40 - 1), %o2;       \
  95         retl;                           \
  96          add %o2, 0x80, %o0;            \
  97         .section __ex_table;            \
  98         .align 4;                       \
  99         .word 98b, 99b;                 \
 100         .text;                          \
 101         .align 4;
 102 #define EXBLK4(x,y)                     \
 103 98:     x,y;                            \
 104         .section .fixup;                \
 105         .align 4;                       \
 106 99:     VISExitHalf;                    \
 107         and %o2, (0x40 - 1), %o2;       \
 108         retl;                           \
 109          add %o2, 0x40, %o0;            \
 110         .section __ex_table;            \
 111         .align 4;                       \
 112         .word 98b, 99b;                 \
 113         .text;                          \
 114         .align 4;
 115 #else
 116 #define ASI_AIUS 0x80
 117 #define ASI_BLK_AIUS 0xf0
 118 #define FPRS_FEF  0x04
 119 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
 120 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 121 #define SMALL_COPY_USES_FPU
 122 #define EXNV(x,y,a,b)   x,y;
 123 #define EXNV2(x,y,a,b)  x,y;
 124 #define EXNV3(x,y,a,b)  x,y;
 125 #define EX(x,y,a,b)     x,y;
 126 #define EXBLK1(x,y)     x,y;
 127 #define EXBLK2(x,y)     x,y;
 128 #define EXBLK3(x,y)     x,y;
 129 #define EXBLK4(x,y)     x,y;
 130 #endif
 131
 132         /* Special/non-trivial issues of this code:
 133          *
 134          * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
 135          * 2) Only low 32 FPU registers are used so that only the
 136          *    lower half of the FPU register set is dirtied by this
 137          *    code.  This is especially important in the kernel.
 138          * 3) This code never prefetches cachelines past the end
 139          *    of the source buffer.
 140          */
 141
 142         .text
 143         .align  32
 144
 145         /* The cheetah's flexible spine, oversized liver, enlarged heart,
 146          * slender muscular body, and claws make it the swiftest hunter
 147          * in Africa and the fastest animal on land.  Can reach speeds
 148          * of up to 2.4GB per second.
 149          */
 150
 151         .globl          U3copy_to_user
 152 U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
 153         /* Writing to %asi is _expensive_ so we hardcode it.
 154          * Reading %asi to check for KERNEL_DS is comparatively
 155          * cheap.
 156          */
 157         rd              %asi, %g1                       ! MS    Group   (4 cycles)
 158         cmp             %g1, ASI_AIUS                   ! A0    Group
 159         bne             U3memcpy                        ! BR
 160          nop                                            ! A1
 161 #ifndef __KERNEL__
 162         /* Save away original 'dst' for memcpy return value. */
 163         mov             %o0, %g3                        ! A0    Group
 164 #endif
 165         /* Anything to copy at all? */
 166         cmp             %o2, 0                          ! A1
 167         ble,pn          %icc, U3copy_to_user_short_ret  ! BR
 168
 169         /* Extremely small copy? */
 170          cmp            %o2, 31                         ! A0    Group
 171         ble,pn          %icc, U3copy_to_user_short      ! BR
 172
 173         /* Large enough to use unrolled prefetch loops? */
 174          cmp            %o2, 0x100                      ! A1
 175         bge,a,pt        %icc, U3copy_to_user_enter      ! BR    Group
 176          andcc          %o0, 0x3f, %g2                  ! A0
 177
 178         ba,pt           %xcc, U3copy_to_user_toosmall   ! BR    Group
 179          andcc          %o0, 0x7, %g2                   ! A0
 180
 181         .align          32
 182 U3copy_to_user_short:
 183         /* Copy %o2 bytes from src to dst, one byte at a time. */
 184         ldub            [%o1 + 0x00], %o3               ! MS    Group
 185         add             %o1, 0x1, %o1                   ! A0
 186         add             %o0, 0x1, %o0                   ! A1
 187         subcc           %o2, 1, %o2                     ! A0    Group
 188
 189         bg,pt           %icc, U3copy_to_user_short      ! BR
 190          EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1)    ! MS    Group (1-cycle stall)
 191
 192 U3copy_to_user_short_ret:
 193 #ifdef __KERNEL__
 194         retl                                            ! BR    Group (0-4 cycle stall)
 195          clr            %o0                             ! A0
 196 #else
 197         retl                                            ! BR    Group (0-4 cycle stall)
 198          mov            %g3, %o0                        ! A0
 199 #endif
 200
 201         /* Here len >= (6 * 64) and condition codes reflect execution
 202          * of "andcc %o0, 0x7, %g2", done by caller.
 203          */
 204         .align          64
 205 U3copy_to_user_enter:
 206         /* Is 'dst' already aligned on an 64-byte boundary? */
 207         be,pt           %xcc, 2f                        ! BR
 208
 209         /* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
 210          * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
 211          * subtract this from 'len'.
 212          */
 213          sub            %g2, 0x40, %g2                  ! A0    Group
 214         sub             %g0, %g2, %g2                   ! A0    Group
 215         sub             %o2, %g2, %o2                   ! A0    Group
 216
 217         /* Copy %g2 bytes from src to dst, one byte at a time. */
 218 1:      ldub            [%o1 + 0x00], %o3               ! MS    (Group)
 219         add             %o1, 0x1, %o1                   ! A1
 220         add             %o0, 0x1, %o0                   ! A0    Group
 221         subcc           %g2, 0x1, %g2                   ! A1
 222
 223         bg,pt           %icc, 1b                        ! BR    Group
 224          EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS    Group
 225
 226 2:      VISEntryHalf                                    ! MS+MS
 227         and             %o1, 0x7, %g1                   ! A1
 228         ba,pt           %xcc, U3copy_to_user_begin      ! BR
 229          alignaddr      %o1, %g0, %o1                   ! MS          (Break-after)
 230
 231         .align          64
 232 U3copy_to_user_begin:
 233 #ifdef __KERNEL__
 234         .globl          U3copy_to_user_nop_1_6
 235 U3copy_to_user_nop_1_6:
 236         ldxa            [%g0] ASI_DCU_CONTROL_REG, %g3
 237         sethi           %uhi(DCU_PE), %o3
 238         sllx            %o3, 32, %o3
 239         or              %g3, %o3, %o3
 240         stxa            %o3, [%g0] ASI_DCU_CONTROL_REG  ! Enable P-cache
 241         membar          #Sync
 242 #endif
 243         prefetch        [%o1 + 0x000], #one_read        ! MS    Group1
 244         prefetch        [%o1 + 0x040], #one_read        ! MS    Group2
 245         andn            %o2, (0x40 - 1), %o4            ! A0
 246         prefetch        [%o1 + 0x080], #one_read        ! MS    Group3
 247         cmp             %o4, 0x140                      ! A0
 248         prefetch        [%o1 + 0x0c0], #one_read        ! MS    Group4
 249         ldd             [%o1 + 0x000], %f0              ! MS    Group5 (%f0 results at G8)
 250         bge,a,pt        %icc, 1f                        ! BR
 251
 252          prefetch       [%o1 + 0x100], #one_read        ! MS    Group6
 253 1:      ldd             [%o1 + 0x008], %f2              ! AX           (%f2 results at G9)
 254         cmp             %o4, 0x180                      ! A1
 255         bge,a,pt        %icc, 1f                        ! BR
 256          prefetch       [%o1 + 0x140], #one_read        ! MS    Group7
 257 1:      ldd             [%o1 + 0x010], %f4              ! AX           (%f4 results at G10)
 258         cmp             %o4, 0x1c0                      ! A1
 259         bge,a,pt        %icc, 1f                        ! BR
 260
 261          prefetch       [%o1 + 0x180], #one_read        ! MS    Group8
 262 1:      faligndata      %f0, %f2, %f16                  ! FGA   Group9 (%f16 at G12)
 263         ldd             [%o1 + 0x018], %f6              ! AX           (%f6 results at G12)
 264         faligndata      %f2, %f4, %f18                  ! FGA   Group10 (%f18 results at G13)
 265         ldd             [%o1 + 0x020], %f8              ! MS            (%f8 results at G13)
 266         faligndata      %f4, %f6, %f20                  ! FGA   Group12 (1-cycle stall,%f20 at G15)
 267         ldd             [%o1 + 0x028], %f10             ! MS            (%f10 results at G15)
 268         faligndata      %f6, %f8, %f22                  ! FGA   Group13 (%f22 results at G16)
 269
 270         ldd             [%o1 + 0x030], %f12             ! MS            (%f12 results at G16)
 271         faligndata      %f8, %f10, %f24                 ! FGA   Group15 (1-cycle stall,%f24 at G18)
 272         ldd             [%o1 + 0x038], %f14             ! MS            (%f14 results at G18)
 273         faligndata      %f10, %f12, %f26                ! FGA   Group16 (%f26 results at G19)
 274         ldd             [%o1 + 0x040], %f0              ! MS            (%f0 results at G19)
 275
 276         /* We only use the first loop if len > (7 * 64). */
 277         subcc           %o4, 0x1c0, %o4                 ! A0    Group17
 278         bg,pt           %icc, U3copy_to_user_loop1      ! BR
 279          add            %o1, 0x40, %o1                  ! A1
 280
 281         add             %o4, 0x140, %o4                 ! A0    Group18
 282         ba,pt           %xcc, U3copy_to_user_loop2      ! BR
 283          srl            %o4, 6, %o3                     ! A0    Group19
 284         nop
 285         nop
 286         nop
 287         nop
 288         nop
 289
 290         nop
 291         nop
 292
 293         /* This loop performs the copy and queues new prefetches.
 294          * We drop into the second loop when len <= (5 * 64).  Note
 295          * that this (5 * 64) factor has been subtracted from len
 296          * already.
 297          */
 298 U3copy_to_user_loop1:
 299         ldd             [%o1 + 0x008], %f2              ! MS    Group2  (%f2 results at G5)
 300         faligndata      %f12, %f14, %f28                ! FGA           (%f28 results at G5)
 301         ldd             [%o1 + 0x010], %f4              ! MS    Group3  (%f4 results at G6)
 302         faligndata      %f14, %f0, %f30                 ! FGA   Group4  (1-cycle stall, %f30 at G7)
 303         EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
 304         ldd             [%o1 + 0x018], %f6              ! AX            (%f6 results at G7)
 305
 306         faligndata      %f0, %f2, %f16                  ! FGA   Group12 (7-cycle stall)
 307         ldd             [%o1 + 0x020], %f8              ! MS            (%f8 results at G15)
 308         faligndata      %f2, %f4, %f18                  ! FGA   Group13 (%f18 results at G16)
 309         ldd             [%o1 + 0x028], %f10             ! MS            (%f10 results at G16)
 310         faligndata      %f4, %f6, %f20                  ! FGA   Group14 (%f20 results at G17)
 311         ldd             [%o1 + 0x030], %f12             ! MS            (%f12 results at G17)
 312         faligndata      %f6, %f8, %f22                  ! FGA   Group15 (%f22 results at G18)
 313         ldd             [%o1 + 0x038], %f14             ! MS            (%f14 results at G18)
 314
 315         faligndata      %f8, %f10, %f24                 ! FGA   Group16 (%f24 results at G19)
 316         ldd             [%o1 + 0x040], %f0              ! AX            (%f0 results at G19)
 317         prefetch        [%o1 + 0x180], #one_read        ! MS
 318         faligndata      %f10, %f12, %f26                ! FGA   Group17 (%f26 results at G20)
 319         subcc           %o4, 0x40, %o4                  ! A0
 320         add             %o1, 0x40, %o1                  ! A1
 321         bg,pt           %xcc, U3copy_to_user_loop1              ! BR
 322          add            %o0, 0x40, %o0                  ! A0    Group18
 323
 324 U3copy_to_user_loop2_enter:
 325         mov             5, %o3                          ! A1
 326
 327         /* This loop performs on the copy, no new prefetches are
 328          * queued.  We do things this way so that we do not perform
 329          * any spurious prefetches past the end of the src buffer.
 330          */
 331 U3copy_to_user_loop2:
 332         ldd             [%o1 + 0x008], %f2              ! MS
 333         faligndata      %f12, %f14, %f28                ! FGA   Group2
 334         ldd             [%o1 + 0x010], %f4              ! MS
 335         faligndata      %f14, %f0, %f30                 ! FGA   Group4  (1-cycle stall)
 336         EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
 337         ldd             [%o1 + 0x018], %f6              ! AX
 338         faligndata      %f0, %f2, %f16                  ! FGA   Group12 (7-cycle stall)
 339
 340         ldd             [%o1 + 0x020], %f8              ! MS
 341         faligndata      %f2, %f4, %f18                  ! FGA   Group13
 342         ldd             [%o1 + 0x028], %f10             ! MS
 343         faligndata      %f4, %f6, %f20                  ! FGA   Group14
 344         ldd             [%o1 + 0x030], %f12             ! MS
 345         faligndata      %f6, %f8, %f22                  ! FGA   Group15
 346         ldd             [%o1 + 0x038], %f14             ! MS
 347         faligndata      %f8, %f10, %f24                 ! FGA   Group16
 348
 349         ldd             [%o1 + 0x040], %f0              ! AX
 350         faligndata      %f10, %f12, %f26                ! FGA   Group17
 351         subcc           %o3, 0x01, %o3                  ! A0
 352         add             %o1, 0x40, %o1                  ! A1
 353         bg,pt           %xcc, U3copy_to_user_loop2      ! BR
 354          add            %o0, 0x40, %o0                  ! A0    Group18
 355
 356         /* Finally we copy the last full 64-byte block. */
 357 U3copy_to_user_loopfini:
 358         ldd             [%o1 + 0x008], %f2              ! MS
 359         faligndata      %f12, %f14, %f28                ! FGA
 360         ldd             [%o1 + 0x010], %f4              ! MS    Group19
 361         faligndata      %f14, %f0, %f30                 ! FGA
 362         EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS    Group20
 363         ldd             [%o1 + 0x018], %f6              ! AX
 364         faligndata      %f0, %f2, %f16                  ! FGA   Group11 (7-cycle stall)
 365         ldd             [%o1 + 0x020], %f8              ! MS
 366         faligndata      %f2, %f4, %f18                  ! FGA   Group12
 367         ldd             [%o1 + 0x028], %f10             ! MS
 368         faligndata      %f4, %f6, %f20                  ! FGA   Group13
 369         ldd             [%o1 + 0x030], %f12             ! MS
 370         faligndata      %f6, %f8, %f22                  ! FGA   Group14
 371         ldd             [%o1 + 0x038], %f14             ! MS
 372         faligndata      %f8, %f10, %f24                 ! FGA   Group15
 373         cmp             %g1, 0                          ! A0
 374         be,pt           %icc, 1f                        ! BR
 375          add            %o0, 0x40, %o0                  ! A1
 376         ldd             [%o1 + 0x040], %f0              ! MS
 377 1:      faligndata      %f10, %f12, %f26                ! FGA   Group16
 378         faligndata      %f12, %f14, %f28                ! FGA   Group17
 379         faligndata      %f14, %f0, %f30                 ! FGA   Group18
 380         EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
 381         add             %o0, 0x40, %o0                  ! A0
 382         add             %o1, 0x40, %o1                  ! A1
 383 #ifdef __KERNEL__
 384         .globl          U3copy_to_user_nop_2_3
 385 U3copy_to_user_nop_2_3:
 386         mov             PRIMARY_CONTEXT, %o3
 387         stxa            %g0, [%o3] ASI_DMMU             ! Flush P-cache
 388         stxa            %g3, [%g0] ASI_DCU_CONTROL_REG  ! Disable P-cache
 389 #endif
 390         membar          #Sync                           ! MS    Group26 (7-cycle stall)
 391
 392         /* Now we copy the (len modulo 64) bytes at the end.
 393          * Note how we borrow the %f0 loaded above.
 394          *
 395          * Also notice how this code is careful not to perform a
 396          * load past the end of the src buffer just like similar
 397          * code found in U3copy_to_user_toosmall processing.
 398          */
 399 U3copy_to_user_loopend:
 400         and             %o2, 0x3f, %o2                  ! A0    Group
 401         andcc           %o2, 0x38, %g2                  ! A0    Group
 402         be,pn           %icc, U3copy_to_user_endcruft   ! BR
 403          subcc          %g2, 0x8, %g2                   ! A1
 404         be,pn           %icc, U3copy_to_user_endcruft   ! BR    Group
 405          cmp            %g1, 0                          ! A0
 406
 407         be,a,pt         %icc, 1f                        ! BR    Group
 408          ldd            [%o1 + 0x00], %f0               ! MS
 409
 410 1:      ldd             [%o1 + 0x08], %f2               ! MS    Group
 411         add             %o1, 0x8, %o1                   ! A0
 412         sub             %o2, 0x8, %o2                   ! A1
 413         subcc           %g2, 0x8, %g2                   ! A0    Group
 414         faligndata      %f0, %f2, %f8                   ! FGA   Group
 415         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    (XXX does it stall here? XXX)
 416         be,pn           %icc, U3copy_to_user_endcruft   ! BR
 417          add            %o0, 0x8, %o0                   ! A0
 418         ldd             [%o1 + 0x08], %f0               ! MS    Group
 419         add             %o1, 0x8, %o1                   ! A0
 420         sub             %o2, 0x8, %o2                   ! A1
 421         subcc           %g2, 0x8, %g2                   ! A0    Group
 422         faligndata      %f2, %f0, %f8                   ! FGA
 423         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    (XXX does it stall here? XXX)
 424         bne,pn          %icc, 1b                        ! BR
 425          add            %o0, 0x8, %o0                   ! A0    Group
 426
 427         /* If anything is left, we copy it one byte at a time.
 428          * Note that %g1 is (src & 0x3) saved above before the
 429          * alignaddr was performed.
 430          */
 431 U3copy_to_user_endcruft:
 432         cmp             %o2, 0
 433         add             %o1, %g1, %o1
 434         VISExitHalf
 435         be,pn           %icc, U3copy_to_user_short_ret
 436          nop
 437         ba,a,pt         %xcc, U3copy_to_user_short
 438
 439         /* If we get here, then 32 <= len < (6 * 64) */
 440 U3copy_to_user_toosmall:
 441
 442 #ifdef SMALL_COPY_USES_FPU
 443
 444         /* Is 'dst' already aligned on an 8-byte boundary? */
 445         be,pt           %xcc, 2f                        ! BR    Group
 446
 447         /* Compute abs((dst & 7) - 8) into %g2.  This is the number
 448          * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
 449          * subtract this from 'len'.
 450          */
 451          sub            %g2, 0x8, %g2                   ! A0
 452         sub             %g0, %g2, %g2                   ! A0    Group (reg-dep)
 453         sub             %o2, %g2, %o2                   ! A0    Group (reg-dep)
 454
 455         /* Copy %g2 bytes from src to dst, one byte at a time. */
 456 1:      ldub            [%o1 + 0x00], %o3               ! MS    (Group) (%o3 in 3 cycles)
 457         add             %o1, 0x1, %o1                   ! A1
 458         add             %o0, 0x1, %o0                   ! A0    Group
 459         subcc           %g2, 0x1, %g2                   ! A1
 460
 461         bg,pt           %icc, 1b                        ! BR    Group
 462          EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS    Group
 463
 464 2:      VISEntryHalf                                    ! MS+MS
 465
 466         /* Compute (len - (len % 8)) into %g2.  This is guaranteed
 467          * to be nonzero.
 468          */
 469         andn            %o2, 0x7, %g2                   ! A0    Group
 470
 471         /* You may read this and believe that it allows reading
 472          * one 8-byte longword past the end of src.  It actually
 473          * does not, as %g2 is subtracted as loads are done from
 474          * src, so we always stop before running off the end.
 475          * Also, we are guaranteed to have at least 0x10 bytes
 476          * to move here.
 477          */
 478         sub             %g2, 0x8, %g2                   ! A0    Group (reg-dep)
 479         alignaddr       %o1, %g0, %g1                   ! MS          (Break-after)
 480         ldd             [%g1 + 0x00], %f0               ! MS    Group (1-cycle stall)
 481         add             %g1, 0x8, %g1                   ! A0
 482
 483 1:      ldd             [%g1 + 0x00], %f2               ! MS    Group
 484         add             %g1, 0x8, %g1                   ! A0
 485         sub             %o2, 0x8, %o2                   ! A1
 486         subcc           %g2, 0x8, %g2                   ! A0    Group
 487
 488         faligndata      %f0, %f2, %f8                   ! FGA   Group (1-cycle stall)
 489         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    Group (2-cycle stall)
 490         add             %o1, 0x8, %o1                   ! A0
 491         be,pn           %icc, 2f                        ! BR
 492
 493          add            %o0, 0x8, %o0                   ! A1
 494         ldd             [%g1 + 0x00], %f0               ! MS    Group
 495         add             %g1, 0x8, %g1                   ! A0
 496         sub             %o2, 0x8, %o2                   ! A1
 497
 498         subcc           %g2, 0x8, %g2                   ! A0    Group
 499         faligndata      %f2, %f0, %f8                   ! FGA   Group (1-cycle stall)
 500         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    Group (2-cycle stall)
 501         add             %o1, 0x8, %o1                   ! A0
 502
 503         bne,pn          %icc, 1b                        ! BR
 504          add            %o0, 0x8, %o0                   ! A1
 505
 506         /* Nothing left to copy? */
 507 2:      cmp             %o2, 0                          ! A0    Group
 508         VISExitHalf                                     ! A0+MS
 509         be,pn           %icc, U3copy_to_user_short_ret  ! BR    Group
 510          nop                                            ! A0
 511         ba,a,pt         %xcc, U3copy_to_user_short      ! BR    Group
 512
 513 #else /* !(SMALL_COPY_USES_FPU) */
 514
 515         xor             %o1, %o0, %g2
 516         andcc           %g2, 0x7, %g0
 517         bne,pn          %icc, U3copy_to_user_short
 518          andcc          %o1, 0x7, %g2
 519
 520         be,pt           %xcc, 2f
 521          sub            %g2, 0x8, %g2
 522         sub             %g0, %g2, %g2
 523         sub             %o2, %g2, %o2
 524
 525 1:      ldub            [%o1 + 0x00], %o3
 526         add             %o1, 0x1, %o1
 527         add             %o0, 0x1, %o0
 528         subcc           %g2, 0x1, %g2
 529         bg,pt           %icc, 1b
 530          EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
 531
 532 2:      andn            %o2, 0x7, %g2
 533         sub             %o2, %g2, %o2
 534
 535 3:      ldx             [%o1 + 0x00], %o3
 536         add             %o1, 0x8, %o1
 537         add             %o0, 0x8, %o0
 538         subcc           %g2, 0x8, %g2
 539         bg,pt           %icc, 3b
 540          EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
 541
 542         cmp             %o2, 0
 543         bne,pn          %icc, U3copy_to_user_short
 544          nop
 545         ba,a,pt         %xcc, U3copy_to_user_short_ret
 546
 547 #endif /* !(SMALL_COPY_USES_FPU) */