arch/sparc64/lib/U3memcpy.S

   1 /* U3memcpy.S: UltraSparc-III optimized memcpy.
   2  *
   3  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
   4  */
   5
   6 #ifdef __KERNEL__
   7 #include <asm/visasm.h>
   8 #include <asm/asi.h>
   9 #include <asm/dcu.h>
  10 #include <asm/spitfire.h>
  11 #else
  12 #define ASI_BLK_P 0xf0
  13 #define FPRS_FEF  0x04
  14 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  15 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  16 #endif
  17
  18 #ifndef XCC
  19 #define XCC xcc
  20 #endif
  21
  22         .register       %g2,#scratch
  23         .register       %g3,#scratch
  24
  25         /* Special/non-trivial issues of this code:
  26          *
  27          * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
  28          * 2) Only low 32 FPU registers are used so that only the
  29          *    lower half of the FPU register set is dirtied by this
  30          *    code.  This is especially important in the kernel.
  31          * 3) This code never prefetches cachelines past the end
  32          *    of the source buffer.
  33          */
  34
  35         .text
  36         .align  32
  37
  38         /* The cheetah's flexible spine, oversized liver, enlarged heart,
  39          * slender muscular body, and claws make it the swiftest hunter
  40          * in Africa and the fastest animal on land.  Can reach speeds
  41          * of up to 2.4GB per second.
  42          */
  43
  44         .globl  U3memcpy
  45 U3memcpy:       /* %o0=dst, %o1=src, %o2=len */
  46         mov             %o0, %g5
  47         cmp             %o2, 0
  48         be,pn           %XCC, 85f
  49          or             %o0, %o1, %o3
  50         cmp             %o2, 16
  51         bleu,a,pn       %XCC, 70f
  52          or             %o3, %o2, %o3
  53
  54         cmp             %o2, 256
  55         blu,pt          %XCC, 80f
  56          andcc          %o3, 0x7, %g0
  57
  58         ba,pt           %xcc, 1f
  59          andcc          %o0, 0x3f, %g2
  60
  61         /* Here len >= 256 and condition codes reflect execution
  62          * of "andcc %o0, 0x7, %g2", done by caller.
  63          */
  64         .align          64
  65 1:
  66         /* Is 'dst' already aligned on an 64-byte boundary? */
  67         be,pt           %XCC, 2f
  68
  69         /* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
  70          * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
  71          * subtract this from 'len'.
  72          */
  73          sub            %g2, 0x40, %g2
  74         sub             %g0, %g2, %g2
  75         sub             %o2, %g2, %o2
  76
  77         /* Copy %g2 bytes from src to dst, one byte at a time. */
  78 1:      ldub            [%o1 + 0x00], %o3
  79         add             %o1, 0x1, %o1
  80         add             %o0, 0x1, %o0
  81         subcc           %g2, 0x1, %g2
  82
  83         bg,pt           %XCC, 1b
  84          stb            %o3, [%o0 + -1]
  85
  86 2:      VISEntryHalf
  87         and             %o1, 0x7, %g1
  88         ba,pt           %xcc, 1f
  89          alignaddr      %o1, %g0, %o1
  90
  91         .align          64
  92 1:
  93         membar          #StoreLoad | #StoreStore | #LoadStore
  94         prefetch        [%o1 + 0x000], #one_read
  95         prefetch        [%o1 + 0x040], #one_read
  96         andn            %o2, (0x40 - 1), %o4
  97         prefetch        [%o1 + 0x080], #one_read
  98         prefetch        [%o1 + 0x0c0], #one_read
  99         ldd             [%o1 + 0x000], %f0
 100         prefetch        [%o1 + 0x100], #one_read
 101         ldd             [%o1 + 0x008], %f2
 102         prefetch        [%o1 + 0x140], #one_read
 103         ldd             [%o1 + 0x010], %f4
 104         prefetch        [%o1 + 0x180], #one_read
 105         faligndata      %f0, %f2, %f16
 106         ldd             [%o1 + 0x018], %f6
 107         faligndata      %f2, %f4, %f18
 108         ldd             [%o1 + 0x020], %f8
 109         faligndata      %f4, %f6, %f20
 110         ldd             [%o1 + 0x028], %f10
 111         faligndata      %f6, %f8, %f22
 112
 113         ldd             [%o1 + 0x030], %f12
 114         faligndata      %f8, %f10, %f24
 115         ldd             [%o1 + 0x038], %f14
 116         faligndata      %f10, %f12, %f26
 117         ldd             [%o1 + 0x040], %f0
 118
 119         sub             %o4, 0x80, %o4
 120         add             %o1, 0x40, %o1
 121         ba,pt           %xcc, 1f
 122          srl            %o4, 6, %o3
 123
 124         .align          64
 125 1:
 126         ldd             [%o1 + 0x008], %f2
 127         faligndata      %f12, %f14, %f28
 128         ldd             [%o1 + 0x010], %f4
 129         faligndata      %f14, %f0, %f30
 130         stda            %f16, [%o0] ASI_BLK_P
 131         ldd             [%o1 + 0x018], %f6
 132         faligndata      %f0, %f2, %f16
 133
 134         ldd             [%o1 + 0x020], %f8
 135         faligndata      %f2, %f4, %f18
 136         ldd             [%o1 + 0x028], %f10
 137         faligndata      %f4, %f6, %f20
 138         ldd             [%o1 + 0x030], %f12
 139         faligndata      %f6, %f8, %f22
 140         ldd             [%o1 + 0x038], %f14
 141         faligndata      %f8, %f10, %f24
 142
 143         ldd             [%o1 + 0x040], %f0
 144         prefetch        [%o1 + 0x180], #one_read
 145         faligndata      %f10, %f12, %f26
 146         subcc           %o3, 0x01, %o3
 147         add             %o1, 0x40, %o1
 148         bg,pt           %XCC, 1b
 149          add            %o0, 0x40, %o0
 150
 151         /* Finally we copy the last full 64-byte block. */
 152         ldd             [%o1 + 0x008], %f2
 153         faligndata      %f12, %f14, %f28
 154         ldd             [%o1 + 0x010], %f4
 155         faligndata      %f14, %f0, %f30
 156         stda            %f16, [%o0] ASI_BLK_P
 157         ldd             [%o1 + 0x018], %f6
 158         faligndata      %f0, %f2, %f16
 159         ldd             [%o1 + 0x020], %f8
 160         faligndata      %f2, %f4, %f18
 161         ldd             [%o1 + 0x028], %f10
 162         faligndata      %f4, %f6, %f20
 163         ldd             [%o1 + 0x030], %f12
 164         faligndata      %f6, %f8, %f22
 165         ldd             [%o1 + 0x038], %f14
 166         faligndata      %f8, %f10, %f24
 167         cmp             %g1, 0
 168         be,pt           %XCC, 1f
 169          add            %o0, 0x40, %o0
 170         ldd             [%o1 + 0x040], %f0
 171 1:      faligndata      %f10, %f12, %f26
 172         faligndata      %f12, %f14, %f28
 173         faligndata      %f14, %f0, %f30
 174         stda            %f16, [%o0] ASI_BLK_P
 175         add             %o0, 0x40, %o0
 176         add             %o1, 0x40, %o1
 177         membar          #Sync
 178
 179         /* Now we copy the (len modulo 64) bytes at the end.
 180          * Note how we borrow the %f0 loaded above.
 181          *
 182          * Also notice how this code is careful not to perform a
 183          * load past the end of the src buffer.
 184          */
 185         and             %o2, 0x3f, %o2
 186         andcc           %o2, 0x38, %g2
 187         be,pn           %XCC, 2f
 188          subcc          %g2, 0x8, %g2
 189         be,pn           %XCC, 2f
 190          cmp            %g1, 0
 191
 192         be,a,pt         %XCC, 1f
 193          ldd            [%o1 + 0x00], %f0
 194
 195 1:      ldd             [%o1 + 0x08], %f2
 196         add             %o1, 0x8, %o1
 197         sub             %o2, 0x8, %o2
 198         subcc           %g2, 0x8, %g2
 199         faligndata      %f0, %f2, %f8
 200         std             %f8, [%o0 + 0x00]
 201         be,pn           %XCC, 2f
 202          add            %o0, 0x8, %o0
 203         ldd             [%o1 + 0x08], %f0
 204         add             %o1, 0x8, %o1
 205         sub             %o2, 0x8, %o2
 206         subcc           %g2, 0x8, %g2
 207         faligndata      %f2, %f0, %f8
 208         std             %f8, [%o0 + 0x00]
 209         bne,pn          %XCC, 1b
 210          add            %o0, 0x8, %o0
 211
 212         /* If anything is left, we copy it one byte at a time.
 213          * Note that %g1 is (src & 0x3) saved above before the
 214          * alignaddr was performed.
 215          */
 216 2:
 217         cmp             %o2, 0
 218         add             %o1, %g1, %o1
 219         VISExitHalf
 220         be,pn           %XCC, 85f
 221          sub            %o0, %o1, %o3
 222
 223         andcc           %g1, 0x7, %g0
 224         bne,pn          %icc, 90f
 225          andcc          %o2, 0x8, %g0
 226         be,pt           %icc, 1f
 227          nop
 228         ldx             [%o1], %o5
 229         stx             %o5, [%o1 + %o3]
 230         add             %o1, 0x8, %o1
 231
 232 1:      andcc           %o2, 0x4, %g0
 233         be,pt           %icc, 1f
 234          nop
 235         lduw            [%o1], %o5
 236         stw             %o5, [%o1 + %o3]
 237         add             %o1, 0x4, %o1
 238
 239 1:      andcc           %o2, 0x2, %g0
 240         be,pt           %icc, 1f
 241          nop
 242         lduh            [%o1], %o5
 243         sth             %o5, [%o1 + %o3]
 244         add             %o1, 0x2, %o1
 245
 246 1:      andcc           %o2, 0x1, %g0
 247         be,pt           %icc, 85f
 248          nop
 249         ldub            [%o1], %o5
 250         ba,pt           %xcc, 85f
 251          stb            %o5, [%o1 + %o3]
 252
 253 70: /* 16 < len <= 64 */
 254         bne,pn          %XCC, 90f
 255          sub            %o0, %o1, %o3
 256
 257         andn            %o2, 0x7, %o4
 258         and             %o2, 0x7, %o2
 259 1:      subcc           %o4, 0x8, %o4
 260         ldx             [%o1], %o5
 261         stx             %o5, [%o1 + %o3]
 262         bgu,pt          %XCC, 1b
 263          add            %o1, 0x8, %o1
 264         andcc           %o2, 0x4, %g0
 265         be,pt           %XCC, 1f
 266          nop
 267         sub             %o2, 0x4, %o2
 268         lduw            [%o1], %o5
 269         stw             %o5, [%o1 + %o3]
 270         add             %o1, 0x4, %o1
 271 1:      cmp             %o2, 0
 272         be,pt           %XCC, 85f
 273          nop
 274         ba,pt           %xcc, 90f
 275          nop
 276
 277 80: /* 0 < len <= 16 */
 278         andcc           %o3, 0x3, %g0
 279         bne,pn          %XCC, 90f
 280          sub            %o0, %o1, %o3
 281
 282 1:
 283         subcc           %o2, 4, %o2
 284         lduw            [%o1], %g1
 285         stw             %g1, [%o1 + %o3]
 286         bgu,pt          %XCC, 1b
 287          add            %o1, 4, %o1
 288
 289 85:     retl
 290          mov            %g5, %o0
 291
 292         .align  32
 293 90:
 294         subcc           %o2, 1, %o2
 295         ldub            [%o1], %g1
 296         stb             %g1, [%o1 + %o3]
 297         bgu,pt          %XCC, 90b
 298          add            %o1, 1, %o1
 299         retl
 300          mov            %g5, %o0
 301
 302         /* Act like copy_{to,in}_user(), ie. return zero instead
 303          * of original destination pointer.  This is invoked when
 304          * copy_{to,in}_user() finds that %asi is kernel space.
 305          */
 306         .globl  U3memcpy_user_stub
 307 U3memcpy_user_stub:
 308         save            %sp, -192, %sp
 309         mov             %i0, %o0
 310         mov             %i1, %o1
 311         call            U3memcpy
 312          mov            %i2, %o2
 313         ret
 314          restore        %g0, %g0, %o0