arch/sparc64/lib/U3copy_to_user.S

   1 /* U3copy_to_user.S: UltraSparc-III optimized memcpy.
   2  *
   3  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
   4  */
   5
   6 #include <asm/visasm.h>
   7 #include <asm/asi.h>
   8 #include <asm/dcu.h>
   9 #include <asm/spitfire.h>
  10
  11 #define XCC xcc
  12
  13 #define EXNV(x,y,a,b)   \
  14 98:     x,y;                            \
  15         .section .fixup;                \
  16         .align 4;                       \
  17 99:     retl;                           \
  18          a, b, %o0;                     \
  19         .section __ex_table;            \
  20         .align 4;                       \
  21         .word 98b, 99b;                 \
  22         .text;                          \
  23         .align 4;
  24 #define EXNV2(x,y,a,b)  \
  25 98:     x,y;                            \
  26         .section .fixup;                \
  27         .align 4;                       \
  28 99:     a, b, %o0;                      \
  29         retl;                           \
  30          add %o0, 1, %o0;               \
  31         .section __ex_table;            \
  32         .align 4;                       \
  33         .word 98b, 99b;                 \
  34         .text;                          \
  35         .align 4;
  36 #define EXNV3(x,y,a,b)  \
  37 98:     x,y;                            \
  38         .section .fixup;                \
  39         .align 4;                       \
  40 99:     a, b, %o0;                      \
  41         retl;                           \
  42          add %o0, 4, %o0;               \
  43         .section __ex_table;            \
  44         .align 4;                       \
  45         .word 98b, 99b;                 \
  46         .text;                          \
  47         .align 4;
  48 #define EXNV4(x,y,a,b)  \
  49 98:     x,y;                            \
  50         .section .fixup;                \
  51         .align 4;                       \
  52 99:     a, b, %o0;                      \
  53         retl;                           \
  54          add %o0, 8, %o0;               \
  55         .section __ex_table;            \
  56         .align 4;                       \
  57         .word 98b, 99b;                 \
  58         .text;                          \
  59         .align 4;
  60 #define EX(x,y,a,b)                     \
  61 98:     x,y;                            \
  62         .section .fixup;                \
  63         .align 4;                       \
  64 99:     VISExitHalf;                    \
  65         retl;                           \
  66          a, b, %o0;                     \
  67         .section __ex_table;            \
  68         .align 4;                       \
  69         .word 98b, 99b;                 \
  70         .text;                          \
  71         .align 4;
  72 #define EXBLK1(x,y)                     \
  73 98:     x,y;                            \
  74         .section .fixup;                \
  75         .align 4;                       \
  76 99:     VISExitHalf;                    \
  77         add %o4, 0x1c0, %o1;            \
  78         and %o2, (0x40 - 1), %o2;       \
  79         retl;                           \
  80          add %o1, %o2, %o0;             \
  81         .section __ex_table;            \
  82         .align 4;                       \
  83         .word 98b, 99b;                 \
  84         .text;                          \
  85         .align 4;
  86 #define EXBLK2(x,y)                     \
  87 98:     x,y;                            \
  88         .section .fixup;                \
  89         .align 4;                       \
  90 99:     VISExitHalf;                    \
  91         sll %o3, 6, %o3;                \
  92         and %o2, (0x40 - 1), %o2;       \
  93         add %o3, 0x80, %o1;             \
  94         retl;                           \
  95          add %o1, %o2, %o0;             \
  96         .section __ex_table;            \
  97         .align 4;                       \
  98         .word 98b, 99b;                 \
  99         .text;                          \
 100         .align 4;
 101 #define EXBLK3(x,y)                     \
 102 98:     x,y;                            \
 103         .section .fixup;                \
 104         .align 4;                       \
 105 99:     VISExitHalf;                    \
 106         and %o2, (0x40 - 1), %o2;       \
 107         retl;                           \
 108          add %o2, 0x80, %o0;            \
 109         .section __ex_table;            \
 110         .align 4;                       \
 111         .word 98b, 99b;                 \
 112         .text;                          \
 113         .align 4;
 114 #define EXBLK4(x,y)                     \
 115 98:     x,y;                            \
 116         .section .fixup;                \
 117         .align 4;                       \
 118 99:     VISExitHalf;                    \
 119         and %o2, (0x40 - 1), %o2;       \
 120         retl;                           \
 121          add %o2, 0x40, %o0;            \
 122         .section __ex_table;            \
 123         .align 4;                       \
 124         .word 98b, 99b;                 \
 125         .text;                          \
 126         .align 4;
 127
 128         .register       %g2,#scratch
 129         .register       %g3,#scratch
 130
 131         /* Special/non-trivial issues of this code:
 132          *
 133          * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
 134          * 2) Only low 32 FPU registers are used so that only the
 135          *    lower half of the FPU register set is dirtied by this
 136          *    code.  This is especially important in the kernel.
 137          * 3) This code never prefetches cachelines past the end
 138          *    of the source buffer.
 139          */
 140
 141         .text
 142         .align  32
 143
 144         /* The cheetah's flexible spine, oversized liver, enlarged heart,
 145          * slender muscular body, and claws make it the swiftest hunter
 146          * in Africa and the fastest animal on land.  Can reach speeds
 147          * of up to 2.4GB per second.
 148          */
 149
 150         .globl  U3copy_to_user
 151 U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
 152         /* Writing to %asi is _expensive_ so we hardcode it.
 153          * Reading %asi to check for KERNEL_DS is comparatively
 154          * cheap.
 155          */
 156         rd              %asi, %g1
 157         cmp             %g1, ASI_AIUS
 158         bne,pn          %icc, U3memcpy_user_stub
 159          nop
 160
 161         cmp             %o2, 0
 162         be,pn           %XCC, 85f
 163          or             %o0, %o1, %o3
 164         cmp             %o2, 16
 165         bleu,a,pn       %XCC, 80f
 166          or             %o3, %o2, %o3
 167
 168         cmp             %o2, 256
 169         blu,pt          %XCC, 70f
 170          andcc          %o3, 0x7, %g0
 171
 172         ba,pt           %xcc, 1f
 173          andcc          %o0, 0x3f, %g2
 174
 175         /* Here len >= 256 and condition codes reflect execution
 176          * of "andcc %o0, 0x7, %g2", done by caller.
 177          */
 178         .align          64
 179 1:
 180         /* Is 'dst' already aligned on an 64-byte boundary? */
 181         be,pt           %XCC, 2f
 182
 183         /* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
 184          * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
 185          * subtract this from 'len'.
 186          */
 187          sub            %g2, 0x40, %g2
 188         sub             %g0, %g2, %g2
 189         sub             %o2, %g2, %o2
 190
 191         /* Copy %g2 bytes from src to dst, one byte at a time. */
 192 1:      ldub            [%o1 + 0x00], %o3
 193         add             %o1, 0x1, %o1
 194         add             %o0, 0x1, %o0
 195         subcc           %g2, 0x1, %g2
 196
 197         bg,pt           %XCC, 1b
 198          EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
 199
 200 2:      VISEntryHalf
 201         and             %o1, 0x7, %g1
 202         ba,pt           %xcc, 1f
 203          alignaddr      %o1, %g0, %o1
 204
 205         .align          64
 206 1:
 207         membar          #StoreLoad | #StoreStore | #LoadStore
 208         prefetch        [%o1 + 0x000], #one_read
 209         prefetch        [%o1 + 0x040], #one_read
 210         andn            %o2, (0x40 - 1), %o4
 211         prefetch        [%o1 + 0x080], #one_read
 212         prefetch        [%o1 + 0x0c0], #one_read
 213         ldd             [%o1 + 0x000], %f0
 214         prefetch        [%o1 + 0x100], #one_read
 215         ldd             [%o1 + 0x008], %f2
 216         prefetch        [%o1 + 0x140], #one_read
 217         ldd             [%o1 + 0x010], %f4
 218         prefetch        [%o1 + 0x180], #one_read
 219         faligndata      %f0, %f2, %f16
 220         ldd             [%o1 + 0x018], %f6
 221         faligndata      %f2, %f4, %f18
 222         ldd             [%o1 + 0x020], %f8
 223         faligndata      %f4, %f6, %f20
 224         ldd             [%o1 + 0x028], %f10
 225         faligndata      %f6, %f8, %f22
 226
 227         ldd             [%o1 + 0x030], %f12
 228         faligndata      %f8, %f10, %f24
 229         ldd             [%o1 + 0x038], %f14
 230         faligndata      %f10, %f12, %f26
 231         ldd             [%o1 + 0x040], %f0
 232
 233         sub             %o4, 0x80, %o4
 234         add             %o1, 0x40, %o1
 235         ba,pt           %xcc, 1f
 236          srl            %o4, 6, %o3
 237
 238         .align          64
 239 1:
 240         ldd             [%o1 + 0x008], %f2
 241         faligndata      %f12, %f14, %f28
 242         ldd             [%o1 + 0x010], %f4
 243         faligndata      %f14, %f0, %f30
 244         EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)
 245         ldd             [%o1 + 0x018], %f6
 246         faligndata      %f0, %f2, %f16
 247
 248         ldd             [%o1 + 0x020], %f8
 249         faligndata      %f2, %f4, %f18
 250         ldd             [%o1 + 0x028], %f10
 251         faligndata      %f4, %f6, %f20
 252         ldd             [%o1 + 0x030], %f12
 253         faligndata      %f6, %f8, %f22
 254         ldd             [%o1 + 0x038], %f14
 255         faligndata      %f8, %f10, %f24
 256
 257         ldd             [%o1 + 0x040], %f0
 258         prefetch        [%o1 + 0x180], #one_read
 259         faligndata      %f10, %f12, %f26
 260         subcc           %o3, 0x01, %o3
 261         add             %o1, 0x40, %o1
 262         bg,pt           %XCC, 1b
 263          add            %o0, 0x40, %o0
 264
 265         /* Finally we copy the last full 64-byte block. */
 266         ldd             [%o1 + 0x008], %f2
 267         faligndata      %f12, %f14, %f28
 268         ldd             [%o1 + 0x010], %f4
 269         faligndata      %f14, %f0, %f30
 270         EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)
 271         ldd             [%o1 + 0x018], %f6
 272         faligndata      %f0, %f2, %f16
 273         ldd             [%o1 + 0x020], %f8
 274         faligndata      %f2, %f4, %f18
 275         ldd             [%o1 + 0x028], %f10
 276         faligndata      %f4, %f6, %f20
 277         ldd             [%o1 + 0x030], %f12
 278         faligndata      %f6, %f8, %f22
 279         ldd             [%o1 + 0x038], %f14
 280         faligndata      %f8, %f10, %f24
 281         cmp             %g1, 0
 282         be,pt           %XCC, 1f
 283          add            %o0, 0x40, %o0
 284         ldd             [%o1 + 0x040], %f0
 285 1:      faligndata      %f10, %f12, %f26
 286         faligndata      %f12, %f14, %f28
 287         faligndata      %f14, %f0, %f30
 288         EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)
 289         add             %o0, 0x40, %o0
 290         add             %o1, 0x40, %o1
 291
 292         membar          #Sync
 293
 294         /* Now we copy the (len modulo 64) bytes at the end.
 295          * Note how we borrow the %f0 loaded above.
 296          *
 297          * Also notice how this code is careful not to perform a
 298          * load past the end of the src buffer.
 299          */
 300         and             %o2, 0x3f, %o2
 301         andcc           %o2, 0x38, %g2
 302         be,pn           %XCC, 2f
 303          subcc          %g2, 0x8, %g2
 304         be,pn           %XCC, 2f
 305          cmp            %g1, 0
 306
 307         be,a,pt         %XCC, 1f
 308          ldd            [%o1 + 0x00], %f0
 309
 310 1:      ldd             [%o1 + 0x08], %f2
 311         add             %o1, 0x8, %o1
 312         sub             %o2, 0x8, %o2
 313         subcc           %g2, 0x8, %g2
 314         faligndata      %f0, %f2, %f8
 315         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
 316         be,pn           %XCC, 2f
 317          add            %o0, 0x8, %o0
 318         ldd             [%o1 + 0x08], %f0
 319         add             %o1, 0x8, %o1
 320         sub             %o2, 0x8, %o2
 321         subcc           %g2, 0x8, %g2
 322         faligndata      %f2, %f0, %f8
 323         EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
 324         bne,pn          %XCC, 1b
 325          add            %o0, 0x8, %o0
 326
 327         /* If anything is left, we copy it one byte at a time.
 328          * Note that %g1 is (src & 0x3) saved above before the
 329          * alignaddr was performed.
 330          */
 331 2:
 332         cmp             %o2, 0
 333         add             %o1, %g1, %o1
 334         VISExitHalf
 335         be,pn           %XCC, 85f
 336          sub            %o0, %o1, %o3
 337
 338         andcc           %g1, 0x7, %g0
 339         bne,pn          %icc, 90f
 340          andcc          %o2, 0x8, %g0
 341         be,pt           %icc, 1f
 342          nop
 343         ldx             [%o1], %o5
 344         EXNV(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
 345         add             %o1, 0x8, %o1
 346
 347 1:      andcc           %o2, 0x4, %g0
 348         be,pt           %icc, 1f
 349          nop
 350         lduw            [%o1], %o5
 351         EXNV(stwa %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x7)
 352         add             %o1, 0x4, %o1
 353
 354 1:      andcc           %o2, 0x2, %g0
 355         be,pt           %icc, 1f
 356          nop
 357         lduh            [%o1], %o5
 358         EXNV(stha %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x3)
 359         add             %o1, 0x2, %o1
 360
 361 1:      andcc           %o2, 0x1, %g0
 362         be,pt           %icc, 85f
 363          nop
 364         ldub            [%o1], %o5
 365         ba,pt           %xcc, 85f
 366          EXNV(stba %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x1)
 367
 368 70: /* 16 < len <= 64 */
 369         bne,pn          %XCC, 90f
 370          sub            %o0, %o1, %o3
 371
 372         andn            %o2, 0x7, %o4
 373         and             %o2, 0x7, %o2
 374 1:      subcc           %o4, 0x8, %o4
 375         ldx             [%o1], %o5
 376         EXNV4(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %o4)
 377         bgu,pt          %XCC, 1b
 378          add            %o1, 0x8, %o1
 379         andcc           %o2, 0x4, %g0
 380         be,pt           %XCC, 1f
 381          nop
 382         sub             %o2, 0x4, %o2
 383         lduw            [%o1], %o5
 384         EXNV3(stwa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
 385         add             %o1, 0x4, %o1
 386 1:      cmp             %o2, 0
 387         be,pt           %XCC, 85f
 388          nop
 389         ba,pt           %xcc, 90f
 390          nop
 391
 392 80: /* 0 < len <= 16 */
 393         andcc           %o3, 0x3, %g0
 394         bne,pn          %XCC, 90f
 395          sub            %o0, %o1, %o3
 396
 397 1:
 398         subcc           %o2, 4, %o2
 399         lduw            [%o1], %g1
 400         EXNV3(stwa %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
 401         bgu,pt          %XCC, 1b
 402          add            %o1, 4, %o1
 403
 404 85:     retl
 405          clr            %o0
 406
 407         .align  32
 408 90:
 409         subcc           %o2, 1, %o2
 410         ldub            [%o1], %g1
 411         EXNV2(stba %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
 412         bgu,pt          %XCC, 90b
 413          add            %o1, 1, %o1
 414         retl
 415          clr            %o0