arch/sh64/lib/copy_user_memcpy.S

   1 !
   2 ! Fast SH memcpy
   3 !
   4 ! by Toshiyasu Morita (tm@netcom.com)
   5 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   6 ! SH5 code Copyright 2002 SuperH Ltd.
   7 !
   8 ! Entry: ARG0: destination pointer
   9 !        ARG1: source pointer
  10 !        ARG2: byte count
  11 !
  12 ! Exit:  RESULT: destination pointer
  13 !        any other registers in the range r0-r7: trashed
  14 !
  15 ! Notes: Usually one wants to do small reads and write a longword, but
  16 !        unfortunately it is difficult in some cases to concatanate bytes
  17 !        into a longword on the SH, so this does a longword read and small
  18 !        writes.
  19 !
  20 ! This implementation makes two assumptions about how it is called:
  21 !
  22 ! 1.: If the byte count is nonzero, the address of the last byte to be
  23 !     copied is unsigned greater than the address of the first byte to
  24 !     be copied.  This could be easily swapped for a signed comparison,
  25 !     but the algorithm used needs some comparison.
  26 !
  27 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  28 !     bytes memory chunk to b copied, the rest of the word can be read
  29 !     without side effects.
  30 !     This could be easily changed by increasing the minumum size of
  31 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  32 !     however, this would cost a few extra cyles on average.
  33 !     For SHmedia, the assumption is that any quadword can be read in its
  34 !     enirety if at least one byte is included in the copy.
  35
  36 /* Imported into Linux kernel by Richard Curnow.  This is used to implement the
  37    __copy_user function in the general case, so it has to be a distinct
  38    function from intra-kernel memcpy to allow for exception fix-ups in the
  39    event that the user pointer is bad somewhere in the copy (e.g. due to
  40    running off the end of the vma).
  41
  42    Note, this algorithm will be slightly wasteful in the case where the source
  43    and destination pointers are equally aligned, because the stlo/sthi pairs
  44    could then be merged back into single stores.  If there are a lot of cache
  45    misses, this is probably offset by the stall lengths on the preloads.
  46
  47 */
  48
  49         .section .text..SHmedia32,"ax"
  50         .little
  51         .balign 32
  52         .global copy_user_memcpy
  53         .global copy_user_memcpy_end
  54 copy_user_memcpy:
  55
  56 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  57 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  58 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  59 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  60
  61         ld.b r3,0,r63
  62         pta/l Large,tr0
  63         movi 25,r0
  64         bgeu/u r4,r0,tr0
  65         nsb r4,r0
  66         shlli r0,5,r0
  67         movi (L1-L0+63*32 + 1) & 0xffff,r1
  68         sub r1, r0, r0
  69 L0:     ptrel r0,tr0
  70         add r2,r4,r5
  71         ptabs r18,tr1
  72         add r3,r4,r6
  73         blink tr0,r63
  74
  75 /* Rearranged to make cut2 safe */
  76         .balign 8
  77 L4_7:   /* 4..7 byte memcpy cntd. */
  78         stlo.l r2, 0, r0
  79         or r6, r7, r6
  80         sthi.l r5, -1, r6
  81         stlo.l r5, -4, r6
  82         blink tr1,r63
  83
  84         .balign 8
  85 L1:     /* 0 byte memcpy */
  86         nop
  87         blink tr1,r63
  88         nop
  89         nop
  90         nop
  91         nop
  92
  93 L2_3:   /* 2 or 3 byte memcpy cntd. */
  94         st.b r5,-1,r6
  95         blink tr1,r63
  96
  97         /* 1 byte memcpy */
  98         ld.b r3,0,r0
  99         st.b r2,0,r0
 100         blink tr1,r63
 101
 102 L8_15:  /* 8..15 byte memcpy cntd. */
 103         stlo.q r2, 0, r0
 104         or r6, r7, r6
 105         sthi.q r5, -1, r6
 106         stlo.q r5, -8, r6
 107         blink tr1,r63
 108
 109         /* 2 or 3 byte memcpy */
 110         ld.b r3,0,r0
 111         ld.b r2,0,r63
 112         ld.b r3,1,r1
 113         st.b r2,0,r0
 114         pta/l L2_3,tr0
 115         ld.b r6,-1,r6
 116         st.b r2,1,r1
 117         blink tr0, r63
 118
 119         /* 4 .. 7 byte memcpy */
 120         LDUAL (r3, 0, r0, r1)
 121         pta L4_7, tr0
 122         ldlo.l r6, -4, r7
 123         or r0, r1, r0
 124         sthi.l r2, 3, r0
 125         ldhi.l r6, -1, r6
 126         blink tr0, r63
 127
 128         /* 8 .. 15 byte memcpy */
 129         LDUAQ (r3, 0, r0, r1)
 130         pta L8_15, tr0
 131         ldlo.q r6, -8, r7
 132         or r0, r1, r0
 133         sthi.q r2, 7, r0
 134         ldhi.q r6, -1, r6
 135         blink tr0, r63
 136
 137         /* 16 .. 24 byte memcpy */
 138         LDUAQ (r3, 0, r0, r1)
 139         LDUAQ (r3, 8, r8, r9)
 140         or r0, r1, r0
 141         sthi.q r2, 7, r0
 142         or r8, r9, r8
 143         sthi.q r2, 15, r8
 144         ldlo.q r6, -8, r7
 145         ldhi.q r6, -1, r6
 146         stlo.q r2, 8, r8
 147         stlo.q r2, 0, r0
 148         or r6, r7, r6
 149         sthi.q r5, -1, r6
 150         stlo.q r5, -8, r6
 151         blink tr1,r63
 152
 153 Large:
 154         ld.b r2, 0, r63
 155         pta/l  Loop_ua, tr1
 156         ori r3, -8, r7
 157         sub r2, r7, r22
 158         sub r3, r2, r6
 159         add r2, r4, r5
 160         ldlo.q r3, 0, r0
 161         addi r5, -16, r5
 162         movi 64+8, r27 ! could subtract r7 from that.
 163         stlo.q r2, 0, r0
 164         sthi.q r2, 7, r0
 165         ldx.q r22, r6, r0
 166         bgtu/l r27, r4, tr1
 167
 168         addi r5, -48, r27
 169         pta/l Loop_line, tr0
 170         addi r6, 64, r36
 171         addi r6, -24, r19
 172         addi r6, -16, r20
 173         addi r6, -8, r21
 174
 175 Loop_line:
 176         ldx.q r22, r36, r63
 177         synco
 178         alloco r22, 32
 179         synco
 180         addi r22, 32, r22
 181         ldx.q r22, r19, r23
 182         sthi.q r22, -25, r0
 183         ldx.q r22, r20, r24
 184         ldx.q r22, r21, r25
 185         stlo.q r22, -32, r0
 186         ldx.q r22, r6,  r0
 187         sthi.q r22, -17, r23
 188         sthi.q r22,  -9, r24
 189         sthi.q r22,  -1, r25
 190         stlo.q r22, -24, r23
 191         stlo.q r22, -16, r24
 192         stlo.q r22,  -8, r25
 193         bgeu r27, r22, tr0
 194
 195 Loop_ua:
 196         addi r22, 8, r22
 197         sthi.q r22, -1, r0
 198         stlo.q r22, -8, r0
 199         ldx.q r22, r6, r0
 200         bgtu/l r5, r22, tr1
 201
 202         add r3, r4, r7
 203         ldlo.q r7, -8, r1
 204         sthi.q r22, 7, r0
 205         ldhi.q r7, -1, r7
 206         ptabs r18,tr1
 207         stlo.q r22, 0, r0
 208         or r1, r7, r1
 209         sthi.q r5, 15, r1
 210         stlo.q r5, 8, r1
 211         blink tr1, r63
 212 copy_user_memcpy_end:
 213         nop