arch/sparc/lib/urem.S

   1 /* $Id: urem.S,v 1.4 1996/09/30 02:22:42 davem Exp $
   2  * urem.S:      This routine was taken from glibc-1.09 and is covered
   3  *              by the GNU Library General Public License Version 2.
   4  */
   5
   6 /* This file is generated from divrem.m4; DO NOT EDIT! */
   7 /*
   8  * Division and remainder, from Appendix E of the Sparc Version 8
   9  * Architecture Manual, with fixes from Gordon Irlam.
  10  */
  11
  12 /*
  13  * Input: dividend and divisor in %o0 and %o1 respectively.
  14  *
  15  * m4 parameters:
  16  *  .urem       name of function to generate
  17  *  rem         rem=div => %o0 / %o1; rem=rem => %o0 % %o1
  18  *  false               false=true => signed; false=false => unsigned
  19  *
  20  * Algorithm parameters:
  21  *  N           how many bits per iteration we try to get (4)
  22  *  WORDSIZE    total number of bits (32)
  23  *
  24  * Derived constants:
  25  *  TOPBITS     number of bits in the top decade of a number
  26  *
  27  * Important variables:
  28  *  Q           the partial quotient under development (initially 0)
  29  *  R           the remainder so far, initially the dividend
  30  *  ITER        number of main division loop iterations required;
  31  *              equal to ceil(log2(quotient) / N).  Note that this
  32  *              is the log base (2^N) of the quotient.
  33  *  V           the current comparand, initially divisor*2^(ITER*N-1)
  34  *
  35  * Cost:
  36  *  Current estimate for non-large dividend is
  37  *      ceil(log2(quotient) / N) * (10 + 7N/2) + C
  38  *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
  39  *  different path, as the upper bits of the quotient must be developed
  40  *  one bit at a time.
  41  */
  42
  43         .globl .urem
  44 .urem:
  45
  46         ! Ready to divide.  Compute size of quotient; scale comparand.
  47         orcc    %o1, %g0, %o5
  48         bne     1f
  49          mov    %o0, %o3
  50
  51                 ! Divide by zero trap.  If it returns, return 0 (about as
  52                 ! wrong as possible, but that is what SunOS does...).
  53                 ta      ST_DIV0
  54                 retl
  55                  clr    %o0
  56
  57 1:
  58         cmp     %o3, %o5                        ! if %o1 exceeds %o0, done
  59         blu     Lgot_result             ! (and algorithm fails otherwise)
  60          clr    %o2
  61
  62         sethi   %hi(1 << (32 - 4 - 1)), %g1
  63
  64         cmp     %o3, %g1
  65         blu     Lnot_really_big
  66          clr    %o4
  67
  68         ! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
  69         ! as our usual N-at-a-shot divide step will cause overflow and havoc.
  70         ! The number of bits in the result here is N*ITER+SC, where SC <= N.
  71         ! Compute ITER in an unorthodox manner: know we need to shift V into
  72         ! the top decade: so do not even bother to compare to R.
  73         1:
  74                 cmp     %o5, %g1
  75                 bgeu    3f
  76                  mov    1, %g7
  77
  78                 sll     %o5, 4, %o5
  79
  80                 b       1b
  81                  add    %o4, 1, %o4
  82
  83         ! Now compute %g7.
  84         2:
  85                 addcc   %o5, %o5, %o5
  86                 bcc     Lnot_too_big
  87                  add    %g7, 1, %g7
  88
  89                 ! We get here if the %o1 overflowed while shifting.
  90                 ! This means that %o3 has the high-order bit set.
  91                 ! Restore %o5 and subtract from %o3.
  92                 sll     %g1, 4, %g1     ! high order bit
  93                 srl     %o5, 1, %o5             ! rest of %o5
  94                 add     %o5, %g1, %o5
  95
  96                 b       Ldo_single_div
  97                  sub    %g7, 1, %g7
  98
  99         Lnot_too_big:
 100         3:
 101                 cmp     %o5, %o3
 102                 blu     2b
 103                  nop
 104
 105                 be      Ldo_single_div
 106                  nop
 107         /* NB: these are commented out in the V8-Sparc manual as well */
 108         /* (I do not understand this) */
 109         ! %o5 > %o3: went too far: back up 1 step
 110         !       srl     %o5, 1, %o5
 111         !       dec     %g7
 112         ! do single-bit divide steps
 113         !
 114         ! We have to be careful here.  We know that %o3 >= %o5, so we can do the
 115         ! first divide step without thinking.  BUT, the others are conditional,
 116         ! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
 117         ! order bit set in the first step, just falling into the regular
 118         ! division loop will mess up the first time around.
 119         ! So we unroll slightly...
 120         Ldo_single_div:
 121                 subcc   %g7, 1, %g7
 122                 bl      Lend_regular_divide
 123                  nop
 124
 125                 sub     %o3, %o5, %o3
 126                 mov     1, %o2
 127
 128                 b       Lend_single_divloop
 129                  nop
 130         Lsingle_divloop:
 131                 sll     %o2, 1, %o2
 132                 bl      1f
 133                  srl    %o5, 1, %o5
 134                 ! %o3 >= 0
 135                 sub     %o3, %o5, %o3
 136                 b       2f
 137                  add    %o2, 1, %o2
 138         1:      ! %o3 < 0
 139                 add     %o3, %o5, %o3
 140                 sub     %o2, 1, %o2
 141         2:
 142         Lend_single_divloop:
 143                 subcc   %g7, 1, %g7
 144                 bge     Lsingle_divloop
 145                  tst    %o3
 146
 147                 b,a     Lend_regular_divide
 148
 149 Lnot_really_big:
 150 1:
 151         sll     %o5, 4, %o5
 152
 153         cmp     %o5, %o3
 154         bleu    1b
 155          addcc  %o4, 1, %o4
 156
 157         be      Lgot_result
 158          sub    %o4, 1, %o4
 159
 160         tst     %o3     ! set up for initial iteration
 161 Ldivloop:
 162         sll     %o2, 4, %o2
 163                 ! depth 1, accumulated bits 0
 164         bl      L.1.16
 165          srl    %o5,1,%o5
 166         ! remainder is positive
 167         subcc   %o3,%o5,%o3
 168                         ! depth 2, accumulated bits 1
 169         bl      L.2.17
 170          srl    %o5,1,%o5
 171         ! remainder is positive
 172         subcc   %o3,%o5,%o3
 173                         ! depth 3, accumulated bits 3
 174         bl      L.3.19
 175          srl    %o5,1,%o5
 176         ! remainder is positive
 177         subcc   %o3,%o5,%o3
 178                         ! depth 4, accumulated bits 7
 179         bl      L.4.23
 180          srl    %o5,1,%o5
 181         ! remainder is positive
 182         subcc   %o3,%o5,%o3
 183         b       9f
 184          add    %o2, (7*2+1), %o2
 185
 186 L.4.23:
 187         ! remainder is negative
 188         addcc   %o3,%o5,%o3
 189         b       9f
 190          add    %o2, (7*2-1), %o2
 191
 192 L.3.19:
 193         ! remainder is negative
 194         addcc   %o3,%o5,%o3
 195                         ! depth 4, accumulated bits 5
 196         bl      L.4.21
 197          srl    %o5,1,%o5
 198         ! remainder is positive
 199         subcc   %o3,%o5,%o3
 200         b       9f
 201          add    %o2, (5*2+1), %o2
 202
 203 L.4.21:
 204         ! remainder is negative
 205         addcc   %o3,%o5,%o3
 206         b       9f
 207          add    %o2, (5*2-1), %o2
 208
 209 L.2.17:
 210         ! remainder is negative
 211         addcc   %o3,%o5,%o3
 212                         ! depth 3, accumulated bits 1
 213         bl      L.3.17
 214          srl    %o5,1,%o5
 215         ! remainder is positive
 216         subcc   %o3,%o5,%o3
 217                         ! depth 4, accumulated bits 3
 218         bl      L.4.19
 219          srl    %o5,1,%o5
 220         ! remainder is positive
 221         subcc   %o3,%o5,%o3
 222         b       9f
 223          add    %o2, (3*2+1), %o2
 224
 225 L.4.19:
 226         ! remainder is negative
 227         addcc   %o3,%o5,%o3
 228         b       9f
 229          add    %o2, (3*2-1), %o2
 230
 231 L.3.17:
 232         ! remainder is negative
 233         addcc   %o3,%o5,%o3
 234                         ! depth 4, accumulated bits 1
 235         bl      L.4.17
 236          srl    %o5,1,%o5
 237         ! remainder is positive
 238         subcc   %o3,%o5,%o3
 239         b       9f
 240          add    %o2, (1*2+1), %o2
 241
 242 L.4.17:
 243         ! remainder is negative
 244         addcc   %o3,%o5,%o3
 245         b       9f
 246          add    %o2, (1*2-1), %o2
 247
 248 L.1.16:
 249         ! remainder is negative
 250         addcc   %o3,%o5,%o3
 251                         ! depth 2, accumulated bits -1
 252         bl      L.2.15
 253          srl    %o5,1,%o5
 254         ! remainder is positive
 255         subcc   %o3,%o5,%o3
 256                         ! depth 3, accumulated bits -1
 257         bl      L.3.15
 258          srl    %o5,1,%o5
 259         ! remainder is positive
 260         subcc   %o3,%o5,%o3
 261                         ! depth 4, accumulated bits -1
 262         bl      L.4.15
 263          srl    %o5,1,%o5
 264         ! remainder is positive
 265         subcc   %o3,%o5,%o3
 266         b       9f
 267          add    %o2, (-1*2+1), %o2
 268
 269 L.4.15:
 270         ! remainder is negative
 271         addcc   %o3,%o5,%o3
 272         b       9f
 273          add    %o2, (-1*2-1), %o2
 274
 275 L.3.15:
 276         ! remainder is negative
 277         addcc   %o3,%o5,%o3
 278                         ! depth 4, accumulated bits -3
 279         bl      L.4.13
 280          srl    %o5,1,%o5
 281         ! remainder is positive
 282         subcc   %o3,%o5,%o3
 283         b       9f
 284          add    %o2, (-3*2+1), %o2
 285
 286 L.4.13:
 287         ! remainder is negative
 288         addcc   %o3,%o5,%o3
 289         b       9f
 290          add    %o2, (-3*2-1), %o2
 291
 292 L.2.15:
 293         ! remainder is negative
 294         addcc   %o3,%o5,%o3
 295                         ! depth 3, accumulated bits -3
 296         bl      L.3.13
 297          srl    %o5,1,%o5
 298         ! remainder is positive
 299         subcc   %o3,%o5,%o3
 300                         ! depth 4, accumulated bits -5
 301         bl      L.4.11
 302          srl    %o5,1,%o5
 303         ! remainder is positive
 304         subcc   %o3,%o5,%o3
 305         b       9f
 306          add    %o2, (-5*2+1), %o2
 307
 308 L.4.11:
 309         ! remainder is negative
 310         addcc   %o3,%o5,%o3
 311         b       9f
 312          add    %o2, (-5*2-1), %o2
 313
 314 L.3.13:
 315         ! remainder is negative
 316         addcc   %o3,%o5,%o3
 317                         ! depth 4, accumulated bits -7
 318         bl      L.4.9
 319          srl    %o5,1,%o5
 320         ! remainder is positive
 321         subcc   %o3,%o5,%o3
 322         b       9f
 323          add    %o2, (-7*2+1), %o2
 324
 325 L.4.9:
 326         ! remainder is negative
 327         addcc   %o3,%o5,%o3
 328         b       9f
 329          add    %o2, (-7*2-1), %o2
 330
 331         9:
 332 Lend_regular_divide:
 333         subcc   %o4, 1, %o4
 334         bge     Ldivloop
 335          tst    %o3
 336
 337         bl,a    Lgot_result
 338         ! non-restoring fixup here (one instruction only!)
 339         add     %o3, %o1, %o3
 340
 341 Lgot_result:
 342
 343         retl
 344          mov %o3, %o0
 345
 346         .globl  .urem_patch
 347 .urem_patch:
 348         wr      %g0, 0x0, %y
 349         nop
 350         nop
 351         nop
 352         udiv    %o0, %o1, %o2
 353         umul    %o2, %o1, %o2
 354         retl
 355          sub    %o0, %o2, %o0