vserver 1.9.5.x5
[linux-2.6.git] / arch / i386 / crypto / aes-i586-asm.S
index e8a0471..7b73c67 100644 (file)
@@ -61,7 +61,6 @@
 #define r3  edx
 #define r4  esi
 #define r5  edi
-#define r6  ebp
 
 #define eaxl  al
 #define eaxh  ah
 // output registers r0, r1, r4 or r5.  
 
 // Parameters:
+// table table base address
 //   %1  out_state[0]
 //   %2  out_state[1]
 //   %3  out_state[2]
 //   %4  out_state[3]
-//   %5  table base address
-//   %6  input register for the round (destroyed)
-//   %7  scratch register for the round
-
-#define do_col(a1, a2, a3, a4, a5, a6, a7)     \
-       movzx   %l(a6),%a7;                     \
-       xor     a5(,%a7,4),%a1;                 \
-       movzx   %h(a6),%a7;                     \
-       shr     $16,%a6;                        \
-       xor     a5+tlen(,%a7,4),%a2;            \
-       movzx   %l(a6),%a7;                     \
-       movzx   %h(a6),%a6;                     \
-       xor     a5+2*tlen(,%a7,4),%a3;          \
-       xor     a5+3*tlen(,%a6,4),%a4;
+//   idx input register for the round (destroyed)
+//   tmp scratch register for the round
+// sched key schedule
+
+#define do_col(table, a1,a2,a3,a4, idx, tmp)   \
+       movzx   %l(idx),%tmp;                   \
+       xor     table(,%tmp,4),%a1;             \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+2*tlen(,%tmp,4),%a3;      \
+       xor     table+3*tlen(,%idx,4),%a4;
 
 // initialise output registers from the key schedule
-
-#define do_fcol(a1, a2, a3, a4, a5, a6, a7, a8)        \
-       mov     0 a8,%a1;                       \
-       movzx   %l(a6),%a7;                     \
-       mov     12 a8,%a2;                      \
-       xor     a5(,%a7,4),%a1;                 \
-       mov     4 a8,%a4;                       \
-       movzx   %h(a6),%a7;                     \
-       shr     $16,%a6;                        \
-       xor     a5+tlen(,%a7,4),%a2;            \
-       movzx   %l(a6),%a7;                     \
-       movzx   %h(a6),%a6;                     \
-       xor     a5+3*tlen(,%a6,4),%a4;          \
-       mov     %a3,%a6;                        \
-       mov     8 a8,%a3;                       \
-       xor     a5+2*tlen(,%a7,4),%a3;
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
+       mov     0 sched,%a1;                    \
+       movzx   %l(idx),%tmp;                   \
+       mov     12 sched,%a2;                   \
+       xor     table(,%tmp,4),%a1;             \
+       mov     4 sched,%a4;                    \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+3*tlen(,%idx,4),%a4;      \
+       mov     %a3,%idx;                       \
+       mov     8 sched,%a3;                    \
+       xor     table+2*tlen(,%tmp,4),%a3;
 
 // initialise output registers from the key schedule
-
-#define do_icol(a1, a2, a3, a4, a5, a6, a7, a8)        \
-       mov     0 a8,%a1;                       \
-       movzx   %l(a6),%a7;                     \
-       mov     4 a8,%a2;                       \
-       xor     a5(,%a7,4),%a1;                 \
-       mov     12 a8,%a4;                      \
-       movzx   %h(a6),%a7;                     \
-       shr     $16,%a6;                        \
-       xor     a5+tlen(,%a7,4),%a2;            \
-       movzx   %l(a6),%a7;                     \
-       movzx   %h(a6),%a6;                     \
-       xor     a5+3*tlen(,%a6,4),%a4;          \
-       mov     %a3,%a6;                        \
-       mov     8 a8,%a3;                       \
-       xor     a5+2*tlen(,%a7,4),%a3;
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
+       mov     0 sched,%a1;                    \
+       movzx   %l(idx),%tmp;                   \
+       mov     4 sched,%a2;                    \
+       xor     table(,%tmp,4),%a1;             \
+       mov     12 sched,%a4;                   \
+       movzx   %h(idx),%tmp;                   \
+       shr     $16,%idx;                       \
+       xor     table+tlen(,%tmp,4),%a2;        \
+       movzx   %l(idx),%tmp;                   \
+       movzx   %h(idx),%idx;                   \
+       xor     table+3*tlen(,%idx,4),%a4;      \
+       mov     %a3,%idx;                       \
+       mov     8 sched,%a3;                    \
+       xor     table+2*tlen(,%tmp,4),%a3;
 
 
 // original Gladman had conditional saves to MMX regs.
 #define restore(a1, a2)                \
        mov     4*a2(%esp),%a1
 
-// This macro performs a forward encryption cycle. It is entered with
-// the first previous round column values in r0, r1, r4 and r5 and
-// exits with the final values in the same registers, using the MMX
-// registers mm0-mm1 or the stack for temporary storage
-
-// mov current column values into the MMX registers
-#define fwd_rnd(arg, table)                                    \
-       /* mov current column values into the MMX registers */  \
-       mov     %r0,%r2;                                        \
-       save   (0,r1);                                          \
-       save   (1,r5);                                          \
-                                                               \
-       /* compute new column values */                         \
-       do_fcol(r0,r5,r4,r1,table, r2,r3, arg);                 \
-       do_col (r4,r1,r0,r5,table, r2,r3);                      \
-       restore(r2,0);                                          \
-       do_col (r1,r0,r5,r4,table, r2,r3);                      \
-       restore(r2,1);                                          \
-       do_col (r5,r4,r1,r0,table, r2,r3);
-
-// This macro performs an inverse encryption cycle. It is entered with
-// the first previous round column values in r0, r1, r4 and r5 and
-// exits with the final values in the same registers, using the MMX
-// registers mm0-mm1 or the stack for temporary storage
-
-#define inv_rnd(arg, table)                                    \
-       /* mov current column values into the MMX registers */  \
-       mov     %r0,%r2;                                        \
-       save    (0,r1);                                         \
-       save    (1,r5);                                         \
-                                                               \
-       /* compute new column values */                         \
-       do_icol(r0,r1,r4,r5, table, r2,r3, arg);                \
-       do_col (r4,r5,r0,r1, table, r2,r3);                     \
-       restore(r2,0);                                          \
-       do_col (r1,r4,r5,r0, table, r2,r3);                     \
-       restore(r2,1);                                          \
-       do_col (r5,r0,r1,r4, table, r2,r3);
+// These macros perform a forward encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage.
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define fwd_rnd1(arg, table)                                           \
+       save   (0,r1);                                                  \
+       save   (1,r5);                                                  \
+                                                                       \
+       /* compute new column values */                                 \
+       do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
+       do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
+       restore(r0,0);                                                  \
+       do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
+       restore(r0,1);                                                  \
+       do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define fwd_rnd2(arg, table)                                           \
+       save   (0,r1);                                                  \
+       save   (1,r5);                                                  \
+                                                                       \
+       /* compute new column values */                                 \
+       do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
+       do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
+       restore(r2,0);                                                  \
+       do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
+       restore(r2,1);                                                  \
+       do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
+
+// These macros performs an inverse encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define inv_rnd1(arg, table)                                           \
+       save    (0,r1);                                                 \
+       save    (1,r5);                                                 \
+                                                                       \
+       /* compute new column values */                                 \
+       do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
+       do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
+       restore(r0,0);                                                  \
+       do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
+       restore(r0,1);                                                  \
+       do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define inv_rnd2(arg, table)                                           \
+       save    (0,r1);                                                 \
+       save    (1,r5);                                                 \
+                                                                       \
+       /* compute new column values */                                 \
+       do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
+       do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
+       restore(r2,0);                                                  \
+       do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
+       restore(r2,1);                                                  \
+       do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
 
 // AES (Rijndael) Encryption Subroutine
 
 aes_enc_blk:
        push    %ebp
        mov     ctx(%esp),%ebp      // pointer to context
-       xor     %eax,%eax
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -208,7 +240,9 @@ aes_enc_blk:
        push    %esi
        mov     nrnd(%ebp),%r3   // number of rounds
        push    %edi
-       lea     ekey(%ebp),%r6   // key pointer
+#if ekey != 0
+       lea     ekey(%ebp),%ebp  // key pointer
+#endif
 
 // input four columns and xor in first round key
 
@@ -216,47 +250,47 @@ aes_enc_blk:
        mov     4(%r2),%r1
        mov     8(%r2),%r4
        mov     12(%r2),%r5
-       xor     (%r6),%r0
-       xor     4(%r6),%r1
-       xor     8(%r6),%r4
-       xor     12(%r6),%r5
+       xor     (%ebp),%r0
+       xor     4(%ebp),%r1
+       xor     8(%ebp),%r4
+       xor     12(%ebp),%r5
 
        sub     $8,%esp           // space for register saves on stack
-       add     $16,%r6           // increment to next round key   
+       add     $16,%ebp          // increment to next round key
        sub     $10,%r3          
        je      4f              // 10 rounds for 128-bit key
-       add     $32,%r6
+       add     $32,%ebp
        sub     $2,%r3
        je      3f              // 12 rounds for 128-bit key
-       add     $32,%r6
-
-2:     fwd_rnd( -64(%r6) ,ft_tab)      // 14 rounds for 128-bit key
-       fwd_rnd( -48(%r6) ,ft_tab)
-3:     fwd_rnd( -32(%r6) ,ft_tab)      // 12 rounds for 128-bit key
-       fwd_rnd( -16(%r6) ,ft_tab)
-4:     fwd_rnd(    (%r6) ,ft_tab)      // 10 rounds for 128-bit key
-       fwd_rnd( +16(%r6) ,ft_tab)
-       fwd_rnd( +32(%r6) ,ft_tab)
-       fwd_rnd( +48(%r6) ,ft_tab)
-       fwd_rnd( +64(%r6) ,ft_tab)
-       fwd_rnd( +80(%r6) ,ft_tab)
-       fwd_rnd( +96(%r6) ,ft_tab)
-       fwd_rnd(+112(%r6) ,ft_tab)
-       fwd_rnd(+128(%r6) ,ft_tab)
-       fwd_rnd(+144(%r6) ,fl_tab)      // last round uses a different table
+       add     $32,%ebp
+
+2:     fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 128-bit key
+       fwd_rnd2( -48(%ebp) ,ft_tab)
+3:     fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 128-bit key
+       fwd_rnd2( -16(%ebp) ,ft_tab)
+4:     fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
+       fwd_rnd2( +16(%ebp) ,ft_tab)
+       fwd_rnd1( +32(%ebp) ,ft_tab)
+       fwd_rnd2( +48(%ebp) ,ft_tab)
+       fwd_rnd1( +64(%ebp) ,ft_tab)
+       fwd_rnd2( +80(%ebp) ,ft_tab)
+       fwd_rnd1( +96(%ebp) ,ft_tab)
+       fwd_rnd2(+112(%ebp) ,ft_tab)
+       fwd_rnd1(+128(%ebp) ,ft_tab)
+       fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
 
        add     $8,%esp
-       mov     out_blk+12(%esp),%r6
-       mov     %r5,12(%r6)
+       mov     out_blk+12(%esp),%ebp
+       mov     %r5,12(%ebp)
        pop     %edi
-       mov     %r4,8(%r6)
+       mov     %r4,8(%ebp)
        pop     %esi
-       mov     %r1,4(%r6)
+       mov     %r1,4(%ebp)
        pop     %ebx
-       mov     %r0,(%r6)
+       mov     %r0,(%ebp)
        pop     %ebp
        mov     $1,%eax
        ret
@@ -273,7 +307,6 @@ aes_enc_blk:
 aes_dec_blk:
        push    %ebp
        mov     ctx(%esp),%ebp       // pointer to context
-       xor     %eax,%eax
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -283,10 +316,12 @@ aes_dec_blk:
        push    %esi
        mov     nrnd(%ebp),%r3   // number of rounds
        push    %edi
-       lea     dkey(%ebp),%r6   // key pointer
+#if dkey != 0
+       lea     dkey(%ebp),%ebp  // key pointer
+#endif
        mov     %r3,%r0
        shl     $4,%r0
-       add     %r0,%r6
+       add     %r0,%ebp
        
 // input four columns and xor in first round key
 
@@ -294,47 +329,47 @@ aes_dec_blk:
        mov     4(%r2),%r1
        mov     8(%r2),%r4
        mov     12(%r2),%r5
-       xor     (%r6),%r0
-       xor     4(%r6),%r1
-       xor     8(%r6),%r4
-       xor     12(%r6),%r5
+       xor     (%ebp),%r0
+       xor     4(%ebp),%r1
+       xor     8(%ebp),%r4
+       xor     12(%ebp),%r5
 
-       sub     $8,%esp           // space for register saves on stack
-       sub     $16,%r6           // increment to next round key   
+       sub     $8,%esp         // space for register saves on stack
+       sub     $16,%ebp        // increment to next round key
        sub     $10,%r3          
        je      4f              // 10 rounds for 128-bit key
-       sub     $32,%r6
+       sub     $32,%ebp
        sub     $2,%r3
        je      3f              // 12 rounds for 128-bit key
-       sub     $32,%r6
-
-2:     inv_rnd( +64(%r6), it_tab)      // 14 rounds for 128-bit key 
-       inv_rnd( +48(%r6), it_tab)
-3:     inv_rnd( +32(%r6), it_tab)      // 12 rounds for 128-bit key
-       inv_rnd( +16(%r6), it_tab)
-4:     inv_rnd(    (%r6), it_tab)      // 10 rounds for 128-bit key
-       inv_rnd( -16(%r6), it_tab)
-       inv_rnd( -32(%r6), it_tab)
-       inv_rnd( -48(%r6), it_tab)
-       inv_rnd( -64(%r6), it_tab)
-       inv_rnd( -80(%r6), it_tab)
-       inv_rnd( -96(%r6), it_tab)
-       inv_rnd(-112(%r6), it_tab)
-       inv_rnd(-128(%r6), it_tab)
-       inv_rnd(-144(%r6), il_tab)      // last round uses a different table
+       sub     $32,%ebp
+
+2:     inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 128-bit key
+       inv_rnd2( +48(%ebp), it_tab)
+3:     inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 128-bit key
+       inv_rnd2( +16(%ebp), it_tab)
+4:     inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
+       inv_rnd2( -16(%ebp), it_tab)
+       inv_rnd1( -32(%ebp), it_tab)
+       inv_rnd2( -48(%ebp), it_tab)
+       inv_rnd1( -64(%ebp), it_tab)
+       inv_rnd2( -80(%ebp), it_tab)
+       inv_rnd1( -96(%ebp), it_tab)
+       inv_rnd2(-112(%ebp), it_tab)
+       inv_rnd1(-128(%ebp), it_tab)
+       inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
 
        add     $8,%esp
-       mov     out_blk+12(%esp),%r6
-       mov     %r5,12(%r6)
+       mov     out_blk+12(%esp),%ebp
+       mov     %r5,12(%ebp)
        pop     %edi
-       mov     %r4,8(%r6)
+       mov     %r4,8(%ebp)
        pop     %esi
-       mov     %r1,4(%r6)
+       mov     %r1,4(%ebp)
        pop     %ebx
-       mov     %r0,(%r6)
+       mov     %r0,(%ebp)
        pop     %ebp
        mov     $1,%eax
        ret