This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / arch / i386 / crypto / aes-i586-asm.S
index 7b73c67..e8a0471 100644 (file)
@@ -61,6 +61,7 @@
 #define r3  edx
 #define r4  esi
 #define r5  edi
+#define r6  ebp
 
 #define eaxl  al
 #define eaxh  ah
 // output registers r0, r1, r4 or r5.  
 
 // Parameters:
-// table table base address
 //   %1  out_state[0]
 //   %2  out_state[1]
 //   %3  out_state[2]
 //   %4  out_state[3]
-//   idx input register for the round (destroyed)
-//   tmp scratch register for the round
-// sched key schedule
-
-#define do_col(table, a1,a2,a3,a4, idx, tmp)   \
-       movzx   %l(idx),%tmp;                   \
-       xor     table(,%tmp,4),%a1;             \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+2*tlen(,%tmp,4),%a3;      \
-       xor     table+3*tlen(,%idx,4),%a4;
+//   %5  table base address
+//   %6  input register for the round (destroyed)
+//   %7  scratch register for the round
+
+#define do_col(a1, a2, a3, a4, a5, a6, a7)     \
+       movzx   %l(a6),%a7;                     \
+       xor     a5(,%a7,4),%a1;                 \
+       movzx   %h(a6),%a7;                     \
+       shr     $16,%a6;                        \
+       xor     a5+tlen(,%a7,4),%a2;            \
+       movzx   %l(a6),%a7;                     \
+       movzx   %h(a6),%a6;                     \
+       xor     a5+2*tlen(,%a7,4),%a3;          \
+       xor     a5+3*tlen(,%a6,4),%a4;
 
 // initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
-       mov     0 sched,%a1;                    \
-       movzx   %l(idx),%tmp;                   \
-       mov     12 sched,%a2;                   \
-       xor     table(,%tmp,4),%a1;             \
-       mov     4 sched,%a4;                    \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+3*tlen(,%idx,4),%a4;      \
-       mov     %a3,%idx;                       \
-       mov     8 sched,%a3;                    \
-       xor     table+2*tlen(,%tmp,4),%a3;
+
+#define do_fcol(a1, a2, a3, a4, a5, a6, a7, a8)        \
+       mov     0 a8,%a1;                       \
+       movzx   %l(a6),%a7;                     \
+       mov     12 a8,%a2;                      \
+       xor     a5(,%a7,4),%a1;                 \
+       mov     4 a8,%a4;                       \
+       movzx   %h(a6),%a7;                     \
+       shr     $16,%a6;                        \
+       xor     a5+tlen(,%a7,4),%a2;            \
+       movzx   %l(a6),%a7;                     \
+       movzx   %h(a6),%a6;                     \
+       xor     a5+3*tlen(,%a6,4),%a4;          \
+       mov     %a3,%a6;                        \
+       mov     8 a8,%a3;                       \
+       xor     a5+2*tlen(,%a7,4),%a3;
 
 // initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
-       mov     0 sched,%a1;                    \
-       movzx   %l(idx),%tmp;                   \
-       mov     4 sched,%a2;                    \
-       xor     table(,%tmp,4),%a1;             \
-       mov     12 sched,%a4;                   \
-       movzx   %h(idx),%tmp;                   \
-       shr     $16,%idx;                       \
-       xor     table+tlen(,%tmp,4),%a2;        \
-       movzx   %l(idx),%tmp;                   \
-       movzx   %h(idx),%idx;                   \
-       xor     table+3*tlen(,%idx,4),%a4;      \
-       mov     %a3,%idx;                       \
-       mov     8 sched,%a3;                    \
-       xor     table+2*tlen(,%tmp,4),%a3;
+
+#define do_icol(a1, a2, a3, a4, a5, a6, a7, a8)        \
+       mov     0 a8,%a1;                       \
+       movzx   %l(a6),%a7;                     \
+       mov     4 a8,%a2;                       \
+       xor     a5(,%a7,4),%a1;                 \
+       mov     12 a8,%a4;                      \
+       movzx   %h(a6),%a7;                     \
+       shr     $16,%a6;                        \
+       xor     a5+tlen(,%a7,4),%a2;            \
+       movzx   %l(a6),%a7;                     \
+       movzx   %h(a6),%a6;                     \
+       xor     a5+3*tlen(,%a6,4),%a4;          \
+       mov     %a3,%a6;                        \
+       mov     8 a8,%a3;                       \
+       xor     a5+2*tlen(,%a7,4),%a3;
 
 
 // original Gladman had conditional saves to MMX regs.
 #define restore(a1, a2)                \
        mov     4*a2(%esp),%a1
 
-// These macros perform a forward encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage.
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit:  r2,r1,r4,r5
-#define fwd_rnd1(arg, table)                                           \
-       save   (0,r1);                                                  \
-       save   (1,r5);                                                  \
-                                                                       \
-       /* compute new column values */                                 \
-       do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
-       do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
-       restore(r0,0);                                                  \
-       do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
-       restore(r0,1);                                                  \
-       do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit:  r0,r1,r4,r5
-#define fwd_rnd2(arg, table)                                           \
-       save   (0,r1);                                                  \
-       save   (1,r5);                                                  \
-                                                                       \
-       /* compute new column values */                                 \
-       do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
-       do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
-       restore(r2,0);                                                  \
-       do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
-       restore(r2,1);                                                  \
-       do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
-
-// These macros performs an inverse encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit:  r2,r1,r4,r5
-#define inv_rnd1(arg, table)                                           \
-       save    (0,r1);                                                 \
-       save    (1,r5);                                                 \
-                                                                       \
-       /* compute new column values */                                 \
-       do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
-       do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
-       restore(r0,0);                                                  \
-       do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
-       restore(r0,1);                                                  \
-       do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit:  r0,r1,r4,r5
-#define inv_rnd2(arg, table)                                           \
-       save    (0,r1);                                                 \
-       save    (1,r5);                                                 \
-                                                                       \
-       /* compute new column values */                                 \
-       do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
-       do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
-       restore(r2,0);                                                  \
-       do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
-       restore(r2,1);                                                  \
-       do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
+// This macro performs a forward encryption cycle. It is entered with
+// the first previous round column values in r0, r1, r4 and r5 and
+// exits with the final values in the same registers, using the MMX
+// registers mm0-mm1 or the stack for temporary storage
+
+// mov current column values into the MMX registers
+#define fwd_rnd(arg, table)                                    \
+       /* mov current column values into the MMX registers */  \
+       mov     %r0,%r2;                                        \
+       save   (0,r1);                                          \
+       save   (1,r5);                                          \
+                                                               \
+       /* compute new column values */                         \
+       do_fcol(r0,r5,r4,r1,table, r2,r3, arg);                 \
+       do_col (r4,r1,r0,r5,table, r2,r3);                      \
+       restore(r2,0);                                          \
+       do_col (r1,r0,r5,r4,table, r2,r3);                      \
+       restore(r2,1);                                          \
+       do_col (r5,r4,r1,r0,table, r2,r3);
+
+// This macro performs an inverse encryption cycle. It is entered with
+// the first previous round column values in r0, r1, r4 and r5 and
+// exits with the final values in the same registers, using the MMX
+// registers mm0-mm1 or the stack for temporary storage
+
+#define inv_rnd(arg, table)                                    \
+       /* mov current column values into the MMX registers */  \
+       mov     %r0,%r2;                                        \
+       save    (0,r1);                                         \
+       save    (1,r5);                                         \
+                                                               \
+       /* compute new column values */                         \
+       do_icol(r0,r1,r4,r5, table, r2,r3, arg);                \
+       do_col (r4,r5,r0,r1, table, r2,r3);                     \
+       restore(r2,0);                                          \
+       do_col (r1,r4,r5,r0, table, r2,r3);                     \
+       restore(r2,1);                                          \
+       do_col (r5,r0,r1,r4, table, r2,r3);
 
 // AES (Rijndael) Encryption Subroutine
 
 aes_enc_blk:
        push    %ebp
        mov     ctx(%esp),%ebp      // pointer to context
+       xor     %eax,%eax
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -240,9 +208,7 @@ aes_enc_blk:
        push    %esi
        mov     nrnd(%ebp),%r3   // number of rounds
        push    %edi
-#if ekey != 0
-       lea     ekey(%ebp),%ebp  // key pointer
-#endif
+       lea     ekey(%ebp),%r6   // key pointer
 
 // input four columns and xor in first round key
 
@@ -250,47 +216,47 @@ aes_enc_blk:
        mov     4(%r2),%r1
        mov     8(%r2),%r4
        mov     12(%r2),%r5
-       xor     (%ebp),%r0
-       xor     4(%ebp),%r1
-       xor     8(%ebp),%r4
-       xor     12(%ebp),%r5
+       xor     (%r6),%r0
+       xor     4(%r6),%r1
+       xor     8(%r6),%r4
+       xor     12(%r6),%r5
 
        sub     $8,%esp           // space for register saves on stack
-       add     $16,%ebp          // increment to next round key
+       add     $16,%r6           // increment to next round key   
        sub     $10,%r3          
        je      4f              // 10 rounds for 128-bit key
-       add     $32,%ebp
+       add     $32,%r6
        sub     $2,%r3
        je      3f              // 12 rounds for 128-bit key
-       add     $32,%ebp
-
-2:     fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 128-bit key
-       fwd_rnd2( -48(%ebp) ,ft_tab)
-3:     fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 128-bit key
-       fwd_rnd2( -16(%ebp) ,ft_tab)
-4:     fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
-       fwd_rnd2( +16(%ebp) ,ft_tab)
-       fwd_rnd1( +32(%ebp) ,ft_tab)
-       fwd_rnd2( +48(%ebp) ,ft_tab)
-       fwd_rnd1( +64(%ebp) ,ft_tab)
-       fwd_rnd2( +80(%ebp) ,ft_tab)
-       fwd_rnd1( +96(%ebp) ,ft_tab)
-       fwd_rnd2(+112(%ebp) ,ft_tab)
-       fwd_rnd1(+128(%ebp) ,ft_tab)
-       fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
+       add     $32,%r6
+
+2:     fwd_rnd( -64(%r6) ,ft_tab)      // 14 rounds for 128-bit key
+       fwd_rnd( -48(%r6) ,ft_tab)
+3:     fwd_rnd( -32(%r6) ,ft_tab)      // 12 rounds for 128-bit key
+       fwd_rnd( -16(%r6) ,ft_tab)
+4:     fwd_rnd(    (%r6) ,ft_tab)      // 10 rounds for 128-bit key
+       fwd_rnd( +16(%r6) ,ft_tab)
+       fwd_rnd( +32(%r6) ,ft_tab)
+       fwd_rnd( +48(%r6) ,ft_tab)
+       fwd_rnd( +64(%r6) ,ft_tab)
+       fwd_rnd( +80(%r6) ,ft_tab)
+       fwd_rnd( +96(%r6) ,ft_tab)
+       fwd_rnd(+112(%r6) ,ft_tab)
+       fwd_rnd(+128(%r6) ,ft_tab)
+       fwd_rnd(+144(%r6) ,fl_tab)      // last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
 
        add     $8,%esp
-       mov     out_blk+12(%esp),%ebp
-       mov     %r5,12(%ebp)
+       mov     out_blk+12(%esp),%r6
+       mov     %r5,12(%r6)
        pop     %edi
-       mov     %r4,8(%ebp)
+       mov     %r4,8(%r6)
        pop     %esi
-       mov     %r1,4(%ebp)
+       mov     %r1,4(%r6)
        pop     %ebx
-       mov     %r0,(%ebp)
+       mov     %r0,(%r6)
        pop     %ebp
        mov     $1,%eax
        ret
@@ -307,6 +273,7 @@ aes_enc_blk:
 aes_dec_blk:
        push    %ebp
        mov     ctx(%esp),%ebp       // pointer to context
+       xor     %eax,%eax
 
 // CAUTION: the order and the values used in these assigns 
 // rely on the register mappings
@@ -316,12 +283,10 @@ aes_dec_blk:
        push    %esi
        mov     nrnd(%ebp),%r3   // number of rounds
        push    %edi
-#if dkey != 0
-       lea     dkey(%ebp),%ebp  // key pointer
-#endif
+       lea     dkey(%ebp),%r6   // key pointer
        mov     %r3,%r0
        shl     $4,%r0
-       add     %r0,%ebp
+       add     %r0,%r6
        
 // input four columns and xor in first round key
 
@@ -329,47 +294,47 @@ aes_dec_blk:
        mov     4(%r2),%r1
        mov     8(%r2),%r4
        mov     12(%r2),%r5
-       xor     (%ebp),%r0
-       xor     4(%ebp),%r1
-       xor     8(%ebp),%r4
-       xor     12(%ebp),%r5
+       xor     (%r6),%r0
+       xor     4(%r6),%r1
+       xor     8(%r6),%r4
+       xor     12(%r6),%r5
 
-       sub     $8,%esp         // space for register saves on stack
-       sub     $16,%ebp        // increment to next round key
+       sub     $8,%esp           // space for register saves on stack
+       sub     $16,%r6           // increment to next round key   
        sub     $10,%r3          
        je      4f              // 10 rounds for 128-bit key
-       sub     $32,%ebp
+       sub     $32,%r6
        sub     $2,%r3
        je      3f              // 12 rounds for 128-bit key
-       sub     $32,%ebp
-
-2:     inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 128-bit key
-       inv_rnd2( +48(%ebp), it_tab)
-3:     inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 128-bit key
-       inv_rnd2( +16(%ebp), it_tab)
-4:     inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
-       inv_rnd2( -16(%ebp), it_tab)
-       inv_rnd1( -32(%ebp), it_tab)
-       inv_rnd2( -48(%ebp), it_tab)
-       inv_rnd1( -64(%ebp), it_tab)
-       inv_rnd2( -80(%ebp), it_tab)
-       inv_rnd1( -96(%ebp), it_tab)
-       inv_rnd2(-112(%ebp), it_tab)
-       inv_rnd1(-128(%ebp), it_tab)
-       inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
+       sub     $32,%r6
+
+2:     inv_rnd( +64(%r6), it_tab)      // 14 rounds for 128-bit key 
+       inv_rnd( +48(%r6), it_tab)
+3:     inv_rnd( +32(%r6), it_tab)      // 12 rounds for 128-bit key
+       inv_rnd( +16(%r6), it_tab)
+4:     inv_rnd(    (%r6), it_tab)      // 10 rounds for 128-bit key
+       inv_rnd( -16(%r6), it_tab)
+       inv_rnd( -32(%r6), it_tab)
+       inv_rnd( -48(%r6), it_tab)
+       inv_rnd( -64(%r6), it_tab)
+       inv_rnd( -80(%r6), it_tab)
+       inv_rnd( -96(%r6), it_tab)
+       inv_rnd(-112(%r6), it_tab)
+       inv_rnd(-128(%r6), it_tab)
+       inv_rnd(-144(%r6), il_tab)      // last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
 
        add     $8,%esp
-       mov     out_blk+12(%esp),%ebp
-       mov     %r5,12(%ebp)
+       mov     out_blk+12(%esp),%r6
+       mov     %r5,12(%r6)
        pop     %edi
-       mov     %r4,8(%ebp)
+       mov     %r4,8(%r6)
        pop     %esi
-       mov     %r1,4(%ebp)
+       mov     %r1,4(%r6)
        pop     %ebx
-       mov     %r0,(%ebp)
+       mov     %r0,(%r6)
        pop     %ebp
        mov     $1,%eax
        ret