X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=include%2Fasm-i386%2Fstring.h;h=bb5f88a27f7a8e23711afa0aa8ca5f30d1ed95c7;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=fc66609bb76142e19c3adf129d26243640a84299;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/include/asm-i386/string.h b/include/asm-i386/string.h index fc66609bb..bb5f88a27 100644 --- a/include/asm-i386/string.h +++ b/include/asm-i386/string.h @@ -25,7 +25,6 @@ /* AK: in fact I bet it would be better to move this stuff all out of line. */ -#if !defined(IN_STRING_C) #define __HAVE_ARCH_STRCPY static inline char * strcpy(char * dest,const char *src) @@ -117,7 +116,8 @@ __asm__ __volatile__( "orb $1,%%al\n" "3:" :"=a" (__res), "=&S" (d0), "=&D" (d1) - :"1" (cs),"2" (ct)); + :"1" (cs),"2" (ct) + :"memory"); return __res; } @@ -139,8 +139,9 @@ __asm__ __volatile__( "3:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "4:" - :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2) - :"1" (cs),"2" (ct),"3" (count)); + :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + :"1" (cs),"2" (ct),"3" (count) + :"memory"); return __res; } @@ -159,7 +160,9 @@ __asm__ __volatile__( "movl $1,%1\n" "2:\tmovl %1,%0\n\t" "decl %0" - :"=a" (__res), "=&S" (d0) : "1" (s),"0" (c)); + :"=a" (__res), "=&S" (d0) + :"1" (s),"0" (c) + :"memory"); return __res; } @@ -176,12 +179,12 @@ __asm__ __volatile__( "leal -1(%%esi),%0\n" "2:\ttestb %%al,%%al\n\t" "jne 1b" - :"=g" (__res), "=&S" (d0), "=&a" (d1) :"0" (0),"1" (s),"2" (c)); + :"=g" (__res), "=&S" (d0), "=&a" (d1) + :"0" (0),"1" (s),"2" (c) + :"memory"); return __res; } -#endif - #define __HAVE_ARCH_STRLEN static inline size_t strlen(const char * s) { @@ -192,56 +195,91 @@ __asm__ __volatile__( "scasb\n\t" "notl %0\n\t" "decl %0" - :"=c" (__res), "=&D" (d0) :"1" (s),"a" (0), "0" (0xffffffffu)); + :"=c" (__res), "=&D" (d0) + :"1" (s),"a" (0), "0" (0xffffffffu) + :"memory"); return __res; } -static inline void * __memcpy(void * to, const void * from, size_t n) +static __always_inline void * __memcpy(void * to, const void * from, size_t n) { int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" +#if 1 /* want to pay 2 byte penalty for a chance to skip microcoded rep? */ + "jz 1f\n\t" +#endif + "rep ; movsb\n\t" + "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) + : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* - * This looks horribly ugly, but the compiler can optimize it totally, + * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ -static inline void * __constant_memcpy(void * to, const void * from, size_t n) +static __always_inline void * __constant_memcpy(void * to, const void * from, size_t n) { - if (n <= 128) - return __builtin_memcpy(to, from, n); - -#define COMMON(x) \ -__asm__ __volatile__( \ - "rep ; movsl" \ - x \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (n/4),"1" ((long) to),"2" ((long) from) \ - : "memory"); -{ - int d0, d1, d2; + long esi, edi; + if (!n) return to; +#if 1 /* want to do small copies with non-string ops? */ + switch (n) { + case 1: *(char*)to = *(char*)from; return to; + case 2: *(short*)to = *(short*)from; return to; + case 4: *(int*)to = *(int*)from; return to; +#if 1 /* including those doable with two moves? */ + case 3: *(short*)to = *(short*)from; + *((char*)to+2) = *((char*)from+2); return to; + case 5: *(int*)to = *(int*)from; + *((char*)to+4) = *((char*)from+4); return to; + case 6: *(int*)to = *(int*)from; + *((short*)to+2) = *((short*)from+2); return to; + case 8: *(int*)to = *(int*)from; + *((int*)to+1) = *((int*)from+1); return to; +#endif + } +#endif + esi = (long) from; + edi = (long) to; + if (n >= 5*4) { + /* large block: use rep prefix */ + int ecx; + __asm__ __volatile__( + "rep ; movsl" + : "=&c" (ecx), "=&D" (edi), "=&S" (esi) + : "0" (n/4), "1" (edi),"2" (esi) + : "memory" + ); + } else { + /* small block: don't clobber ecx + smaller code */ + if (n >= 4*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 3*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 2*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 1*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + } switch (n % 4) { - case 0: COMMON(""); return to; - case 1: COMMON("\n\tmovsb"); return to; - case 2: COMMON("\n\tmovsw"); return to; - default: COMMON("\n\tmovsw\n\tmovsb"); return to; + /* tail */ + case 0: return to; + case 1: __asm__ __volatile__("movsb" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; + case 2: __asm__ __volatile__("movsw" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; + default: __asm__ __volatile__("movsw\n\tmovsb" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; } } - -#undef COMMON -} #define __HAVE_ARCH_MEMCPY @@ -303,7 +341,9 @@ __asm__ __volatile__( "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" - :"=D" (__res), "=&c" (d0) : "a" (c),"0" (cs),"1" (count)); + :"=D" (__res), "=&c" (d0) + :"a" (c),"0" (cs),"1" (count) + :"memory"); return __res; } @@ -327,7 +367,7 @@ return s; * things 32 bits at a time even when we don't know the size of the * area at compile-time.. */ -static inline void * __constant_c_memset(void * s, unsigned long c, size_t count) +static __always_inline void * __constant_c_memset(void * s, unsigned long c, size_t count) { int d0, d1; __asm__ __volatile__( @@ -339,7 +379,7 @@ __asm__ __volatile__( "je 2f\n\t" "stosb\n" "2:" - : "=&c" (d0), "=&D" (d1) + :"=&c" (d0), "=&D" (d1) :"a" (c), "q" (count), "0" (count/4), "1" ((long) s) :"memory"); return (s); @@ -362,7 +402,8 @@ __asm__ __volatile__( "jne 1b\n" "3:\tsubl %2,%0" :"=a" (__res), "=&d" (d0) - :"c" (s),"1" (count)); + :"c" (s),"1" (count) + :"memory"); return __res; } /* end of additional stuff */ @@ -375,7 +416,7 @@ extern char *strstr(const char *cs, const char *ct); * This looks horribly ugly, but the compiler can optimize it totally, * as we by now know that both pattern and count is constant.. */ -static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count) +static __always_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count) { switch (count) { case 0: @@ -443,7 +484,8 @@ static inline void * memscan(void * addr, int c, size_t size) "dec %%edi\n" "1:" : "=D" (addr), "=c" (size) - : "0" (addr), "1" (size), "a" (c)); + : "0" (addr), "1" (size), "a" (c) + : "memory"); return addr; }