X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Flib%2Fmemcpy.S;h=0ea0ddc875a7128203b8a0c8a7c4f5784fb2b680;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=c6c46494fef50bec9f061ce6a679d017e90ac921;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index c6c46494f..0ea0ddc87 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -1,6 +1,9 @@ /* Copyright 2002 Andi Kleen */ - - #include + +#include +#include +#include + /* * memcpy - Copy a memory block. * @@ -13,22 +16,36 @@ * rax original destination */ - .globl __memcpy - .globl memcpy - .p2align 4 -__memcpy: -memcpy: + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx shrl $6,%ecx jz .Lhandle_tail - + .p2align 4 .Lloop_64: decl %ecx - + movq (%rsi),%r11 movq 8(%rsi),%r8 @@ -40,7 +57,7 @@ memcpy: movq %r9,2*8(%rdi) movq %r10,3*8(%rdi) - + movq 4*8(%rsi),%r11 movq 5*8(%rsi),%r8 @@ -63,10 +80,10 @@ memcpy: shrl $3,%ecx jz .Lhandle_7 .p2align 4 -.Lloop_8: +.Lloop_8: decl %ecx movq (%rsi),%r8 - movq %r8,(%rdi) + movq %r8,(%rdi) leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jnz .Lloop_8 @@ -78,44 +95,35 @@ memcpy: .p2align 4 .Lloop_1: movb (%rsi),%r8b - movb %r8b,(%rdi) + movb %r8b,(%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 - -.Lende: + +.Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret .Lfinal: - - /* C stepping K8 run faster using the string copy instructions. + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) + + /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ - - .section .altinstructions,"a" - .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_K8_C - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep - movsq - movl %edx,%ecx - rep - movsb - ret -memcpy_c_end: +1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memcpy + .byte 2b - 1b .previous