movl %edx,%ecx
shrl $6,%ecx
jz .Lhandle_tail
-
+
.p2align 4
.Lloop_64:
decl %ecx
-
+
movq (%rsi),%r11
movq 8(%rsi),%r8
movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)
-
+
movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8
shrl $3,%ecx
jz .Lhandle_7
.p2align 4
-.Lloop_8:
+.Lloop_8:
decl %ecx
movq (%rsi),%r8
- movq %r8,(%rdi)
+ movq %r8,(%rdi)
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jnz .Lloop_8
.p2align 4
.Lloop_1:
movb (%rsi),%r8b
- movb %r8b,(%rdi)
+ movb %r8b,(%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
-
-.Lende:
+
+.Lende:
popq %rbx
ret
.Lfinal:
-
- /* C stepping K8 run faster using the string copy instructions.
+
+ /* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
-
+
.section .altinstructions,"a"
.align 8
.quad memcpy
.quad memcpy_c
- .byte X86_FEATURE_K8_C
+ .byte X86_FEATURE_REP_GOOD
.byte .Lfinal-memcpy
- .byte memcpy_c_end-memcpy_c
+ .byte memcpy_c_end-memcpy_c
.previous
.section .altinstr_replacement,"ax"
/* rdi destination
* rsi source
* rdx count
- */
+ */
memcpy_c:
movq %rdi,%rax
movl %edx,%ecx