X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Flib%2Fmemset.S;h=2c5948116bd21731d4a85712fddd00d3943fd728;hb=refs%2Fheads%2Fvserver;hp=4b4c40638640ee8128510155b36dc5550044098a;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git

diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638..2c5948116 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,4 +1,8 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
 /*
  * ISO C memset - set a memory block to a byte value.
  *	
@@ -8,11 +12,29 @@
  * 
  * rax   original destination
  */	
- 	.globl __memset
-	.globl memset
-	.p2align 4
-memset:	
-__memset:
+	ALIGN
+memset_c:
+	CFI_STARTPROC
+	movq %rdi,%r9
+	movl %edx,%r8d
+	andl $7,%r8d
+	movl %edx,%ecx
+	shrl $3,%ecx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	mulq %rsi		/* with rax, clobbers rdx */
+	rep stosq
+	movl %r8d,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+	CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+	CFI_STARTPROC
 	movq %rdi,%r10
 	movq %rdx,%r11
 
@@ -22,32 +44,33 @@ __memset:
 	mul    %rcx		/* with rax, clobbers rdx */
 
 	/* align dst */
-	movl  %edi,%r9d		
-	andl  $7,%r9d	
+	movl  %edi,%r9d
+	andl  $7,%r9d
 	jnz  .Lbad_alignment
+	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
-	
+
 	movl %r11d,%ecx
 	shrl $6,%ecx
 	jz	 .Lhandle_tail
 
 	.p2align 4
-.Lloop_64:	
+.Lloop_64:
 	decl   %ecx
-	movq  %rax,(%rdi) 
-	movq  %rax,8(%rdi) 
-	movq  %rax,16(%rdi) 
-	movq  %rax,24(%rdi) 
-	movq  %rax,32(%rdi) 
-	movq  %rax,40(%rdi) 
-	movq  %rax,48(%rdi) 
-	movq  %rax,56(%rdi) 
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
 	leaq  64(%rdi),%rdi
 	jnz    .Lloop_64
 
 	/* Handle tail in loops. The loops should be faster than hard
-	   to predict jump tables. */ 
-	.p2align 4	   
+	   to predict jump tables. */
+	.p2align 4
 .Lhandle_tail:
 	movl	%r11d,%ecx
 	andl    $63&(~7),%ecx
@@ -70,56 +93,41 @@ __memset:
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
-	
-.Lende:	
+
+.Lende:
 	movq	%r10,%rax
 	ret
 
+	CFI_RESTORE_STATE
 .Lbad_alignment:
 	cmpq $7,%r11
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
-	movq $8,%r8			
-	subq %r9,%r8 
+	movq $8,%r8
+	subq %r9,%r8
 	addq %r8,%rdi
 	subq %r8,%r11
 	jmp .Lafter_bad_alignment
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
 
-	/* C stepping K8 run faster using the string instructions.
+	/* Some CPUs run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
 
-#include <asm/cpufeature.h>	
-		
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memset
-	.quad  memset_c
-	.byte  X86_FEATURE_K8_C
-	.byte  memset_c_end-memset_c
-	.byte  memset_c_end-memset_c
-	.previous
+#include <asm/cpufeature.h>
 
 	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi value
-  * rdx count
-  */			
-memset_c:	
-	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d		
-	movl %edx,%ecx
-	shrl $3,%ecx		
-	/* expand byte value  */
-	movzbl %sil,%esi
-	movabs $0x0101010101010101,%rax
-	mulq   %rsi		/* with rax, clobbers rdx */
-	rep
-	stosq	
-	movl %r8d,%ecx
-	rep
-	stosb
-	movq %r9,%rax
-	ret
-memset_c_end:
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memset_c - memset) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad memset
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lfinal - memset
+	.byte 2b - 1b
 	.previous