16 years ago · f3b6eaf014
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
 
				 /* Copyright 2002 Andi Kleen */
			
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				-#include <asm/dwarf2.h>
			
 
				+
			
 
				 #include <asm/cpufeature.h>
			
 
				+#include <asm/dwarf2.h>
			
 
				 
			
 
				 /*
			
 
				  * memcpy - Copy a memory block.
			
 
				  *
			
 
				- * Input:	
			
 
				- * rdi destination
			
 
				- * rsi source
			
 
				- * rdx count
			
 
				- * 
			
 
				+ * Input:
			
 
				+ *  rdi destination
			
 
				+ *  rsi source
			
 
				+ *  rdx count
			
 
				+ *
			
 
				  * Output:
			
 
				  * rax original destination
			
 
				- */	
			
 
				+ */
			
 
				 
			
 
				+/*
			
 
				+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
			
 
				+ *
			
 
				+ * Calls to this get patched into the kernel image via the
			
 
				+ * alternative instructions framework:
			
 
				+ */
			
 
				 	ALIGN
			
 
				 memcpy_c:
			
 
				 	CFI_STARTPROC
			
 
				-	movq %rdi,%rax
			
 
				-	movl %edx,%ecx
			
 
				-	shrl $3,%ecx
			
 
				-	andl $7,%edx
			
 
				+	movq %rdi, %rax
			
 
				+
			
 
				+	movl %edx, %ecx
			
 
				+	shrl $3, %ecx
			
 
				+	andl $7, %edx
			
 
				 	rep movsq
			
 
				-	movl %edx,%ecx
			
 
				+	movl %edx, %ecx
			
 
				 	rep movsb
			
 
				 	ret
			
 
				 	CFI_ENDPROC
			
@@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
 
				 ENTRY(__memcpy)
			
 
				 ENTRY(memcpy)
			
 
				 	CFI_STARTPROC
			
 
				-	movq %rdi,%rax
			
 
				 
			
 
				-	movl %edx,%ecx
			
 
				-	shrl $6,%ecx
			
 
				+	/*
			
 
				+	 * Put the number of full 64-byte blocks into %ecx.
			
 
				+	 * Tail portion is handled at the end:
			
 
				+	 */
			
 
				+	movq %rdi, %rax
			
 
				+	movl %edx, %ecx
			
 
				+	shrl   $6, %ecx
			
 
				 	jz .Lhandle_tail
			
 
				 
			
 
				 	.p2align 4
			
 
				 .Lloop_64:
			
 
				+	/*
			
 
				+	 * We decrement the loop index here - and the zero-flag is
			
 
				+	 * checked at the end of the loop (instructions inbetween do
			
 
				+	 * not change the zero flag):
			
 
				+	 */
			
 
				 	decl %ecx
			
 
				 
			
 
				-	movq (%rsi),%r11
			
 
				-	movq 8(%rsi),%r8
			
 
				+	/*
			
 
				+	 * Move in blocks of 4x16 bytes:
			
 
				+	 */
			
 
				+	movq 0*8(%rsi),		%r11
			
 
				+	movq 1*8(%rsi),		%r8
			
 
				+	movq %r11,		0*8(%rdi)
			
 
				+	movq %r8,		1*8(%rdi)
			
 
				 
			
 
				-	movq %r11,(%rdi)
			
 
				-	movq %r8,1*8(%rdi)
			
 
				+	movq 2*8(%rsi),		%r9
			
 
				+	movq 3*8(%rsi),		%r10
			
 
				+	movq %r9,		2*8(%rdi)
			
 
				+	movq %r10,		3*8(%rdi)
			
 
				 
			
 
				-	movq 2*8(%rsi),%r9
			
 
				-	movq 3*8(%rsi),%r10
			
 
				+	movq 4*8(%rsi),		%r11
			
 
				+	movq 5*8(%rsi),		%r8
			
 
				+	movq %r11,		4*8(%rdi)
			
 
				+	movq %r8,		5*8(%rdi)
			
 
				 
			
 
				-	movq %r9,2*8(%rdi)
			
 
				-	movq %r10,3*8(%rdi)
			
 
				+	movq 6*8(%rsi),		%r9
			
 
				+	movq 7*8(%rsi),		%r10
			
 
				+	movq %r9,		6*8(%rdi)
			
 
				+	movq %r10,		7*8(%rdi)
			
 
				 
			
 
				-	movq 4*8(%rsi),%r11
			
 
				-	movq 5*8(%rsi),%r8
			
 
				+	leaq 64(%rsi), %rsi
			
 
				+	leaq 64(%rdi), %rdi
			
 
				 
			
 
				-	movq %r11,4*8(%rdi)
			
 
				-	movq %r8,5*8(%rdi)
			
 
				-
			
 
				-	movq 6*8(%rsi),%r9
			
 
				-	movq 7*8(%rsi),%r10
			
 
				-
			
 
				-	movq %r9,6*8(%rdi)
			
 
				-	movq %r10,7*8(%rdi)
			
 
				-
			
 
				-	leaq 64(%rsi),%rsi
			
 
				-	leaq 64(%rdi),%rdi
			
 
				 	jnz  .Lloop_64
			
 
				 
			
 
				 .Lhandle_tail:
			
 
				-	movl %edx,%ecx
			
 
				-	andl $63,%ecx
			
 
				-	shrl $3,%ecx
			
 
				+	movl %edx, %ecx
			
 
				+	andl  $63, %ecx
			
 
				+	shrl   $3, %ecx
			
 
				 	jz   .Lhandle_7
			
 
				+
			
 
				 	.p2align 4
			
 
				 .Lloop_8:
			
 
				 	decl %ecx
			
 
				-	movq (%rsi),%r8
			
 
				-	movq %r8,(%rdi)
			
 
				-	leaq 8(%rdi),%rdi
			
 
				-	leaq 8(%rsi),%rsi
			
 
				+	movq (%rsi),		%r8
			
 
				+	movq %r8,		(%rdi)
			
 
				+	leaq 8(%rdi),		%rdi
			
 
				+	leaq 8(%rsi),		%rsi
			
 
				 	jnz  .Lloop_8
			
 
				 
			
 
				 .Lhandle_7:
			
 
				-	movl %edx,%ecx
			
 
				-	andl $7,%ecx
			
 
				-	jz .Lende
			
 
				+	movl %edx, %ecx
			
 
				+	andl $7, %ecx
			
 
				+	jz .Lend
			
 
				+
			
 
				 	.p2align 4
			
 
				 .Lloop_1:
			
 
				-	movb (%rsi),%r8b
			
 
				-	movb %r8b,(%rdi)
			
 
				+	movb (%rsi), %r8b
			
 
				+	movb %r8b, (%rdi)
			
 
				 	incq %rdi
			
 
				 	incq %rsi
			
 
				 	decl %ecx
			
 
				 	jnz .Lloop_1
			
 
				 
			
 
				-.Lende:
			
 
				+.Lend:
			
 
				 	ret
			
 
				 	CFI_ENDPROC
			
 
				 ENDPROC(memcpy)
			
 
				 ENDPROC(__memcpy)
			
 
				 
			
 
				-	/* Some CPUs run faster using the string copy instructions.
			
 
				-	   It is also a lot simpler. Use this when possible */
			
 
				+	/*
			
 
				+	 * Some CPUs run faster using the string copy instructions.
			
 
				+	 * It is also a lot simpler. Use this when possible:
			
 
				+	 */
			
 
				 
			
 
				-	.section .altinstr_replacement,"ax"
			
 
				+	.section .altinstr_replacement, "ax"
			
 
				 1:	.byte 0xeb				/* jmp <disp8> */
			
 
				 	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
			
 
				 2:
			
 
				 	.previous
			
 
				-	.section .altinstructions,"a"
			
 
				+
			
 
				+	.section .altinstructions, "a"
			
 
				 	.align 8
			
 
				 	.quad memcpy
			
 
				 	.quad 1b
			
 
				 	.byte X86_FEATURE_REP_GOOD
			
 
				-	/* Replace only beginning, memcpy is used to apply alternatives, so it
			
 
				-	 * is silly to overwrite itself with nops - reboot is only outcome... */
			
 
				+
			
 
				+	/*
			
 
				+	 * Replace only beginning, memcpy is used to apply alternatives,
			
 
				+	 * so it is silly to overwrite itself with nops - reboot is the
			
 
				+	 * only outcome...
			
 
				+	 */
			
 
				 	.byte 2b - 1b
			
 
				 	.byte 2b - 1b
			
 
				 	.previous