15 years ago · 59daa706fb
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n)
 
				 				 "1" (src),
			
 
				 				 "2" (dest)
			
 
				 				:"memory");
			
 
				-
			
 
				 	} else {
			
 
				-
			
 
				-		if((src + count) < dest)
			
 
				-			return memcpy(dest, src, count);
			
 
				+		if((src + n) < dest)
			
 
				+			return memcpy(dest, src, n);
			
 
				 		else
			
 
				 			__asm__ __volatile__(
			
 
				 				"std\n\t"
			
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
 
				 ENTRY(__memcpy)
			
 
				 ENTRY(memcpy)
			
 
				 	CFI_STARTPROC
			
 
				+	movq %rdi, %rax
			
 
				 
			
 
				 	/*
			
 
				-	 * Put the number of full 64-byte blocks into %ecx.
			
 
				-	 * Tail portion is handled at the end:
			
 
				+	 * Use 32bit CMP here to avoid long NOP padding.
			
 
				 	 */
			
 
				-	movq %rdi, %rax
			
 
				-	movl %edx, %ecx
			
 
				-	shrl   $6, %ecx
			
 
				-	jz .Lhandle_tail
			
 
				+	cmp  $0x20, %edx
			
 
				+	jb .Lhandle_tail
			
 
				 
			
 
				-	.p2align 4
			
 
				-.Lloop_64:
			
 
				 	/*
			
 
				-	 * We decrement the loop index here - and the zero-flag is
			
 
				-	 * checked at the end of the loop (instructions inbetween do
			
 
				-	 * not change the zero flag):
			
 
				+	 * We check whether memory false dependece could occur,
			
 
				+	 * then jump to corresponding copy mode.
			
 
				 	 */
			
 
				-	decl %ecx
			
 
				+	cmp  %dil, %sil
			
 
				+	jl .Lcopy_backward
			
 
				+	subl $0x20, %edx
			
 
				+.Lcopy_forward_loop:
			
 
				+	subq $0x20,	%rdx
			
 
				 
			
 
				 	/*
			
 
				-	 * Move in blocks of 4x16 bytes:
			
 
				+	 * Move in blocks of 4x8 bytes:
			
 
				 	 */
			
 
				-	movq 0*8(%rsi),		%r11
			
 
				-	movq 1*8(%rsi),		%r8
			
 
				-	movq %r11,		0*8(%rdi)
			
 
				-	movq %r8,		1*8(%rdi)
			
 
				-
			
 
				-	movq 2*8(%rsi),		%r9
			
 
				-	movq 3*8(%rsi),		%r10
			
 
				-	movq %r9,		2*8(%rdi)
			
 
				-	movq %r10,		3*8(%rdi)
			
 
				-
			
 
				-	movq 4*8(%rsi),		%r11
			
 
				-	movq 5*8(%rsi),		%r8
			
 
				-	movq %r11,		4*8(%rdi)
			
 
				-	movq %r8,		5*8(%rdi)
			
 
				-
			
 
				-	movq 6*8(%rsi),		%r9
			
 
				-	movq 7*8(%rsi),		%r10
			
 
				-	movq %r9,		6*8(%rdi)
			
 
				-	movq %r10,		7*8(%rdi)
			
 
				-
			
 
				-	leaq 64(%rsi), %rsi
			
 
				-	leaq 64(%rdi), %rdi
			
 
				-
			
 
				-	jnz  .Lloop_64
			
 
				+	movq 0*8(%rsi),	%r8
			
 
				+	movq 1*8(%rsi),	%r9
			
 
				+	movq 2*8(%rsi),	%r10
			
 
				+	movq 3*8(%rsi),	%r11
			
 
				+	leaq 4*8(%rsi),	%rsi
			
 
				+
			
 
				+	movq %r8,	0*8(%rdi)
			
 
				+	movq %r9,	1*8(%rdi)
			
 
				+	movq %r10,	2*8(%rdi)
			
 
				+	movq %r11,	3*8(%rdi)
			
 
				+	leaq 4*8(%rdi),	%rdi
			
 
				+	jae  .Lcopy_forward_loop
			
 
				+	addq $0x20,	%rdx
			
 
				+	jmp  .Lhandle_tail
			
 
				+
			
 
				+.Lcopy_backward:
			
 
				+	/*
			
 
				+	 * Calculate copy position to tail.
			
 
				+	 */
			
 
				+	addq %rdx,	%rsi
			
 
				+	addq %rdx,	%rdi
			
 
				+	subq $0x20,	%rdx
			
 
				+	/*
			
 
				+	 * At most 3 ALU operations in one cycle,
			
 
				+	 * so append NOPS in the same 16bytes trunk.
			
 
				+	 */
			
 
				+	.p2align 4
			
 
				+.Lcopy_backward_loop:
			
 
				+	subq $0x20,	%rdx
			
 
				+	movq -1*8(%rsi),	%r8
			
 
				+	movq -2*8(%rsi),	%r9
			
 
				+	movq -3*8(%rsi),	%r10
			
 
				+	movq -4*8(%rsi),	%r11
			
 
				+	leaq -4*8(%rsi),	%rsi
			
 
				+	movq %r8,		-1*8(%rdi)
			
 
				+	movq %r9,		-2*8(%rdi)
			
 
				+	movq %r10,		-3*8(%rdi)
			
 
				+	movq %r11,		-4*8(%rdi)
			
 
				+	leaq -4*8(%rdi),	%rdi
			
 
				+	jae  .Lcopy_backward_loop
			
 
				 
			
 
				+	/*
			
 
				+	 * Calculate copy position to head.
			
 
				+	 */
			
 
				+	addq $0x20,	%rdx
			
 
				+	subq %rdx,	%rsi
			
 
				+	subq %rdx,	%rdi
			
 
				 .Lhandle_tail:
			
 
				-	movl %edx, %ecx
			
 
				-	andl  $63, %ecx
			
 
				-	shrl   $3, %ecx
			
 
				-	jz   .Lhandle_7
			
 
				+	cmpq $16,	%rdx
			
 
				+	jb   .Lless_16bytes
			
 
				 
			
 
				+	/*
			
 
				+	 * Move data from 16 bytes to 31 bytes.
			
 
				+	 */
			
 
				+	movq 0*8(%rsi), %r8
			
 
				+	movq 1*8(%rsi),	%r9
			
 
				+	movq -2*8(%rsi, %rdx),	%r10
			
 
				+	movq -1*8(%rsi, %rdx),	%r11
			
 
				+	movq %r8,	0*8(%rdi)
			
 
				+	movq %r9,	1*8(%rdi)
			
 
				+	movq %r10,	-2*8(%rdi, %rdx)
			
 
				+	movq %r11,	-1*8(%rdi, %rdx)
			
 
				+	retq
			
 
				 	.p2align 4
			
 
				-.Lloop_8:
			
 
				-	decl %ecx
			
 
				-	movq (%rsi),		%r8
			
 
				-	movq %r8,		(%rdi)
			
 
				-	leaq 8(%rdi),		%rdi
			
 
				-	leaq 8(%rsi),		%rsi
			
 
				-	jnz  .Lloop_8
			
 
				-
			
 
				-.Lhandle_7:
			
 
				-	movl %edx, %ecx
			
 
				-	andl $7, %ecx
			
 
				-	jz .Lend
			
 
				+.Lless_16bytes:
			
 
				+	cmpq $8,	%rdx
			
 
				+	jb   .Lless_8bytes
			
 
				+	/*
			
 
				+	 * Move data from 8 bytes to 15 bytes.
			
 
				+	 */
			
 
				+	movq 0*8(%rsi),	%r8
			
 
				+	movq -1*8(%rsi, %rdx),	%r9
			
 
				+	movq %r8,	0*8(%rdi)
			
 
				+	movq %r9,	-1*8(%rdi, %rdx)
			
 
				+	retq
			
 
				+	.p2align 4
			
 
				+.Lless_8bytes:
			
 
				+	cmpq $4,	%rdx
			
 
				+	jb   .Lless_3bytes
			
 
				 
			
 
				+	/*
			
 
				+	 * Move data from 4 bytes to 7 bytes.
			
 
				+	 */
			
 
				+	movl (%rsi), %ecx
			
 
				+	movl -4(%rsi, %rdx), %r8d
			
 
				+	movl %ecx, (%rdi)
			
 
				+	movl %r8d, -4(%rdi, %rdx)
			
 
				+	retq
			
 
				 	.p2align 4
			
 
				+.Lless_3bytes:
			
 
				+	cmpl $0, %edx
			
 
				+	je .Lend
			
 
				+	/*
			
 
				+	 * Move data from 1 bytes to 3 bytes.
			
 
				+	 */
			
 
				 .Lloop_1:
			
 
				 	movb (%rsi), %r8b
			
 
				 	movb %r8b, (%rdi)
			
 
				 	incq %rdi
			
 
				 	incq %rsi
			
 
				-	decl %ecx
			
 
				+	decl %edx
			
 
				 	jnz .Lloop_1
			
 
				 
			
 
				 .Lend:
			
 
				-	ret
			
 
				+	retq
			
 
				 	CFI_ENDPROC
			
 
				 ENDPROC(memcpy)
			
 
				 ENDPROC(__memcpy)