|
@@ -40,84 +40,132 @@
|
|
|
ENTRY(__memcpy)
|
|
|
ENTRY(memcpy)
|
|
|
CFI_STARTPROC
|
|
|
+ movq %rdi, %rax
|
|
|
|
|
|
/*
|
|
|
- * Put the number of full 64-byte blocks into %ecx.
|
|
|
- * Tail portion is handled at the end:
|
|
|
+ * Use 32bit CMP here to avoid long NOP padding.
|
|
|
*/
|
|
|
- movq %rdi, %rax
|
|
|
- movl %edx, %ecx
|
|
|
- shrl $6, %ecx
|
|
|
- jz .Lhandle_tail
|
|
|
+ cmp $0x20, %edx
|
|
|
+ jb .Lhandle_tail
|
|
|
|
|
|
- .p2align 4
|
|
|
-.Lloop_64:
|
|
|
/*
|
|
|
- * We decrement the loop index here - and the zero-flag is
|
|
|
- * checked at the end of the loop (instructions inbetween do
|
|
|
- * not change the zero flag):
|
|
|
+ * We check whether memory false dependece could occur,
|
|
|
+ * then jump to corresponding copy mode.
|
|
|
*/
|
|
|
- decl %ecx
|
|
|
+ cmp %dil, %sil
|
|
|
+ jl .Lcopy_backward
|
|
|
+ subl $0x20, %edx
|
|
|
+.Lcopy_forward_loop:
|
|
|
+ subq $0x20, %rdx
|
|
|
|
|
|
/*
|
|
|
- * Move in blocks of 4x16 bytes:
|
|
|
+ * Move in blocks of 4x8 bytes:
|
|
|
*/
|
|
|
- movq 0*8(%rsi), %r11
|
|
|
- movq 1*8(%rsi), %r8
|
|
|
- movq %r11, 0*8(%rdi)
|
|
|
- movq %r8, 1*8(%rdi)
|
|
|
-
|
|
|
- movq 2*8(%rsi), %r9
|
|
|
- movq 3*8(%rsi), %r10
|
|
|
- movq %r9, 2*8(%rdi)
|
|
|
- movq %r10, 3*8(%rdi)
|
|
|
-
|
|
|
- movq 4*8(%rsi), %r11
|
|
|
- movq 5*8(%rsi), %r8
|
|
|
- movq %r11, 4*8(%rdi)
|
|
|
- movq %r8, 5*8(%rdi)
|
|
|
-
|
|
|
- movq 6*8(%rsi), %r9
|
|
|
- movq 7*8(%rsi), %r10
|
|
|
- movq %r9, 6*8(%rdi)
|
|
|
- movq %r10, 7*8(%rdi)
|
|
|
-
|
|
|
- leaq 64(%rsi), %rsi
|
|
|
- leaq 64(%rdi), %rdi
|
|
|
-
|
|
|
- jnz .Lloop_64
|
|
|
+ movq 0*8(%rsi), %r8
|
|
|
+ movq 1*8(%rsi), %r9
|
|
|
+ movq 2*8(%rsi), %r10
|
|
|
+ movq 3*8(%rsi), %r11
|
|
|
+ leaq 4*8(%rsi), %rsi
|
|
|
+
|
|
|
+ movq %r8, 0*8(%rdi)
|
|
|
+ movq %r9, 1*8(%rdi)
|
|
|
+ movq %r10, 2*8(%rdi)
|
|
|
+ movq %r11, 3*8(%rdi)
|
|
|
+ leaq 4*8(%rdi), %rdi
|
|
|
+ jae .Lcopy_forward_loop
|
|
|
+ addq $0x20, %rdx
|
|
|
+ jmp .Lhandle_tail
|
|
|
+
|
|
|
+.Lcopy_backward:
|
|
|
+ /*
|
|
|
+ * Calculate copy position to tail.
|
|
|
+ */
|
|
|
+ addq %rdx, %rsi
|
|
|
+ addq %rdx, %rdi
|
|
|
+ subq $0x20, %rdx
|
|
|
+ /*
|
|
|
+ * At most 3 ALU operations in one cycle,
|
|
|
+ * so append NOPS in the same 16bytes trunk.
|
|
|
+ */
|
|
|
+ .p2align 4
|
|
|
+.Lcopy_backward_loop:
|
|
|
+ subq $0x20, %rdx
|
|
|
+ movq -1*8(%rsi), %r8
|
|
|
+ movq -2*8(%rsi), %r9
|
|
|
+ movq -3*8(%rsi), %r10
|
|
|
+ movq -4*8(%rsi), %r11
|
|
|
+ leaq -4*8(%rsi), %rsi
|
|
|
+ movq %r8, -1*8(%rdi)
|
|
|
+ movq %r9, -2*8(%rdi)
|
|
|
+ movq %r10, -3*8(%rdi)
|
|
|
+ movq %r11, -4*8(%rdi)
|
|
|
+ leaq -4*8(%rdi), %rdi
|
|
|
+ jae .Lcopy_backward_loop
|
|
|
|
|
|
+ /*
|
|
|
+ * Calculate copy position to head.
|
|
|
+ */
|
|
|
+ addq $0x20, %rdx
|
|
|
+ subq %rdx, %rsi
|
|
|
+ subq %rdx, %rdi
|
|
|
.Lhandle_tail:
|
|
|
- movl %edx, %ecx
|
|
|
- andl $63, %ecx
|
|
|
- shrl $3, %ecx
|
|
|
- jz .Lhandle_7
|
|
|
+ cmpq $16, %rdx
|
|
|
+ jb .Lless_16bytes
|
|
|
|
|
|
+ /*
|
|
|
+ * Move data from 16 bytes to 31 bytes.
|
|
|
+ */
|
|
|
+ movq 0*8(%rsi), %r8
|
|
|
+ movq 1*8(%rsi), %r9
|
|
|
+ movq -2*8(%rsi, %rdx), %r10
|
|
|
+ movq -1*8(%rsi, %rdx), %r11
|
|
|
+ movq %r8, 0*8(%rdi)
|
|
|
+ movq %r9, 1*8(%rdi)
|
|
|
+ movq %r10, -2*8(%rdi, %rdx)
|
|
|
+ movq %r11, -1*8(%rdi, %rdx)
|
|
|
+ retq
|
|
|
.p2align 4
|
|
|
-.Lloop_8:
|
|
|
- decl %ecx
|
|
|
- movq (%rsi), %r8
|
|
|
- movq %r8, (%rdi)
|
|
|
- leaq 8(%rdi), %rdi
|
|
|
- leaq 8(%rsi), %rsi
|
|
|
- jnz .Lloop_8
|
|
|
-
|
|
|
-.Lhandle_7:
|
|
|
- movl %edx, %ecx
|
|
|
- andl $7, %ecx
|
|
|
- jz .Lend
|
|
|
+.Lless_16bytes:
|
|
|
+ cmpq $8, %rdx
|
|
|
+ jb .Lless_8bytes
|
|
|
+ /*
|
|
|
+ * Move data from 8 bytes to 15 bytes.
|
|
|
+ */
|
|
|
+ movq 0*8(%rsi), %r8
|
|
|
+ movq -1*8(%rsi, %rdx), %r9
|
|
|
+ movq %r8, 0*8(%rdi)
|
|
|
+ movq %r9, -1*8(%rdi, %rdx)
|
|
|
+ retq
|
|
|
+ .p2align 4
|
|
|
+.Lless_8bytes:
|
|
|
+ cmpq $4, %rdx
|
|
|
+ jb .Lless_3bytes
|
|
|
|
|
|
+ /*
|
|
|
+ * Move data from 4 bytes to 7 bytes.
|
|
|
+ */
|
|
|
+ movl (%rsi), %ecx
|
|
|
+ movl -4(%rsi, %rdx), %r8d
|
|
|
+ movl %ecx, (%rdi)
|
|
|
+ movl %r8d, -4(%rdi, %rdx)
|
|
|
+ retq
|
|
|
.p2align 4
|
|
|
+.Lless_3bytes:
|
|
|
+ cmpl $0, %edx
|
|
|
+ je .Lend
|
|
|
+ /*
|
|
|
+ * Move data from 1 bytes to 3 bytes.
|
|
|
+ */
|
|
|
.Lloop_1:
|
|
|
movb (%rsi), %r8b
|
|
|
movb %r8b, (%rdi)
|
|
|
incq %rdi
|
|
|
incq %rsi
|
|
|
- decl %ecx
|
|
|
+ decl %edx
|
|
|
jnz .Lloop_1
|
|
|
|
|
|
.Lend:
|
|
|
- ret
|
|
|
+ retq
|
|
|
CFI_ENDPROC
|
|
|
ENDPROC(memcpy)
|
|
|
ENDPROC(__memcpy)
|