|
@@ -1,30 +1,38 @@
|
|
|
/* Copyright 2002 Andi Kleen */
|
|
|
|
|
|
#include <linux/linkage.h>
|
|
|
-#include <asm/dwarf2.h>
|
|
|
+
|
|
|
#include <asm/cpufeature.h>
|
|
|
+#include <asm/dwarf2.h>
|
|
|
|
|
|
/*
|
|
|
* memcpy - Copy a memory block.
|
|
|
*
|
|
|
- * Input:
|
|
|
- * rdi destination
|
|
|
- * rsi source
|
|
|
- * rdx count
|
|
|
- *
|
|
|
+ * Input:
|
|
|
+ * rdi destination
|
|
|
+ * rsi source
|
|
|
+ * rdx count
|
|
|
+ *
|
|
|
* Output:
|
|
|
* rax original destination
|
|
|
- */
|
|
|
+ */
|
|
|
|
|
|
+/*
|
|
|
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
|
|
|
+ *
|
|
|
+ * Calls to this get patched into the kernel image via the
|
|
|
+ * alternative instructions framework:
|
|
|
+ */
|
|
|
ALIGN
|
|
|
memcpy_c:
|
|
|
CFI_STARTPROC
|
|
|
- movq %rdi,%rax
|
|
|
- movl %edx,%ecx
|
|
|
- shrl $3,%ecx
|
|
|
- andl $7,%edx
|
|
|
+ movq %rdi, %rax
|
|
|
+
|
|
|
+ movl %edx, %ecx
|
|
|
+ shrl $3, %ecx
|
|
|
+ andl $7, %edx
|
|
|
rep movsq
|
|
|
- movl %edx,%ecx
|
|
|
+ movl %edx, %ecx
|
|
|
rep movsb
|
|
|
ret
|
|
|
CFI_ENDPROC
|
|
@@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
|
|
|
ENTRY(__memcpy)
|
|
|
ENTRY(memcpy)
|
|
|
CFI_STARTPROC
|
|
|
- movq %rdi,%rax
|
|
|
|
|
|
- movl %edx,%ecx
|
|
|
- shrl $6,%ecx
|
|
|
+ /*
|
|
|
+ * Put the number of full 64-byte blocks into %ecx.
|
|
|
+ * Tail portion is handled at the end:
|
|
|
+ */
|
|
|
+ movq %rdi, %rax
|
|
|
+ movl %edx, %ecx
|
|
|
+ shrl $6, %ecx
|
|
|
jz .Lhandle_tail
|
|
|
|
|
|
.p2align 4
|
|
|
.Lloop_64:
|
|
|
+ /*
|
|
|
+ * We decrement the loop index here - and the zero-flag is
|
|
|
+ * checked at the end of the loop (instructions inbetween do
|
|
|
+ * not change the zero flag):
|
|
|
+ */
|
|
|
decl %ecx
|
|
|
|
|
|
- movq (%rsi),%r11
|
|
|
- movq 8(%rsi),%r8
|
|
|
+ /*
|
|
|
+ * Move in blocks of 4x16 bytes:
|
|
|
+ */
|
|
|
+ movq 0*8(%rsi), %r11
|
|
|
+ movq 1*8(%rsi), %r8
|
|
|
+ movq %r11, 0*8(%rdi)
|
|
|
+ movq %r8, 1*8(%rdi)
|
|
|
|
|
|
- movq %r11,(%rdi)
|
|
|
- movq %r8,1*8(%rdi)
|
|
|
+ movq 2*8(%rsi), %r9
|
|
|
+ movq 3*8(%rsi), %r10
|
|
|
+ movq %r9, 2*8(%rdi)
|
|
|
+ movq %r10, 3*8(%rdi)
|
|
|
|
|
|
- movq 2*8(%rsi),%r9
|
|
|
- movq 3*8(%rsi),%r10
|
|
|
+ movq 4*8(%rsi), %r11
|
|
|
+ movq 5*8(%rsi), %r8
|
|
|
+ movq %r11, 4*8(%rdi)
|
|
|
+ movq %r8, 5*8(%rdi)
|
|
|
|
|
|
- movq %r9,2*8(%rdi)
|
|
|
- movq %r10,3*8(%rdi)
|
|
|
+ movq 6*8(%rsi), %r9
|
|
|
+ movq 7*8(%rsi), %r10
|
|
|
+ movq %r9, 6*8(%rdi)
|
|
|
+ movq %r10, 7*8(%rdi)
|
|
|
|
|
|
- movq 4*8(%rsi),%r11
|
|
|
- movq 5*8(%rsi),%r8
|
|
|
+ leaq 64(%rsi), %rsi
|
|
|
+ leaq 64(%rdi), %rdi
|
|
|
|
|
|
- movq %r11,4*8(%rdi)
|
|
|
- movq %r8,5*8(%rdi)
|
|
|
-
|
|
|
- movq 6*8(%rsi),%r9
|
|
|
- movq 7*8(%rsi),%r10
|
|
|
-
|
|
|
- movq %r9,6*8(%rdi)
|
|
|
- movq %r10,7*8(%rdi)
|
|
|
-
|
|
|
- leaq 64(%rsi),%rsi
|
|
|
- leaq 64(%rdi),%rdi
|
|
|
jnz .Lloop_64
|
|
|
|
|
|
.Lhandle_tail:
|
|
|
- movl %edx,%ecx
|
|
|
- andl $63,%ecx
|
|
|
- shrl $3,%ecx
|
|
|
+ movl %edx, %ecx
|
|
|
+ andl $63, %ecx
|
|
|
+ shrl $3, %ecx
|
|
|
jz .Lhandle_7
|
|
|
+
|
|
|
.p2align 4
|
|
|
.Lloop_8:
|
|
|
decl %ecx
|
|
|
- movq (%rsi),%r8
|
|
|
- movq %r8,(%rdi)
|
|
|
- leaq 8(%rdi),%rdi
|
|
|
- leaq 8(%rsi),%rsi
|
|
|
+ movq (%rsi), %r8
|
|
|
+ movq %r8, (%rdi)
|
|
|
+ leaq 8(%rdi), %rdi
|
|
|
+ leaq 8(%rsi), %rsi
|
|
|
jnz .Lloop_8
|
|
|
|
|
|
.Lhandle_7:
|
|
|
- movl %edx,%ecx
|
|
|
- andl $7,%ecx
|
|
|
- jz .Lende
|
|
|
+ movl %edx, %ecx
|
|
|
+ andl $7, %ecx
|
|
|
+ jz .Lend
|
|
|
+
|
|
|
.p2align 4
|
|
|
.Lloop_1:
|
|
|
- movb (%rsi),%r8b
|
|
|
- movb %r8b,(%rdi)
|
|
|
+ movb (%rsi), %r8b
|
|
|
+ movb %r8b, (%rdi)
|
|
|
incq %rdi
|
|
|
incq %rsi
|
|
|
decl %ecx
|
|
|
jnz .Lloop_1
|
|
|
|
|
|
-.Lende:
|
|
|
+.Lend:
|
|
|
ret
|
|
|
CFI_ENDPROC
|
|
|
ENDPROC(memcpy)
|
|
|
ENDPROC(__memcpy)
|
|
|
|
|
|
- /* Some CPUs run faster using the string copy instructions.
|
|
|
- It is also a lot simpler. Use this when possible */
|
|
|
+ /*
|
|
|
+ * Some CPUs run faster using the string copy instructions.
|
|
|
+ * It is also a lot simpler. Use this when possible:
|
|
|
+ */
|
|
|
|
|
|
- .section .altinstr_replacement,"ax"
|
|
|
+ .section .altinstr_replacement, "ax"
|
|
|
1: .byte 0xeb /* jmp <disp8> */
|
|
|
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
|
|
|
2:
|
|
|
.previous
|
|
|
- .section .altinstructions,"a"
|
|
|
+
|
|
|
+ .section .altinstructions, "a"
|
|
|
.align 8
|
|
|
.quad memcpy
|
|
|
.quad 1b
|
|
|
.byte X86_FEATURE_REP_GOOD
|
|
|
- /* Replace only beginning, memcpy is used to apply alternatives, so it
|
|
|
- * is silly to overwrite itself with nops - reboot is only outcome... */
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Replace only beginning, memcpy is used to apply alternatives,
|
|
|
+ * so it is silly to overwrite itself with nops - reboot is the
|
|
|
+ * only outcome...
|
|
|
+ */
|
|
|
.byte 2b - 1b
|
|
|
.byte 2b - 1b
|
|
|
.previous
|