memcpy_64.S 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/dwarf2.h>
  4. #include <asm/cpufeature.h>
  5. /*
  6. * memcpy - Copy a memory block.
  7. *
  8. * Input:
  9. * rdi destination
  10. * rsi source
  11. * rdx count
  12. *
  13. * Output:
  14. * rax original destination
  15. */
  16. ALIGN
  17. memcpy_c:
  18. CFI_STARTPROC
  19. movq %rdi,%rax
  20. movl %edx,%ecx
  21. shrl $3,%ecx
  22. andl $7,%edx
  23. rep movsq
  24. movl %edx,%ecx
  25. rep movsb
  26. ret
  27. CFI_ENDPROC
  28. ENDPROC(memcpy_c)
  29. ENTRY(__memcpy)
  30. ENTRY(memcpy)
  31. CFI_STARTPROC
  32. pushq %rbx
  33. CFI_ADJUST_CFA_OFFSET 8
  34. CFI_REL_OFFSET rbx, 0
  35. movq %rdi,%rax
  36. movl %edx,%ecx
  37. shrl $6,%ecx
  38. jz .Lhandle_tail
  39. .p2align 4
  40. .Lloop_64:
  41. decl %ecx
  42. movq (%rsi),%r11
  43. movq 8(%rsi),%r8
  44. movq %r11,(%rdi)
  45. movq %r8,1*8(%rdi)
  46. movq 2*8(%rsi),%r9
  47. movq 3*8(%rsi),%r10
  48. movq %r9,2*8(%rdi)
  49. movq %r10,3*8(%rdi)
  50. movq 4*8(%rsi),%r11
  51. movq 5*8(%rsi),%r8
  52. movq %r11,4*8(%rdi)
  53. movq %r8,5*8(%rdi)
  54. movq 6*8(%rsi),%r9
  55. movq 7*8(%rsi),%r10
  56. movq %r9,6*8(%rdi)
  57. movq %r10,7*8(%rdi)
  58. leaq 64(%rsi),%rsi
  59. leaq 64(%rdi),%rdi
  60. jnz .Lloop_64
  61. .Lhandle_tail:
  62. movl %edx,%ecx
  63. andl $63,%ecx
  64. shrl $3,%ecx
  65. jz .Lhandle_7
  66. .p2align 4
  67. .Lloop_8:
  68. decl %ecx
  69. movq (%rsi),%r8
  70. movq %r8,(%rdi)
  71. leaq 8(%rdi),%rdi
  72. leaq 8(%rsi),%rsi
  73. jnz .Lloop_8
  74. .Lhandle_7:
  75. movl %edx,%ecx
  76. andl $7,%ecx
  77. jz .Lende
  78. .p2align 4
  79. .Lloop_1:
  80. movb (%rsi),%r8b
  81. movb %r8b,(%rdi)
  82. incq %rdi
  83. incq %rsi
  84. decl %ecx
  85. jnz .Lloop_1
  86. .Lende:
  87. popq %rbx
  88. CFI_ADJUST_CFA_OFFSET -8
  89. CFI_RESTORE rbx
  90. ret
  91. .Lfinal:
  92. CFI_ENDPROC
  93. ENDPROC(memcpy)
  94. ENDPROC(__memcpy)
  95. /* Some CPUs run faster using the string copy instructions.
  96. It is also a lot simpler. Use this when possible */
  97. .section .altinstr_replacement,"ax"
  98. 1: .byte 0xeb /* jmp <disp8> */
  99. .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
  100. 2:
  101. .previous
  102. .section .altinstructions,"a"
  103. .align 8
  104. .quad memcpy
  105. .quad 1b
  106. .byte X86_FEATURE_REP_GOOD
  107. /* Replace only beginning, memcpy is used to apply alternatives, so it
  108. * is silly to overwrite itself with nops - reboot is only outcome... */
  109. .byte 2b - 1b
  110. .byte 2b - 1b
  111. .previous