memcpy_64.S 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/cpufeature.h>
  4. #include <asm/dwarf2.h>
  5. /*
  6. * memcpy - Copy a memory block.
  7. *
  8. * Input:
  9. * rdi destination
  10. * rsi source
  11. * rdx count
  12. *
  13. * Output:
  14. * rax original destination
  15. */
  16. /*
  17. * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  18. *
  19. * Calls to this get patched into the kernel image via the
  20. * alternative instructions framework:
  21. */
  22. ALIGN
  23. memcpy_c:
  24. CFI_STARTPROC
  25. movq %rdi, %rax
  26. movl %edx, %ecx
  27. shrl $3, %ecx
  28. andl $7, %edx
  29. rep movsq
  30. movl %edx, %ecx
  31. rep movsb
  32. ret
  33. CFI_ENDPROC
  34. ENDPROC(memcpy_c)
  35. ENTRY(__memcpy)
  36. ENTRY(memcpy)
  37. CFI_STARTPROC
  38. /*
  39. * Put the number of full 64-byte blocks into %ecx.
  40. * Tail portion is handled at the end:
  41. */
  42. movq %rdi, %rax
  43. movl %edx, %ecx
  44. shrl $6, %ecx
  45. jz .Lhandle_tail
  46. .p2align 4
  47. .Lloop_64:
  48. /*
  49. * We decrement the loop index here - and the zero-flag is
  50. * checked at the end of the loop (instructions inbetween do
  51. * not change the zero flag):
  52. */
  53. decl %ecx
  54. /*
  55. * Move in blocks of 4x16 bytes:
  56. */
  57. movq 0*8(%rsi), %r11
  58. movq 1*8(%rsi), %r8
  59. movq %r11, 0*8(%rdi)
  60. movq %r8, 1*8(%rdi)
  61. movq 2*8(%rsi), %r9
  62. movq 3*8(%rsi), %r10
  63. movq %r9, 2*8(%rdi)
  64. movq %r10, 3*8(%rdi)
  65. movq 4*8(%rsi), %r11
  66. movq 5*8(%rsi), %r8
  67. movq %r11, 4*8(%rdi)
  68. movq %r8, 5*8(%rdi)
  69. movq 6*8(%rsi), %r9
  70. movq 7*8(%rsi), %r10
  71. movq %r9, 6*8(%rdi)
  72. movq %r10, 7*8(%rdi)
  73. leaq 64(%rsi), %rsi
  74. leaq 64(%rdi), %rdi
  75. jnz .Lloop_64
  76. .Lhandle_tail:
  77. movl %edx, %ecx
  78. andl $63, %ecx
  79. shrl $3, %ecx
  80. jz .Lhandle_7
  81. .p2align 4
  82. .Lloop_8:
  83. decl %ecx
  84. movq (%rsi), %r8
  85. movq %r8, (%rdi)
  86. leaq 8(%rdi), %rdi
  87. leaq 8(%rsi), %rsi
  88. jnz .Lloop_8
  89. .Lhandle_7:
  90. movl %edx, %ecx
  91. andl $7, %ecx
  92. jz .Lend
  93. .p2align 4
  94. .Lloop_1:
  95. movb (%rsi), %r8b
  96. movb %r8b, (%rdi)
  97. incq %rdi
  98. incq %rsi
  99. decl %ecx
  100. jnz .Lloop_1
  101. .Lend:
  102. ret
  103. CFI_ENDPROC
  104. ENDPROC(memcpy)
  105. ENDPROC(__memcpy)
  106. /*
  107. * Some CPUs run faster using the string copy instructions.
  108. * It is also a lot simpler. Use this when possible:
  109. */
  110. .section .altinstr_replacement, "ax"
  111. 1: .byte 0xeb /* jmp <disp8> */
  112. .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
  113. 2:
  114. .previous
  115. .section .altinstructions, "a"
  116. .align 8
  117. .quad memcpy
  118. .quad 1b
  119. .byte X86_FEATURE_REP_GOOD
  120. /*
  121. * Replace only beginning, memcpy is used to apply alternatives,
  122. * so it is silly to overwrite itself with nops - reboot is the
  123. * only outcome...
  124. */
  125. .byte 2b - 1b
  126. .byte 2b - 1b
  127. .previous