memcpy_64.S 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/cpufeature.h>
  4. #include <asm/dwarf2.h>
  5. /*
  6. * memcpy - Copy a memory block.
  7. *
  8. * Input:
  9. * rdi destination
  10. * rsi source
  11. * rdx count
  12. *
  13. * Output:
  14. * rax original destination
  15. */
  16. /*
  17. * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  18. *
  19. * This gets patched over the unrolled variant (below) via the
  20. * alternative instructions framework:
  21. */
  22. .section .altinstr_replacement, "ax", @progbits
  23. .Lmemcpy_c:
  24. movq %rdi, %rax
  25. movl %edx, %ecx
  26. shrl $3, %ecx
  27. andl $7, %edx
  28. rep movsq
  29. movl %edx, %ecx
  30. rep movsb
  31. ret
  32. .Lmemcpy_e:
  33. .previous
  34. ENTRY(__memcpy)
  35. ENTRY(memcpy)
  36. CFI_STARTPROC
  37. /*
  38. * Put the number of full 64-byte blocks into %ecx.
  39. * Tail portion is handled at the end:
  40. */
  41. movq %rdi, %rax
  42. movl %edx, %ecx
  43. shrl $6, %ecx
  44. jz .Lhandle_tail
  45. .p2align 4
  46. .Lloop_64:
  47. /*
  48. * We decrement the loop index here - and the zero-flag is
  49. * checked at the end of the loop (instructions inbetween do
  50. * not change the zero flag):
  51. */
  52. decl %ecx
  53. /*
  54. * Move in blocks of 4x16 bytes:
  55. */
  56. movq 0*8(%rsi), %r11
  57. movq 1*8(%rsi), %r8
  58. movq %r11, 0*8(%rdi)
  59. movq %r8, 1*8(%rdi)
  60. movq 2*8(%rsi), %r9
  61. movq 3*8(%rsi), %r10
  62. movq %r9, 2*8(%rdi)
  63. movq %r10, 3*8(%rdi)
  64. movq 4*8(%rsi), %r11
  65. movq 5*8(%rsi), %r8
  66. movq %r11, 4*8(%rdi)
  67. movq %r8, 5*8(%rdi)
  68. movq 6*8(%rsi), %r9
  69. movq 7*8(%rsi), %r10
  70. movq %r9, 6*8(%rdi)
  71. movq %r10, 7*8(%rdi)
  72. leaq 64(%rsi), %rsi
  73. leaq 64(%rdi), %rdi
  74. jnz .Lloop_64
  75. .Lhandle_tail:
  76. movl %edx, %ecx
  77. andl $63, %ecx
  78. shrl $3, %ecx
  79. jz .Lhandle_7
  80. .p2align 4
  81. .Lloop_8:
  82. decl %ecx
  83. movq (%rsi), %r8
  84. movq %r8, (%rdi)
  85. leaq 8(%rdi), %rdi
  86. leaq 8(%rsi), %rsi
  87. jnz .Lloop_8
  88. .Lhandle_7:
  89. movl %edx, %ecx
  90. andl $7, %ecx
  91. jz .Lend
  92. .p2align 4
  93. .Lloop_1:
  94. movb (%rsi), %r8b
  95. movb %r8b, (%rdi)
  96. incq %rdi
  97. incq %rsi
  98. decl %ecx
  99. jnz .Lloop_1
  100. .Lend:
  101. ret
  102. CFI_ENDPROC
  103. ENDPROC(memcpy)
  104. ENDPROC(__memcpy)
  105. /*
  106. * Some CPUs run faster using the string copy instructions.
  107. * It is also a lot simpler. Use this when possible:
  108. */
  109. .section .altinstructions, "a"
  110. .align 8
  111. .quad memcpy
  112. .quad .Lmemcpy_c
  113. .byte X86_FEATURE_REP_GOOD
  114. /*
  115. * Replace only beginning, memcpy is used to apply alternatives,
  116. * so it is silly to overwrite itself with nops - reboot is the
  117. * only outcome...
  118. */
  119. .byte .Lmemcpy_e - .Lmemcpy_c
  120. .byte .Lmemcpy_e - .Lmemcpy_c
  121. .previous