memcpy_64.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/cpufeature.h>
  4. #include <asm/dwarf2.h>
  5. /*
  6. * memcpy - Copy a memory block.
  7. *
  8. * Input:
  9. * rdi destination
  10. * rsi source
  11. * rdx count
  12. *
  13. * Output:
  14. * rax original destination
  15. */
  16. /*
  17. * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  18. *
  19. * This gets patched over the unrolled variant (below) via the
  20. * alternative instructions framework:
  21. */
  22. .section .altinstr_replacement, "ax", @progbits
  23. .Lmemcpy_c:
  24. movq %rdi, %rax
  25. movl %edx, %ecx
  26. shrl $3, %ecx
  27. andl $7, %edx
  28. rep movsq
  29. movl %edx, %ecx
  30. rep movsb
  31. ret
  32. .Lmemcpy_e:
  33. .previous
  34. ENTRY(__memcpy)
  35. ENTRY(memcpy)
  36. CFI_STARTPROC
  37. movq %rdi, %rax
  38. /*
  39. * Use 32bit CMP here to avoid long NOP padding.
  40. */
  41. cmp $0x20, %edx
  42. jb .Lhandle_tail
  43. /*
  44. * We check whether memory false dependece could occur,
  45. * then jump to corresponding copy mode.
  46. */
  47. cmp %dil, %sil
  48. jl .Lcopy_backward
  49. subl $0x20, %edx
  50. .Lcopy_forward_loop:
  51. subq $0x20, %rdx
  52. /*
  53. * Move in blocks of 4x8 bytes:
  54. */
  55. movq 0*8(%rsi), %r8
  56. movq 1*8(%rsi), %r9
  57. movq 2*8(%rsi), %r10
  58. movq 3*8(%rsi), %r11
  59. leaq 4*8(%rsi), %rsi
  60. movq %r8, 0*8(%rdi)
  61. movq %r9, 1*8(%rdi)
  62. movq %r10, 2*8(%rdi)
  63. movq %r11, 3*8(%rdi)
  64. leaq 4*8(%rdi), %rdi
  65. jae .Lcopy_forward_loop
  66. addq $0x20, %rdx
  67. jmp .Lhandle_tail
  68. .Lcopy_backward:
  69. /*
  70. * Calculate copy position to tail.
  71. */
  72. addq %rdx, %rsi
  73. addq %rdx, %rdi
  74. subq $0x20, %rdx
  75. /*
  76. * At most 3 ALU operations in one cycle,
  77. * so append NOPS in the same 16bytes trunk.
  78. */
  79. .p2align 4
  80. .Lcopy_backward_loop:
  81. subq $0x20, %rdx
  82. movq -1*8(%rsi), %r8
  83. movq -2*8(%rsi), %r9
  84. movq -3*8(%rsi), %r10
  85. movq -4*8(%rsi), %r11
  86. leaq -4*8(%rsi), %rsi
  87. movq %r8, -1*8(%rdi)
  88. movq %r9, -2*8(%rdi)
  89. movq %r10, -3*8(%rdi)
  90. movq %r11, -4*8(%rdi)
  91. leaq -4*8(%rdi), %rdi
  92. jae .Lcopy_backward_loop
  93. /*
  94. * Calculate copy position to head.
  95. */
  96. addq $0x20, %rdx
  97. subq %rdx, %rsi
  98. subq %rdx, %rdi
  99. .Lhandle_tail:
  100. cmpq $16, %rdx
  101. jb .Lless_16bytes
  102. /*
  103. * Move data from 16 bytes to 31 bytes.
  104. */
  105. movq 0*8(%rsi), %r8
  106. movq 1*8(%rsi), %r9
  107. movq -2*8(%rsi, %rdx), %r10
  108. movq -1*8(%rsi, %rdx), %r11
  109. movq %r8, 0*8(%rdi)
  110. movq %r9, 1*8(%rdi)
  111. movq %r10, -2*8(%rdi, %rdx)
  112. movq %r11, -1*8(%rdi, %rdx)
  113. retq
  114. .p2align 4
  115. .Lless_16bytes:
  116. cmpq $8, %rdx
  117. jb .Lless_8bytes
  118. /*
  119. * Move data from 8 bytes to 15 bytes.
  120. */
  121. movq 0*8(%rsi), %r8
  122. movq -1*8(%rsi, %rdx), %r9
  123. movq %r8, 0*8(%rdi)
  124. movq %r9, -1*8(%rdi, %rdx)
  125. retq
  126. .p2align 4
  127. .Lless_8bytes:
  128. cmpq $4, %rdx
  129. jb .Lless_3bytes
  130. /*
  131. * Move data from 4 bytes to 7 bytes.
  132. */
  133. movl (%rsi), %ecx
  134. movl -4(%rsi, %rdx), %r8d
  135. movl %ecx, (%rdi)
  136. movl %r8d, -4(%rdi, %rdx)
  137. retq
  138. .p2align 4
  139. .Lless_3bytes:
  140. cmpl $0, %edx
  141. je .Lend
  142. /*
  143. * Move data from 1 bytes to 3 bytes.
  144. */
  145. .Lloop_1:
  146. movb (%rsi), %r8b
  147. movb %r8b, (%rdi)
  148. incq %rdi
  149. incq %rsi
  150. decl %edx
  151. jnz .Lloop_1
  152. .Lend:
  153. retq
  154. CFI_ENDPROC
  155. ENDPROC(memcpy)
  156. ENDPROC(__memcpy)
  157. /*
  158. * Some CPUs run faster using the string copy instructions.
  159. * It is also a lot simpler. Use this when possible:
  160. */
  161. .section .altinstructions, "a"
  162. .align 8
  163. .quad memcpy
  164. .quad .Lmemcpy_c
  165. .word X86_FEATURE_REP_GOOD
  166. /*
  167. * Replace only beginning, memcpy is used to apply alternatives,
  168. * so it is silly to overwrite itself with nops - reboot is the
  169. * only outcome...
  170. */
  171. .byte .Lmemcpy_e - .Lmemcpy_c
  172. .byte .Lmemcpy_e - .Lmemcpy_c
  173. .previous