memmove_64.S 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. /*
  2. * Normally compiler builtins are used, but sometimes the compiler calls out
  3. * of line code. Based on asm-i386/string.h.
  4. *
  5. * This assembly file is re-written from memmove_64.c file.
  6. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  7. */
  8. #define _STRING_C
  9. #include <linux/linkage.h>
  10. #include <asm/dwarf2.h>
  11. #undef memmove
  12. /*
  13. * Implement memmove(). This can handle overlap between src and dst.
  14. *
  15. * Input:
  16. * rdi: dest
  17. * rsi: src
  18. * rdx: count
  19. *
  20. * Output:
  21. * rax: dest
  22. */
  23. ENTRY(memmove)
  24. CFI_STARTPROC
  25. /* Handle more 32bytes in loop */
  26. mov %rdi, %rax
  27. cmp $0x20, %rdx
  28. jb 1f
  29. /* Decide forward/backward copy mode */
  30. cmp %rdi, %rsi
  31. jb 2f
  32. /*
  33. * movsq instruction have many startup latency
  34. * so we handle small size by general register.
  35. */
  36. cmp $680, %rdx
  37. jb 3f
  38. /*
  39. * movsq instruction is only good for aligned case.
  40. */
  41. cmpb %dil, %sil
  42. je 4f
  43. 3:
  44. sub $0x20, %rdx
  45. /*
  46. * We gobble 32byts forward in each loop.
  47. */
  48. 5:
  49. sub $0x20, %rdx
  50. movq 0*8(%rsi), %r11
  51. movq 1*8(%rsi), %r10
  52. movq 2*8(%rsi), %r9
  53. movq 3*8(%rsi), %r8
  54. leaq 4*8(%rsi), %rsi
  55. movq %r11, 0*8(%rdi)
  56. movq %r10, 1*8(%rdi)
  57. movq %r9, 2*8(%rdi)
  58. movq %r8, 3*8(%rdi)
  59. leaq 4*8(%rdi), %rdi
  60. jae 5b
  61. addq $0x20, %rdx
  62. jmp 1f
  63. /*
  64. * Handle data forward by movsq.
  65. */
  66. .p2align 4
  67. 4:
  68. movq %rdx, %rcx
  69. movq -8(%rsi, %rdx), %r11
  70. lea -8(%rdi, %rdx), %r10
  71. shrq $3, %rcx
  72. rep movsq
  73. movq %r11, (%r10)
  74. jmp 13f
  75. /*
  76. * Handle data backward by movsq.
  77. */
  78. .p2align 4
  79. 7:
  80. movq %rdx, %rcx
  81. movq (%rsi), %r11
  82. movq %rdi, %r10
  83. leaq -8(%rsi, %rdx), %rsi
  84. leaq -8(%rdi, %rdx), %rdi
  85. shrq $3, %rcx
  86. std
  87. rep movsq
  88. cld
  89. movq %r11, (%r10)
  90. jmp 13f
  91. /*
  92. * Start to prepare for backward copy.
  93. */
  94. .p2align 4
  95. 2:
  96. cmp $680, %rdx
  97. jb 6f
  98. cmp %dil, %sil
  99. je 7b
  100. 6:
  101. /*
  102. * Calculate copy position to tail.
  103. */
  104. addq %rdx, %rsi
  105. addq %rdx, %rdi
  106. subq $0x20, %rdx
  107. /*
  108. * We gobble 32byts backward in each loop.
  109. */
  110. 8:
  111. subq $0x20, %rdx
  112. movq -1*8(%rsi), %r11
  113. movq -2*8(%rsi), %r10
  114. movq -3*8(%rsi), %r9
  115. movq -4*8(%rsi), %r8
  116. leaq -4*8(%rsi), %rsi
  117. movq %r11, -1*8(%rdi)
  118. movq %r10, -2*8(%rdi)
  119. movq %r9, -3*8(%rdi)
  120. movq %r8, -4*8(%rdi)
  121. leaq -4*8(%rdi), %rdi
  122. jae 8b
  123. /*
  124. * Calculate copy position to head.
  125. */
  126. addq $0x20, %rdx
  127. subq %rdx, %rsi
  128. subq %rdx, %rdi
  129. 1:
  130. cmpq $16, %rdx
  131. jb 9f
  132. /*
  133. * Move data from 16 bytes to 31 bytes.
  134. */
  135. movq 0*8(%rsi), %r11
  136. movq 1*8(%rsi), %r10
  137. movq -2*8(%rsi, %rdx), %r9
  138. movq -1*8(%rsi, %rdx), %r8
  139. movq %r11, 0*8(%rdi)
  140. movq %r10, 1*8(%rdi)
  141. movq %r9, -2*8(%rdi, %rdx)
  142. movq %r8, -1*8(%rdi, %rdx)
  143. jmp 13f
  144. .p2align 4
  145. 9:
  146. cmpq $8, %rdx
  147. jb 10f
  148. /*
  149. * Move data from 8 bytes to 15 bytes.
  150. */
  151. movq 0*8(%rsi), %r11
  152. movq -1*8(%rsi, %rdx), %r10
  153. movq %r11, 0*8(%rdi)
  154. movq %r10, -1*8(%rdi, %rdx)
  155. jmp 13f
  156. 10:
  157. cmpq $4, %rdx
  158. jb 11f
  159. /*
  160. * Move data from 4 bytes to 7 bytes.
  161. */
  162. movl (%rsi), %r11d
  163. movl -4(%rsi, %rdx), %r10d
  164. movl %r11d, (%rdi)
  165. movl %r10d, -4(%rdi, %rdx)
  166. jmp 13f
  167. 11:
  168. cmp $2, %rdx
  169. jb 12f
  170. /*
  171. * Move data from 2 bytes to 3 bytes.
  172. */
  173. movw (%rsi), %r11w
  174. movw -2(%rsi, %rdx), %r10w
  175. movw %r11w, (%rdi)
  176. movw %r10w, -2(%rdi, %rdx)
  177. jmp 13f
  178. 12:
  179. cmp $1, %rdx
  180. jb 13f
  181. /*
  182. * Move data for 1 byte.
  183. */
  184. movb (%rsi), %r11b
  185. movb %r11b, (%rdi)
  186. 13:
  187. retq
  188. CFI_ENDPROC
  189. ENDPROC(memmove)