memcpy_64.S 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /*
  2. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation; either version
  7. * 2 of the License, or (at your option) any later version.
  8. */
  9. #include <asm/processor.h>
  10. #include <asm/ppc_asm.h>
  11. .align 7
  12. _GLOBAL(memcpy)
  13. std r3,48(r1) /* save destination pointer for return value */
  14. PPC_MTOCRF 0x01,r5
  15. cmpldi cr1,r5,16
  16. neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
  17. andi. r6,r6,7
  18. dcbt 0,r4
  19. blt cr1,.Lshort_copy
  20. /* Below we want to nop out the bne if we're on a CPU that has the
  21. CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  22. cleared.
  23. At the time of writing the only CPU that has this combination of bits
  24. set is Power6. */
  25. BEGIN_FTR_SECTION
  26. nop
  27. FTR_SECTION_ELSE
  28. bne .Ldst_unaligned
  29. ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  30. CPU_FTR_UNALIGNED_LD_STD)
  31. .Ldst_aligned:
  32. addi r3,r3,-16
  33. BEGIN_FTR_SECTION
  34. andi. r0,r4,7
  35. bne .Lsrc_unaligned
  36. END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  37. srdi r7,r5,4
  38. ld r9,0(r4)
  39. addi r4,r4,-8
  40. mtctr r7
  41. andi. r5,r5,7
  42. bf cr7*4+0,2f
  43. addi r3,r3,8
  44. addi r4,r4,8
  45. mr r8,r9
  46. blt cr1,3f
  47. 1: ld r9,8(r4)
  48. std r8,8(r3)
  49. 2: ldu r8,16(r4)
  50. stdu r9,16(r3)
  51. bdnz 1b
  52. 3: std r8,8(r3)
  53. beq 3f
  54. addi r3,r3,16
  55. ld r9,8(r4)
  56. .Ldo_tail:
  57. bf cr7*4+1,1f
  58. rotldi r9,r9,32
  59. stw r9,0(r3)
  60. addi r3,r3,4
  61. 1: bf cr7*4+2,2f
  62. rotldi r9,r9,16
  63. sth r9,0(r3)
  64. addi r3,r3,2
  65. 2: bf cr7*4+3,3f
  66. rotldi r9,r9,8
  67. stb r9,0(r3)
  68. 3: ld r3,48(r1) /* return dest pointer */
  69. blr
  70. .Lsrc_unaligned:
  71. srdi r6,r5,3
  72. addi r5,r5,-16
  73. subf r4,r0,r4
  74. srdi r7,r5,4
  75. sldi r10,r0,3
  76. cmpdi cr6,r6,3
  77. andi. r5,r5,7
  78. mtctr r7
  79. subfic r11,r10,64
  80. add r5,r5,r0
  81. bt cr7*4+0,0f
  82. ld r9,0(r4) # 3+2n loads, 2+2n stores
  83. ld r0,8(r4)
  84. sld r6,r9,r10
  85. ldu r9,16(r4)
  86. srd r7,r0,r11
  87. sld r8,r0,r10
  88. or r7,r7,r6
  89. blt cr6,4f
  90. ld r0,8(r4)
  91. # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
  92. b 2f
  93. 0: ld r0,0(r4) # 4+2n loads, 3+2n stores
  94. ldu r9,8(r4)
  95. sld r8,r0,r10
  96. addi r3,r3,-8
  97. blt cr6,5f
  98. ld r0,8(r4)
  99. srd r12,r9,r11
  100. sld r6,r9,r10
  101. ldu r9,16(r4)
  102. or r12,r8,r12
  103. srd r7,r0,r11
  104. sld r8,r0,r10
  105. addi r3,r3,16
  106. beq cr6,3f
  107. # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
  108. 1: or r7,r7,r6
  109. ld r0,8(r4)
  110. std r12,8(r3)
  111. 2: srd r12,r9,r11
  112. sld r6,r9,r10
  113. ldu r9,16(r4)
  114. or r12,r8,r12
  115. stdu r7,16(r3)
  116. srd r7,r0,r11
  117. sld r8,r0,r10
  118. bdnz 1b
  119. 3: std r12,8(r3)
  120. or r7,r7,r6
  121. 4: std r7,16(r3)
  122. 5: srd r12,r9,r11
  123. or r12,r8,r12
  124. std r12,24(r3)
  125. beq 4f
  126. cmpwi cr1,r5,8
  127. addi r3,r3,32
  128. sld r9,r9,r10
  129. ble cr1,.Ldo_tail
  130. ld r0,8(r4)
  131. srd r7,r0,r11
  132. or r9,r7,r9
  133. b .Ldo_tail
  134. .Ldst_unaligned:
  135. PPC_MTOCRF 0x01,r6 # put #bytes to 8B bdry into cr7
  136. subf r5,r6,r5
  137. li r7,0
  138. cmpldi cr1,r5,16
  139. bf cr7*4+3,1f
  140. lbz r0,0(r4)
  141. stb r0,0(r3)
  142. addi r7,r7,1
  143. 1: bf cr7*4+2,2f
  144. lhzx r0,r7,r4
  145. sthx r0,r7,r3
  146. addi r7,r7,2
  147. 2: bf cr7*4+1,3f
  148. lwzx r0,r7,r4
  149. stwx r0,r7,r3
  150. 3: PPC_MTOCRF 0x01,r5
  151. add r4,r6,r4
  152. add r3,r6,r3
  153. b .Ldst_aligned
  154. .Lshort_copy:
  155. bf cr7*4+0,1f
  156. lwz r0,0(r4)
  157. lwz r9,4(r4)
  158. addi r4,r4,8
  159. stw r0,0(r3)
  160. stw r9,4(r3)
  161. addi r3,r3,8
  162. 1: bf cr7*4+1,2f
  163. lwz r0,0(r4)
  164. addi r4,r4,4
  165. stw r0,0(r3)
  166. addi r3,r3,4
  167. 2: bf cr7*4+2,3f
  168. lhz r0,0(r4)
  169. addi r4,r4,2
  170. sth r0,0(r3)
  171. addi r3,r3,2
  172. 3: bf cr7*4+3,4f
  173. lbz r0,0(r4)
  174. stb r0,0(r3)
  175. 4: ld r3,48(r1) /* return dest pointer */
  176. blr