memcpy_64.S 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /*
  2. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation; either version
  7. * 2 of the License, or (at your option) any later version.
  8. */
  9. #include <asm/processor.h>
  10. #include <asm/ppc_asm.h>
  11. .align 7
  12. _GLOBAL(memcpy)
  13. std r3,48(r1) /* save destination pointer for return value */
  14. PPC_MTOCRF 0x01,r5
  15. cmpldi cr1,r5,16
  16. neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
  17. andi. r6,r6,7
  18. dcbt 0,r4
  19. blt cr1,.Lshort_copy
  20. bne .Ldst_unaligned
  21. .Ldst_aligned:
  22. andi. r0,r4,7
  23. addi r3,r3,-16
  24. bne .Lsrc_unaligned
  25. srdi r7,r5,4
  26. ld r9,0(r4)
  27. addi r4,r4,-8
  28. mtctr r7
  29. andi. r5,r5,7
  30. bf cr7*4+0,2f
  31. addi r3,r3,8
  32. addi r4,r4,8
  33. mr r8,r9
  34. blt cr1,3f
  35. 1: ld r9,8(r4)
  36. std r8,8(r3)
  37. 2: ldu r8,16(r4)
  38. stdu r9,16(r3)
  39. bdnz 1b
  40. 3: std r8,8(r3)
  41. beq 3f
  42. addi r3,r3,16
  43. ld r9,8(r4)
  44. .Ldo_tail:
  45. bf cr7*4+1,1f
  46. rotldi r9,r9,32
  47. stw r9,0(r3)
  48. addi r3,r3,4
  49. 1: bf cr7*4+2,2f
  50. rotldi r9,r9,16
  51. sth r9,0(r3)
  52. addi r3,r3,2
  53. 2: bf cr7*4+3,3f
  54. rotldi r9,r9,8
  55. stb r9,0(r3)
  56. 3: ld r3,48(r1) /* return dest pointer */
  57. blr
  58. .Lsrc_unaligned:
  59. srdi r6,r5,3
  60. addi r5,r5,-16
  61. subf r4,r0,r4
  62. srdi r7,r5,4
  63. sldi r10,r0,3
  64. cmpdi cr6,r6,3
  65. andi. r5,r5,7
  66. mtctr r7
  67. subfic r11,r10,64
  68. add r5,r5,r0
  69. bt cr7*4+0,0f
  70. ld r9,0(r4) # 3+2n loads, 2+2n stores
  71. ld r0,8(r4)
  72. sld r6,r9,r10
  73. ldu r9,16(r4)
  74. srd r7,r0,r11
  75. sld r8,r0,r10
  76. or r7,r7,r6
  77. blt cr6,4f
  78. ld r0,8(r4)
  79. # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
  80. b 2f
  81. 0: ld r0,0(r4) # 4+2n loads, 3+2n stores
  82. ldu r9,8(r4)
  83. sld r8,r0,r10
  84. addi r3,r3,-8
  85. blt cr6,5f
  86. ld r0,8(r4)
  87. srd r12,r9,r11
  88. sld r6,r9,r10
  89. ldu r9,16(r4)
  90. or r12,r8,r12
  91. srd r7,r0,r11
  92. sld r8,r0,r10
  93. addi r3,r3,16
  94. beq cr6,3f
  95. # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
  96. 1: or r7,r7,r6
  97. ld r0,8(r4)
  98. std r12,8(r3)
  99. 2: srd r12,r9,r11
  100. sld r6,r9,r10
  101. ldu r9,16(r4)
  102. or r12,r8,r12
  103. stdu r7,16(r3)
  104. srd r7,r0,r11
  105. sld r8,r0,r10
  106. bdnz 1b
  107. 3: std r12,8(r3)
  108. or r7,r7,r6
  109. 4: std r7,16(r3)
  110. 5: srd r12,r9,r11
  111. or r12,r8,r12
  112. std r12,24(r3)
  113. beq 4f
  114. cmpwi cr1,r5,8
  115. addi r3,r3,32
  116. sld r9,r9,r10
  117. ble cr1,.Ldo_tail
  118. ld r0,8(r4)
  119. srd r7,r0,r11
  120. or r9,r7,r9
  121. b .Ldo_tail
  122. .Ldst_unaligned:
  123. PPC_MTOCRF 0x01,r6 # put #bytes to 8B bdry into cr7
  124. subf r5,r6,r5
  125. li r7,0
  126. cmpldi r1,r5,16
  127. bf cr7*4+3,1f
  128. lbz r0,0(r4)
  129. stb r0,0(r3)
  130. addi r7,r7,1
  131. 1: bf cr7*4+2,2f
  132. lhzx r0,r7,r4
  133. sthx r0,r7,r3
  134. addi r7,r7,2
  135. 2: bf cr7*4+1,3f
  136. lwzx r0,r7,r4
  137. stwx r0,r7,r3
  138. 3: PPC_MTOCRF 0x01,r5
  139. add r4,r6,r4
  140. add r3,r6,r3
  141. b .Ldst_aligned
  142. .Lshort_copy:
  143. bf cr7*4+0,1f
  144. lwz r0,0(r4)
  145. lwz r9,4(r4)
  146. addi r4,r4,8
  147. stw r0,0(r3)
  148. stw r9,4(r3)
  149. addi r3,r3,8
  150. 1: bf cr7*4+1,2f
  151. lwz r0,0(r4)
  152. addi r4,r4,4
  153. stw r0,0(r3)
  154. addi r3,r3,4
  155. 2: bf cr7*4+2,3f
  156. lhz r0,0(r4)
  157. addi r4,r4,2
  158. sth r0,0(r3)
  159. addi r3,r3,2
  160. 3: bf cr7*4+3,4f
  161. lbz r0,0(r4)
  162. stb r0,0(r3)
  163. 4: ld r3,48(r1) /* return dest pointer */
  164. blr