memcpy_64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /*
  2. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation; either version
  7. * 2 of the License, or (at your option) any later version.
  8. */
  9. #include <asm/processor.h>
  10. #include <asm/ppc_asm.h>
  11. .align 7
  12. _GLOBAL(memcpy)
  13. mtcrf 0x01,r5
  14. cmpldi cr1,r5,16
  15. neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
  16. andi. r6,r6,7
  17. dcbt 0,r4
  18. blt cr1,.Lshort_copy
  19. bne .Ldst_unaligned
  20. .Ldst_aligned:
  21. andi. r0,r4,7
  22. addi r3,r3,-16
  23. bne .Lsrc_unaligned
  24. srdi r7,r5,4
  25. ld r9,0(r4)
  26. addi r4,r4,-8
  27. mtctr r7
  28. andi. r5,r5,7
  29. bf cr7*4+0,2f
  30. addi r3,r3,8
  31. addi r4,r4,8
  32. mr r8,r9
  33. blt cr1,3f
  34. 1: ld r9,8(r4)
  35. std r8,8(r3)
  36. 2: ldu r8,16(r4)
  37. stdu r9,16(r3)
  38. bdnz 1b
  39. 3: std r8,8(r3)
  40. beqlr
  41. addi r3,r3,16
  42. ld r9,8(r4)
  43. .Ldo_tail:
  44. bf cr7*4+1,1f
  45. rotldi r9,r9,32
  46. stw r9,0(r3)
  47. addi r3,r3,4
  48. 1: bf cr7*4+2,2f
  49. rotldi r9,r9,16
  50. sth r9,0(r3)
  51. addi r3,r3,2
  52. 2: bf cr7*4+3,3f
  53. rotldi r9,r9,8
  54. stb r9,0(r3)
  55. 3: blr
  56. .Lsrc_unaligned:
  57. srdi r6,r5,3
  58. addi r5,r5,-16
  59. subf r4,r0,r4
  60. srdi r7,r5,4
  61. sldi r10,r0,3
  62. cmpdi cr6,r6,3
  63. andi. r5,r5,7
  64. mtctr r7
  65. subfic r11,r10,64
  66. add r5,r5,r0
  67. bt cr7*4+0,0f
  68. ld r9,0(r4) # 3+2n loads, 2+2n stores
  69. ld r0,8(r4)
  70. sld r6,r9,r10
  71. ldu r9,16(r4)
  72. srd r7,r0,r11
  73. sld r8,r0,r10
  74. or r7,r7,r6
  75. blt cr6,4f
  76. ld r0,8(r4)
  77. # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
  78. b 2f
  79. 0: ld r0,0(r4) # 4+2n loads, 3+2n stores
  80. ldu r9,8(r4)
  81. sld r8,r0,r10
  82. addi r3,r3,-8
  83. blt cr6,5f
  84. ld r0,8(r4)
  85. srd r12,r9,r11
  86. sld r6,r9,r10
  87. ldu r9,16(r4)
  88. or r12,r8,r12
  89. srd r7,r0,r11
  90. sld r8,r0,r10
  91. addi r3,r3,16
  92. beq cr6,3f
  93. # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
  94. 1: or r7,r7,r6
  95. ld r0,8(r4)
  96. std r12,8(r3)
  97. 2: srd r12,r9,r11
  98. sld r6,r9,r10
  99. ldu r9,16(r4)
  100. or r12,r8,r12
  101. stdu r7,16(r3)
  102. srd r7,r0,r11
  103. sld r8,r0,r10
  104. bdnz 1b
  105. 3: std r12,8(r3)
  106. or r7,r7,r6
  107. 4: std r7,16(r3)
  108. 5: srd r12,r9,r11
  109. or r12,r8,r12
  110. std r12,24(r3)
  111. beqlr
  112. cmpwi cr1,r5,8
  113. addi r3,r3,32
  114. sld r9,r9,r10
  115. ble cr1,.Ldo_tail
  116. ld r0,8(r4)
  117. srd r7,r0,r11
  118. or r9,r7,r9
  119. b .Ldo_tail
  120. .Ldst_unaligned:
  121. mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7
  122. subf r5,r6,r5
  123. li r7,0
  124. cmpldi r1,r5,16
  125. bf cr7*4+3,1f
  126. lbz r0,0(r4)
  127. stb r0,0(r3)
  128. addi r7,r7,1
  129. 1: bf cr7*4+2,2f
  130. lhzx r0,r7,r4
  131. sthx r0,r7,r3
  132. addi r7,r7,2
  133. 2: bf cr7*4+1,3f
  134. lwzx r0,r7,r4
  135. stwx r0,r7,r3
  136. 3: mtcrf 0x01,r5
  137. add r4,r6,r4
  138. add r3,r6,r3
  139. b .Ldst_aligned
  140. .Lshort_copy:
  141. bf cr7*4+0,1f
  142. lwz r0,0(r4)
  143. lwz r9,4(r4)
  144. addi r4,r4,8
  145. stw r0,0(r3)
  146. stw r9,4(r3)
  147. addi r3,r3,8
  148. 1: bf cr7*4+1,2f
  149. lwz r0,0(r4)
  150. addi r4,r4,4
  151. stw r0,0(r3)
  152. addi r3,r3,4
  153. 2: bf cr7*4+2,3f
  154. lhz r0,0(r4)
  155. addi r4,r4,2
  156. sth r0,0(r3)
  157. addi r3,r3,2
  158. 3: bf cr7*4+3,4f
  159. lbz r0,0(r4)
  160. stb r0,0(r3)
  161. 4: blr