memcpy_64.S 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. /*
  2. * arch/ppc64/lib/memcpy.S
  3. *
  4. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #include <asm/processor.h>
  12. #include <asm/ppc_asm.h>
  13. .align 7
  14. _GLOBAL(memcpy)
  15. mtcrf 0x01,r5
  16. cmpldi cr1,r5,16
  17. neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
  18. andi. r6,r6,7
  19. dcbt 0,r4
  20. blt cr1,.Lshort_copy
  21. bne .Ldst_unaligned
  22. .Ldst_aligned:
  23. andi. r0,r4,7
  24. addi r3,r3,-16
  25. bne .Lsrc_unaligned
  26. srdi r7,r5,4
  27. ld r9,0(r4)
  28. addi r4,r4,-8
  29. mtctr r7
  30. andi. r5,r5,7
  31. bf cr7*4+0,2f
  32. addi r3,r3,8
  33. addi r4,r4,8
  34. mr r8,r9
  35. blt cr1,3f
  36. 1: ld r9,8(r4)
  37. std r8,8(r3)
  38. 2: ldu r8,16(r4)
  39. stdu r9,16(r3)
  40. bdnz 1b
  41. 3: std r8,8(r3)
  42. beqlr
  43. addi r3,r3,16
  44. ld r9,8(r4)
  45. .Ldo_tail:
  46. bf cr7*4+1,1f
  47. rotldi r9,r9,32
  48. stw r9,0(r3)
  49. addi r3,r3,4
  50. 1: bf cr7*4+2,2f
  51. rotldi r9,r9,16
  52. sth r9,0(r3)
  53. addi r3,r3,2
  54. 2: bf cr7*4+3,3f
  55. rotldi r9,r9,8
  56. stb r9,0(r3)
  57. 3: blr
  58. .Lsrc_unaligned:
  59. srdi r6,r5,3
  60. addi r5,r5,-16
  61. subf r4,r0,r4
  62. srdi r7,r5,4
  63. sldi r10,r0,3
  64. cmpdi cr6,r6,3
  65. andi. r5,r5,7
  66. mtctr r7
  67. subfic r11,r10,64
  68. add r5,r5,r0
  69. bt cr7*4+0,0f
  70. ld r9,0(r4) # 3+2n loads, 2+2n stores
  71. ld r0,8(r4)
  72. sld r6,r9,r10
  73. ldu r9,16(r4)
  74. srd r7,r0,r11
  75. sld r8,r0,r10
  76. or r7,r7,r6
  77. blt cr6,4f
  78. ld r0,8(r4)
  79. # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
  80. b 2f
  81. 0: ld r0,0(r4) # 4+2n loads, 3+2n stores
  82. ldu r9,8(r4)
  83. sld r8,r0,r10
  84. addi r3,r3,-8
  85. blt cr6,5f
  86. ld r0,8(r4)
  87. srd r12,r9,r11
  88. sld r6,r9,r10
  89. ldu r9,16(r4)
  90. or r12,r8,r12
  91. srd r7,r0,r11
  92. sld r8,r0,r10
  93. addi r3,r3,16
  94. beq cr6,3f
  95. # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
  96. 1: or r7,r7,r6
  97. ld r0,8(r4)
  98. std r12,8(r3)
  99. 2: srd r12,r9,r11
  100. sld r6,r9,r10
  101. ldu r9,16(r4)
  102. or r12,r8,r12
  103. stdu r7,16(r3)
  104. srd r7,r0,r11
  105. sld r8,r0,r10
  106. bdnz 1b
  107. 3: std r12,8(r3)
  108. or r7,r7,r6
  109. 4: std r7,16(r3)
  110. 5: srd r12,r9,r11
  111. or r12,r8,r12
  112. std r12,24(r3)
  113. beqlr
  114. cmpwi cr1,r5,8
  115. addi r3,r3,32
  116. sld r9,r9,r10
  117. ble cr1,.Ldo_tail
  118. ld r0,8(r4)
  119. srd r7,r0,r11
  120. or r9,r7,r9
  121. b .Ldo_tail
  122. .Ldst_unaligned:
  123. mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7
  124. subf r5,r6,r5
  125. li r7,0
  126. cmpldi r1,r5,16
  127. bf cr7*4+3,1f
  128. lbz r0,0(r4)
  129. stb r0,0(r3)
  130. addi r7,r7,1
  131. 1: bf cr7*4+2,2f
  132. lhzx r0,r7,r4
  133. sthx r0,r7,r3
  134. addi r7,r7,2
  135. 2: bf cr7*4+1,3f
  136. lwzx r0,r7,r4
  137. stwx r0,r7,r3
  138. 3: mtcrf 0x01,r5
  139. add r4,r6,r4
  140. add r3,r6,r3
  141. b .Ldst_aligned
  142. .Lshort_copy:
  143. bf cr7*4+0,1f
  144. lwz r0,0(r4)
  145. lwz r9,4(r4)
  146. addi r4,r4,8
  147. stw r0,0(r3)
  148. stw r9,4(r3)
  149. addi r3,r3,8
  150. 1: bf cr7*4+1,2f
  151. lwz r0,0(r4)
  152. addi r4,r4,4
  153. stw r0,0(r3)
  154. addi r3,r3,4
  155. 2: bf cr7*4+2,3f
  156. lhz r0,0(r4)
  157. addi r4,r4,2
  158. sth r0,0(r3)
  159. addi r3,r3,2
  160. 3: bf cr7*4+3,4f
  161. lbz r0,0(r4)
  162. stb r0,0(r3)
  163. 4: blr