string.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. /*#************************************************************************#*/
  2. /*#-------------------------------------------------------------------------*/
  3. /*# */
  4. /*# FUNCTION NAME: memcpy() */
  5. /*# */
  6. /*# PARAMETERS: void* dst; Destination address. */
  7. /*# void* src; Source address. */
  8. /*# int len; Number of bytes to copy. */
  9. /*# */
  10. /*# RETURNS: dst. */
  11. /*# */
  12. /*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */
  13. /*# about copying of overlapping memory areas. This routine is */
  14. /*# very sensitive to compiler changes in register allocation. */
  15. /*# Should really be rewritten to avoid this problem. */
  16. /*# */
  17. /*#-------------------------------------------------------------------------*/
  18. /*# */
  19. /*# HISTORY */
  20. /*# */
  21. /*# DATE NAME CHANGES */
  22. /*# ---- ---- ------- */
  23. /*# 941007 Kenny R Creation */
  24. /*# 941011 Kenny R Lots of optimizations and inlining. */
  25. /*# 941129 Ulf A Adapted for use in libc. */
  26. /*# 950216 HP N==0 forgotten if non-aligned src/dst. */
  27. /*# Added some optimizations. */
  28. /*# 001025 HP Make src and dst char *. Align dst to */
  29. /*# dword, not just word-if-both-src-and-dst- */
  30. /*# are-misaligned. */
  31. /*# */
  32. /*#-------------------------------------------------------------------------*/
  33. #include <linux/types.h>
  34. void *memcpy(void *pdst,
  35. const void *psrc,
  36. size_t pn)
  37. {
  38. /* Ok. Now we want the parameters put in special registers.
  39. Make sure the compiler is able to make something useful of this.
  40. As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
  41. If gcc was allright, it really would need no temporaries, and no
  42. stack space to save stuff on. */
  43. register void *return_dst __asm__ ("r10") = pdst;
  44. register char *dst __asm__ ("r13") = pdst;
  45. register const char *src __asm__ ("r11") = psrc;
  46. register int n __asm__ ("r12") = pn;
  47. /* When src is aligned but not dst, this makes a few extra needless
  48. cycles. I believe it would take as many to check that the
  49. re-alignment was unnecessary. */
  50. if (((unsigned long) dst & 3) != 0
  51. /* Don't align if we wouldn't copy more than a few bytes; so we
  52. don't have to check further for overflows. */
  53. && n >= 3)
  54. {
  55. if ((unsigned long) dst & 1)
  56. {
  57. n--;
  58. *(char*)dst = *(char*)src;
  59. src++;
  60. dst++;
  61. }
  62. if ((unsigned long) dst & 2)
  63. {
  64. n -= 2;
  65. *(short*)dst = *(short*)src;
  66. src += 2;
  67. dst += 2;
  68. }
  69. }
  70. /* Decide which copying method to use. */
  71. if (n >= 44*2) /* Break even between movem and
  72. move16 is at 38.7*2, but modulo 44. */
  73. {
  74. /* For large copies we use 'movem' */
  75. /* It is not optimal to tell the compiler about clobbering any
  76. registers; that will move the saving/restoring of those registers
  77. to the function prologue/epilogue, and make non-movem sizes
  78. suboptimal.
  79. This method is not foolproof; it assumes that the "asm reg"
  80. declarations at the beginning of the function really are used
  81. here (beware: they may be moved to temporary registers).
  82. This way, we do not have to save/move the registers around into
  83. temporaries; we can safely use them straight away.
  84. If you want to check that the allocation was right; then
  85. check the equalities in the first comment. It should say
  86. "r13=r13, r11=r11, r12=r12" */
  87. __asm__ volatile ("
  88. ;; Check that the following is true (same register names on
  89. ;; both sides of equal sign, as in r8=r8):
  90. ;; %0=r13, %1=r11, %2=r12
  91. ;;
  92. ;; Save the registers we'll use in the movem process
  93. ;; on the stack.
  94. subq 11*4,$sp
  95. movem $r10,[$sp]
  96. ;; Now we've got this:
  97. ;; r11 - src
  98. ;; r13 - dst
  99. ;; r12 - n
  100. ;; Update n for the first loop
  101. subq 44,$r12
  102. 0:
  103. movem [$r11+],$r10
  104. subq 44,$r12
  105. bge 0b
  106. movem $r10,[$r13+]
  107. addq 44,$r12 ;; compensate for last loop underflowing n
  108. ;; Restore registers from stack
  109. movem [$sp+],$r10"
  110. /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
  111. /* Inputs */ : "0" (dst), "1" (src), "2" (n));
  112. }
  113. /* Either we directly starts copying, using dword copying
  114. in a loop, or we copy as much as possible with 'movem'
  115. and then the last block (<44 bytes) is copied here.
  116. This will work since 'movem' will have updated src,dst,n. */
  117. while ( n >= 16 )
  118. {
  119. *((long*)dst)++ = *((long*)src)++;
  120. *((long*)dst)++ = *((long*)src)++;
  121. *((long*)dst)++ = *((long*)src)++;
  122. *((long*)dst)++ = *((long*)src)++;
  123. n -= 16;
  124. }
  125. /* A switch() is definitely the fastest although it takes a LOT of code.
  126. * Particularly if you inline code this.
  127. */
  128. switch (n)
  129. {
  130. case 0:
  131. break;
  132. case 1:
  133. *(char*)dst = *(char*)src;
  134. break;
  135. case 2:
  136. *(short*)dst = *(short*)src;
  137. break;
  138. case 3:
  139. *((short*)dst)++ = *((short*)src)++;
  140. *(char*)dst = *(char*)src;
  141. break;
  142. case 4:
  143. *((long*)dst)++ = *((long*)src)++;
  144. break;
  145. case 5:
  146. *((long*)dst)++ = *((long*)src)++;
  147. *(char*)dst = *(char*)src;
  148. break;
  149. case 6:
  150. *((long*)dst)++ = *((long*)src)++;
  151. *(short*)dst = *(short*)src;
  152. break;
  153. case 7:
  154. *((long*)dst)++ = *((long*)src)++;
  155. *((short*)dst)++ = *((short*)src)++;
  156. *(char*)dst = *(char*)src;
  157. break;
  158. case 8:
  159. *((long*)dst)++ = *((long*)src)++;
  160. *((long*)dst)++ = *((long*)src)++;
  161. break;
  162. case 9:
  163. *((long*)dst)++ = *((long*)src)++;
  164. *((long*)dst)++ = *((long*)src)++;
  165. *(char*)dst = *(char*)src;
  166. break;
  167. case 10:
  168. *((long*)dst)++ = *((long*)src)++;
  169. *((long*)dst)++ = *((long*)src)++;
  170. *(short*)dst = *(short*)src;
  171. break;
  172. case 11:
  173. *((long*)dst)++ = *((long*)src)++;
  174. *((long*)dst)++ = *((long*)src)++;
  175. *((short*)dst)++ = *((short*)src)++;
  176. *(char*)dst = *(char*)src;
  177. break;
  178. case 12:
  179. *((long*)dst)++ = *((long*)src)++;
  180. *((long*)dst)++ = *((long*)src)++;
  181. *((long*)dst)++ = *((long*)src)++;
  182. break;
  183. case 13:
  184. *((long*)dst)++ = *((long*)src)++;
  185. *((long*)dst)++ = *((long*)src)++;
  186. *((long*)dst)++ = *((long*)src)++;
  187. *(char*)dst = *(char*)src;
  188. break;
  189. case 14:
  190. *((long*)dst)++ = *((long*)src)++;
  191. *((long*)dst)++ = *((long*)src)++;
  192. *((long*)dst)++ = *((long*)src)++;
  193. *(short*)dst = *(short*)src;
  194. break;
  195. case 15:
  196. *((long*)dst)++ = *((long*)src)++;
  197. *((long*)dst)++ = *((long*)src)++;
  198. *((long*)dst)++ = *((long*)src)++;
  199. *((short*)dst)++ = *((short*)src)++;
  200. *(char*)dst = *(char*)src;
  201. break;
  202. }
  203. return return_dst; /* destination pointer. */
  204. } /* memcpy() */