string.c 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /*#************************************************************************#*/
  2. /*#-------------------------------------------------------------------------*/
  3. /*# */
  4. /*# FUNCTION NAME: memcpy() */
  5. /*# */
  6. /*# PARAMETERS: void* dst; Destination address. */
  7. /*# void* src; Source address. */
  8. /*# int len; Number of bytes to copy. */
  9. /*# */
  10. /*# RETURNS: dst. */
  11. /*# */
  12. /*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */
  13. /*# about copying of overlapping memory areas. This routine is */
  14. /*# very sensitive to compiler changes in register allocation. */
  15. /*# Should really be rewritten to avoid this problem. */
  16. /*# */
  17. /*#-------------------------------------------------------------------------*/
  18. /*# */
  19. /*# HISTORY */
  20. /*# */
  21. /*# DATE NAME CHANGES */
  22. /*# ---- ---- ------- */
  23. /*# 941007 Kenny R Creation */
  24. /*# 941011 Kenny R Lots of optimizations and inlining. */
  25. /*# 941129 Ulf A Adapted for use in libc. */
  26. /*# 950216 HP N==0 forgotten if non-aligned src/dst. */
  27. /*# Added some optimizations. */
  28. /*# 001025 HP Make src and dst char *. Align dst to */
  29. /*# dword, not just word-if-both-src-and-dst- */
  30. /*# are-misaligned. */
  31. /*# */
  32. /*#-------------------------------------------------------------------------*/
  33. #include <linux/types.h>
  34. void *memcpy(void *pdst,
  35. const void *psrc,
  36. size_t pn)
  37. {
  38. /* Ok. Now we want the parameters put in special registers.
  39. Make sure the compiler is able to make something useful of this.
  40. As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
  41. If gcc was allright, it really would need no temporaries, and no
  42. stack space to save stuff on. */
  43. register void *return_dst __asm__ ("r10") = pdst;
  44. register char *dst __asm__ ("r13") = pdst;
  45. register const char *src __asm__ ("r11") = psrc;
  46. register int n __asm__ ("r12") = pn;
  47. /* When src is aligned but not dst, this makes a few extra needless
  48. cycles. I believe it would take as many to check that the
  49. re-alignment was unnecessary. */
  50. if (((unsigned long) dst & 3) != 0
  51. /* Don't align if we wouldn't copy more than a few bytes; so we
  52. don't have to check further for overflows. */
  53. && n >= 3)
  54. {
  55. if ((unsigned long) dst & 1)
  56. {
  57. n--;
  58. *(char*)dst = *(char*)src;
  59. src++;
  60. dst++;
  61. }
  62. if ((unsigned long) dst & 2)
  63. {
  64. n -= 2;
  65. *(short*)dst = *(short*)src;
  66. src += 2;
  67. dst += 2;
  68. }
  69. }
  70. /* Decide which copying method to use. Movem is dirt cheap, so the
  71. overheap is low enough to always use the minimum block size as the
  72. threshold. */
  73. if (n >= 44)
  74. {
  75. /* For large copies we use 'movem' */
  76. /* It is not optimal to tell the compiler about clobbering any
  77. registers; that will move the saving/restoring of those registers
  78. to the function prologue/epilogue, and make non-movem sizes
  79. suboptimal. */
  80. __asm__ volatile (" \n\
  81. ;; Check that the register asm declaration got right. \n\
  82. ;; The GCC manual explicitly says TRT will happen. \n\
  83. .ifnc %0-%1-%2,$r13-$r11-$r12 \n\
  84. .err \n\
  85. .endif \n\
  86. \n\
  87. ;; Save the registers we'll use in the movem process \n\
  88. \n\
  89. ;; on the stack. \n\
  90. subq 11*4,$sp \n\
  91. movem $r10,[$sp] \n\
  92. \n\
  93. ;; Now we've got this: \n\
  94. ;; r11 - src \n\
  95. ;; r13 - dst \n\
  96. ;; r12 - n \n\
  97. \n\
  98. ;; Update n for the first loop \n\
  99. subq 44,$r12 \n\
  100. 0: \n\
  101. movem [$r11+],$r10 \n\
  102. subq 44,$r12 \n\
  103. bge 0b \n\
  104. movem $r10,[$r13+] \n\
  105. \n\
  106. addq 44,$r12 ;; compensate for last loop underflowing n \n\
  107. \n\
  108. ;; Restore registers from stack \n\
  109. movem [$sp+],$r10"
  110. /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
  111. /* Inputs */ : "0" (dst), "1" (src), "2" (n));
  112. }
  113. /* Either we directly starts copying, using dword copying
  114. in a loop, or we copy as much as possible with 'movem'
  115. and then the last block (<44 bytes) is copied here.
  116. This will work since 'movem' will have updated src,dst,n. */
  117. while ( n >= 16 )
  118. {
  119. *((long*)dst)++ = *((long*)src)++;
  120. *((long*)dst)++ = *((long*)src)++;
  121. *((long*)dst)++ = *((long*)src)++;
  122. *((long*)dst)++ = *((long*)src)++;
  123. n -= 16;
  124. }
  125. /* A switch() is definitely the fastest although it takes a LOT of code.
  126. * Particularly if you inline code this.
  127. */
  128. switch (n)
  129. {
  130. case 0:
  131. break;
  132. case 1:
  133. *(char*)dst = *(char*)src;
  134. break;
  135. case 2:
  136. *(short*)dst = *(short*)src;
  137. break;
  138. case 3:
  139. *((short*)dst)++ = *((short*)src)++;
  140. *(char*)dst = *(char*)src;
  141. break;
  142. case 4:
  143. *((long*)dst)++ = *((long*)src)++;
  144. break;
  145. case 5:
  146. *((long*)dst)++ = *((long*)src)++;
  147. *(char*)dst = *(char*)src;
  148. break;
  149. case 6:
  150. *((long*)dst)++ = *((long*)src)++;
  151. *(short*)dst = *(short*)src;
  152. break;
  153. case 7:
  154. *((long*)dst)++ = *((long*)src)++;
  155. *((short*)dst)++ = *((short*)src)++;
  156. *(char*)dst = *(char*)src;
  157. break;
  158. case 8:
  159. *((long*)dst)++ = *((long*)src)++;
  160. *((long*)dst)++ = *((long*)src)++;
  161. break;
  162. case 9:
  163. *((long*)dst)++ = *((long*)src)++;
  164. *((long*)dst)++ = *((long*)src)++;
  165. *(char*)dst = *(char*)src;
  166. break;
  167. case 10:
  168. *((long*)dst)++ = *((long*)src)++;
  169. *((long*)dst)++ = *((long*)src)++;
  170. *(short*)dst = *(short*)src;
  171. break;
  172. case 11:
  173. *((long*)dst)++ = *((long*)src)++;
  174. *((long*)dst)++ = *((long*)src)++;
  175. *((short*)dst)++ = *((short*)src)++;
  176. *(char*)dst = *(char*)src;
  177. break;
  178. case 12:
  179. *((long*)dst)++ = *((long*)src)++;
  180. *((long*)dst)++ = *((long*)src)++;
  181. *((long*)dst)++ = *((long*)src)++;
  182. break;
  183. case 13:
  184. *((long*)dst)++ = *((long*)src)++;
  185. *((long*)dst)++ = *((long*)src)++;
  186. *((long*)dst)++ = *((long*)src)++;
  187. *(char*)dst = *(char*)src;
  188. break;
  189. case 14:
  190. *((long*)dst)++ = *((long*)src)++;
  191. *((long*)dst)++ = *((long*)src)++;
  192. *((long*)dst)++ = *((long*)src)++;
  193. *(short*)dst = *(short*)src;
  194. break;
  195. case 15:
  196. *((long*)dst)++ = *((long*)src)++;
  197. *((long*)dst)++ = *((long*)src)++;
  198. *((long*)dst)++ = *((long*)src)++;
  199. *((short*)dst)++ = *((short*)src)++;
  200. *(char*)dst = *(char*)src;
  201. break;
  202. }
  203. return return_dst; /* destination pointer. */
  204. } /* memcpy() */