memset.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /*#************************************************************************#*/
  2. /*#-------------------------------------------------------------------------*/
  3. /*# */
  4. /*# FUNCTION NAME: memset() */
  5. /*# */
  6. /*# PARAMETERS: void* dst; Destination address. */
  7. /*# int c; Value of byte to write. */
  8. /*# int len; Number of bytes to write. */
  9. /*# */
  10. /*# RETURNS: dst. */
  11. /*# */
  12. /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
  13. /*# Framework taken from memcpy. This routine is */
  14. /*# very sensitive to compiler changes in register allocation. */
  15. /*# Should really be rewritten to avoid this problem. */
  16. /*# */
  17. /*#-------------------------------------------------------------------------*/
  18. /*# */
  19. /*# HISTORY */
  20. /*# */
  21. /*# DATE NAME CHANGES */
  22. /*# ---- ---- ------- */
  23. /*# 990713 HP Tired of watching this function (or */
  24. /*# really, the nonoptimized generic */
  25. /*# implementation) take up 90% of simulator */
  26. /*# output. Measurements needed. */
  27. /*# */
  28. /*#-------------------------------------------------------------------------*/
  29. #include <linux/types.h>
  30. /* No, there's no macro saying 12*4, since it is "hard" to get it into
  31. the asm in a good way. Thus better to expose the problem everywhere.
  32. */
  33. /* Assuming 1 cycle per dword written or read (ok, not really true), and
  34. one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
  35. so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
  36. #define ZERO_BLOCK_SIZE (1*12*4)
  37. void *memset(void *pdst,
  38. int c,
  39. size_t plen)
  40. {
  41. /* Ok. Now we want the parameters put in special registers.
  42. Make sure the compiler is able to make something useful of this. */
  43. register char *return_dst __asm__ ("r10") = pdst;
  44. register int n __asm__ ("r12") = plen;
  45. register int lc __asm__ ("r11") = c;
  46. /* Most apps use memset sanely. Only those memsetting about 3..4
  47. bytes or less get penalized compared to the generic implementation
  48. - and that's not really sane use. */
  49. /* Ugh. This is fragile at best. Check with newer GCC releases, if
  50. they compile cascaded "x |= x << 8" sanely! */
  51. __asm__("movu.b %0,$r13\n\t"
  52. "lslq 8,$r13\n\t"
  53. "move.b %0,$r13\n\t"
  54. "move.d $r13,%0\n\t"
  55. "lslq 16,$r13\n\t"
  56. "or.d $r13,%0"
  57. : "=r" (lc) : "0" (lc) : "r13");
  58. {
  59. register char *dst __asm__ ("r13") = pdst;
  60. /* This is NONPORTABLE, but since this whole routine is */
  61. /* grossly nonportable that doesn't matter. */
  62. if (((unsigned long) pdst & 3) != 0
  63. /* Oops! n=0 must be a legal call, regardless of alignment. */
  64. && n >= 3)
  65. {
  66. if ((unsigned long)dst & 1)
  67. {
  68. *dst = (char) lc;
  69. n--;
  70. dst++;
  71. }
  72. if ((unsigned long)dst & 2)
  73. {
  74. *(short *)dst = lc;
  75. n -= 2;
  76. dst += 2;
  77. }
  78. }
  79. /* Now the fun part. For the threshold value of this, check the equation
  80. above. */
  81. /* Decide which copying method to use. */
  82. if (n >= ZERO_BLOCK_SIZE)
  83. {
  84. /* For large copies we use 'movem' */
  85. /* It is not optimal to tell the compiler about clobbering any
  86. registers; that will move the saving/restoring of those registers
  87. to the function prologue/epilogue, and make non-movem sizes
  88. suboptimal.
  89. This method is not foolproof; it assumes that the "asm reg"
  90. declarations at the beginning of the function really are used
  91. here (beware: they may be moved to temporary registers).
  92. This way, we do not have to save/move the registers around into
  93. temporaries; we can safely use them straight away.
  94. If you want to check that the allocation was right; then
  95. check the equalities in the first comment. It should say
  96. "r13=r13, r12=r12, r11=r11" */
  97. __asm__ volatile ("
  98. ;; Check that the following is true (same register names on
  99. ;; both sides of equal sign, as in r8=r8):
  100. ;; %0=r13, %1=r12, %4=r11
  101. ;;
  102. ;; Save the registers we'll clobber in the movem process
  103. ;; on the stack. Don't mention them to gcc, it will only be
  104. ;; upset.
  105. subq 11*4,$sp
  106. movem $r10,[$sp]
  107. move.d $r11,$r0
  108. move.d $r11,$r1
  109. move.d $r11,$r2
  110. move.d $r11,$r3
  111. move.d $r11,$r4
  112. move.d $r11,$r5
  113. move.d $r11,$r6
  114. move.d $r11,$r7
  115. move.d $r11,$r8
  116. move.d $r11,$r9
  117. move.d $r11,$r10
  118. ;; Now we've got this:
  119. ;; r13 - dst
  120. ;; r12 - n
  121. ;; Update n for the first loop
  122. subq 12*4,$r12
  123. 0:
  124. subq 12*4,$r12
  125. bge 0b
  126. movem $r11,[$r13+]
  127. addq 12*4,$r12 ;; compensate for last loop underflowing n
  128. ;; Restore registers from stack
  129. movem [$sp+],$r10"
  130. /* Outputs */ : "=r" (dst), "=r" (n)
  131. /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
  132. }
  133. /* Either we directly starts copying, using dword copying
  134. in a loop, or we copy as much as possible with 'movem'
  135. and then the last block (<44 bytes) is copied here.
  136. This will work since 'movem' will have updated src,dst,n. */
  137. while ( n >= 16 )
  138. {
  139. *((long*)dst)++ = lc;
  140. *((long*)dst)++ = lc;
  141. *((long*)dst)++ = lc;
  142. *((long*)dst)++ = lc;
  143. n -= 16;
  144. }
  145. /* A switch() is definitely the fastest although it takes a LOT of code.
  146. * Particularly if you inline code this.
  147. */
  148. switch (n)
  149. {
  150. case 0:
  151. break;
  152. case 1:
  153. *(char*)dst = (char) lc;
  154. break;
  155. case 2:
  156. *(short*)dst = (short) lc;
  157. break;
  158. case 3:
  159. *((short*)dst)++ = (short) lc;
  160. *(char*)dst = (char) lc;
  161. break;
  162. case 4:
  163. *((long*)dst)++ = lc;
  164. break;
  165. case 5:
  166. *((long*)dst)++ = lc;
  167. *(char*)dst = (char) lc;
  168. break;
  169. case 6:
  170. *((long*)dst)++ = lc;
  171. *(short*)dst = (short) lc;
  172. break;
  173. case 7:
  174. *((long*)dst)++ = lc;
  175. *((short*)dst)++ = (short) lc;
  176. *(char*)dst = (char) lc;
  177. break;
  178. case 8:
  179. *((long*)dst)++ = lc;
  180. *((long*)dst)++ = lc;
  181. break;
  182. case 9:
  183. *((long*)dst)++ = lc;
  184. *((long*)dst)++ = lc;
  185. *(char*)dst = (char) lc;
  186. break;
  187. case 10:
  188. *((long*)dst)++ = lc;
  189. *((long*)dst)++ = lc;
  190. *(short*)dst = (short) lc;
  191. break;
  192. case 11:
  193. *((long*)dst)++ = lc;
  194. *((long*)dst)++ = lc;
  195. *((short*)dst)++ = (short) lc;
  196. *(char*)dst = (char) lc;
  197. break;
  198. case 12:
  199. *((long*)dst)++ = lc;
  200. *((long*)dst)++ = lc;
  201. *((long*)dst)++ = lc;
  202. break;
  203. case 13:
  204. *((long*)dst)++ = lc;
  205. *((long*)dst)++ = lc;
  206. *((long*)dst)++ = lc;
  207. *(char*)dst = (char) lc;
  208. break;
  209. case 14:
  210. *((long*)dst)++ = lc;
  211. *((long*)dst)++ = lc;
  212. *((long*)dst)++ = lc;
  213. *(short*)dst = (short) lc;
  214. break;
  215. case 15:
  216. *((long*)dst)++ = lc;
  217. *((long*)dst)++ = lc;
  218. *((long*)dst)++ = lc;
  219. *((short*)dst)++ = (short) lc;
  220. *(char*)dst = (char) lc;
  221. break;
  222. }
  223. }
  224. return return_dst; /* destination pointer. */
  225. } /* memset() */