memset.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /*#************************************************************************#*/
  2. /*#-------------------------------------------------------------------------*/
  3. /*# */
  4. /*# FUNCTION NAME: memset() */
  5. /*# */
  6. /*# PARAMETERS: void* dst; Destination address. */
  7. /*# int c; Value of byte to write. */
  8. /*# int len; Number of bytes to write. */
  9. /*# */
  10. /*# RETURNS: dst. */
  11. /*# */
  12. /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
  13. /*# Framework taken from memcpy. This routine is */
  14. /*# very sensitive to compiler changes in register allocation. */
  15. /*# Should really be rewritten to avoid this problem. */
  16. /*# */
  17. /*#-------------------------------------------------------------------------*/
  18. /*# */
  19. /*# HISTORY */
  20. /*# */
  21. /*# DATE NAME CHANGES */
  22. /*# ---- ---- ------- */
  23. /*# 990713 HP Tired of watching this function (or */
  24. /*# really, the nonoptimized generic */
  25. /*# implementation) take up 90% of simulator */
  26. /*# output. Measurements needed. */
  27. /*# */
  28. /*#-------------------------------------------------------------------------*/
  29. #include <linux/types.h>
  30. /* No, there's no macro saying 12*4, since it is "hard" to get it into
  31. the asm in a good way. Thus better to expose the problem everywhere.
  32. */
  33. /* Assuming 1 cycle per dword written or read (ok, not really true), and
  34. one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
  35. so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
  36. #define ZERO_BLOCK_SIZE (1*12*4)
  37. void *memset(void *pdst,
  38. int c,
  39. size_t plen)
  40. {
  41. /* Ok. Now we want the parameters put in special registers.
  42. Make sure the compiler is able to make something useful of this. */
  43. register char *return_dst __asm__ ("r10") = pdst;
  44. register int n __asm__ ("r12") = plen;
  45. register int lc __asm__ ("r11") = c;
  46. /* Most apps use memset sanely. Only those memsetting about 3..4
  47. bytes or less get penalized compared to the generic implementation
  48. - and that's not really sane use. */
  49. /* Ugh. This is fragile at best. Check with newer GCC releases, if
  50. they compile cascaded "x |= x << 8" sanely! */
  51. __asm__("movu.b %0,$r13 \n\
  52. lslq 8,$r13 \n\
  53. move.b %0,$r13 \n\
  54. move.d $r13,%0 \n\
  55. lslq 16,$r13 \n\
  56. or.d $r13,%0"
  57. : "=r" (lc) : "0" (lc) : "r13");
  58. {
  59. register char *dst __asm__ ("r13") = pdst;
  60. /* This is NONPORTABLE, but since this whole routine is */
  61. /* grossly nonportable that doesn't matter. */
  62. if (((unsigned long) pdst & 3) != 0
  63. /* Oops! n=0 must be a legal call, regardless of alignment. */
  64. && n >= 3)
  65. {
  66. if ((unsigned long)dst & 1)
  67. {
  68. *dst = (char) lc;
  69. n--;
  70. dst++;
  71. }
  72. if ((unsigned long)dst & 2)
  73. {
  74. *(short *)dst = lc;
  75. n -= 2;
  76. dst += 2;
  77. }
  78. }
  79. /* Now the fun part. For the threshold value of this, check the equation
  80. above. */
  81. /* Decide which copying method to use. */
  82. if (n >= ZERO_BLOCK_SIZE)
  83. {
  84. /* For large copies we use 'movem' */
  85. /* It is not optimal to tell the compiler about clobbering any
  86. registers; that will move the saving/restoring of those registers
  87. to the function prologue/epilogue, and make non-movem sizes
  88. suboptimal.
  89. This method is not foolproof; it assumes that the "asm reg"
  90. declarations at the beginning of the function really are used
  91. here (beware: they may be moved to temporary registers).
  92. This way, we do not have to save/move the registers around into
  93. temporaries; we can safely use them straight away.
  94. If you want to check that the allocation was right; then
  95. check the equalities in the first comment. It should say
  96. "r13=r13, r12=r12, r11=r11" */
  97. __asm__ volatile (" \n\
  98. ;; Check that the register asm declaration got right. \n\
  99. ;; The GCC manual says it will work, but there *has* been bugs. \n\
  100. .ifnc %0-%1-%4,$r13-$r12-$r11 \n\
  101. .err \n\
  102. .endif \n\
  103. \n\
  104. ;; Save the registers we'll clobber in the movem process \n\
  105. ;; on the stack. Don't mention them to gcc, it will only be \n\
  106. ;; upset. \n\
  107. subq 11*4,$sp \n\
  108. movem $r10,[$sp] \n\
  109. \n\
  110. move.d $r11,$r0 \n\
  111. move.d $r11,$r1 \n\
  112. move.d $r11,$r2 \n\
  113. move.d $r11,$r3 \n\
  114. move.d $r11,$r4 \n\
  115. move.d $r11,$r5 \n\
  116. move.d $r11,$r6 \n\
  117. move.d $r11,$r7 \n\
  118. move.d $r11,$r8 \n\
  119. move.d $r11,$r9 \n\
  120. move.d $r11,$r10 \n\
  121. \n\
  122. ;; Now we've got this: \n\
  123. ;; r13 - dst \n\
  124. ;; r12 - n \n\
  125. \n\
  126. ;; Update n for the first loop \n\
  127. subq 12*4,$r12 \n\
  128. 0: \n\
  129. subq 12*4,$r12 \n\
  130. bge 0b \n\
  131. movem $r11,[$r13+] \n\
  132. \n\
  133. addq 12*4,$r12 ;; compensate for last loop underflowing n \n\
  134. \n\
  135. ;; Restore registers from stack \n\
  136. movem [$sp+],$r10"
  137. /* Outputs */ : "=r" (dst), "=r" (n)
  138. /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
  139. }
  140. /* Either we directly starts copying, using dword copying
  141. in a loop, or we copy as much as possible with 'movem'
  142. and then the last block (<44 bytes) is copied here.
  143. This will work since 'movem' will have updated src,dst,n. */
  144. while ( n >= 16 )
  145. {
  146. *((long*)dst)++ = lc;
  147. *((long*)dst)++ = lc;
  148. *((long*)dst)++ = lc;
  149. *((long*)dst)++ = lc;
  150. n -= 16;
  151. }
  152. /* A switch() is definitely the fastest although it takes a LOT of code.
  153. * Particularly if you inline code this.
  154. */
  155. switch (n)
  156. {
  157. case 0:
  158. break;
  159. case 1:
  160. *(char*)dst = (char) lc;
  161. break;
  162. case 2:
  163. *(short*)dst = (short) lc;
  164. break;
  165. case 3:
  166. *((short*)dst)++ = (short) lc;
  167. *(char*)dst = (char) lc;
  168. break;
  169. case 4:
  170. *((long*)dst)++ = lc;
  171. break;
  172. case 5:
  173. *((long*)dst)++ = lc;
  174. *(char*)dst = (char) lc;
  175. break;
  176. case 6:
  177. *((long*)dst)++ = lc;
  178. *(short*)dst = (short) lc;
  179. break;
  180. case 7:
  181. *((long*)dst)++ = lc;
  182. *((short*)dst)++ = (short) lc;
  183. *(char*)dst = (char) lc;
  184. break;
  185. case 8:
  186. *((long*)dst)++ = lc;
  187. *((long*)dst)++ = lc;
  188. break;
  189. case 9:
  190. *((long*)dst)++ = lc;
  191. *((long*)dst)++ = lc;
  192. *(char*)dst = (char) lc;
  193. break;
  194. case 10:
  195. *((long*)dst)++ = lc;
  196. *((long*)dst)++ = lc;
  197. *(short*)dst = (short) lc;
  198. break;
  199. case 11:
  200. *((long*)dst)++ = lc;
  201. *((long*)dst)++ = lc;
  202. *((short*)dst)++ = (short) lc;
  203. *(char*)dst = (char) lc;
  204. break;
  205. case 12:
  206. *((long*)dst)++ = lc;
  207. *((long*)dst)++ = lc;
  208. *((long*)dst)++ = lc;
  209. break;
  210. case 13:
  211. *((long*)dst)++ = lc;
  212. *((long*)dst)++ = lc;
  213. *((long*)dst)++ = lc;
  214. *(char*)dst = (char) lc;
  215. break;
  216. case 14:
  217. *((long*)dst)++ = lc;
  218. *((long*)dst)++ = lc;
  219. *((long*)dst)++ = lc;
  220. *(short*)dst = (short) lc;
  221. break;
  222. case 15:
  223. *((long*)dst)++ = lc;
  224. *((long*)dst)++ = lc;
  225. *((long*)dst)++ = lc;
  226. *((short*)dst)++ = (short) lc;
  227. *(char*)dst = (char) lc;
  228. break;
  229. }
  230. }
  231. return return_dst; /* destination pointer. */
  232. } /* memset() */