mmx_32.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. #include <linux/types.h>
  2. #include <linux/string.h>
  3. #include <linux/sched.h>
  4. #include <linux/hardirq.h>
  5. #include <linux/module.h>
  6. #include <asm/asm.h>
  7. #include <asm/i387.h>
  8. /*
  9. * MMX 3DNow! library helper functions
  10. *
  11. * To do:
  12. * We can use MMX just for prefetch in IRQ's. This may be a win.
  13. * (reported so on K6-III)
  14. * We should use a better code neutral filler for the short jump
  15. * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
  16. * We also want to clobber the filler register so we don't get any
  17. * register forwarding stalls on the filler.
  18. *
  19. * Add *user handling. Checksums are not a win with MMX on any CPU
  20. * tested so far for any MMX solution figured.
  21. *
  22. * 22/09/2000 - Arjan van de Ven
  23. * Improved for non-egineering-sample Athlons
  24. *
  25. */
  26. void *_mmx_memcpy(void *to, const void *from, size_t len)
  27. {
  28. void *p;
  29. int i;
  30. if (unlikely(in_interrupt()))
  31. return __memcpy(to, from, len);
  32. p = to;
  33. i = len >> 6; /* len/64 */
  34. kernel_fpu_begin();
  35. __asm__ __volatile__ (
  36. "1: prefetch (%0)\n" /* This set is 28 bytes */
  37. " prefetch 64(%0)\n"
  38. " prefetch 128(%0)\n"
  39. " prefetch 192(%0)\n"
  40. " prefetch 256(%0)\n"
  41. "2: \n"
  42. ".section .fixup, \"ax\"\n"
  43. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  44. " jmp 2b\n"
  45. ".previous\n"
  46. _ASM_EXTABLE(1b,3b)
  47. : : "r" (from) );
  48. for(; i>5; i--)
  49. {
  50. __asm__ __volatile__ (
  51. "1: prefetch 320(%0)\n"
  52. "2: movq (%0), %%mm0\n"
  53. " movq 8(%0), %%mm1\n"
  54. " movq 16(%0), %%mm2\n"
  55. " movq 24(%0), %%mm3\n"
  56. " movq %%mm0, (%1)\n"
  57. " movq %%mm1, 8(%1)\n"
  58. " movq %%mm2, 16(%1)\n"
  59. " movq %%mm3, 24(%1)\n"
  60. " movq 32(%0), %%mm0\n"
  61. " movq 40(%0), %%mm1\n"
  62. " movq 48(%0), %%mm2\n"
  63. " movq 56(%0), %%mm3\n"
  64. " movq %%mm0, 32(%1)\n"
  65. " movq %%mm1, 40(%1)\n"
  66. " movq %%mm2, 48(%1)\n"
  67. " movq %%mm3, 56(%1)\n"
  68. ".section .fixup, \"ax\"\n"
  69. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  70. " jmp 2b\n"
  71. ".previous\n"
  72. _ASM_EXTABLE(1b,3b)
  73. : : "r" (from), "r" (to) : "memory");
  74. from+=64;
  75. to+=64;
  76. }
  77. for(; i>0; i--)
  78. {
  79. __asm__ __volatile__ (
  80. " movq (%0), %%mm0\n"
  81. " movq 8(%0), %%mm1\n"
  82. " movq 16(%0), %%mm2\n"
  83. " movq 24(%0), %%mm3\n"
  84. " movq %%mm0, (%1)\n"
  85. " movq %%mm1, 8(%1)\n"
  86. " movq %%mm2, 16(%1)\n"
  87. " movq %%mm3, 24(%1)\n"
  88. " movq 32(%0), %%mm0\n"
  89. " movq 40(%0), %%mm1\n"
  90. " movq 48(%0), %%mm2\n"
  91. " movq 56(%0), %%mm3\n"
  92. " movq %%mm0, 32(%1)\n"
  93. " movq %%mm1, 40(%1)\n"
  94. " movq %%mm2, 48(%1)\n"
  95. " movq %%mm3, 56(%1)\n"
  96. : : "r" (from), "r" (to) : "memory");
  97. from+=64;
  98. to+=64;
  99. }
  100. /*
  101. * Now do the tail of the block
  102. */
  103. __memcpy(to, from, len&63);
  104. kernel_fpu_end();
  105. return p;
  106. }
  107. #ifdef CONFIG_MK7
  108. /*
  109. * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
  110. * other MMX using processors do not.
  111. */
  112. static void fast_clear_page(void *page)
  113. {
  114. int i;
  115. kernel_fpu_begin();
  116. __asm__ __volatile__ (
  117. " pxor %%mm0, %%mm0\n" : :
  118. );
  119. for(i=0;i<4096/64;i++)
  120. {
  121. __asm__ __volatile__ (
  122. " movntq %%mm0, (%0)\n"
  123. " movntq %%mm0, 8(%0)\n"
  124. " movntq %%mm0, 16(%0)\n"
  125. " movntq %%mm0, 24(%0)\n"
  126. " movntq %%mm0, 32(%0)\n"
  127. " movntq %%mm0, 40(%0)\n"
  128. " movntq %%mm0, 48(%0)\n"
  129. " movntq %%mm0, 56(%0)\n"
  130. : : "r" (page) : "memory");
  131. page+=64;
  132. }
  133. /* since movntq is weakly-ordered, a "sfence" is needed to become
  134. * ordered again.
  135. */
  136. __asm__ __volatile__ (
  137. " sfence \n" : :
  138. );
  139. kernel_fpu_end();
  140. }
  141. static void fast_copy_page(void *to, void *from)
  142. {
  143. int i;
  144. kernel_fpu_begin();
  145. /* maybe the prefetch stuff can go before the expensive fnsave...
  146. * but that is for later. -AV
  147. */
  148. __asm__ __volatile__ (
  149. "1: prefetch (%0)\n"
  150. " prefetch 64(%0)\n"
  151. " prefetch 128(%0)\n"
  152. " prefetch 192(%0)\n"
  153. " prefetch 256(%0)\n"
  154. "2: \n"
  155. ".section .fixup, \"ax\"\n"
  156. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  157. " jmp 2b\n"
  158. ".previous\n"
  159. _ASM_EXTABLE(1b,3b)
  160. : : "r" (from) );
  161. for(i=0; i<(4096-320)/64; i++)
  162. {
  163. __asm__ __volatile__ (
  164. "1: prefetch 320(%0)\n"
  165. "2: movq (%0), %%mm0\n"
  166. " movntq %%mm0, (%1)\n"
  167. " movq 8(%0), %%mm1\n"
  168. " movntq %%mm1, 8(%1)\n"
  169. " movq 16(%0), %%mm2\n"
  170. " movntq %%mm2, 16(%1)\n"
  171. " movq 24(%0), %%mm3\n"
  172. " movntq %%mm3, 24(%1)\n"
  173. " movq 32(%0), %%mm4\n"
  174. " movntq %%mm4, 32(%1)\n"
  175. " movq 40(%0), %%mm5\n"
  176. " movntq %%mm5, 40(%1)\n"
  177. " movq 48(%0), %%mm6\n"
  178. " movntq %%mm6, 48(%1)\n"
  179. " movq 56(%0), %%mm7\n"
  180. " movntq %%mm7, 56(%1)\n"
  181. ".section .fixup, \"ax\"\n"
  182. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  183. " jmp 2b\n"
  184. ".previous\n"
  185. _ASM_EXTABLE(1b,3b)
  186. : : "r" (from), "r" (to) : "memory");
  187. from+=64;
  188. to+=64;
  189. }
  190. for(i=(4096-320)/64; i<4096/64; i++)
  191. {
  192. __asm__ __volatile__ (
  193. "2: movq (%0), %%mm0\n"
  194. " movntq %%mm0, (%1)\n"
  195. " movq 8(%0), %%mm1\n"
  196. " movntq %%mm1, 8(%1)\n"
  197. " movq 16(%0), %%mm2\n"
  198. " movntq %%mm2, 16(%1)\n"
  199. " movq 24(%0), %%mm3\n"
  200. " movntq %%mm3, 24(%1)\n"
  201. " movq 32(%0), %%mm4\n"
  202. " movntq %%mm4, 32(%1)\n"
  203. " movq 40(%0), %%mm5\n"
  204. " movntq %%mm5, 40(%1)\n"
  205. " movq 48(%0), %%mm6\n"
  206. " movntq %%mm6, 48(%1)\n"
  207. " movq 56(%0), %%mm7\n"
  208. " movntq %%mm7, 56(%1)\n"
  209. : : "r" (from), "r" (to) : "memory");
  210. from+=64;
  211. to+=64;
  212. }
  213. /* since movntq is weakly-ordered, a "sfence" is needed to become
  214. * ordered again.
  215. */
  216. __asm__ __volatile__ (
  217. " sfence \n" : :
  218. );
  219. kernel_fpu_end();
  220. }
  221. #else
  222. /*
  223. * Generic MMX implementation without K7 specific streaming
  224. */
  225. static void fast_clear_page(void *page)
  226. {
  227. int i;
  228. kernel_fpu_begin();
  229. __asm__ __volatile__ (
  230. " pxor %%mm0, %%mm0\n" : :
  231. );
  232. for(i=0;i<4096/128;i++)
  233. {
  234. __asm__ __volatile__ (
  235. " movq %%mm0, (%0)\n"
  236. " movq %%mm0, 8(%0)\n"
  237. " movq %%mm0, 16(%0)\n"
  238. " movq %%mm0, 24(%0)\n"
  239. " movq %%mm0, 32(%0)\n"
  240. " movq %%mm0, 40(%0)\n"
  241. " movq %%mm0, 48(%0)\n"
  242. " movq %%mm0, 56(%0)\n"
  243. " movq %%mm0, 64(%0)\n"
  244. " movq %%mm0, 72(%0)\n"
  245. " movq %%mm0, 80(%0)\n"
  246. " movq %%mm0, 88(%0)\n"
  247. " movq %%mm0, 96(%0)\n"
  248. " movq %%mm0, 104(%0)\n"
  249. " movq %%mm0, 112(%0)\n"
  250. " movq %%mm0, 120(%0)\n"
  251. : : "r" (page) : "memory");
  252. page+=128;
  253. }
  254. kernel_fpu_end();
  255. }
  256. static void fast_copy_page(void *to, void *from)
  257. {
  258. int i;
  259. kernel_fpu_begin();
  260. __asm__ __volatile__ (
  261. "1: prefetch (%0)\n"
  262. " prefetch 64(%0)\n"
  263. " prefetch 128(%0)\n"
  264. " prefetch 192(%0)\n"
  265. " prefetch 256(%0)\n"
  266. "2: \n"
  267. ".section .fixup, \"ax\"\n"
  268. "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
  269. " jmp 2b\n"
  270. ".previous\n"
  271. _ASM_EXTABLE(1b,3b)
  272. : : "r" (from) );
  273. for(i=0; i<4096/64; i++)
  274. {
  275. __asm__ __volatile__ (
  276. "1: prefetch 320(%0)\n"
  277. "2: movq (%0), %%mm0\n"
  278. " movq 8(%0), %%mm1\n"
  279. " movq 16(%0), %%mm2\n"
  280. " movq 24(%0), %%mm3\n"
  281. " movq %%mm0, (%1)\n"
  282. " movq %%mm1, 8(%1)\n"
  283. " movq %%mm2, 16(%1)\n"
  284. " movq %%mm3, 24(%1)\n"
  285. " movq 32(%0), %%mm0\n"
  286. " movq 40(%0), %%mm1\n"
  287. " movq 48(%0), %%mm2\n"
  288. " movq 56(%0), %%mm3\n"
  289. " movq %%mm0, 32(%1)\n"
  290. " movq %%mm1, 40(%1)\n"
  291. " movq %%mm2, 48(%1)\n"
  292. " movq %%mm3, 56(%1)\n"
  293. ".section .fixup, \"ax\"\n"
  294. "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
  295. " jmp 2b\n"
  296. ".previous\n"
  297. _ASM_EXTABLE(1b,3b)
  298. : : "r" (from), "r" (to) : "memory");
  299. from+=64;
  300. to+=64;
  301. }
  302. kernel_fpu_end();
  303. }
  304. #endif
  305. /*
  306. * Favour MMX for page clear and copy.
  307. */
  308. static void slow_zero_page(void * page)
  309. {
  310. int d0, d1;
  311. __asm__ __volatile__( \
  312. "cld\n\t" \
  313. "rep ; stosl" \
  314. : "=&c" (d0), "=&D" (d1)
  315. :"a" (0),"1" (page),"0" (1024)
  316. :"memory");
  317. }
  318. void mmx_clear_page(void * page)
  319. {
  320. if(unlikely(in_interrupt()))
  321. slow_zero_page(page);
  322. else
  323. fast_clear_page(page);
  324. }
  325. static void slow_copy_page(void *to, void *from)
  326. {
  327. int d0, d1, d2;
  328. __asm__ __volatile__( \
  329. "cld\n\t" \
  330. "rep ; movsl" \
  331. : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
  332. : "0" (1024),"1" ((long) to),"2" ((long) from) \
  333. : "memory");
  334. }
  335. void mmx_copy_page(void *to, void *from)
  336. {
  337. if(unlikely(in_interrupt()))
  338. slow_copy_page(to, from);
  339. else
  340. fast_copy_page(to, from);
  341. }
  342. EXPORT_SYMBOL(_mmx_memcpy);
  343. EXPORT_SYMBOL(mmx_clear_page);
  344. EXPORT_SYMBOL(mmx_copy_page);